diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 90afb1ce29684..0000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,137 +0,0 @@ -version: 2.1 - -jobs: - test-arm: - machine: - image: ubuntu-2004:2022.04.1 - resource_class: arm.large - environment: - ENV_FILE: ci/deps/circle-310-arm64.yaml - PYTEST_WORKERS: auto - PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" - PYTEST_TARGET: "pandas" - PANDAS_CI: "1" - steps: - - checkout - - run: .circleci/setup_env.sh - - run: > - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD - ci/run_tests.sh - linux-musl: - docker: - - image: quay.io/pypa/musllinux_1_1_aarch64 - resource_class: arm.large - steps: - # Install pkgs first to have git in the image - # (needed for checkout) - - run: | - apk update - apk add git - apk add musl-locales - - checkout - - run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" - python -m pip list --no-cache-dir - - run: | - . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml - build-aarch64: - parameters: - cibw-build: - type: string - machine: - image: ubuntu-2004:2022.04.1 - resource_class: arm.large - environment: - TRIGGER_SOURCE: << pipeline.trigger_source >> - steps: - - checkout - - run: - name: Check if build is necessary - command: | - # Check if tag is defined or TRIGGER_SOURCE is scheduled - if [[ -n "$CIRCLE_TAG" ]]; then - echo 'export IS_PUSH="true"' >> "$BASH_ENV" - elif [[ $TRIGGER_SOURCE == "scheduled_pipeline" ]]; then - echo 'export IS_SCHEDULE_DISPATCH="true"' >> "$BASH_ENV" - # Look for the build label/[wheel build] in commit - # grep takes a regex, so need to escape brackets - elif (git log --format=oneline -n 1 $CIRCLE_SHA1) | grep -q '\[wheel build\]'; then - : # Do nothing - elif ! (curl https://api.github.com/repos/pandas-dev/pandas/issues/$CIRCLE_PR_NUMBER | jq '.labels' | grep -q 'Build'); then - circleci-agent step halt - fi - - run: - name: Build aarch64 wheels - no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that - command: | - pip3 install cibuildwheel==2.15.0 - # When this is a nightly wheel build, allow picking up NumPy 2.0 dev wheels: - if [[ "$IS_SCHEDULE_DISPATCH" == "true" || "$IS_PUSH" != 'true' ]]; then - export CIBW_ENVIRONMENT="PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - fi - cibuildwheel --prerelease-pythons --output-dir wheelhouse - - environment: - CIBW_BUILD: << parameters.cibw-build >> - - - run: - name: Install Anaconda Client & Upload Wheels - command: | - echo "Install Mambaforge" - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" - echo "Downloading $MAMBA_URL" - wget -q $MAMBA_URL -O minimamba.sh - chmod +x minimamba.sh - - MAMBA_DIR="$HOME/miniconda3" - rm -rf $MAMBA_DIR - ./minimamba.sh -b -p $MAMBA_DIR - - export PATH=$MAMBA_DIR/bin:$PATH - - mamba install -y -c conda-forge anaconda-client - - source ci/upload_wheels.sh - set_upload_vars - upload_wheels - - store_artifacts: - path: wheelhouse/ - -workflows: - test: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-arm - test-musl: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - linux-musl - build-wheels: - jobs: - - build-aarch64: - filters: - tags: - only: /^v.*/ - matrix: - parameters: - cibw-build: ["cp39-manylinux_aarch64", - "cp310-manylinux_aarch64", - "cp311-manylinux_aarch64", - "cp312-manylinux_aarch64", - "cp39-musllinux_aarch64", - "cp310-musllinux_aarch64", - "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64",] diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh deleted file mode 100755 index eef4db1191a9a..0000000000000 --- a/.circleci/setup_env.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -e - -echo "Install Mambaforge" -MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" -echo "Downloading $MAMBA_URL" -wget -q $MAMBA_URL -O minimamba.sh -chmod +x minimamba.sh - -MAMBA_DIR="$HOME/miniconda3" -rm -rf $MAMBA_DIR -./minimamba.sh -b -p $MAMBA_DIR - -export PATH=$MAMBA_DIR/bin:$PATH - -echo -echo "which conda" -which conda - -echo -echo "update conda" -conda config --set ssl_verify false -conda config --set quiet true --set always_yes true --set changeps1 false -mamba install -y -c conda-forge -n base pip setuptools - -echo "conda info -a" -conda info -a - -echo "conda list (root environment)" -conda list - -echo -# Clean up any left-over from a previous build -mamba env remove -n pandas-dev -echo "mamba env update --file=${ENV_FILE}" -# See https://github.com/mamba-org/mamba/issues/633 -mamba create -q -n pandas-dev -time mamba env update -n pandas-dev --file="${ENV_FILE}" - -echo "conda list -n pandas-dev" -conda list -n pandas-dev - -echo "activate pandas-dev" -source activate pandas-dev - -# Explicitly set an environment variable indicating that this is pandas' CI environment. -# -# This allows us to enable things like -Werror that shouldn't be activated in -# downstream CI jobs that may also build pandas from source. -export PANDAS_CI=1 - -if pip show pandas 1>/dev/null; then - echo - echo "remove any installed pandas package w/o removing anything else" - pip uninstall -y pandas -fi - -echo "Install pandas" -python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" - -echo "done" diff --git a/.devcontainer.json b/.devcontainer.json index 7c5d009260c64..54ddfa1a130f8 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -8,7 +8,6 @@ // Use 'settings' to set *default* container specific settings.json values on container create. // You can edit these settings after create using File > Preferences > Settings > Remote. "settings": { - "terminal.integrated.shell.linux": "/bin/bash", "python.pythonPath": "/usr/local/bin/python", "python.formatting.provider": "black", "python.linting.enabled": true, diff --git a/.gitattributes b/.gitattributes index 19c6fd2fd1d47..bc7dec642df0f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -61,14 +61,13 @@ pandas/_version.py export-subst *.pxi export-ignore # Ignoring stuff from the top level -.circleci export-ignore .github export-ignore asv_bench export-ignore ci export-ignore doc export-ignore gitpod export-ignore MANIFEST.in export-ignore -scripts export-ignore +scripts/** export-ignore typings export-ignore web export-ignore CITATION.cff export-ignore @@ -82,3 +81,6 @@ setup.py export-ignore # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist pandas/tests/io/parser/data export-ignore + +# Include cibw script in sdist since it's needed for building wheels +scripts/cibw_before_build.sh -export-ignore diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index bacca20780191..3a7c71af02bf9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -4,19 +4,13 @@ # ci ci/ @mroeschke -# web -web/ @datapythonista - # docs doc/cheatsheet @Dr-Irv doc/source/development @noatamir # pandas -pandas/_libs/ @WillAyd -pandas/_libs/tslibs/* @MarcoGorelli pandas/_typing.py @Dr-Irv pandas/core/groupby/* @rhshadrach -pandas/core/tools/datetimes.py @MarcoGorelli pandas/io/excel/* @rhshadrach pandas/io/formats/style.py @attack68 pandas/io/formats/style_render.py @attack68 diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml index 6e6cd78ace11d..9c15218794499 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yaml +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -31,7 +31,7 @@ body: attributes: label: Feature Description description: > - Please describe how the new feature would be implemented, using psudocode if relevant. + Please describe how the new feature would be implemented, using pseudocode if relevant. placeholder: > Add a new parameter to DataFrame, to_series, to return a Series if possible. diff --git a/.github/ISSUE_TEMPLATE/pdep_vote.yaml b/.github/ISSUE_TEMPLATE/pdep_vote.yaml new file mode 100644 index 0000000000000..6dcbd76eb0f74 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/pdep_vote.yaml @@ -0,0 +1,74 @@ +name: PDEP Vote +description: Call for a vote on a PDEP +title: "VOTE: " +labels: [Vote] + +body: + - type: markdown + attributes: + value: > + As per [PDEP-1](https://pandas.pydata.org/pdeps/0001-purpose-and-guidelines.html), the following issue template should be used when a + maintainer has opened a PDEP discussion and is ready to call for a vote. + - type: checkboxes + attributes: + label: Locked issue + options: + - label: > + I locked this voting issue so that only voting members are able to cast their votes or + comment on this issue. + required: true + - type: input + id: PDEP-name + attributes: + label: PDEP number and title + placeholder: > + PDEP-1: Purpose and guidelines + validations: + required: true + - type: input + id: PDEP-link + attributes: + label: Pull request with discussion + description: e.g. https://github.com/pandas-dev/pandas/pull/47444 + validations: + required: true + - type: input + id: PDEP-rendered-link + attributes: + label: Rendered PDEP for easy reading + description: e.g. https://github.com/pandas-dev/pandas/pull/47444/files?short_path=7c449e6#diff-7c449e698132205b235c501f7e47ebba38da4d2b7f9492c98f16745dba787041 + validations: + required: true + - type: input + id: PDEP-number-of-discussion-participants + attributes: + label: Discussion participants + description: > + You may find it useful to list or total the number of participating members in the + PDEP discussion PR. This would be the maximum possible disapprove votes. + placeholder: > + 14 voting members participated in the PR discussion thus far. + - type: input + id: PDEP-vote-end + attributes: + label: Voting will close in 15 days. + description: The voting period end date. ('Voting will close in 15 days.' will be automatically written) + - type: markdown + attributes: + value: --- + - type: textarea + id: Vote + attributes: + label: Vote + value: | + Cast your vote in a comment below. + * +1: approve. + * 0: abstain. + * Reason: A one sentence reason is required. + * -1: disapprove + * Reason: A one sentence reason is required. + A disapprove vote requires prior participation in the linked discussion PR. + + @pandas-dev/pandas-core + validations: + required: true diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 63f687324b0ae..2d208cb38725a 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -4,12 +4,9 @@ inputs: editable: description: Whether to build pandas in editable mode (default true) default: true - meson_args: - description: Extra flags to pass to meson - required: false - cflags_adds: - description: Items to append to the CFLAGS variable - required: false + werror: + description: Enable werror flag for build + default: true runs: using: composite steps: @@ -30,12 +27,11 @@ runs: - name: Build Pandas run: | - export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}" if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ - --config-settings=setup-args="--werror" + pip install -e . --no-build-isolation -v --no-deps \ + ${{ inputs.werror == 'true' && '-Csetup-args="--werror"' || '' }} else - pip install . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ - --config-settings=setup-args="--werror" + pip install . --no-build-isolation -v --no-deps \ + ${{ inputs.werror == 'true' && '-Csetup-args="--werror"' || '' }} fi shell: bash -el {0} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index b4778b74df335..b60245d20e8e4 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -1,32 +1,20 @@ name: Run tests and report results -inputs: - preload: - description: Preload arguments for sanitizer - required: false - asan_options: - description: Arguments for Address Sanitizer (ASAN) - required: false runs: using: composite steps: - name: Test - run: ${{ inputs.asan_options }} ${{ inputs.preload }} ci/run_tests.sh + run: ci/run_tests.sh shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Test results path: test-data.xml if: failure() - - name: Report Coverage - run: coverage report -m - shell: bash -el {0} - if: failure() - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v5 with: flags: unittests name: codecov-pandas diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index ceeebfcd1c90c..a09ac1a4e5ffb 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -7,10 +7,16 @@ runs: using: composite steps: - name: Install ${{ inputs.environment-file }} - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-file: ${{ inputs.environment-file }} environment-name: test condarc-file: ci/.condarc cache-environment: true cache-downloads: true + + - name: Uninstall pyarrow + if: ${{ env.REMOVE_PYARROW == '1' }} + run: | + micromamba remove -y pyarrow + shell: bash -el {0} diff --git a/.github/workflows/cache-cleanup-weekly.yml b/.github/workflows/cache-cleanup-daily.yml similarity index 90% rename from .github/workflows/cache-cleanup-weekly.yml rename to .github/workflows/cache-cleanup-daily.yml index 6da31f7354457..8eadfb2ccd2a9 100644 --- a/.github/workflows/cache-cleanup-weekly.yml +++ b/.github/workflows/cache-cleanup-daily.yml @@ -1,8 +1,8 @@ -name: Purge caches once a week +name: Purge caches daily on: schedule: - # 4:10 UTC on Sunday - - cron: "10 4 * * 0" + # 4:10 UTC daily + - cron: "10 4 * * *" jobs: cleanup: diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index b49b9a67c4743..728019b06e053 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.3.x pull_request: branches: - main - - 2.1.x + - 2.3.x env: ENV_FILE: environment.yml @@ -21,7 +21,7 @@ permissions: jobs: docstring_typing_manual_hooks: name: Docstring validation, typing, and other manual pre-commit hooks - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 defaults: run: shell: bash -el {0} @@ -51,6 +51,11 @@ jobs: # TODO: The doctests have to be run first right now, since the Cython doctests only work # with pandas installed in non-editable mode # This can be removed once pytest-cython doesn't require C extensions to be installed inplace + + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Run doctests run: cd ci && ./code_checks.sh doctests if: ${{ steps.build.outcome == 'success' && always() }} @@ -85,8 +90,8 @@ jobs: echo "PYTHONPATH=$PYTHONPATH" >> $GITHUB_ENV if: ${{ steps.build.outcome == 'success' && always() }} - - name: Typing + pylint - uses: pre-commit/action@v3.0.0 + - name: Typing + uses: pre-commit/action@v3.0.1 with: extra_args: --verbose --hook-stage manual --all-files if: ${{ steps.build.outcome == 'success' && always() }} @@ -97,7 +102,7 @@ jobs: asv-benchmarks: name: ASV Benchmarks - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 defaults: run: shell: bash -el {0} @@ -128,7 +133,7 @@ jobs: build_docker_dev_environment: name: Build Docker Dev Environment - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 defaults: run: shell: bash -el {0} @@ -155,7 +160,7 @@ jobs: requirements-dev-text-installable: name: Test install requirements-dev.txt - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4d0066bc0b48d..44a9b4bfa20b8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -13,7 +13,7 @@ permissions: jobs: analyze: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 permissions: actions: read contents: read diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 425abef850184..b843363ae8c4d 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -10,7 +10,7 @@ permissions: jobs: issue_assign: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' concurrency: group: ${{ github.actor }}-issue-assign @@ -19,19 +19,17 @@ jobs: echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees preview_docs: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 if: github.event.issue.pull_request && github.event.comment.body == '/preview' concurrency: group: ${{ github.actor }}-preview-docs steps: - - run: | - if curl --output /dev/null --silent --head --fail "https://pandas.pydata.org/preview/${{ github.event.issue.number }}/"; then - curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"body": "Website preview of this PR available at: https://pandas.pydata.org/preview/${{ github.event.issue.number }}/"}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments - else - curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"body": "No preview found for PR #${{ github.event.issue.number }}. Did the docs build complete?"}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments - fi + - uses: pandas-dev/github-doc-previewer@v0.3.2 + with: + previewer-server: "https://pandas.pydata.org/preview" + artifact-job: "Doc Build and Upload" asv_run: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 # TODO: Support more benchmarking options later, against different branches, against self, etc if: github.event.issue.pull_request && startsWith(github.event.comment.body, '@github-actions benchmark') defaults: diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml index ec71daf6f84ab..334a5d77b407b 100644 --- a/.github/workflows/deprecation-tracking-bot.yml +++ b/.github/workflows/deprecation-tracking-bot.yml @@ -17,9 +17,9 @@ jobs: deprecation_update: permissions: issues: write - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 env: - DEPRECATION_TRACKER_ISSUE: 50578 + DEPRECATION_TRACKER_ISSUE: 56596 steps: - uses: actions/github-script@v7 id: update-deprecation-issue diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index da232404e6ff5..ba9e30e088c66 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.1.x + - 2.3.x tags: - '*' pull_request: branches: - main - - 2.1.x + - 2.3.x env: ENV_FILE: environment.yml @@ -23,7 +23,7 @@ permissions: jobs: web_and_docs: name: Doc Build and Upload - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 @@ -46,6 +46,10 @@ jobs: - name: Build Pandas uses: ./.github/actions/build_pandas + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Test website run: python -m pytest web/ @@ -55,6 +59,10 @@ jobs: - name: Build documentation run: doc/make.py --warnings-are-errors + - name: Build the interactive terminal + working-directory: web/interactive_terminal + run: jupyter lite build + - name: Build documentation zip run: doc/make.py zip_html @@ -90,10 +98,3 @@ jobs: name: website path: web/build retention-days: 14 - - - name: Trigger web/doc preview - run: curl -X POST https://pandas.pydata.org/preview/submit/$RUN_ID/$PR_ID/ - env: - RUN_ID: ${{ github.run_id }} - PR_ID: ${{ github.event.pull_request.number }} - if: github.event_name == 'pull_request' diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 04d8b8e006985..9800cc1694313 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.3.x pull_request: branches: - main - - 2.1.x + - 2.3.x types: [ labeled, opened, synchronize, reopened ] permissions: @@ -21,10 +21,10 @@ defaults: jobs: pip: if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: @@ -50,10 +50,10 @@ jobs: shell: bash -el {0} conda_forge_recipe: if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11'] fail-fast: false name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} concurrency: @@ -67,7 +67,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-name: recipe-test create-args: >- diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index 792afe8f4faf5..3a51dbefc6bb0 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -12,7 +12,7 @@ jobs: permissions: pull-requests: write if: github.repository_owner == 'pandas-dev' - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/stale@v9 with: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6ca4d19196874..59512ddc91a8a 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.3.x pull_request: branches: - main - - 2.1.x + - 2.3.x paths-ignore: - "doc/**" - "web/**" @@ -22,21 +22,29 @@ defaults: jobs: ubuntu: - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.platform }} timeout-minutes: 90 strategy: matrix: - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + platform: [ubuntu-24.04, ubuntu-24.04-arm] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] + pandas_future_infer_string: ["0"] include: - name: "Downstream Compat" env_file: actions-311-downstream_compat.yaml pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" + platform: ubuntu-24.04 - name: "Minimum Versions" - env_file: actions-39-minimum_versions.yaml + env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" + platform: ubuntu-24.04 + - name: "Freethreading" + env_file: actions-313-freethreading.yaml + pattern: "not slow and not network and not single_cpu" + platform: ubuntu-24.04 - name: "Locale: it_IT" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -47,6 +55,7 @@ jobs: # Also install it_IT (its encoding is ISO8859-1) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "it_IT" + platform: ubuntu-24.04 - name: "Locale: zh_CN" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -57,75 +66,52 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" - - name: "Copy-on-Write 3.9" - env_file: actions-39.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" - - name: "Copy-on-Write 3.10" - env_file: actions-310.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" - - name: "Copy-on-Write 3.11" - env_file: actions-311.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" - - name: "Copy-on-Write 3.12" + platform: ubuntu-24.04 + - name: "Future infer strings" env_file: actions-312.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" - - name: "Copy-on-Write 3.11 (warnings)" + pandas_future_infer_string: "1" + platform: ubuntu-24.04 + - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "warn" - - name: "Copy-on-Write 3.10 (warnings)" - env_file: actions-310.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "warn" - - name: "Copy-on-Write 3.9 (warnings)" - env_file: actions-39.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "warn" + pandas_future_infer_string: "1" + platform: ubuntu-24.04 - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" + platform: ubuntu-24.04 - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - test_args: "-W error::FutureWarning" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" + platform: ubuntu-24.04 - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" - - name: "ASAN / UBSAN" - env_file: actions-311-sanitizers.yaml - pattern: "not slow and not network and not single_cpu and not skip_ubsan" - asan_options: "ASAN_OPTIONS=detect_leaks=0" - preload: LD_PRELOAD=$(gcc -print-file-name=libasan.so) - meson_args: --config-settings=setup-args="-Db_sanitize=address,undefined" - cflags_adds: -fno-sanitize-recover=all - pytest_workers: -1 # disable pytest-xdist as it swallows stderr from ASAN + pandas_future_infer_string: "1" + platform: ubuntu-24.04 fail-fast: false - name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }} env: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + PANDAS_CI: '1' + PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }} TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} + PYTEST_WORKERS: 'auto' PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} - NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests QT_QPA_PLATFORM: offscreen + REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }} cancel-in-progress: true services: mysql: - image: mysql:8.0.33 + image: mysql:9 env: MYSQL_ALLOW_EMPTY_PASSWORD: yes MYSQL_DATABASE: pandas @@ -138,7 +124,7 @@ jobs: - 3306:3306 postgres: - image: postgres:13 + image: postgres:17 env: PGUSER: postgres POSTGRES_USER: postgres @@ -153,7 +139,7 @@ jobs: - 5432:5432 moto: - image: motoserver/moto:4.1.13 + image: motoserver/moto:5.0.27 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret @@ -167,8 +153,8 @@ jobs: fetch-depth: 0 - name: Extra installs - run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }} - if: ${{ matrix.extra_apt }} + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ${{ matrix.extra_apt || ''}} - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests @@ -184,24 +170,20 @@ jobs: id: build uses: ./.github/actions/build_pandas with: - meson_args: ${{ matrix.meson_args }} - cflags_adds: ${{ matrix.cflags_adds }} + # xref https://github.com/cython/cython/issues/6870 + werror: ${{ matrix.name != 'Freethreading' }} + # TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge + if: ${{ matrix.name != 'Pypy' }} - name: Test (not single_cpu) uses: ./.github/actions/run-tests if: ${{ matrix.name != 'Pypy' }} - with: - preload: ${{ matrix.preload }} - asan_options: ${{ matrix.asan_options }} env: # Set pattern to not single_cpu if not already set PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} - name: Test (single_cpu) uses: ./.github/actions/run-tests - with: - preload: ${{ matrix.preload }} - asan_options: ${{ matrix.asan_options }} env: PATTERN: 'single_cpu' PYTEST_WORKERS: 0 @@ -211,8 +193,9 @@ jobs: timeout-minutes: 90 strategy: matrix: - os: [macos-latest, windows-latest] - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + # Note: Don't use macOS latest since macos 14 appears to be arm64 only + os: [macos-13, macos-14, windows-latest] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} @@ -224,8 +207,7 @@ jobs: PANDAS_CI: 1 PYTEST_TARGET: pandas PATTERN: "not slow and not db and not network and not single_cpu" - # GH 47443: PYTEST_WORKERS > 0 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '0' }} + PYTEST_WORKERS: 'auto' steps: - name: Checkout @@ -245,7 +227,7 @@ jobs: uses: ./.github/actions/run-tests Linux-32-bit: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 container: image: quay.io/pypa/manylinux2014_i686 options: --platform linux/386 @@ -266,25 +248,26 @@ jobs: fi - name: Build environment and Run Tests # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 + # Note: Pinned to Cython 3.0.10 to avoid numerical instability in 32-bit environments + # https://github.com/pandas-dev/pandas/pull/61423 run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip install numpy -Csetup-args="-Dallow-noblas=true" + python -m pip install --no-cache-dir versioneer[toml] cython==3.0.10 python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit cancel-in-progress: true Linux-Musl: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 container: - image: quay.io/pypa/musllinux_1_1_x86_64 + image: quay.io/pypa/musllinux_1_2_x86_64 steps: - name: Checkout pandas Repo # actions/checkout does not work since it requires node @@ -306,18 +289,17 @@ jobs: apk add musl-locales - name: Build environment run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir - name: Run Tests run: | . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-musl @@ -330,7 +312,7 @@ jobs: # In general, this will remain frozen(present, but not running) until: # - The next unreleased Python version has released beta 1 # - This version should be available on GitHub Actions. - # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) + # - Our required build/runtime dependencies(numpy, Cython, python-dateutil) # support that unreleased Python version. # To unfreeze, comment out the ``if: false`` condition, and make sure you update # the name of the workflow and Python version in actions/setup-python ``python-version:`` @@ -343,7 +325,7 @@ jobs: # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. - if: false # Uncomment this to freeze the workflow, comment it to unfreeze + if: false defaults: run: shell: bash -eou pipefail {0} @@ -351,13 +333,14 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] + # Separate out macOS 13 and 14, since macOS 14 is arm64 only + os: [ubuntu-24.04, macOS-13, macOS-14, windows-latest] timeout-minutes: 90 concurrency: - #https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-dev cancel-in-progress: true env: @@ -374,17 +357,79 @@ jobs: - name: Set up Python Dev Version uses: actions/setup-python@v5 with: - python-version: '3.12-dev' + python-version: '3.13-dev' - name: Build Environment run: | python --version python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy - python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov - python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" + python -m pip install versioneer[toml] python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list - name: Run Tests uses: ./.github/actions/run-tests + + # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml + emscripten: + # Note: the Python version, Emscripten toolchain version are determined + # by the Pyodide version. The appropriate versions can be found in the + # Pyodide repodata.json "info" field, or in the Makefile.envs file: + # https://github.com/pyodide/pyodide/blob/stable/Makefile.envs#L2 + # The Node.js version can be determined via Pyodide: + # https://pyodide.org/en/stable/usage/index.html#node-js + name: Pyodide build + runs-on: ubuntu-24.04 + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-wasm + cancel-in-progress: true + steps: + - name: Checkout pandas Repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python for pyodide-build + id: setup-python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Set up Emscripten toolchain + uses: mymindstorm/setup-emsdk@v14 + with: + version: '3.1.58' + actions-cache-folder: emsdk-cache + + - name: Install pyodide-build + run: pip install "pyodide-build>=0.29.2" + + - name: Build pandas for Pyodide + run: | + pyodide build + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Set up Pyodide virtual environment + env: + pyodide-version: '0.27.1' + run: | + pyodide xbuildenv install ${{ env.pyodide-version }} + pyodide venv .venv-pyodide + source .venv-pyodide/bin/activate + pip install dist/*.whl + + - name: Test pandas for Pyodide + env: + PANDAS_CI: 1 + run: | + source .venv-pyodide/bin/activate + pip install pytest hypothesis + # do not import pandas from the checked out repo + cd .. + python -c 'import pandas as pd; pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db"])' diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 841559c8e9799..af9c48c57c535 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -40,7 +40,7 @@ jobs: (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'Build')) || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -92,12 +92,23 @@ jobs: # GitHub Actions doesn't support pairing matrix values together, let's improvise # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 buildplat: - - [ubuntu-22.04, manylinux_x86_64] - - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_*] + - [ubuntu-24.04, manylinux_x86_64] + - [ubuntu-24.04, musllinux_x86_64] + - [ubuntu-24.04-arm, manylinux_aarch64] + - [macos-13, macosx_x86_64] + # Note: M1 images on Github Actions start from macOS 14 + - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] + python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] + include: + # Build Pyodide wheels and upload them to Anaconda.org + # NOTE: this job is similar to the one in unit-tests.yml except for the fact + # that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup. + - buildplat: [ubuntu-24.04, pyodide_wasm32] + python: ["cp312", "3.12"] + cibw_build_frontend: 'build' + env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -128,7 +139,7 @@ jobs: # Python version used to build sdist doesn't matter # wheel will be built from sdist with the correct version - name: Unzip sdist (macOS) - if: ${{ matrix.buildplat[1] == 'macosx_*' }} + if: ${{ startsWith(matrix.buildplat[1], 'macosx') }} run: | tar -xzf ./dist/${{ env.sdist_name }} -C ./dist @@ -137,29 +148,17 @@ jobs: shell: bash -el {0} run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - - name: Build normal wheels - if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.2 + - name: Build wheels + uses: pypa/cibuildwheel@v2.23.3 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: - CIBW_PRERELEASE_PYTHONS: True - CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - - - name: Build nightly wheels (with NumPy pre-release) - if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.2 - with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} - env: - # The nightly wheels should be build witht he NumPy 2.0 pre-releases - # which requires the additional URL. - CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple - CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }} + CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} - name: Set up Python - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-name: wheel-env # Use a fixed Python, since we might have an unreleased Python not @@ -175,23 +174,9 @@ jobs: shell: bash -el {0} run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done - # Testing on windowsservercore instead of GHA runner to fail on missing DLLs - - name: Test Windows Wheels - if: ${{ matrix.buildplat[1] == 'win_amd64' }} - shell: pwsh - run: | - $TST_CMD = @" - python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; - python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); - python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; - "@ - # add rc to the end of the image name if the Python version is unreleased - docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - - uses: actions/upload-artifact@v4 with: - name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} + name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} path: ./wheelhouse/*.whl - name: Upload wheels & sdist diff --git a/.gitignore b/.gitignore index a188e216d9f70..d951f3fb9cbad 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,7 @@ doc/source/savefig/ # Interactive terminal generated files # ######################################## .jupyterlite.doit.db + +# Pyodide/WASM related files # +############################## +/.pyodide-xbuildenv-* diff --git a/.gitpod.yml b/.gitpod.yml index 9222639136a17..5bf028750f30f 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -13,10 +13,10 @@ tasks: mkdir -p .vscode cp gitpod/settings.json .vscode/settings.json git fetch --tags - python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true - pre-commit install + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true + pre-commit install --install-hooks command: | - python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true echo "✨ Pre-build complete! You can close this terminal ✨ " # -------------------------------------------------------- diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a070e9a49b97..af93c7f6c76b9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,10 @@ -minimum_pre_commit_version: 2.15.0 +minimum_pre_commit_version: 4.0.0 exclude: ^LICENSES/|\.(html|csv|svg)$ # reserve "manual" for relatively slow hooks which we still want to run in CI default_stages: [ - commit, - merge-commit, - push, + pre-commit, + pre-merge-commit, + pre-push, prepare-commit-msg, commit-msg, post-checkout, @@ -16,51 +16,49 @@ ci: autofix_prs: false autoupdate_schedule: monthly # manual stage hooks - skip: [pylint, pyright, mypy] + skip: [pyright, mypy] repos: -- repo: https://github.com/hauntsaninja/black-pre-commit-mirror - # black compiled with mypyc - rev: 23.11.0 - hooks: - - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.6 + rev: v0.11.8 hooks: - id: ruff args: [--exit-non-zero-on-fix] + exclude: ^pandas/tests/frame/test_query_eval.py - id: ruff - # TODO: remove autofixe-only rules when they are checked by ruff + # TODO: remove autofix only rules when they are checked by ruff name: ruff-selected-autofixes alias: ruff-selected-autofixes + files: ^pandas + exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] + - id: ruff-format + exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.10' + rev: v2.14 hooks: - id: vulture entry: python scripts/run_vulture.py pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.4.1 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.0 + rev: v0.16.6 hooks: - id: cython-lint - id: double-quote-cython-strings - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v5.0.0 hooks: - - id: check-ast - id: check-case-conflict - id: check-toml - id: check-xml - id: check-yaml exclude: ^ci/meta.yaml$ - - id: debug-statements - id: end-of-file-fixer exclude: \.txt$ - id: mixed-line-ending @@ -70,57 +68,49 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace -- repo: https://github.com/pylint-dev/pylint - rev: v3.0.1 - hooks: - - id: pylint - stages: [manual] - args: [--load-plugins=pylint.extensions.redefined_loop_name] - - id: pylint - alias: redefined-outer-name - name: Redefining name from outer scope - files: ^pandas/ - exclude: | - (?x) - ^pandas/tests # keep excluded - |/_testing/ # keep excluded - |^pandas/util/_test_decorators\.py # keep excluded - |^pandas/_version\.py # keep excluded - |^pandas/conftest\.py # keep excluded - args: [--disable=all, --enable=redefined-outer-name] - stages: [manual] + args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 6.0.1 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.15.0 + rev: v3.19.1 hooks: - id: pyupgrade - args: [--py39-plus] + args: [--py310-plus] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: - - id: python-check-blanket-noqa - - id: python-check-blanket-type-ignore - id: rst-backticks - id: rst-directive-colons types: [text] # overwrite types: [rst] types_or: [python, rst] - id: rst-inline-touching-normal + exclude: ^pandas/tests/frame/test_query_eval.py types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.9.1 + rev: v1.0.0 hooks: - id: sphinx-lint + args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v17.0.6 + rev: v20.1.3 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] +- repo: https://github.com/trim21/pre-commit-mirror-meson + rev: v1.8.0 + hooks: + - id: meson-fmt + args: ['--inplace'] +- repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.10.0.1 + hooks: + - id: shellcheck + args: ["--severity=warning"] - repo: local hooks: - id: pyright @@ -132,7 +122,7 @@ repos: types: [python] stages: [manual] additional_dependencies: &pyright_dependencies - - pyright@1.1.339 + - pyright@1.1.383 - id: pyright # note: assumes python env is setup and activated name: pyright reportGeneralTypeIssues @@ -150,7 +140,7 @@ repos: pass_filenames: false types: [python] stages: [manual] - - id: mypy + - id: stubtest # note: assumes python env is setup and activated # note: requires pandas dev to be installed name: mypy (stubtest) @@ -190,9 +180,6 @@ repos: # Check for deprecated messages without sphinx directive |(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.) - # {foo!r} instead of {repr(foo)} - |!r} - # builtin filter function |(? -
- + + + Pandas Logo + ----------------- @@ -95,7 +96,7 @@ The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/). +Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://anaconda.org/conda-forge/pandas). ```sh # conda @@ -137,7 +138,7 @@ or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_in ```sh -python -m pip install -ve . --no-build-isolation --config-settings=editable-verbose=true +python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true ``` See the full instructions for [installing from source](https://pandas.pydata.org/docs/dev/development/contributing_environment.html). diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 9b0dc14fe6747..30c692115eab1 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -41,7 +41,8 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { - "Cython": ["3.0.5"], + "pip+build": [], + "Cython": ["3.0"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 2b3d32fb579dc..a17732c70c2c7 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -59,10 +59,11 @@ def setup(self, dtype): elif dtype in ["str", "string[python]", "string[pyarrow]"]: try: self.series = Series( - Index([f"i-{i}" for i in range(N)], dtype=object), dtype=dtype + Index([f"i-{i}" for i in range(N)], dtype=object)._values, + dtype=dtype, ) - except ImportError: - raise NotImplementedError + except ImportError as err: + raise NotImplementedError from err self.values = list(self.series[:2]) else: diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index aefc6e6d3c307..953af5c868356 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -76,8 +76,8 @@ class ArrowStringArray: def setup(self, multiple_chunks): try: import pyarrow as pa - except ImportError: - raise NotImplementedError + except ImportError as err: + raise NotImplementedError from err strings = np.array([str(i) for i in range(10_000)], dtype=object) if multiple_chunks: chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 1716110b619d6..7d5b250c7b157 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -24,7 +24,7 @@ def setup(self): self.codes = np.tile(range(len(self.categories)), N) self.datetimes = pd.Series( - pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s") + pd.date_range("1995-01-01 00:00:00", periods=N // 10, freq="s") ) self.datetimes_with_nat = self.datetimes.copy() self.datetimes_with_nat.iloc[-1] = pd.NaT @@ -88,7 +88,7 @@ def setup(self): ) for col in ("int", "float", "timestamp"): - self.df[col + "_as_str"] = self.df[col].astype(str) + self.df[f"{col}_as_str"] = self.df[col].astype(str) for col in self.df.columns: self.df[col] = self.df[col].astype("category") diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index a283afd1f0f1e..cd7851acae3f2 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -159,12 +159,6 @@ def setup(self): def time_items(self): # (monitor no-copying behaviour) - if hasattr(self.df, "_item_cache"): - self.df._item_cache.clear() - for name, col in self.df.items(): - pass - - def time_items_cached(self): for name, col in self.df.items(): pass @@ -523,7 +517,7 @@ def setup(self): self.df = DataFrame(np.random.randn(1000, 100)) self.s = Series(np.arange(1028.0)) - self.df2 = DataFrame({i: self.s for i in range(1028)}) + self.df2 = DataFrame(dict.fromkeys(range(1028), self.s)) self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC")) def time_apply_user_func(self): @@ -593,8 +587,6 @@ def setup(self): N = 10000 # this is the worst case, where every column has NaNs. arr = np.random.randn(N, 100) - # NB: we need to set values in array, not in df.values, otherwise - # the benchmark will be misleading for ArrayManager arr[::2] = np.nan self.df = DataFrame(arr) @@ -870,4 +862,28 @@ def time_last_valid_index(self, dtype): self.df.last_valid_index() +class Update: + def setup(self): + rng = np.random.default_rng() + self.df = DataFrame(rng.uniform(size=(1_000_000, 10))) + + idx = rng.choice(range(1_000_000), size=1_000_000, replace=False) + self.df_random = DataFrame(self.df, index=idx) + + idx = rng.choice(range(1_000_000), size=100_000, replace=False) + cols = rng.choice(range(10), size=2, replace=False) + self.df_sample = DataFrame( + rng.uniform(size=(100_000, 2)), index=idx, columns=cols + ) + + def time_to_update_big_frame_small_arg(self): + self.df.update(self.df_sample) + + def time_to_update_random_indices(self): + self.df_random.update(self.df_sample) + + def time_to_update_small_frame_big_arg(self): + self.df_sample.update(self.df) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index abffa1f702b9c..19c556dfe9d1f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -511,8 +511,7 @@ def setup(self, dtype, method, application, ncols, engine): # grouping on multiple columns # and we lack kernels for a bunch of methods if ( - engine == "numba" - and method in _numba_unsupported_methods + (engine == "numba" and method in _numba_unsupported_methods) or ncols > 1 or application == "transformation" or dtype == "datetime" diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 637b1b40f59a3..9c1e9656503f7 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -136,10 +136,14 @@ def setup(self): self.int_idxs.append(i_idx) o_idx = i_idx.astype(str) self.object_idxs.append(o_idx) + self.same_range_idx = [self.range_idx] * N def time_append_range_list(self): self.range_idx.append(self.range_idxs) + def time_append_range_list_same(self): + self.range_idx.append(self.same_range_idx) + def time_append_int_list(self): self.int_idx.append(self.int_idxs) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 9ad1f5b31016d..b2495356f134c 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -3,6 +3,7 @@ lower-level methods directly on Index and subclasses, see index_object.py, indexing_engine.py, and index_cached.py """ + from datetime import datetime import warnings @@ -84,9 +85,7 @@ def time_loc_slice(self, index, index_structure): class NumericMaskedIndexing: monotonic_list = list(range(10**6)) - non_monotonic_list = ( - list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1)) - ) + non_monotonic_list = list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1)) params = [ ("Int64", "UInt64", "Float64"), @@ -547,24 +546,17 @@ def time_chained_indexing(self, mode): class Block: - params = [ - (True, "True"), - (np.array(True), "np.array(True)"), - ] - - def setup(self, true_value, mode): + def setup(self): self.df = DataFrame( False, columns=np.arange(500).astype(str), index=date_range("2010-01-01", "2011-01-01"), ) - self.true_value = true_value - - def time_test(self, true_value, mode): + def time_test(self): start = datetime(2010, 5, 1) end = datetime(2010, 9, 1) - self.df.loc[start:end, :] = true_value + self.df.loc[start:end, :] = True from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index fd3d0f0b9cf2e..da0e7de585391 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -67,6 +67,14 @@ class NumericEngineIndexing: def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype + if ( + index_type == "non_monotonic" + and dtype in [np.int16, np.int8, np.uint8] + and unique + ): + # Values overflow + raise NotImplementedError + if index_type == "monotonic_incr": if unique: arr = np.arange(N * 3, dtype=dtype) @@ -87,7 +95,7 @@ def setup(self, engine_and_dtype, index_type, unique, N): arr = np.array([1, 2, 3], dtype=dtype).repeat(N) self.data = engine(arr) - # code belows avoids populating the mapping etc. while timing. + # code below avoids populating the mapping etc. while timing. self.data.get_loc(2) self.key_middle = arr[len(arr) // 2] @@ -115,6 +123,14 @@ def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype dtype = dtype.lower() + if ( + index_type == "non_monotonic" + and dtype in ["int16", "int8", "uint8"] + and unique + ): + # Values overflow + raise NotImplementedError + if index_type == "monotonic_incr": if unique: arr = np.arange(N * 3, dtype=dtype) @@ -140,7 +156,7 @@ def setup(self, engine_and_dtype, index_type, unique, N): mask[-1] = True self.data = engine(BaseMaskedArray(arr, mask)) - # code belows avoids populating the mapping etc. while timing. + # code below avoids populating the mapping etc. while timing. self.data.get_loc(2) self.key_middle = arr[len(arr) // 2] @@ -169,7 +185,7 @@ def setup(self, index_type): }[index_type] self.data = libindex.ObjectEngine(arr) - # code belows avoids populating the mapping etc. while timing. + # code below avoids populating the mapping etc. while timing. self.data.get_loc("b") def time_get_loc(self, index_type): diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index d5c58033c1157..ce3935d2cd0ac 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -22,23 +22,20 @@ class ToNumeric: - params = ["ignore", "coerce"] - param_names = ["errors"] - - def setup(self, errors): + def setup(self): N = 10000 self.float = Series(np.random.randn(N)) self.numstr = self.float.astype("str") self.str = Series(Index([f"i-{i}" for i in range(N)], dtype=object)) - def time_from_float(self, errors): - to_numeric(self.float, errors=errors) + def time_from_float(self): + to_numeric(self.float, errors="coerce") - def time_from_numeric_str(self, errors): - to_numeric(self.numstr, errors=errors) + def time_from_numeric_str(self): + to_numeric(self.numstr, errors="coerce") - def time_from_str(self, errors): - to_numeric(self.str, errors=errors) + def time_from_str(self): + to_numeric(self.str, errors="coerce") class ToNumericDowncast: @@ -187,7 +184,7 @@ def time_iso8601_tz_spaceformat(self): def time_iso8601_infer_zero_tz_fromat(self): # GH 41047 - to_datetime(self.strings_zero_tz, infer_datetime_format=True) + to_datetime(self.strings_zero_tz) class ToDatetimeNONISO8601: @@ -203,7 +200,7 @@ def time_same_offset(self): to_datetime(self.same_offset) def time_different_offset(self): - to_datetime(self.diff_offset) + to_datetime(self.diff_offset, utc=True) class ToDatetimeFormatQuarters: @@ -234,9 +231,6 @@ def time_no_exact(self): def time_same_offset(self): to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - def time_different_offset(self): - to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - def time_same_offset_to_utc(self): to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) @@ -271,16 +265,6 @@ def time_dup_string_tzoffset_dates(self, cache): to_datetime(self.dup_string_with_tz, cache=cache) -# GH 43901 -class ToDatetimeInferDatetimeFormat: - def setup(self): - rng = date_range(start="1/1/2000", periods=100000, freq="h") - self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() - - def time_infer_datetime_format(self): - to_datetime(self.strings, infer_datetime_format=True) - - class ToTimedelta: def setup(self): self.ints = np.random.randint(0, 60, size=10000) @@ -301,16 +285,13 @@ def time_convert_string_seconds(self): class ToTimedeltaErrors: - params = ["coerce", "ignore"] - param_names = ["errors"] - - def setup(self, errors): + def setup(self): ints = np.random.randint(0, 60, size=10000) self.arr = [f"{i} days" for i in ints] self.arr[-1] = "apple" - def time_convert(self, errors): - to_timedelta(self.arr, errors=errors) + def time_convert(self): + to_timedelta(self.arr, errors="coerce") from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9ac83db4f85b9..3a15f754ae523 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine): def time_read_bytescsv(self, engine): read_csv(self.data(self.BytesIO_input), engine=engine) + def peakmem_read_csv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" @@ -442,16 +445,6 @@ def setup(self, engine): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self, engine): - read_csv( - self.data(self.StringIO_input), - engine=engine, - sep=",", - header=None, - names=list(string.digits[:9]), - parse_dates=[[1, 2], [1, 3]], - ) - def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), @@ -601,7 +594,7 @@ def setup(self): self.StringIO_input = StringIO(data) def time_read_csv_index_col(self): - read_csv(self.StringIO_input, index_col="a") + read_csv(self.data(self.StringIO_input), index_col="a") class ReadCSVDatePyarrowEngine(StringIORewind): @@ -612,7 +605,7 @@ def setup(self): def time_read_csv_index_col(self): read_csv( - self.StringIO_input, + self.data(self.StringIO_input), parse_dates=["a"], engine="pyarrow", dtype_backend="pyarrow", diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index acf0ec4b9d359..2eb4c8c7f674b 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -123,12 +123,12 @@ def setup(self, format): index=date_range("20000101", periods=N, freq="h"), ) self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) - self.df.to_hdf(self.fname, "df", format=format) + self.df.to_hdf(self.fname, key="df", format=format) # Numeric df self.df1 = self.df.copy() self.df1 = self.df1.reset_index() - self.df1.to_hdf(self.fname, "df1", format=format) + self.df1.to_hdf(self.fname, key="df1", format=format) def time_read_hdf(self, format): read_hdf(self.fname, "df") @@ -137,7 +137,7 @@ def peakmem_read_hdf(self, format): read_hdf(self.fname, "df") def time_write_hdf(self, format): - self.df.to_hdf(self.fname, "df", format=format) + self.df.to_hdf(self.fname, key="df", format=format) from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 1078837a8e395..d3fd5075a4707 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,10 +1,5 @@ -import numpy as np - try: - from pandas._libs.tslibs.parsing import ( - _does_string_look_like_datetime, - concat_date_cols, - ) + from pandas._libs.tslibs.parsing import _does_string_look_like_datetime except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) pass @@ -20,21 +15,3 @@ def setup(self, value): def time_check_datetimes(self, value): for obj in self.objects: _does_string_look_like_datetime(obj) - - -class ConcatDateCols: - params = ([1234567890, "AAAA"], [1, 2]) - param_names = ["value", "dim"] - - def setup(self, value, dim): - count_elem = 10000 - if dim == 1: - self.object = (np.array([value] * count_elem),) - if dim == 2: - self.object = ( - np.array([value] * count_elem), - np.array([value] * count_elem), - ) - - def time_check_concat(self, value, dim): - concat_date_cols(self.object) diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index af9eef337e78e..0486cabb29845 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -13,8 +13,8 @@ class Render: def setup(self, cols, rows): self.df = DataFrame( np.random.randn(rows, cols), - columns=[f"float_{i+1}" for i in range(cols)], - index=[f"row_{i+1}" for i in range(rows)], + columns=[f"float_{i + 1}" for i in range(cols)], + index=[f"row_{i + 1}" for i in range(rows)], ) def time_apply_render(self, cols, rows): @@ -76,7 +76,8 @@ def _style_format(self): # apply a formatting function # subset is flexible but hinders vectorised solutions self.st = self.df.style.format( - "{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"] + "{:,.3f}", + subset=IndexSlice["row_1" : f"row_{ir}", "float_1" : f"float_{ic}"], ) def _style_apply_format_hide(self): diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index ce64304731116..a6c6990892d38 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -328,6 +328,23 @@ def time_i8merge(self, how): merge(self.left, self.right, how=how) +class UniqueMerge: + params = [4_000_000, 1_000_000] + param_names = ["unique_elements"] + + def setup(self, unique_elements): + N = 1_000_000 + self.left = DataFrame({"a": np.random.randint(1, unique_elements, (N,))}) + self.right = DataFrame({"a": np.random.randint(1, unique_elements, (N,))}) + uniques = self.right.a.drop_duplicates() + self.right["a"] = concat( + [uniques, Series(np.arange(0, -(N - len(uniques)), -1))], ignore_index=True + ) + + def time_unique_merge(self, unique_elements): + merge(self.left, self.right, how="inner") + + class MergeDatetime: params = [ [ diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py index 3419163bcfe09..7da2d27d98dbb 100644 --- a/asv_bench/benchmarks/libs.py +++ b/asv_bench/benchmarks/libs.py @@ -5,6 +5,7 @@ If a PR does not edit anything in _libs/, then it is unlikely that the benchmarks will be affected. """ + import numpy as np from pandas._libs.lib import ( diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 54788d41d83fe..0a588c9a2e22e 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -223,14 +223,20 @@ def time_categorical_level(self): class Equals: def setup(self): - idx_large_fast = RangeIndex(100000) - idx_small_slow = date_range(start="1/1/2012", periods=1) - self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) - + self.mi = MultiIndex.from_product( + [ + date_range("2000-01-01", periods=1000), + RangeIndex(1000), + ] + ) + self.mi_deepcopy = self.mi.copy(deep=True) self.idx_non_object = RangeIndex(1) + def time_equals_deepcopy(self): + self.mi.equals(self.mi_deepcopy) + def time_equals_non_object_index(self): - self.mi_large_slow.equals(self.idx_non_object) + self.mi.equals(self.idx_non_object) class SetOperations: diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py index 257c82cba8878..f8b51a523dab8 100644 --- a/asv_bench/benchmarks/package.py +++ b/asv_bench/benchmarks/package.py @@ -1,6 +1,7 @@ """ Benchmarks for pandas at the package-level. """ + import subprocess import sys diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index ccd86cae06d58..3b8b60e790380 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -2,6 +2,7 @@ Period benchmarks with non-tslibs dependencies. See benchmarks.tslibs.period for benchmarks that rely only on tslibs. """ + from pandas import ( DataFrame, Period, diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index bd4da00bfd2ad..f9a5f38c2e349 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -10,7 +10,19 @@ class Methods: ["DataFrame", "Series"], [("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"], + [ + "median", + "mean", + "max", + "min", + "std", + "count", + "skew", + "kurt", + "sum", + "sem", + "nunique", + ], ) param_names = ["constructor", "window_kwargs", "dtype", "method"] diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index b021af4694d7d..85d34cac5a7bf 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -148,10 +148,14 @@ def time_searchsorted(self, dtype): class Map: - params = (["dict", "Series", "lambda"], ["object", "category", "int"]) - param_names = "mapper" - - def setup(self, mapper, dtype): + params = ( + ["dict", "Series", "lambda"], + ["object", "category", "int"], + [None, "ignore"], + ) + param_names = ["mapper", "dtype", "na_action"] + + def setup(self, mapper, dtype, na_action): map_size = 1000 map_data = Series(map_size - np.arange(map_size), dtype=dtype) @@ -168,8 +172,8 @@ def setup(self, mapper, dtype): self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype) - def time_map(self, mapper, *args, **kwargs): - self.s.map(self.map_data) + def time_map(self, mapper, dtype, na_action): + self.s.map(self.map_data, na_action=na_action) class Clip: diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 89bda81ccf08c..8913293dfa20e 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -33,6 +33,7 @@ def setup(self, op, axis): ("median", 1), ("median", None), ("std", 1), + ("std", None), ) ): # Skipping cases where datetime aggregations are not implemented diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 47f25b331ab9b..5f2832e361eb5 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -79,12 +79,6 @@ def time_frame_period_formatting_default(self, nobs, freq): def time_frame_period_formatting_default_explicit(self, nobs, freq): self.data["p"].dt.strftime(date_format=self.default_fmt) - def time_frame_period_formatting_index_default(self, nobs, freq): - self.data.index.format() - - def time_frame_period_formatting_index_default_explicit(self, nobs, freq): - self.data.index.format(self.default_fmt) - def time_frame_period_formatting_custom(self, nobs, freq): self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 1f4a104255057..467fab857d306 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -19,10 +19,11 @@ class Dtypes: def setup(self, dtype): try: self.s = Series( - Index([f"i-{i}" for i in range(10000)], dtype=object), dtype=dtype + Index([f"i-{i}" for i in range(10000)], dtype=object)._values, + dtype=dtype, ) - except ImportError: - raise NotImplementedError + except ImportError as err: + raise NotImplementedError from err class Construction: diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 8e1deb99a66a4..8deec502898d9 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -29,7 +29,7 @@ def setup(self, index_type): "dst": date_range( start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s" ), - "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10), + "repeated": date_range(start="2000", periods=N // 10, freq="s").repeat(10), "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"), "tz_local": date_range( start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal() @@ -183,7 +183,7 @@ def setup(self): self.dt_ts = Series(5, rng3, dtype="datetime64[ns]") def time_resample(self): - self.dt_ts.resample("1S").last() + self.dt_ts.resample("1s").last() class AsOf: diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 3a2baec54109a..fe31879e67a67 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -19,10 +19,15 @@ class TimeGetTimedeltaField: def setup(self, size, field): arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr + arr = np.random.randint(-86400 * 1_000_000_000, 0, size=size, dtype="i8") + self.i8data_negative = arr def time_get_timedelta_field(self, size, field): get_timedelta_field(self.i8data, field) + def time_get_timedelta_field_negative_td(self, size, field): + get_timedelta_field(self.i8data_negative, field) + class TimeGetDateField: params = [ @@ -72,3 +77,6 @@ def setup(self, size, side, period, freqstr, month_kw): def time_get_start_end_field(self, size, side, period, freqstr, month_kw): get_start_end_field(self.i8data, self.attrname, freqstr, month_kw=month_kw) + + +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/tslibs/offsets.py b/asv_bench/benchmarks/tslibs/offsets.py index 1f48ec504acf1..55bd3c31c055c 100644 --- a/asv_bench/benchmarks/tslibs/offsets.py +++ b/asv_bench/benchmarks/tslibs/offsets.py @@ -2,6 +2,7 @@ offsets benchmarks that rely only on tslibs. See benchmarks.offset for offsets benchmarks that rely on other parts of pandas. """ + from datetime import datetime import numpy as np diff --git a/asv_bench/benchmarks/tslibs/resolution.py b/asv_bench/benchmarks/tslibs/resolution.py index 44f288c7de216..6317d299379d3 100644 --- a/asv_bench/benchmarks/tslibs/resolution.py +++ b/asv_bench/benchmarks/tslibs/resolution.py @@ -17,6 +17,7 @@ df.loc[key] = (val.average, val.stdev) """ + import numpy as np try: diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py index 2daf1861eb80a..9d9689fcfa94b 100644 --- a/asv_bench/benchmarks/tslibs/timedelta.py +++ b/asv_bench/benchmarks/tslibs/timedelta.py @@ -2,6 +2,7 @@ Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for Timedelta benchmarks that rely on other parts of pandas. """ + import datetime import numpy as np @@ -19,7 +20,7 @@ def time_from_int(self): Timedelta(123456789) def time_from_unit(self): - Timedelta(1, unit="d") + Timedelta(1, unit="D") def time_from_components(self): Timedelta( diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index 082220ee0dff2..6145966fb6a0e 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -1,7 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) +import zoneinfo import numpy as np -import pytz from pandas import Timestamp @@ -12,7 +15,7 @@ class TimestampConstruction: def setup(self): self.npdatetime64 = np.datetime64("2020-01-01 00:00:00") self.dttime_unaware = datetime(2020, 1, 1, 0, 0, 0) - self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC) + self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, timezone.utc) self.ts = Timestamp("2020-01-01 00:00:00") def time_parse_iso8601_no_tz(self): @@ -113,7 +116,7 @@ def setup(self, tz): self.ts = Timestamp("2017-08-25 08:16:14", tz=tz) def time_replace_tz(self, tz): - self.ts.replace(tzinfo=pytz.timezone("US/Eastern")) + self.ts.replace(tzinfo=zoneinfo.ZoneInfo("US/Eastern")) def time_replace_None(self, tz): self.ts.replace(tzinfo=None) @@ -144,8 +147,8 @@ def time_ceil(self, tz): class TimestampAcrossDst: def setup(self): - dt = datetime(2016, 3, 27, 1) - self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=0) + self.tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo self.ts2 = Timestamp(dt) def time_replace_across_dst(self): diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index 97ec80201dd16..885cf48d01743 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -15,17 +15,18 @@ val = %timeit -o tr.time_ints_to_pydatetime(box, size, tz) df.loc[key] = (val.average, val.stdev) """ + from datetime import ( timedelta, timezone, ) +import zoneinfo from dateutil.tz import ( gettz, tzlocal, ) import numpy as np -import pytz try: from pandas._libs.tslibs import ints_to_pydatetime @@ -37,7 +38,7 @@ None, timezone.utc, timezone(timedelta(minutes=60)), - pytz.timezone("US/Pacific"), + zoneinfo.ZoneInfo("US/Pacific"), gettz("Asia/Tokyo"), tzlocal_obj, ] diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index c6b510efdca69..c87adb5e5d0e9 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -1,5 +1,6 @@ +from datetime import timezone + import numpy as np -from pytz import UTC from pandas._libs.tslibs.tzconversion import tz_localize_to_utc @@ -41,7 +42,7 @@ def time_tz_convert_from_utc(self, size, tz): # dti = DatetimeIndex(self.i8data, tz=tz) # dti.tz_localize(None) if old_sig: - tz_convert_from_utc(self.i8data, UTC, tz) + tz_convert_from_utc(self.i8data, timezone.utc, tz) else: tz_convert_from_utc(self.i8data, tz) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e41f625e583c0..a0d23aa0478d2 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -16,17 +16,23 @@ set -uo pipefail -[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ - { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } +if [[ -v 1 ]]; then + CHECK=$1 +else + # script will fail if it uses an unset variable (i.e. $1 is not provided) + CHECK="" +fi + +[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \ + { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; } -BASE_DIR="$(dirname $0)/.." +BASE_DIR="$(dirname "$0")/.." RET=0 -CHECK=$1 ### CODE ### if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then - MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG + MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo "$MSG" python -W error -c " import sys import pandas @@ -43,126 +49,227 @@ if mods: sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) " - RET=$(($RET + $?)) ; echo $MSG "DONE" + RET=$(($RET + $?)) ; echo "$MSG" "DONE" fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - MSG='Python and Cython Doctests' ; echo $MSG + MSG='Python and Cython Doctests' ; echo "$MSG" python -c 'import pandas as pd; pd.test(run_doctests=True)' - RET=$(($RET + $?)) ; echo $MSG "DONE" + RET=$(($RET + $?)) ; echo "$MSG" "DONE" fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (EX01, EX02, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX02,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06 - RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Validate Docstrings' ; echo "$MSG" + "$BASE_DIR"/scripts/validate_docstrings.py \ + --format=actions \ + -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ + -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ + -i "pandas.Period.freq GL08" \ + -i "pandas.Period.ordinal GL08" \ + -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ + -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ + -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ + -i "pandas.tseries.offsets.BDay PR02,SA01" \ + -i "pandas.tseries.offsets.BHalfYearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.n GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.n GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.normalize GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.startingMonth GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.n GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.normalize GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.startingMonth GL08" \ + -i "pandas.tseries.offsets.BYearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BYearBegin.month GL08" \ + -i "pandas.tseries.offsets.BYearBegin.n GL08" \ + -i "pandas.tseries.offsets.BYearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BYearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BYearEnd.month GL08" \ + -i "pandas.tseries.offsets.BYearEnd.n GL08" \ + -i "pandas.tseries.offsets.BYearEnd.normalize GL08" \ + -i "pandas.tseries.offsets.BusinessDay PR02,SA01" \ + -i "pandas.tseries.offsets.BusinessDay.calendar GL08" \ + -i "pandas.tseries.offsets.BusinessDay.holidays GL08" \ + -i "pandas.tseries.offsets.BusinessDay.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BusinessDay.n GL08" \ + -i "pandas.tseries.offsets.BusinessDay.normalize GL08" \ + -i "pandas.tseries.offsets.BusinessDay.weekmask GL08" \ + -i "pandas.tseries.offsets.BusinessHour PR02,SA01" \ + -i "pandas.tseries.offsets.BusinessHour.calendar GL08" \ + -i "pandas.tseries.offsets.BusinessHour.end GL08" \ + -i "pandas.tseries.offsets.BusinessHour.holidays GL08" \ + -i "pandas.tseries.offsets.BusinessHour.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BusinessHour.n GL08" \ + -i "pandas.tseries.offsets.BusinessHour.normalize GL08" \ + -i "pandas.tseries.offsets.BusinessHour.start GL08" \ + -i "pandas.tseries.offsets.BusinessHour.weekmask GL08" \ + -i "pandas.tseries.offsets.BusinessMonthBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BusinessMonthBegin.n GL08" \ + -i "pandas.tseries.offsets.BusinessMonthBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BusinessMonthEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BusinessMonthEnd.n GL08" \ + -i "pandas.tseries.offsets.BusinessMonthEnd.normalize GL08" \ + -i "pandas.tseries.offsets.CBMonthBegin PR02" \ + -i "pandas.tseries.offsets.CBMonthEnd PR02" \ + -i "pandas.tseries.offsets.CDay PR02,SA01" \ + -i "pandas.tseries.offsets.CustomBusinessDay PR02,SA01" \ + -i "pandas.tseries.offsets.CustomBusinessDay.calendar GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.holidays GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.is_on_offset GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.n GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.normalize GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.weekmask GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour PR02,SA01" \ + -i "pandas.tseries.offsets.CustomBusinessHour.calendar GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.end GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.holidays GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.is_on_offset GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.n GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.normalize GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.start GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.weekmask GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.weekmask GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.weekmask GL08" \ + -i "pandas.tseries.offsets.DateOffset.is_on_offset GL08" \ + -i "pandas.tseries.offsets.DateOffset.n GL08" \ + -i "pandas.tseries.offsets.DateOffset.normalize GL08" \ + -i "pandas.tseries.offsets.Day.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Day.n GL08" \ + -i "pandas.tseries.offsets.Day.normalize GL08" \ + -i "pandas.tseries.offsets.Easter.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Easter.n GL08" \ + -i "pandas.tseries.offsets.Easter.normalize GL08" \ + -i "pandas.tseries.offsets.FY5253.get_rule_code_suffix GL08" \ + -i "pandas.tseries.offsets.FY5253.get_year_end GL08" \ + -i "pandas.tseries.offsets.FY5253.is_on_offset GL08" \ + -i "pandas.tseries.offsets.FY5253.n GL08" \ + -i "pandas.tseries.offsets.FY5253.normalize GL08" \ + -i "pandas.tseries.offsets.FY5253.rule_code GL08" \ + -i "pandas.tseries.offsets.FY5253.startingMonth GL08" \ + -i "pandas.tseries.offsets.FY5253.variation GL08" \ + -i "pandas.tseries.offsets.FY5253.weekday GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.get_rule_code_suffix GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.get_weeks GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.is_on_offset GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.n GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.normalize GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.qtr_with_extra_week GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.rule_code GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.startingMonth GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.variation GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.n GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.n GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.normalize GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.startingMonth GL08" \ + -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Hour.n GL08" \ + -i "pandas.tseries.offsets.Hour.normalize GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.week GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.weekday GL08" \ + -i "pandas.tseries.offsets.Micro.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Micro.n GL08" \ + -i "pandas.tseries.offsets.Micro.normalize GL08" \ + -i "pandas.tseries.offsets.Milli.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Milli.n GL08" \ + -i "pandas.tseries.offsets.Milli.normalize GL08" \ + -i "pandas.tseries.offsets.Minute.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Minute.n GL08" \ + -i "pandas.tseries.offsets.Minute.normalize GL08" \ + -i "pandas.tseries.offsets.MonthBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.MonthBegin.n GL08" \ + -i "pandas.tseries.offsets.MonthBegin.normalize GL08" \ + -i "pandas.tseries.offsets.MonthEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.MonthEnd.n GL08" \ + -i "pandas.tseries.offsets.MonthEnd.normalize GL08" \ + -i "pandas.tseries.offsets.Nano.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Nano.normalize GL08" \ + -i "pandas.tseries.offsets.Nano.n GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.n GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.normalize GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.n GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.normalize GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.startingMonth GL08" \ + -i "pandas.tseries.offsets.Second.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Second.n GL08" \ + -i "pandas.tseries.offsets.Second.normalize GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.normalize GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.Tick GL08" \ + -i "pandas.tseries.offsets.Tick.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Tick.n GL08" \ + -i "pandas.tseries.offsets.Tick.normalize GL08" \ + -i "pandas.tseries.offsets.Week.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Week.n GL08" \ + -i "pandas.tseries.offsets.Week.normalize GL08" \ + -i "pandas.tseries.offsets.Week.weekday GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.n GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.week GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.weekday GL08" \ + -i "pandas.tseries.offsets.YearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.YearBegin.month GL08" \ + -i "pandas.tseries.offsets.YearBegin.n GL08" \ + -i "pandas.tseries.offsets.YearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.YearEnd.month GL08" \ + -i "pandas.tseries.offsets.YearEnd.n GL08" \ + -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ + -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function - MSG='Partially validate docstrings (EX03)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \ - pandas.Series.dt.day_name \ - pandas.Series.str.len \ - pandas.Series.cat.set_categories \ - pandas.Series.plot.bar \ - pandas.Series.plot.hist \ - pandas.Series.plot.line \ - pandas.Series.to_sql \ - pandas.Series.to_latex \ - pandas.errors.CategoricalConversionWarning \ - pandas.errors.ChainedAssignmentError \ - pandas.errors.ClosedFileError \ - pandas.errors.DatabaseError \ - pandas.errors.IndexingError \ - pandas.errors.InvalidColumnName \ - pandas.errors.NumExprClobberingError \ - pandas.errors.PossibleDataLossError \ - pandas.errors.PossiblePrecisionLoss \ - pandas.errors.SettingWithCopyError \ - pandas.errors.SettingWithCopyWarning \ - pandas.errors.SpecificationError \ - pandas.errors.UndefinedVariableError \ - pandas.errors.ValueLabelTypeMismatch \ - pandas.Timestamp.ceil \ - pandas.Timestamp.floor \ - pandas.Timestamp.round \ - pandas.read_pickle \ - pandas.ExcelWriter \ - pandas.read_json \ - pandas.io.json.build_table_schema \ - pandas.DataFrame.to_latex \ - pandas.io.formats.style.Styler.to_latex \ - pandas.read_parquet \ - pandas.DataFrame.to_sql \ - pandas.read_stata \ - pandas.core.resample.Resampler.pipe \ - pandas.core.resample.Resampler.fillna \ - pandas.core.resample.Resampler.interpolate \ - pandas.plotting.scatter_matrix \ - pandas.pivot \ - pandas.merge_asof \ - pandas.wide_to_long \ - pandas.Index.rename \ - pandas.Index.droplevel \ - pandas.Index.isin \ - pandas.CategoricalIndex.set_categories \ - pandas.MultiIndex.names \ - pandas.MultiIndex.droplevel \ - pandas.IndexSlice \ - pandas.DatetimeIndex.month_name \ - pandas.DatetimeIndex.day_name \ - pandas.core.window.rolling.Rolling.corr \ - pandas.Grouper \ - pandas.core.groupby.SeriesGroupBy.apply \ - pandas.core.groupby.DataFrameGroupBy.apply \ - pandas.core.groupby.SeriesGroupBy.transform \ - pandas.core.groupby.SeriesGroupBy.pipe \ - pandas.core.groupby.DataFrameGroupBy.pipe \ - pandas.core.groupby.DataFrameGroupBy.describe \ - pandas.core.groupby.DataFrameGroupBy.idxmax \ - pandas.core.groupby.DataFrameGroupBy.idxmin \ - pandas.core.groupby.DataFrameGroupBy.value_counts \ - pandas.core.groupby.SeriesGroupBy.describe \ - pandas.core.groupby.DataFrameGroupBy.boxplot \ - pandas.core.groupby.DataFrameGroupBy.hist \ - pandas.io.formats.style.Styler.map \ - pandas.io.formats.style.Styler.apply_index \ - pandas.io.formats.style.Styler.map_index \ - pandas.io.formats.style.Styler.format \ - pandas.io.formats.style.Styler.format_index \ - pandas.io.formats.style.Styler.relabel_index \ - pandas.io.formats.style.Styler.hide \ - pandas.io.formats.style.Styler.set_td_classes \ - pandas.io.formats.style.Styler.set_tooltips \ - pandas.io.formats.style.Styler.set_uuid \ - pandas.io.formats.style.Styler.pipe \ - pandas.io.formats.style.Styler.highlight_between \ - pandas.io.formats.style.Styler.highlight_quantile \ - pandas.io.formats.style.Styler.background_gradient \ - pandas.io.formats.style.Styler.text_gradient \ - pandas.DataFrame.values \ - pandas.DataFrame.loc \ - pandas.DataFrame.iloc \ - pandas.DataFrame.groupby \ - pandas.DataFrame.describe \ - pandas.DataFrame.skew \ - pandas.DataFrame.var \ - pandas.DataFrame.idxmax \ - pandas.DataFrame.idxmin \ - pandas.DataFrame.last \ - pandas.DataFrame.pivot \ - pandas.DataFrame.sort_values \ - pandas.DataFrame.tz_convert \ - pandas.DataFrame.tz_localize \ - pandas.DataFrame.plot.bar \ - pandas.DataFrame.plot.hexbin \ - pandas.DataFrame.plot.hist \ - pandas.DataFrame.plot.line \ - pandas.DataFrame.hist \ RET=$(($RET + $?)) ; echo $MSG "DONE" fi @@ -171,7 +278,7 @@ fi if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then MSG='Notebooks' ; echo $MSG - jupyter nbconvert --execute $(find doc/source -name '*.ipynb') --to notebook + jupyter nbconvert --execute "$(find doc/source -name '*.ipynb')" --to notebook RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml similarity index 54% rename from ci/deps/actions-39-minimum_versions.yaml rename to ci/deps/actions-310-minimum_versions.yaml index fd71315d2e7ac..f8522594f36f4 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -4,63 +4,62 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.9 + - python=3.10 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4, <2 - - pytz=2020.1 + - numpy=1.23.5 # optional dependencies - - beautifulsoup4=4.11.2 - - blosc=1.21.3 + - beautifulsoup4=4.12.3 - bottleneck=1.3.6 - - fastparquet=2022.12.0 - - fsspec=2022.11.0 + - fastparquet=2024.2.0 + - fsspec=2023.12.2 - html5lib=1.1 - - hypothesis=6.46.1 - - gcsfs=2022.11.0 - - jinja2=3.1.2 + - hypothesis=6.84.0 + - gcsfs=2023.12.2 + - jinja2=3.1.3 - lxml=4.9.2 - - matplotlib=3.6.3 - - numba=0.56.4 - - numexpr=2.8.4 + - matplotlib=3.8.3 + - numba=0.59.0 + - numexpr=2.9.0 - odfpy=1.4.1 - qtpy=2.3.0 - - openpyxl=3.1.0 + - openpyxl=3.1.2 - psycopg2=2.9.6 - pyarrow=10.0.1 - - pymysql=1.0.2 + - pyiceberg=0.7.1 + - pymysql=1.1.0 - pyqt=5.15.9 - - pyreadstat=1.2.0 + - pyreadstat=1.2.6 - pytables=3.8.0 - python-calamine=0.1.7 + - pytz=2023.4 - pyxlsb=1.0.10 - - s3fs=2022.11.0 - - scipy=1.10.0 + - s3fs=2023.12.2 + - scipy=1.12.0 - sqlalchemy=2.0.0 - tabulate=0.9.0 - - xarray=2022.12.0 + - xarray=2024.1.1 - xlrd=2.0.1 - - xlsxwriter=3.0.5 - - zstandard=0.19.0 + - xlsxwriter=3.2.0 + - zstandard=0.22.0 - pip: - - adbc-driver-postgresql==0.8.0 + - adbc-driver-postgresql==0.10.0 - adbc-driver-sqlite==0.8.0 - - dataframe-api-compat==0.1.7 - tzdata==2022.7 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 4b62ecc79e4ef..ea22bc411dedd 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -5,59 +5,59 @@ dependencies: - python=3.10 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - hypothesis>=6.84.0 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - - openpyxl>=3.1.0 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 - pyqt>=5.15.9 - - pyreadstat>=1.2.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 - pip: - - adbc-driver-postgresql>=0.8.0 + - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 95c0319d6f5b8..e981be9891dec 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -6,73 +6,70 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - hypothesis>=6.84.0 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - - openpyxl>=3.1.0 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 - pyqt>=5.15.9 - - pyreadstat>=1.2.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 # downstream packages - botocore - cftime - dask - ipython - - geopandas-base - seaborn - scikit-learn - statsmodels - coverage - pandas-datareader - pyyaml - - py - pip: - - adbc-driver-postgresql>=0.8.0 + - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - dataframe-api-compat>=0.1.7 - tzdata>=2022.7 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index b62e8630f2059..325a6d45d74fd 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -5,23 +5,19 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] - - meson[ninja]=1.2.1 + - versioneer + - meson=1.2.1 - meson-python=0.13.1 - cython>=0.29.33 # test dependencies - pytest>=7.3.2 - pytest-cov - # Once pytest-cov > 4 comes out, unpin this - # Right now, a DeprecationWarning related to rsyncdir - # causes an InternalError within pytest - - pytest-xdist>=2.2.0, <3 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # pandas dependencies - python-dateutil - - pytz - pip - pip: diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 5455b9b84b034..2d3d11c294e12 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -5,26 +5,25 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] - - meson[ninja]=1.2.1 + - versioneer + - meson=1.2.1 - cython>=0.29.33 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy - pip - pip: - "tzdata>=2022.7" - - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--prefer-binary" - "--pre" - "pyarrow" diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml deleted file mode 100644 index dcd381066b0ea..0000000000000 --- a/ci/deps/actions-311-sanitizers.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.11 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 - - boto3 - - hypothesis>=6.46.1 - - pyqt>=5.15.9 - - # required dependencies - - python-dateutil - - numpy<2 - - pytz - - # pandas dependencies - - pip - - - pip: - - "tzdata>=2022.7" diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 52074ae00ea18..f03d518fd22fb 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -5,59 +5,58 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - hypothesis>=6.84.0 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - pyqt>=5.15.9 - - openpyxl>=3.1.0 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyreadstat>=1.2.0 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 - pip: - - adbc-driver-postgresql>=0.8.0 + - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 4c51e9e6029e3..58c6c2ca3210c 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -5,59 +5,59 @@ dependencies: - python=3.12 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - hypothesis>=6.84.0 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - # - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - pyqt>=5.15.9 - - openpyxl>=3.1.0 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyreadstat>=1.2.0 - # - pytables>=3.8.0 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 + - pyreadstat>=1.2.6 + - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 - pip: - - adbc-driver-postgresql>=0.8.0 + - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/actions-313-freethreading.yaml b/ci/deps/actions-313-freethreading.yaml new file mode 100644 index 0000000000000..d10c70bc1fbfb --- /dev/null +++ b/ci/deps/actions-313-freethreading.yaml @@ -0,0 +1,30 @@ +name: pandas-dev-313-freethreading +channels: + - conda-forge +dependencies: + - python-freethreading + + # build dependencies + - setuptools + - versioneer + - meson=1.8.0 + - meson-python=0.18.0 + + # test dependencies + - pytest>=7.3.2 + - pytest-xdist>=3.4.0 + + # required dependencies + - python-dateutil + - numpy + + # optional dependencies + - hypothesis>=6.84.0 + + # Move Cython to build dependencies, once they release a version that supports freethreading + - pip: + # No free-threaded coveragepy (with the C-extension) on conda-forge yet + - pytest-cov + - "tzdata>=2022.7" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" + - "cython" diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-313.yaml similarity index 51% rename from ci/deps/actions-39.yaml rename to ci/deps/actions-313.yaml index cbe8f77c15730..f94138a98e127 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-313.yaml @@ -1,63 +1,63 @@ -name: pandas-dev +name: pandas-dev-313 channels: - conda-forge dependencies: - - python=3.9 + - python=3.13 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy # optional dependencies - - beautifulsoup4>=4.11.2 + - beautifulsoup4>=4.12.3 - blosc>=1.21.3 - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 + - hypothesis>=6.84.0 + - gcsfs>=2023.12.2 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 - odfpy>=1.4.1 - qtpy>=2.3.0 - - openpyxl>=3.1.0 + - pyqt>=5.15.9 + - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 + - pymysql>=1.1.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 - pip: - - adbc-driver-postgresql>=0.8.0 + - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 5a5a01f7aec72..90933b24b88db 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -8,20 +8,19 @@ dependencies: - python=3.9[build=*_pypy] # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # required - - numpy<2 + - numpy - python-dateutil - - pytz - pip: - tzdata>=2022.7 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml deleted file mode 100644 index 8e106445cd4e0..0000000000000 --- a/ci/deps/circle-310-arm64.yaml +++ /dev/null @@ -1,61 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.10 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy<2 - - pytz - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - pip: - - adbc-driver-postgresql>=0.8.0 - - adbc-driver-sqlite>=0.8.0 diff --git a/ci/meta.yaml b/ci/meta.yaml index aac5593e493b7..a4c9e8189f082 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -37,7 +37,6 @@ requirements: - numpy >=1.21.6 # [py<311] - numpy >=1.23.2 # [py>=311] - python-dateutil >=2.8.2 - - pytz >=2020.1 - python-tzdata >=2022.7 test: @@ -64,9 +63,9 @@ test: requires: - pip - pytest >=7.3.2 - - pytest-xdist >=2.2.0 + - pytest-xdist >=3.4.0 - pytest-cov - - hypothesis >=6.46.1 + - hypothesis >=6.84.0 - tomli # [py<311] about: @@ -90,4 +89,3 @@ extra: - datapythonista - phofl - lithomas1 - - marcogorelli diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..16292beec612b 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -3,18 +3,16 @@ # Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set) # https://github.com/pytest-dev/pytest/issues/920 # https://github.com/pytest-dev/pytest/issues/1075 -export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') - -# May help reproduce flaky CI builds if set in subsequent runs -echo PYTHONHASHSEED=$PYTHONHASHSEED +PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +export PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi -echo $PYTEST_CMD +echo "$PYTEST_CMD" sh -c "$PYTEST_CMD" diff --git a/ci/upload_wheels.sh b/ci/upload_wheels.sh index 3c4aa76c02003..c7c7ca00ee466 100644 --- a/ci/upload_wheels.sh +++ b/ci/upload_wheels.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh set_upload_vars() { @@ -19,20 +20,20 @@ set_upload_vars() { fi } upload_wheels() { - echo ${PWD} + echo "${PWD}" if [[ ${ANACONDA_UPLOAD} == true ]]; then - if [ -z ${TOKEN} ]; then + if [ -z "${TOKEN}" ]; then echo no token set, not uploading else # sdists are located under dist folder when built through setup.py if compgen -G "./dist/*.gz"; then echo "Found sdist" - anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz echo "Uploaded sdist" fi if compgen -G "./wheelhouse/*.whl"; then echo "Found wheel" - anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl echo "Uploaded wheel" fi echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf index 3582e0c0dabf9..87f3e9252cca2 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx index 746f508516964..0cefa8958d5d4 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf new file mode 100644 index 0000000000000..ea356385e9fb1 Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx new file mode 100644 index 0000000000000..1b812c1a2595a Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx differ diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md index 6c33de104ed90..b72c093b4ba2f 100644 --- a/doc/cheatsheet/README.md +++ b/doc/cheatsheet/README.md @@ -6,11 +6,13 @@ and pick "PDF" as the format. This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf). -| Topic | PDF | PPT | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Pandas_Cheat_Sheet | | | -| Pandas_Cheat_Sheet_JA | | | +| Topic | Language | PDF | PPT | +|------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas_Cheat_Sheet | English | | | +| Pandas_Cheat_Sheet_JA | Japanese | | | +| Pandas_Cheat_Sheet_FA | Persian | | | +The English version has additional material that is not in the versions in other languages. **Alternative** diff --git a/doc/data/titanic.csv b/doc/data/titanic.csv index 5cc466e97cf12..0f7d184728a17 100644 --- a/doc/data/titanic.csv +++ b/doc/data/titanic.csv @@ -1,93 +1,93 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S 2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +3,1,3,"Heikkinen, Miss Laina",female,26,0,0,STON/O2. 3101282,7.925,,S 4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S 5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S 6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q 7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +8,0,3,"Palsson, Master Gosta Leonard",male,2,3,1,349909,21.075,,S 9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S 10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C -11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S -12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +11,1,3,"Sandstrom, Miss Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss Elizabeth",female,58,0,0,113783,26.55,C103,S 13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S 14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S -15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +15,0,3,"Vestrom, Miss Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S 16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S -17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +17,0,3,"Rice, Master Eugene",male,2,4,1,382652,29.125,,Q 18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S 19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S 20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C 21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S 22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S -23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +23,1,3,"McGowan, Miss Anna ""Annie""",female,15,0,0,330923,8.0292,,Q 24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S -25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +25,0,3,"Palsson, Miss Torborg Danira",female,8,3,1,349909,21.075,,S 26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S 27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C 28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S -29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +29,1,3,"O'Dwyer, Miss Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q 30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S 31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C 32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C -33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +33,1,3,"Glynn, Miss Mary Agatha",female,,0,0,335677,7.75,,Q 34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S 35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C 36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S 37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C 38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S -39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S -40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +39,0,3,"Vander Planke, Miss Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss Jamila",female,14,1,0,2651,11.2417,,C 41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S 42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S 43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C -44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C -45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +44,1,2,"Laroche, Miss Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss Margaret Delia",female,19,0,0,330958,7.8792,,Q 46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S 47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q -48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +48,1,3,"O'Driscoll, Miss Bridget",female,,0,0,14311,7.75,,Q 49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C 50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S -51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +51,0,3,"Panula, Master Juha Niilo",male,7,4,1,3101295,39.6875,,S 52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S 53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C 54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S 55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C 56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S -57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +57,1,2,"Rugg, Miss Emily",female,21,0,0,C.A. 31026,10.5,,S 58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C -59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S -60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +59,1,2,"West, Miss Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master William Frederick",male,11,5,2,CA 2144,46.9,,S 61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C -62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +62,1,1,"Icard, Miss Amelie",female,38,0,0,113572,80,B28, 63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S -64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +64,0,3,"Skoog, Master Harald",male,4,3,2,347088,27.9,,S 65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C -66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +66,1,3,"Moubarek, Master Gerios",male,,1,1,2661,15.2458,,C 67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S 68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S -69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +69,1,3,"Andersson, Miss Erna Alexandra",female,17,4,2,3101281,7.925,,S 70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S 71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S -72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +72,0,3,"Goodwin, Miss Lillian Amy",female,16,5,2,CA 2144,46.9,,S 73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S 74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C 75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S 76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S 77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S 78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S -79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S -80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +79,1,2,"Caldwell, Master Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss Elizabeth",female,30,0,0,364516,12.475,,S 81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S 82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S -83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +83,1,3,"McDermott, Miss Brigdet Delia",female,,0,0,330932,7.7875,,Q 84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S -85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +85,1,2,"Ilett, Miss Bertha",female,17,0,0,SO/C 14885,10.5,,S 86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S 87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S 88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S -89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +89,1,1,"Fortune, Miss Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S 90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S 91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S 92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S @@ -99,35 +99,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C 99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S 100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S -101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +101,0,3,"Petranec, Miss Matilda",female,28,0,0,349245,7.8958,,S 102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S 103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S 104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S 105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S 106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S -107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +107,1,3,"Salkjelsvik, Miss Anna Kristine",female,21,0,0,343120,7.65,,S 108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S 109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S -110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +110,1,3,"Moran, Miss Bertha",female,,1,0,371110,24.15,,Q 111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S -112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +112,0,3,"Zabour, Miss Hileni",female,14.5,1,0,2665,14.4542,,C 113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S -114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S -115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +114,0,3,"Jussila, Miss Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss Malake",female,17,0,0,2627,14.4583,,C 116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S 117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q 118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S 119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C -120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +120,0,3,"Andersson, Miss Ellis Anna Maria",female,2,4,2,347082,31.275,,S 121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S 122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S 123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C -124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +124,1,2,"Webber, Miss Susan",female,32.5,0,0,27267,13,E101,S 125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S -126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +126,1,3,"Nicola-Yarred, Master Elias",male,12,1,0,2651,11.2417,,C 127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q 128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S -129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +129,1,3,"Peter, Miss Anna",female,,1,1,2668,22.3583,F E69,C 130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S 131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C 132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S @@ -135,18 +135,18 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S 135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S 136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C -137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +137,1,1,"Newsom, Miss Helen Monypeny",female,19,0,2,11752,26.2833,D47,S 138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S 139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S 140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C 141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C -142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +142,1,3,"Nysten, Miss Anna Sofia",female,22,0,0,347081,7.75,,S 143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S 144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q 145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S 146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S 147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S -148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +148,0,3,"Ford, Miss Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S 149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S 150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S 151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S @@ -155,35 +155,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S 155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S 156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C -157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +157,1,3,"Gilnagh, Miss Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q 158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S 159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S -160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +160,0,3,"Sage, Master Thomas Henry",male,,8,2,CA. 2343,69.55,,S 161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S 162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S 163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S 164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S -165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S -166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +165,0,3,"Panula, Master Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S 167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S 168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S 169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S 170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S 171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S -172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q -173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +172,0,3,"Rice, Master Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss Eleanor Ileen",female,1,1,1,347742,11.1333,,S 174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S 175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C 176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S -177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S -178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +177,0,3,"Lefebre, Master Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C 179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S 180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S -181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +181,0,3,"Sage, Miss Constance Gladys",female,,8,2,CA. 2343,69.55,,S 182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C -183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S -184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S -185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +183,0,3,"Asplund, Master Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss Luise Gretchen",female,4,0,2,315153,22.025,,S 186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S 187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q 188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S @@ -191,33 +191,33 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S 191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S 192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S -193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S -194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +193,1,3,"Andersen-Jensen, Miss Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master Michel M",male,3,1,1,230080,26,F2,S 195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C -196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +196,1,1,"Lurette, Miss Elise",female,58,0,0,PC 17569,146.5208,B80,C 197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q 198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S -199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q -200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +199,1,3,"Madigan, Miss Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S 201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S 202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S 203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S 204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C 205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S -206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +206,0,3,"Strom, Miss Telma Matilda",female,2,0,1,347054,10.4625,G6,S 207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S 208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C -209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +209,1,3,"Carr, Miss Helen ""Ellen""",female,16,0,0,367231,7.75,,Q 210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C 211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S -212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +212,1,2,"Cameron, Miss Clear Annie",female,35,0,0,F.C.C. 13528,21,,S 213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S 214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S 215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q -216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C -217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +216,1,1,"Newell, Miss Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S 218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S -219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +219,1,1,"Bazzani, Miss Albina",female,32,0,0,11813,76.2917,D15,C 220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S 221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S 222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S @@ -228,24 +228,24 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S 228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S 229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S -230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +230,0,3,"Lefebre, Miss Mathilde",female,,3,1,4133,25.4667,,S 231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S 232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S 233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S -234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +234,1,3,"Asplund, Miss Lillian Gertrud",female,5,4,2,347077,31.3875,,S 235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S -236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +236,0,3,"Harknett, Miss Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S 237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S -238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +238,1,2,"Collyer, Miss Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S 239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S 240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S -241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C -242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +241,0,3,"Zabour, Miss Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss Katherine ""Kate""",female,,1,0,367230,15.5,,Q 243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S 244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S 245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C 246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q -247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +247,0,3,"Lindahl, Miss Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S 248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S 249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S 250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S @@ -256,28 +256,28 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S 256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C 257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C -258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S -259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +258,1,1,"Cherry, Miss Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss Anna",female,35,0,0,PC 17755,512.3292,,C 260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S 261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q -262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +262,1,3,"Asplund, Master Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S 263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S 264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S -265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +265,0,3,"Henry, Miss Delia",female,,0,0,382649,7.75,,Q 266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S 267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S 268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S 269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S -270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +270,1,1,"Bissette, Miss Amelia",female,35,0,0,PC 17760,135.6333,C99,S 271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S 272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S 273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S 274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C -275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q -276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S -277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +275,1,3,"Healy, Miss Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss Augusta Charlotta",female,45,0,0,347073,7.75,,S 278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S -279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +279,0,3,"Rice, Master Eric",male,7,4,1,382652,29.125,,Q 280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S 281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q 282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S @@ -288,66 +288,66 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S 288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S 289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S -290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q -291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +290,1,3,"Connolly, Miss Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss Ellen ""Nellie""",female,26,0,0,19877,78.85,,S 292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C 293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C -294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +294,0,3,"Haas, Miss Aloisia",female,24,0,0,349236,8.85,,S 295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S 296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C 297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C -298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +298,0,1,"Allison, Miss Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S 299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S 300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C -301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +301,1,3,"Kelly, Miss Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q 302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q 303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S -304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +304,1,2,"Keane, Miss Nora A",female,,0,0,226593,12.35,E101,Q 305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S -306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S -307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +306,1,1,"Allison, Master Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss Margaret",female,,0,0,17421,110.8833,,C 308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C 309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C -310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C -311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C -312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +310,1,1,"Francatelli, Miss Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C 313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S 314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S 315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S -316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +316,1,3,"Nilsson, Miss Helmina Josefina",female,26,0,0,347470,7.8542,,S 317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S 318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S -319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +319,1,1,"Wick, Miss Mary Natalie",female,31,0,2,36928,164.8667,C7,S 320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C 321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S 322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S -323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +323,1,2,"Slayter, Miss Hilda Mary",female,30,0,0,234818,12.35,,Q 324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S 325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S -326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +326,1,1,"Young, Miss Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C 327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S 328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S 329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S -330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C -331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +330,1,1,"Hippach, Miss Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss Agnes",female,,2,0,367226,23.25,,Q 332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S 333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S 334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S 335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S 336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S 337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S -338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +338,1,1,"Burns, Miss Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C 339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S 340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S -341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S -342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +341,1,2,"Navratil, Master Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S 343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S 344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S 345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S -346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S -347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +346,1,2,"Brown, Miss Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss Marion Elsie",female,40,0,0,31418,13,,S 348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S -349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +349,1,3,"Coutts, Master William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S 350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S 351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S 352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S @@ -355,10 +355,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S 355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C 356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S -357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S -358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S -359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q -360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +357,1,1,"Bowerman, Miss Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q 361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S 362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C 363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C @@ -367,58 +367,58 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S 367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C 368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C -369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +369,1,3,"Jermyn, Miss Annie",female,,0,0,14313,7.75,,Q 370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C 371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C 372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S 373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S 374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C -375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +375,0,3,"Palsson, Miss Stina Viola",female,3,3,1,349909,21.075,,S 376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C -377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +377,1,3,"Landergren, Miss Aurora Adelia",female,22,0,0,C 7077,7.25,,S 378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C 379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C 380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S -381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C -382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +381,1,1,"Bidois, Miss Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss Maria (""Mary"")",female,1,0,2,2653,15.7417,,C 383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S 384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S 385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S 386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S -387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S -388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +387,0,3,"Goodwin, Master Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss Kate",female,36,0,0,27849,13,,S 389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q -390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +390,1,2,"Lehmann, Miss Bertha",female,17,0,0,SC 1748,12,,C 391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S 392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S 393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S -394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +394,1,1,"Newell, Miss Marjorie",female,23,1,0,35273,113.275,D36,C 395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S 396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S -397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +397,0,3,"Olsson, Miss Elina",female,31,0,0,350407,7.8542,,S 398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S 399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S 400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S 401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S 402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S -403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +403,0,3,"Jussila, Miss Mari Aina",female,21,1,0,4137,9.825,,S 404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S -405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +405,0,3,"Oreskovic, Miss Marija",female,20,0,0,315096,8.6625,,S 406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S 407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S -408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +408,1,2,"Richards, Master William Rowe",male,3,1,1,29106,18.75,,S 409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S -410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +410,0,3,"Lefebre, Miss Ida",female,,3,1,4133,25.4667,,S 411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S 412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q -413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +413,1,1,"Minahan, Miss Daisy E",female,33,1,0,19928,90,C78,Q 414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S 415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S 416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S 417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S -418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +418,1,2,"Silven, Miss Lyyli Karoliina",female,18,0,2,250652,13,,S 419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S -420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +420,0,3,"Van Impe, Miss Catharina",female,10,0,2,345773,24.15,,S 421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C 422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q 423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S @@ -426,7 +426,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S 426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S 427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S -428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +428,1,2,"Phillips, Miss Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S 429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q 430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S 431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S @@ -434,8 +434,8 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S 434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S 435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S -436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S -437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +436,1,1,"Carter, Miss Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S 438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S 439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S 440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S @@ -444,10 +444,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S 444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S 445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S -446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S -447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +446,1,1,"Dodge, Master Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss Madeleine Violet",female,13,0,1,250644,19.5,,S 448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S -449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +449,1,3,"Baclini, Miss Marie Catherine",female,5,2,1,2666,19.2583,,C 450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S 451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S 452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S @@ -457,7 +457,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C 457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S 458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S -459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +459,1,2,"Toomey, Miss Ellen",female,50,0,0,F.C.C. 13531,10.5,,S 460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q 461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S 462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S @@ -468,42 +468,42 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S 468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S 469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q -470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +470,1,3,"Baclini, Miss Helene Barbara",female,0.75,2,1,2666,19.2583,,C 471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S 472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S 473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S 474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C -475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +475,0,3,"Strandberg, Miss Ida Sofia",female,22,0,0,7553,9.8375,,S 476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S 477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S 478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S 479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S -480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S -481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +480,1,3,"Hirvonen, Miss Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master Harold Victor",male,9,5,2,CA 2144,46.9,,S 482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S 483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S 484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S 485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C -486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +486,0,3,"Lefebre, Miss Jeannie",female,,3,1,4133,25.4667,,S 487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S 488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C 489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S -490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +490,1,3,"Coutts, Master Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S 491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S 492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S 493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S 494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C 495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S 496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C -497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +497,1,1,"Eustis, Miss Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C 498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S 499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S 500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S 501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S -502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q -503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q -504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S -505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +502,0,3,"Canavan, Miss Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss Roberta",female,16,0,0,110152,86.5,B79,S 506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C 507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S 508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S @@ -519,41 +519,41 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q 519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S 520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S -521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +521,1,1,"Perreault, Miss Anne",female,30,0,0,12749,93.5,B73,S 522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S 523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C 524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C 525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C 526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q -527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +527,1,2,"Ridsdale, Miss Lucy",female,50,0,0,W./C. 14258,10.5,,S 528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S 529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S 530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S -531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +531,1,2,"Quick, Miss Phyllis May",female,2,1,1,26360,26,,S 532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C 533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C 534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C -535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S -536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +535,0,3,"Cacic, Miss Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S 537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S -538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +538,1,1,"LeRoy, Miss Bertha",female,30,0,0,PC 17761,106.425,,C 539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S -540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C -541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S -542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S -543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +540,1,1,"Frolicher, Miss Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss Sigrid Elisabeth",female,11,4,2,347082,31.275,,S 544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S 545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C 546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S 547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S 548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C 549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S -550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +550,1,2,"Davies, Master John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S 551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C 552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S 553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q 554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C -555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +555,1,3,"Ohman, Miss Velin",female,22,0,0,347085,7.775,,S 556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S 557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C 558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C @@ -563,7 +563,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S 563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S 564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S -565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +565,0,3,"Meanwell, Miss (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S 566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S 567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S 568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S @@ -572,19 +572,19 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S 572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S 573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S -574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +574,1,3,"Kelly, Miss Mary",female,,0,0,14312,7.75,,Q 575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S 576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S -577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +577,1,2,"Garside, Miss Ethel",female,34,0,0,243880,13,,S 578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S 579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C 580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S -581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +581,1,2,"Christy, Miss Julie Rachel",female,25,1,1,237789,30,,S 582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C 583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S 584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C 585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C -586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +586,1,1,"Taussig, Miss Ruth",female,18,0,2,110413,79.65,E68,S 587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S 588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C 589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S @@ -592,10 +592,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S 592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C 593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S -594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +594,0,3,"Bourke, Miss Mary",female,,0,2,364848,7.75,,Q 595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S 596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S -597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +597,1,2,"Leitch, Miss Jessie Wills",female,,0,0,248727,33,,S 598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S 599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C 600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C @@ -608,16 +608,16 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S 608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S 609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C -610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +610,1,1,"Shutes, Miss Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S 611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S 612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S -613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +613,1,3,"Murphy, Miss Margaret Jane",female,,1,0,367230,15.5,,Q 614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q 615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S -616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +616,1,2,"Herman, Miss Alice",female,24,1,2,220845,65,,S 617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S 618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S -619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +619,1,2,"Becker, Miss Marion Louise",female,4,2,1,230136,39,F4,S 620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S 621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C 622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S @@ -626,34 +626,34 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S 626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S 627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q -628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +628,1,1,"Longley, Miss Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S 629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S 630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q 631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S 632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S 633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C 634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S -635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S -636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +635,0,3,"Skoog, Miss Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss Mary",female,28,0,0,237668,13,,S 637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S 638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S 639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S 640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S 641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S 642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C -643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +643,0,3,"Skoog, Miss Margit Elizabeth",female,2,3,2,347088,27.9,,S 644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S -645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +645,1,3,"Baclini, Miss Eugenie",female,0.75,2,1,2666,19.2583,,C 646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C 647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S 648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C 649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S -650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +650,1,3,"Stanley, Miss Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S 651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S -652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +652,1,2,"Doling, Miss Elsie",female,18,0,1,231919,23,,S 653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S -654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q -655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +654,1,3,"O'Leary, Miss Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss Hanora ""Nora""",female,18,0,0,365226,6.75,,Q 656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S 657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S 658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q @@ -676,10 +676,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S 676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S 677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S -678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +678,1,3,"Turja, Miss Anna Sofia",female,18,0,0,4138,9.8417,,S 679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S 680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C -681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +681,0,3,"Peters, Miss Katie",female,,0,0,330935,8.1375,,Q 682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C 683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S 684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S @@ -688,48 +688,48 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S 688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S 689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S -690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +690,1,1,"Madill, Miss Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S 691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S -692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +692,1,3,"Karun, Miss Manca",female,4,0,1,349256,13.4167,,C 693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S 694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C 695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S 696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S 697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S -698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +698,1,3,"Mullens, Miss Katherine ""Katie""",female,,0,0,35852,7.7333,,Q 699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C 700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S 701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C 702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S -703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +703,0,3,"Barbara, Miss Saiide",female,18,0,1,2691,14.4542,,C 704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q 705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S 706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S 707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S 708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S -709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S -710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +709,1,1,"Cleaver, Miss Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C 711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C 712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S 713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S 714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S 715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S 716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S -717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C -718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +717,1,1,"Endres, Miss Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S 719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q 720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S -721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +721,1,2,"Harper, Miss Annie Jessie ""Nina""",female,6,0,1,248727,33,,S 722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S 723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S 724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S 725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S 726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S 727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S -728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +728,1,3,"Mannion, Miss Margareth",female,,0,0,36866,7.7375,,Q 729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S -730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S -731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +730,0,3,"Ilmakangas, Miss Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S 732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C 733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S 734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S @@ -741,20 +741,20 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S 741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S 742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S -743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +743,1,1,"Ryerson, Miss Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C 744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S 745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S 746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S 747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S -748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +748,1,2,"Sinkkonen, Miss Anna",female,30,0,0,250648,13,,S 749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S 750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q -751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S -752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +751,1,2,"Wells, Miss Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master Meier",male,6,0,1,392096,12.475,E121,S 753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S 754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S 755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S -756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +756,1,2,"Hamalainen, Master Viljo",male,0.67,1,1,250649,14.5,,S 757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S 758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S 759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S @@ -766,7 +766,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S 766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S 767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C -768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +768,0,3,"Mangan, Miss Mary",female,30.5,0,0,364850,7.75,,Q 769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q 770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S 771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S @@ -776,22 +776,22 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S 776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S 777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q -778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +778,1,3,"Emanuel, Miss Virginia Ethel",female,5,0,0,364516,12.475,,S 779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q 780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S -781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +781,1,3,"Ayoub, Miss Banoura",female,13,0,0,2687,7.2292,,C 782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S 783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S 784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S 785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S 786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S -787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S -788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q -789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +787,1,3,"Sjoblom, Miss Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S 790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C 791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q 792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S -793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +793,0,3,"Sage, Miss Stella Anna",female,,8,2,CA. 2343,69.55,,S 794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C 795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S 796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S @@ -801,47 +801,47 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S 801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S 802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S -803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S -804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +803,1,1,"Carter, Master William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master Assad Alexander",male,0.42,0,1,2625,8.5167,,C 805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S 806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S 807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S -808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +808,0,3,"Pettersson, Miss Ellen Natalia",female,18,0,0,347087,7.775,,S 809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S 810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S 811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S 812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S 813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S -814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +814,0,3,"Andersson, Miss Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S 815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S 816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S -817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +817,0,3,"Heininen, Miss Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S 818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C 819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S -820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +820,0,3,"Skoog, Master Karl Thorsten",male,10,3,2,347088,27.9,,S 821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S 822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S 823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S 824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S -825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +825,0,3,"Panula, Master Urho Abraham",male,2,4,1,3101295,39.6875,,S 826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q 827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S -828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +828,1,2,"Mallet, Master Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C 829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q 830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, 831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C -832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +832,1,2,"Richards, Master George Sibley",male,0.83,1,1,29106,18.75,,S 833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C 834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S 835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S -836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +836,1,1,"Compton, Miss Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C 837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S 838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S 839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S 840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C 841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S 842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S -843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +843,1,1,"Serepeca, Miss Augusta",female,30,0,0,113798,31,,C 844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C 845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S 846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S @@ -849,10 +849,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C 849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S 850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C -851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +851,0,3,"Andersson, Master Sigvard Harald Elias",male,4,4,2,347082,31.275,,S 852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S -853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C -854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +853,0,3,"Boulos, Miss Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss Mary Conover",female,16,0,1,PC 17592,39.4,D28,S 855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S 856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S 857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S @@ -862,31 +862,31 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S 862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S 863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S -864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +864,0,3,"Sage, Miss Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S 865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S 866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S -867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +867,1,2,"Duran y More, Miss Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C 868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S 869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S -870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +870,1,3,"Johnson, Master Harold Theodor",male,4,1,1,347742,11.1333,,S 871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S 872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S 873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S 874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S 875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C -876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +876,1,3,"Najib, Miss Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C 877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S 878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S 879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S 880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C 881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S 882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S -883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +883,0,3,"Dahlberg, Miss Gerda Ulrika",female,22,0,0,7552,10.5167,,S 884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S 885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S 886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q 887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S -888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S -889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +888,1,1,"Graham, Miss Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S 890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C 891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q diff --git a/doc/make.py b/doc/make.py index 2583242786fc8..9542563dc037b 100755 --- a/doc/make.py +++ b/doc/make.py @@ -11,6 +11,7 @@ $ python make.py html $ python make.py latex """ + import argparse import csv import importlib @@ -113,7 +114,7 @@ def _run_os(*args) -> None: Examples -------- - >>> DocBuilder()._run_os('python', '--version') + >>> DocBuilder()._run_os("python", "--version") """ subprocess.check_call(args, stdout=sys.stdout, stderr=sys.stderr) @@ -129,7 +130,7 @@ def _sphinx_build(self, kind: str): Examples -------- - >>> DocBuilder(num_jobs=4)._sphinx_build('html') + >>> DocBuilder(num_jobs=4)._sphinx_build("html") """ if kind not in ("html", "latex", "linkcheck"): raise ValueError(f"kind must be html, latex or linkcheck, not {kind}") @@ -233,7 +234,7 @@ def html(self): ret_code = self._sphinx_build("html") zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip") if os.path.exists(zip_fname): - os.remove(zip_fname) + os.remove(zip_fname) # noqa: TID251 if ret_code == 0: if self.single_doc_html is not None: @@ -259,8 +260,7 @@ def latex(self, force=False): for i in range(3): self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex") raise SystemExit( - "You should check the file " - '"build/latex/pandas.pdf" for problems.' + 'You should check the file "build/latex/pandas.pdf" for problems.' ) self._run_os("make") return ret_code @@ -285,7 +285,7 @@ def zip_html(self) -> None: """ zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip") if os.path.exists(zip_fname): - os.remove(zip_fname) + os.remove(zip_fname) # noqa: TID251 dirname = os.path.join(BUILD_PATH, "html") fnames = os.listdir(dirname) os.chdir(dirname) @@ -342,8 +342,7 @@ def main(): dest="verbosity", default=0, help=( - "increase verbosity (can be repeated), " - "passed to the sphinx build command" + "increase verbosity (can be repeated), passed to the sphinx build command" ), ) argparser.add_argument( diff --git a/doc/redirects.csv b/doc/redirects.csv index 27b41da63c513..c11e4e242f128 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -118,7 +118,6 @@ generated/pandas.api.types.is_int64_dtype,../reference/api/pandas.api.types.is_i generated/pandas.api.types.is_integer_dtype,../reference/api/pandas.api.types.is_integer_dtype generated/pandas.api.types.is_integer,../reference/api/pandas.api.types.is_integer generated/pandas.api.types.is_interval_dtype,../reference/api/pandas.api.types.is_interval_dtype -generated/pandas.api.types.is_interval,../reference/api/pandas.api.types.is_interval generated/pandas.api.types.is_iterator,../reference/api/pandas.api.types.is_iterator generated/pandas.api.types.is_list_like,../reference/api/pandas.api.types.is_list_like generated/pandas.api.types.is_named_tuple,../reference/api/pandas.api.types.is_named_tuple @@ -239,7 +238,6 @@ generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.r generated/pandas.core.resample.Resampler.bfill,../reference/api/pandas.core.resample.Resampler.bfill generated/pandas.core.resample.Resampler.count,../reference/api/pandas.core.resample.Resampler.count generated/pandas.core.resample.Resampler.ffill,../reference/api/pandas.core.resample.Resampler.ffill -generated/pandas.core.resample.Resampler.fillna,../reference/api/pandas.core.resample.Resampler.fillna generated/pandas.core.resample.Resampler.first,../reference/api/pandas.core.resample.Resampler.first generated/pandas.core.resample.Resampler.get_group,../reference/api/pandas.core.resample.Resampler.get_group generated/pandas.core.resample.Resampler.groups,../reference/api/pandas.core.resample.Resampler.groups @@ -312,7 +310,6 @@ generated/pandas.DataFrame.align,../reference/api/pandas.DataFrame.align generated/pandas.DataFrame.all,../reference/api/pandas.DataFrame.all generated/pandas.DataFrame.any,../reference/api/pandas.DataFrame.any generated/pandas.DataFrame.apply,../reference/api/pandas.DataFrame.apply -generated/pandas.DataFrame.applymap,../reference/api/pandas.DataFrame.applymap generated/pandas.DataFrame.as_blocks,../reference/api/pandas.DataFrame.as_blocks generated/pandas.DataFrame.asfreq,../reference/api/pandas.DataFrame.asfreq generated/pandas.DataFrame.as_matrix,../reference/api/pandas.DataFrame.as_matrix @@ -325,7 +322,6 @@ generated/pandas.DataFrame.axes,../reference/api/pandas.DataFrame.axes generated/pandas.DataFrame.between_time,../reference/api/pandas.DataFrame.between_time generated/pandas.DataFrame.bfill,../reference/api/pandas.DataFrame.bfill generated/pandas.DataFrame.blocks,../reference/api/pandas.DataFrame.blocks -generated/pandas.DataFrame.bool,../reference/api/pandas.DataFrame.bool generated/pandas.DataFrame.boxplot,../reference/api/pandas.DataFrame.boxplot generated/pandas.DataFrame.clip,../reference/api/pandas.DataFrame.clip generated/pandas.DataFrame.clip_lower,../reference/api/pandas.DataFrame.clip_lower @@ -484,7 +480,6 @@ generated/pandas.DataFrame.style,../reference/api/pandas.DataFrame.style generated/pandas.DataFrame.sub,../reference/api/pandas.DataFrame.sub generated/pandas.DataFrame.subtract,../reference/api/pandas.DataFrame.subtract generated/pandas.DataFrame.sum,../reference/api/pandas.DataFrame.sum -generated/pandas.DataFrame.swapaxes,../reference/api/pandas.DataFrame.swapaxes generated/pandas.DataFrame.swaplevel,../reference/api/pandas.DataFrame.swaplevel generated/pandas.DataFrame.tail,../reference/api/pandas.DataFrame.tail generated/pandas.DataFrame.take,../reference/api/pandas.DataFrame.take @@ -495,7 +490,6 @@ generated/pandas.DataFrame.to_csv,../reference/api/pandas.DataFrame.to_csv generated/pandas.DataFrame.to_dict,../reference/api/pandas.DataFrame.to_dict generated/pandas.DataFrame.to_excel,../reference/api/pandas.DataFrame.to_excel generated/pandas.DataFrame.to_feather,../reference/api/pandas.DataFrame.to_feather -generated/pandas.DataFrame.to_gbq,../reference/api/pandas.DataFrame.to_gbq generated/pandas.DataFrame.to_hdf,../reference/api/pandas.DataFrame.to_hdf generated/pandas.DataFrame.to,../reference/api/pandas.DataFrame.to generated/pandas.DataFrame.to_json,../reference/api/pandas.DataFrame.to_json @@ -750,8 +744,6 @@ generated/pandas.Interval.overlaps,../reference/api/pandas.Interval.overlaps generated/pandas.interval_range,../reference/api/pandas.interval_range generated/pandas.Interval.right,../reference/api/pandas.Interval.right generated/pandas.io.formats.style.Styler.apply,../reference/api/pandas.io.formats.style.Styler.apply -generated/pandas.io.formats.style.Styler.applymap,../reference/api/pandas.io.formats.style.Styler.map -generated/pandas.io.formats.style.Styler.applymap_index,../reference/api/pandas.io.formats.style.Styler.map_index generated/pandas.io.formats.style.Styler.background_gradient,../reference/api/pandas.io.formats.style.Styler.background_gradient generated/pandas.io.formats.style.Styler.bar,../reference/api/pandas.io.formats.style.Styler.bar generated/pandas.io.formats.style.Styler.clear,../reference/api/pandas.io.formats.style.Styler.clear @@ -890,7 +882,6 @@ generated/pandas.read_csv,../reference/api/pandas.read_csv generated/pandas.read_excel,../reference/api/pandas.read_excel generated/pandas.read_feather,../reference/api/pandas.read_feather generated/pandas.read_fwf,../reference/api/pandas.read_fwf -generated/pandas.read_gbq,../reference/api/pandas.read_gbq generated/pandas.read_hdf,../reference/api/pandas.read_hdf generated/pandas.read,../reference/api/pandas.read generated/pandas.read_json,../reference/api/pandas.read_json @@ -932,7 +923,6 @@ generated/pandas.Series.between,../reference/api/pandas.Series.between generated/pandas.Series.between_time,../reference/api/pandas.Series.between_time generated/pandas.Series.bfill,../reference/api/pandas.Series.bfill generated/pandas.Series.blocks,../reference/api/pandas.Series.blocks -generated/pandas.Series.bool,../reference/api/pandas.Series.bool generated/pandas.Series.cat.add_categories,../reference/api/pandas.Series.cat.add_categories generated/pandas.Series.cat.as_ordered,../reference/api/pandas.Series.cat.as_ordered generated/pandas.Series.cat.as_unordered,../reference/api/pandas.Series.cat.as_unordered @@ -1116,7 +1106,6 @@ generated/pandas.Series.ptp,../reference/api/pandas.Series.ptp generated/pandas.Series.quantile,../reference/api/pandas.Series.quantile generated/pandas.Series.radd,../reference/api/pandas.Series.radd generated/pandas.Series.rank,../reference/api/pandas.Series.rank -generated/pandas.Series.ravel,../reference/api/pandas.Series.ravel generated/pandas.Series.rdiv,../reference/api/pandas.Series.rdiv generated/pandas.Series.rdivmod,../reference/api/pandas.Series.rdivmod generated/pandas.Series.real,../reference/api/pandas.Series.real @@ -1215,7 +1204,6 @@ generated/pandas.Series.str.zfill,../reference/api/pandas.Series.str.zfill generated/pandas.Series.sub,../reference/api/pandas.Series.sub generated/pandas.Series.subtract,../reference/api/pandas.Series.subtract generated/pandas.Series.sum,../reference/api/pandas.Series.sum -generated/pandas.Series.swapaxes,../reference/api/pandas.Series.swapaxes generated/pandas.Series.swaplevel,../reference/api/pandas.Series.swaplevel generated/pandas.Series.tail,../reference/api/pandas.Series.tail generated/pandas.Series.take,../reference/api/pandas.Series.take @@ -1251,7 +1239,6 @@ generated/pandas.Series.valid,../reference/api/pandas.Series.valid generated/pandas.Series.value_counts,../reference/api/pandas.Series.value_counts generated/pandas.Series.values,../reference/api/pandas.Series.values generated/pandas.Series.var,../reference/api/pandas.Series.var -generated/pandas.Series.view,../reference/api/pandas.Series.view generated/pandas.Series.where,../reference/api/pandas.Series.where generated/pandas.Series.xs,../reference/api/pandas.Series.xs generated/pandas.set_option,../reference/api/pandas.set_option @@ -1435,12 +1422,9 @@ reference/api/pandas.Series.transpose,pandas.Series.T reference/api/pandas.Index.transpose,pandas.Index.T reference/api/pandas.Index.notnull,pandas.Index.notna reference/api/pandas.Index.tolist,pandas.Index.to_list -reference/api/pandas.arrays.PandasArray,pandas.arrays.NumpyExtensionArray reference/api/pandas.core.groupby.DataFrameGroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill reference/api/pandas.core.groupby.GroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill reference/api/pandas.core.resample.Resampler.backfill,pandas.core.resample.Resampler.bfill -reference/api/pandas.io.formats.style.Styler.applymap,pandas.io.formats.style.Styler.map -reference/api/pandas.io.formats.style.Styler.applymap_index,pandas.io.formats.style.Styler.map_index # EWM -> ExponentialMovingWindow reference/api/pandas.core.window.ewm.EWM.corr,pandas.core.window.ewm.ExponentialMovingWindow.corr diff --git a/doc/scripts/eval_performance.py b/doc/scripts/eval_performance.py index dded8362a4cda..0383d4d598d55 100644 --- a/doc/scripts/eval_performance.py +++ b/doc/scripts/eval_performance.py @@ -17,7 +17,7 @@ def bench_with(n, times=10, repeat=3, engine="numexpr"): return ( np.array( timeit( - f"df.eval(s, engine={repr(engine)})", + f"df.eval(s, engine={engine!r})", setup=setup_common % (n, setup_with), repeat=repeat, number=times, @@ -34,7 +34,7 @@ def bench_subset(n, times=20, repeat=3, engine="numexpr"): return ( np.array( timeit( - f"df.query(s, engine={repr(engine)})", + f"df.query(s, engine={engine!r})", setup=setup_common % (n, setup_subset), repeat=repeat, number=times, @@ -55,7 +55,7 @@ def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False): for engine in engines: for i, n in enumerate(r): if verbose & (i % 10 == 0): - print(f"engine: {repr(engine)}, i == {i:d}") + print(f"engine: {engine!r}, i == {i:d}") ev_times = bench_with(n, times=1, repeat=1, engine=engine) ev.loc[i, engine] = np.mean(ev_times) qu_times = bench_subset(n, times=1, repeat=1, engine=engine) diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css index 0d53bbde94ae3..55141f8955066 100644 --- a/doc/source/_static/css/getting_started.css +++ b/doc/source/_static/css/getting_started.css @@ -248,6 +248,8 @@ ul.task-bullet > li > p:first-child { } .tutorial-card .card-header { + --bs-card-cap-color: var(--pst-color-text-base); + color: var(--pst-color-text-base); cursor: pointer; background-color: var(--pst-color-surface); border: 1px solid var(--pst-color-border) @@ -255,6 +257,7 @@ ul.task-bullet > li > p:first-child { .tutorial-card .card-body { background-color: var(--pst-color-on-background); + color: var(--pst-color-text-base); } .tutorial-card .badge { @@ -269,7 +272,7 @@ ul.task-bullet > li > p:first-child { .tutorial-card .gs-badge-link a { - color: var(--pst-color-text-base); + color: var(--pst-color-primary-text); text-decoration: none; } diff --git a/doc/source/_static/schemas/01_table_spreadsheet.png b/doc/source/_static/schemas/01_table_spreadsheet.png index b3cf5a0245b9c..4e3497879de31 100644 Binary files a/doc/source/_static/schemas/01_table_spreadsheet.png and b/doc/source/_static/schemas/01_table_spreadsheet.png differ diff --git a/doc/source/conf.py b/doc/source/conf.py index be6150d4e54ba..677ee6274b093 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -242,7 +242,6 @@ "external_links": [], "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", - "twitter_url": "https://twitter.com/pandas_dev", "analytics": { "plausible_analytics_domain": "pandas.pydata.org", "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", @@ -254,8 +253,15 @@ "json_url": "https://pandas.pydata.org/versions.json", "version_match": switcher_version, }, - "show_version_warning_banner": True, + # This shows a warning for patch releases since the + # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) + "show_version_warning_banner": False, "icon_links": [ + { + "name": "X", + "url": "https://x.com/pandas_dev", + "icon": "fa-brands fa-square-x-twitter", + }, { "name": "Mastodon", "url": "https://fosstodon.org/@pandas_dev", @@ -323,7 +329,6 @@ ("pandas.io.clipboard.read_clipboard", "pandas.read_clipboard"), ("pandas.io.excel.ExcelFile.parse", "pandas.ExcelFile.parse"), ("pandas.io.excel.read_excel", "pandas.read_excel"), - ("pandas.io.gbq.read_gbq", "pandas.read_gbq"), ("pandas.io.html.read_html", "pandas.read_html"), ("pandas.io.json.read_json", "pandas.read_json"), ("pandas.io.parsers.read_csv", "pandas.read_csv"), @@ -432,7 +437,7 @@ "index", "pandas.tex", "pandas: powerful Python data analysis toolkit", - "Wes McKinney and the Pandas Development Team", + "Wes McKinney and the pandas Development Team", "manual", ) ] @@ -460,7 +465,6 @@ "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None), "matplotlib": ("https://matplotlib.org/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None), - "py": ("https://pylib.readthedocs.io/en/latest/", None), "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/", None), "pyarrow": ("https://arrow.apache.org/docs/", None), @@ -813,8 +817,6 @@ def setup(app) -> None: for link in [ "http://scatterci.github.io/pydata/pandas", "http://specs.frictionlessdata.io/json-table-schema/", - "https://cloud.google.com/bigquery/docs/access-control#roles", - "https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query", "https://crates.io/crates/calamine", "https://devguide.python.org/setup/#macos", "https://en.wikipedia.org/wiki/Imputation_statistics", @@ -830,7 +832,6 @@ def setup(app) -> None: "https://nipunbatra.github.io/blog/visualisation/2013/05/01/aggregation-timeseries.html", "https://nbviewer.ipython.org/gist/metakermit/5720498", "https://numpy.org/doc/stable/user/basics.byteswapping.html", - "https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-8-0", "https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking", "https://pandas.pydata.org/pandas-docs/stable/ecosystem.html", "https://sqlalchemy.readthedocs.io/en/latest/dialects/index.html", diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst index 9a4de3c2580ab..ab8294b8f135a 100644 --- a/doc/source/development/community.rst +++ b/doc/source/development/community.rst @@ -100,6 +100,8 @@ The pandas mailing list `pandas-dev@python.org `_. + .. _community.slack: Community slack @@ -112,7 +114,7 @@ people who are hesitant to bring up their questions or ideas on a large public mailing list or GitHub. If this sounds like the right place for you, you are welcome to join using -`this link `_! +`this link `_! Please remember to follow our `Code of Conduct `_, and be aware that our admins are monitoring for irrelevant messages and will remove folks who use our diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 82af8122a6bbd..4d99f282aa695 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -19,7 +19,7 @@ Bug reports and enhancement requests ==================================== Bug reports and enhancement requests are an important part of making pandas more stable and -are curated though Github issues. When reporting and issue or request, please select the `appropriate +are curated though Github issues. When reporting an issue or request, please select the `appropriate category and fill out the issue form fully `_ to ensure others and the core development team can fully understand the scope of the issue. @@ -74,7 +74,6 @@ If you are new to Git, you can reference some of these resources for learning Gi to the :ref:`contributor community ` for help if needed: * `Git documentation `_. -* `Numpy's Git resources `_ tutorial. Also, the project follows a forking workflow further described on this page whereby contributors fork the repository, make changes and then create a pull request. @@ -306,15 +305,15 @@ It is important to periodically update your local ``main`` branch with updates f branch and update your development environment to reflect any changes to the various packages that are used during development. -If using :ref:`mamba `, run: +If using :ref:`conda `, run: .. code-block:: shell git checkout main git fetch upstream git merge upstream/main - mamba activate pandas-dev - mamba env update -f environment.yml --prune + conda activate pandas-dev + conda env update -f environment.yml --prune If using :ref:`pip ` , do: diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index be8249cd3a287..b8d568428c156 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -38,7 +38,7 @@ Pre-commit ---------- Additionally, :ref:`Continuous Integration ` will run code formatting checks -like ``black``, ``ruff``, +like ``ruff``, ``isort``, and ``clang-format`` and more using `pre-commit hooks `_. Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore, it is helpful to run the check yourself before submitting code. This @@ -198,7 +198,7 @@ In some cases you may be tempted to use ``cast`` from the typing module when you obj = cast(str, obj) # Mypy complains without this! return obj.upper() -The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable +The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_). While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable .. code-block:: python @@ -244,7 +244,7 @@ in your python environment. .. warning:: - * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. + * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. .. _contributing.ci: @@ -253,7 +253,7 @@ Testing type hints in code using pandas .. warning:: - * Pandas is not yet a py.typed library (:pep:`561`)! + * pandas is not yet a py.typed library (:pep:`561`)! The primary purpose of locally declaring pandas as a py.typed library is to test and improve the pandas-builtin type annotations. @@ -298,6 +298,12 @@ So, before actually writing any code, you should write your tests. Often the te taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. +We use `code coverage `_ to help understand +the amount of code which is covered by a test. We recommend striving to ensure code +you add or change within Pandas is covered by a test. Please see our +`code coverage dashboard through Codecov `_ +for more information. + Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. @@ -338,7 +344,7 @@ be located. - tests.scalar - tests.tseries.offsets -2. Does your test depend only on code in pd._libs? +2. Does your test depend only on code in ``pd._libs``? This test likely belongs in one of: - tests.libs @@ -531,7 +537,7 @@ Preferred ``pytest`` idioms test and does not check if the test will fail. If this is the behavior you desire, use ``pytest.skip`` instead. If a test is known to fail but the manner in which it fails -is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that +is not meant to be captured, use ``pytest.mark.xfail``. It is common to use this method for a test that exhibits buggy behavior or a non-implemented feature. If the failing test has flaky behavior, use the argument ``strict=False``. This will make it so pytest does not fail if the test happens to pass. Using ``strict=False`` is highly undesirable, please use it only as a last resort. @@ -557,11 +563,12 @@ is being raised, using ``pytest.raises`` instead. Testing a warning ^^^^^^^^^^^^^^^^^ -Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning. +Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning +and specify the warning message using the ``match`` argument. .. code-block:: python - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(DeprecationWarning, match="the warning message"): pd.deprecated_function() If a warning should specifically not happen in a block of code, pass ``False`` into the context manager. @@ -596,14 +603,15 @@ with the specific exception subclass (i.e. never use :py:class:`Exception`) and Testing involving files ^^^^^^^^^^^^^^^^^^^^^^^ -The ``tm.ensure_clean`` context manager creates a temporary file for testing, -with a generated filename (or your filename if provided), that is automatically -deleted when the context block is exited. +The ``temp_file`` pytest fixture creates a temporary file :py:class:`Pathlib` object for testing: .. code-block:: python - with tm.ensure_clean('my_file_path') as path: - # do something with the path + def test_something(temp_file): + pd.DataFrame([1]).to_csv(str(temp_file)) + +Please reference `pytest's documentation `_ +for the file retention policy. Testing involving network connectivity ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -760,8 +768,7 @@ install pandas) by typing:: your installation is probably fine and you can start contributing! Often it is worth running only a subset of tests first around your changes before running the -entire suite (tip: you can use the `pandas-coverage app `_) -to find out which tests hit the lines of code you've modified, and then run only those). +entire suite. The easiest way to do this is with:: diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 87aecb6936c9c..59d7957275e15 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -142,7 +142,7 @@ backticks. The following are considered inline code: With several mistakes in the docstring. - It has a blank like after the signature ``def func():``. + It has a blank line after the signature ``def func():``. The text 'Some function' should go in the line after the opening quotes of the docstring, not in the same line. @@ -939,8 +939,8 @@ Each shared docstring will have a base template with variables, like Finally, docstrings can also be appended to with the ``doc`` decorator. In this example, we'll create a parent docstring normally (this is like -``pandas.core.generic.NDFrame``. Then we'll have two children (like -``pandas.core.series.Series`` and ``pandas.core.frame.DataFrame``). We'll +``pandas.core.generic.NDFrame``). Then we'll have two children (like +``pandas.Series`` and ``pandas.DataFrame``). We'll substitute the class names in this docstring. .. code-block:: python @@ -995,5 +995,5 @@ mapping function names to docstrings. Wherever possible, we prefer using ``doc``, since the docstring-writing processes is slightly closer to normal. See ``pandas.core.generic.NDFrame.fillna`` for an example template, and -``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna`` +``pandas.Series.fillna`` and ``pandas.core.generic.frame.fillna`` for the filled versions. diff --git a/doc/source/development/contributing_documentation.rst b/doc/source/development/contributing_documentation.rst index 964f82be4fa7b..443470e6c50f9 100644 --- a/doc/source/development/contributing_documentation.rst +++ b/doc/source/development/contributing_documentation.rst @@ -14,7 +14,7 @@ experts. If something in the docs doesn't make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help the next person. Please visit the `issues page `__ for a full list of issues that are currently open regarding the -Pandas documentation. +pandas documentation. diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 325c902dd4f9e..98bd4b00d016b 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -35,6 +35,10 @@ You will need `Build Tools for Visual Studio 2022 scrolling down to "All downloads" -> "Tools for Visual Studio". In the installer, select the "Desktop development with C++" Workloads. + If you encounter an error indicating ``cl.exe`` is not found when building with Meson, + reopen the installer and also select the optional component + **MSVC v142 - VS 2019 C++ x64/x86 build tools** in the right pane for installation. + Alternatively, you can install the necessary components on the commandline using `vs_BuildTools.exe `_ @@ -43,7 +47,7 @@ and consult the ``Linux`` instructions below. **macOS** -To use the :ref:`mamba `-based compilers, you will need to install the +To use the :ref:`conda `-based compilers, you will need to install the Developer Tools using ``xcode-select --install``. If you prefer to use a different compiler, general information can be found here: @@ -51,9 +55,9 @@ https://devguide.python.org/setup/#macos **Linux** -For Linux-based :ref:`mamba ` installations, you won't have to install any -additional components outside of the mamba environment. The instructions -below are only needed if your setup isn't based on mamba environments. +For Linux-based :ref:`conda ` installations, you won't have to install any +additional components outside of the conda environment. The instructions +below are only needed if your setup isn't based on conda environments. Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system:: @@ -82,19 +86,18 @@ Before we begin, please: * Make sure that you have :any:`cloned the repository ` * ``cd`` to the pandas source directory you just created with the clone command -.. _contributing.mamba: +.. _contributing.conda: -Option 1: using mamba (recommended) +Option 1: using conda (recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Install miniforge to get `mamba `_ -* Make sure your mamba is up to date (``mamba update mamba``) -* Create and activate the ``pandas-dev`` mamba environment using the following commands: +* Install miniforge to get `conda `_ +* Create and activate the ``pandas-dev`` conda environment using the following commands: -.. code-block:: none +.. code-block:: bash - mamba env create --file environment.yml - mamba activate pandas-dev + conda env create --file environment.yml + conda activate pandas-dev .. _contributing.pip: @@ -130,7 +133,7 @@ Consult the docs for setting up pyenv `here `__. pyenv virtualenv # For instance: - pyenv virtualenv 3.9.10 pandas-dev + pyenv virtualenv 3.10 pandas-dev # Activate the virtualenv pyenv activate pandas-dev @@ -227,7 +230,7 @@ To compile pandas with meson, run:: # By default, this will print verbose output # showing the "rebuild" taking place on import (see section below for explanation) # If you do not want to see this, omit everything after --no-build-isolation - python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true .. note:: The version number is pulled from the latest repository tag. Be sure to fetch the latest tags from upstream @@ -242,15 +245,15 @@ To compile pandas with meson, run:: It is possible to pass options from the pip frontend to the meson backend if you would like to configure your install. Occasionally, you'll want to use this to adjust the build directory, and/or toggle debug/optimization levels. -You can pass a build directory to pandas by appending ``--config-settings builddir="your builddir here"`` to your pip command. +You can pass a build directory to pandas by appending ``-Cbuilddir="your builddir here"`` to your pip command. This option allows you to configure where meson stores your built C extensions, and allows for fast rebuilds. Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions. -Appending ``--config-settings setup-args="-Ddebug=true"`` will do the trick. +Appending ``-Csetup-args="-Ddebug=true"`` will do the trick. With pip, it is possible to chain together multiple config settings (for example specifying both a build directory and building with debug symbols would look like -``--config-settings builddir="your builddir here" --config-settings=setup-args="-Dbuildtype=debug"``. +``-Cbuilddir="your builddir here" -Csetup-args="-Dbuildtype=debug"``. **Compiling pandas with setup.py** @@ -291,7 +294,7 @@ At this point you may want to try When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified. By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson's -output when importing pandas, you can set the environment variable ``MESONPY_EDTIABLE_VERBOSE``. For example, this would be:: +output when importing pandas, you can set the environment variable ``MESONPY_EDITABLE_VERBOSE``. For example, this would be:: # On Linux/macOS MESONPY_EDITABLE_VERBOSE=1 python @@ -302,7 +305,7 @@ output when importing pandas, you can set the environment variable ``MESONPY_EDT If you would like to see this verbose output every time, you can set the ``editable-verbose`` config setting to ``true`` like so:: - python -m pip install -ve . --config-settings editable-verbose=true + python -m pip install -ve . -Ceditable-verbose=true .. tip:: If you ever find yourself wondering whether setuptools or meson was used to build your pandas, diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst index 2ba43a44b87d3..b70981b4d307d 100644 --- a/doc/source/development/contributing_gitpod.rst +++ b/doc/source/development/contributing_gitpod.rst @@ -109,7 +109,7 @@ development experience: * `VSCode rst extension `_ * `Markdown All in One `_ -* `VSCode Gitlens extension `_ +* `VSCode GitLens extension `_ * `VSCode Git Graph extension `_ Development workflow with Gitpod diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index d63ecb3157cff..3b955e7bbb0a5 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -6,7 +6,7 @@ Debugging C extensions ====================== -Pandas uses Cython and C/C++ `extension modules `_ to optimize performance. Unfortunately, the standard Python debugger does not allow you to step into these extensions. Cython extensions can be debugged with the `Cython debugger `_ and C/C++ extensions can be debugged using the tools shipped with your platform's compiler. +pandas uses Cython and C/C++ `extension modules `_ to optimize performance. Unfortunately, the standard Python debugger does not allow you to step into these extensions. Cython extensions can be debugged with the `Cython debugger `_ and C/C++ extensions can be debugged using the tools shipped with your platform's compiler. For Python developers with limited or no C/C++ experience this can seem a daunting task. Core developer Will Ayd has written a 3 part blog series to help guide you from the standard Python debugger into these other tools: @@ -19,18 +19,18 @@ Debugging locally By default building pandas from source will generate a release build. To generate a development build you can type:: - pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" + pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" .. note:: - conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases. If using conda, you may need to set ``CFLAGS="$CFLAGS -O0"`` and ``CPPFLAGS="$CPPFLAGS -O0"`` to ensure optimizations are turned off for debugging + conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases, and may work counter towards usage in a development environment. If using conda, you should unset these environment variables via ``export CFLAGS=`` and ``export CPPFLAGS=`` By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. Using Docker ------------ -To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locallly. +To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locally. You can then mount your pandas repository into this image via: @@ -42,7 +42,7 @@ Inside the image, you can use meson to build/install pandas and place the build .. code-block:: sh - python -m pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" + python -m pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first ``cd`` to the build folder, then start that application. diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index 6de237b70f08d..c5c4b7c449ce7 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -99,7 +99,7 @@ Column metadata * Boolean: ``'bool'`` * Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'`` * Floats: ``'float16', 'float32', 'float64'`` -* Date and Time Types: ``'datetime', 'datetimetz'``, ``'timedelta'`` +* Date and Time Types: ``'datetime', 'datetimetz', 'timedelta'`` * String: ``'unicode', 'bytes'`` * Categorical: ``'categorical'`` * Other Python objects: ``'object'`` diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index c7803d8401e4e..c572559dcc3e0 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -84,7 +84,7 @@ Here's a typical workflow for triaging a newly opened issue. example. See https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports for a good explanation. If the example is not reproducible, or if it's *clearly* not minimal, feel free to ask the reporter if they can provide - and example or simplify the provided one. Do acknowledge that writing + an example or simplify the provided one. Do acknowledge that writing minimal reproducible examples is hard work. If the reporter is struggling, you can try to write one yourself and we'll edit the original post to include it. @@ -93,6 +93,9 @@ Here's a typical workflow for triaging a newly opened issue. If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + If this is a regression report, post the result of a ``git bisect`` run. + More info on this can be found in the :ref:`maintaining.regressions` section. + Ensure the issue exists on the main branch and that it has the "Needs Triage" tag until all steps have been completed. Add a comment to the issue once you have verified it exists on the main branch, so others know it has been confirmed. @@ -125,7 +128,10 @@ Here's a typical workflow for triaging a newly opened issue. If the issue is clearly defined and the fix seems relatively straightforward, label the issue as "Good first issue". - Once you have completed the above, make sure to remove the "needs triage" label. + If the issue is a regression report, add the "Regression" label and the next patch + release milestone. + + Once you have completed the above, make sure to remove the "Needs Triage" label. .. _maintaining.regressions: @@ -151,7 +157,7 @@ and then run:: git bisect start git bisect good v1.4.0 git bisect bad v1.5.0 - git bisect run bash -c "python setup.py build_ext -j 4; python t.py" + git bisect run bash -c "python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true; python t.py" This finds the first commit that changed the behavior. The C extensions have to be rebuilt at every step, so the search can take a while. @@ -159,7 +165,7 @@ rebuilt at every step, so the search can take a while. Exit bisect and rebuild the current version:: git bisect reset - python setup.py build_ext -j 4 + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true Report your findings under the corresponding issue and ping the commit author to get their input. @@ -326,34 +332,6 @@ a milestone before tagging, you can request the bot to backport it with: @Meeseeksdev backport -.. _maintaining.asv-machine: - -Benchmark machine ------------------ - -The team currently owns dedicated hardware for hosting a website for pandas' ASV performance benchmark. The results -are published to https://asv-runner.github.io/asv-collection/pandas/ - -Configuration -````````````` - -The machine can be configured with the `Ansible `_ playbook in https://github.com/tomaugspurger/asv-runner. - -Publishing -`````````` - -The results are published to another GitHub repository, https://github.com/tomaugspurger/asv-collection. -Finally, we have a cron job on our docs server to pull from https://github.com/tomaugspurger/asv-collection, to serve them from ``/speed``. -Ask Tom or Joris for access to the webserver. - -Debugging -````````` - -The benchmarks are scheduled by Airflow. It has a dashboard for viewing and debugging the results. You'll need to setup an SSH tunnel to view them - - ssh -L 8080:localhost:8080 pandas@panda.likescandy.com - - .. _maintaining.release: Release process @@ -366,7 +344,7 @@ in the next places: - Git repo with a `new tag `_ - Source distribution in a `GitHub release `_ - Pip packages in the `PyPI `_ -- Conda/Mamba packages in `conda-forge `_ +- Conda packages in `conda-forge `_ The process for releasing a new version of pandas is detailed next section. @@ -430,7 +408,7 @@ Release git checkout git pull --ff-only upstream git clean -xdf - git commit --allow-empty --author="Pandas Development Team " -m "RLS: " + git commit --allow-empty --author="pandas Development Team " -m "RLS: " git tag -a v -m "Version " # NOTE that the tag is v1.5.2 with "v" not 1.5.2 git push upstream --follow-tags @@ -460,7 +438,7 @@ which will be triggered when the tag is pushed. 4. Create a `new GitHub release `_: - Tag: ```` - - Title: ``Pandas `` + - Title: ``pandas `` - Description: Copy the description of the last release of the same kind (release candidate, major/minor or patch release) - Files: ``pandas-.tar.gz`` source distribution just generated - Set as a pre-release: Only check for a release candidate @@ -490,9 +468,9 @@ Post-Release the appropriate ones for the version you are releasing): - Log in to the server and use the correct user. - - `cd /var/www/html/pandas-docs/` - - `ln -sfn version/2.1 stable` (for a major or minor release) - - `ln -sfn version/2.0.3 version/2.0` (for a patch release) + - ``cd /var/www/html/pandas-docs/`` + - ``ln -sfn version/2.1 stable`` (for a major or minor release) + - ``ln -sfn version/2.0.3 version/2.0`` (for a patch release) 2. If releasing a major or minor release, open a PR in our source code to update ``web/pandas/versions.json``, to have the desired versions in the documentation @@ -510,7 +488,7 @@ Post-Release for reference): - The pandas-dev and pydata mailing lists - - Twitter, Mastodon, Telegram and LinkedIn + - X, Mastodon, Telegram and LinkedIn 7. Update this release instructions to fix anything incorrect and to update about any change since the last release. diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index 8465820452353..a3665c5bb4d1f 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -46,10 +46,12 @@ deprecation removed in the next major release (2.0.0). These policies do not apply to features marked as **experimental** in the documentation. pandas may change the behavior of experimental features at any time. +.. _policies.python_support: + Python support ~~~~~~~~~~~~~~ -pandas mirrors the `NumPy guidelines for Python support `__. +pandas mirrors the `SPEC 0 guideline for Python support `__. Security policy ~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index a6cfcd4614984..cc7add87b5935 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -383,7 +383,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python - a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) + a = np.array(list(range(1, 24)) + [np.nan]).reshape(2, 3, 4) pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) meltlist @@ -402,10 +402,10 @@ In Python, this list would be a list of tuples, so .. ipython:: python - a = list(enumerate(list(range(1, 5)) + [np.NAN])) + a = list(enumerate(list(range(1, 5)) + [np.nan])) pd.DataFrame(a) -For more details and examples see :ref:`the Into to Data Structures +For more details and examples see :ref:`the Intro to Data Structures documentation `. meltdf @@ -438,7 +438,7 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: ) pd.melt(cheese, id_vars=["first", "last"]) - cheese.set_index(["first", "last"]).stack(future_stack=True) # alternative way + cheese.set_index(["first", "last"]).stack() # alternative way For more details and examples see :ref:`the reshaping documentation `. diff --git a/doc/source/getting_started/comparison/comparison_with_spss.rst b/doc/source/getting_started/comparison/comparison_with_spss.rst new file mode 100644 index 0000000000000..12c64bfd180a3 --- /dev/null +++ b/doc/source/getting_started/comparison/comparison_with_spss.rst @@ -0,0 +1,229 @@ +.. _compare_with_spss: + +{{ header }} + +Comparison with SPSS +******************** +For potential users coming from `SPSS `__, this page is meant to demonstrate +how various SPSS operations would be performed using pandas. + +.. include:: includes/introduction.rst + +Data structures +--------------- + +General terminology translation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. csv-table:: + :header: "pandas", "SPSS" + :widths: 20, 20 + + :class:`DataFrame`, data file + column, variable + row, case + groupby, split file + :class:`NaN`, system-missing + +:class:`DataFrame` +~~~~~~~~~~~~~~~~~~ + +A :class:`DataFrame` in pandas is analogous to an SPSS data file - a two-dimensional +data source with labeled columns that can be of different types. As will be shown in this +document, almost any operation that can be performed in SPSS can also be accomplished in pandas. + +:class:`Series` +~~~~~~~~~~~~~~~ + +A :class:`Series` is the data structure that represents one column of a :class:`DataFrame`. SPSS doesn't have a +separate data structure for a single variable, but in general, working with a :class:`Series` is analogous +to working with a variable in SPSS. + +:class:`Index` +~~~~~~~~~~~~~~ + +Every :class:`DataFrame` and :class:`Series` has an :class:`Index` -- labels on the *rows* of the data. SPSS does not +have an exact analogue, as cases are simply numbered sequentially from 1. In pandas, if no index is +specified, a :class:`RangeIndex` is used by default (first row = 0, second row = 1, and so on). + +While using a labeled :class:`Index` or :class:`MultiIndex` can enable sophisticated analyses and is ultimately an +important part of pandas to understand, for this comparison we will essentially ignore the :class:`Index` and +just treat the :class:`DataFrame` as a collection of columns. Please see the :ref:`indexing documentation` +for much more on how to use an :class:`Index` effectively. + + +Copies vs. in place operations +------------------------------ + +.. include:: includes/copies.rst + + +Data input / output +------------------- + +Reading external data +~~~~~~~~~~~~~~~~~~~~~ + +Like SPSS, pandas provides utilities for reading in data from many formats. The ``tips`` dataset, found within +the pandas tests (`csv `_) +will be used in many of the following examples. + +In SPSS, you would use File > Open > Data to import a CSV file: + +.. code-block:: text + + FILE > OPEN > DATA + /TYPE=CSV + /FILE='tips.csv' + /DELIMITERS="," + /FIRSTCASE=2 + /VARIABLES=col1 col2 col3. + +The pandas equivalent would use :func:`read_csv`: + +.. code-block:: python + + url = ( + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" + ) + tips = pd.read_csv(url) + tips + +Like SPSS's data import wizard, ``read_csv`` can take a number of parameters to specify how the data should be parsed. +For example, if the data was instead tab delimited, and did not have column names, the pandas command would be: + +.. code-block:: python + + tips = pd.read_csv("tips.csv", sep="\t", header=None) + + # alternatively, read_table is an alias to read_csv with tab delimiter + tips = pd.read_table("tips.csv", header=None) + + +Data operations +--------------- + +Filtering +~~~~~~~~~ + +In SPSS, filtering is done through Data > Select Cases: + +.. code-block:: text + + SELECT IF (total_bill > 10). + EXECUTE. + +In pandas, boolean indexing can be used: + +.. code-block:: python + + tips[tips["total_bill"] > 10] + + +Sorting +~~~~~~~ + +In SPSS, sorting is done through Data > Sort Cases: + +.. code-block:: text + + SORT CASES BY sex total_bill. + EXECUTE. + +In pandas, this would be written as: + +.. code-block:: python + + tips.sort_values(["sex", "total_bill"]) + + +String processing +----------------- + +Finding length of string +~~~~~~~~~~~~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE length = LENGTH(time). + EXECUTE. + +.. include:: includes/length.rst + + +Changing case +~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE upper = UPCASE(time). + COMPUTE lower = LOWER(time). + EXECUTE. + +.. include:: includes/case.rst + + +Merging +------- + +In SPSS, merging data files is done through Data > Merge Files. + +.. include:: includes/merge_setup.rst +.. include:: includes/merge.rst + + +GroupBy operations +------------------ + +Split-file processing +~~~~~~~~~~~~~~~~~~~~~ + +In SPSS, split-file analysis is done through Data > Split File: + +.. code-block:: text + + SORT CASES BY sex. + SPLIT FILE BY sex. + DESCRIPTIVES VARIABLES=total_bill tip + /STATISTICS=MEAN STDDEV MIN MAX. + +The pandas equivalent would be: + +.. code-block:: python + + tips.groupby("sex")[["total_bill", "tip"]].agg(["mean", "std", "min", "max"]) + + +Missing data +------------ + +SPSS uses the period (``.``) for numeric missing values and blank spaces for string missing values. +pandas uses ``NaN`` (Not a Number) for numeric missing values and ``None`` or ``NaN`` for string +missing values. + +.. include:: includes/missing.rst + + +Other considerations +-------------------- + +Output management +----------------- + +While pandas does not have a direct equivalent to SPSS's Output Management System (OMS), you can +capture and export results in various ways: + +.. code-block:: python + + # Save summary statistics to CSV + tips.groupby('sex')[['total_bill', 'tip']].mean().to_csv('summary.csv') + + # Save multiple results to Excel sheets + with pd.ExcelWriter('results.xlsx') as writer: + tips.describe().to_excel(writer, sheet_name='Descriptives') + tips.groupby('sex').mean().to_excel(writer, sheet_name='Means by Gender') diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index daa528c7d408a..dc0590f18751a 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -505,7 +505,7 @@ DELETE DELETE FROM tips WHERE tip > 9; -In pandas we select the rows that should remain instead of deleting them: +In pandas we select the rows that should remain instead of deleting the rows that should be removed: .. ipython:: python diff --git a/doc/source/getting_started/comparison/index.rst b/doc/source/getting_started/comparison/index.rst index c3f58ce1f3d6d..3133d74afa3db 100644 --- a/doc/source/getting_started/comparison/index.rst +++ b/doc/source/getting_started/comparison/index.rst @@ -14,3 +14,4 @@ Comparison with other tools comparison_with_spreadsheets comparison_with_sas comparison_with_stata + comparison_with_spss diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index d9cb1de14aded..a17699a71fbd3 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -17,8 +17,7 @@ Installation :columns: 12 12 6 6 :padding: 3 - pandas is part of the `Anaconda `__ - distribution and can be installed with Anaconda or Miniconda: + pandas can be installed via conda from `conda-forge `__. ++++++++++++++++++++++ @@ -134,8 +133,8 @@ to explore, clean, and process your data. In pandas, a data table is called a :c
-pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these -data sources is provided by function with the prefix ``read_*``. Similarly, the ``to_*`` methods are used to store data. +pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). The ability to import data from each of these +data sources is provided by functions with the prefix, ``read_*``. Similarly, the ``to_*`` methods are used to store data. .. image:: ../_static/schemas/02_io_readwrite.svg :align: center @@ -181,7 +180,7 @@ data sources is provided by function with the prefix ``read_*``. Similarly, the
-Selecting or filtering specific rows and/or columns? Filtering the data on a condition? Methods for slicing, selecting, and extracting the +Selecting or filtering specific rows and/or columns? Filtering the data on a particular condition? Methods for slicing, selecting, and extracting the data you need are available in pandas. .. image:: ../_static/schemas/03_subset_columns_rows.svg @@ -228,7 +227,7 @@ data you need are available in pandas.
-pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) +pandas provides plotting for your data right out of the box with the power of Matplotlib. Simply pick the plot type (scatter, bar, boxplot,...) corresponding to your data. .. image:: ../_static/schemas/04_plot_overview.svg @@ -275,7 +274,7 @@ corresponding to your data.
-There is no need to loop over all rows of your data table to do calculations. Data manipulations on a column work elementwise. +There's no need to loop over all rows of your data table to do calculations. Column data manipulations work elementwise in pandas. Adding a column to a :class:`DataFrame` based on existing data in other columns is straightforward. .. image:: ../_static/schemas/05_newcolumn_2.svg @@ -322,7 +321,7 @@ Adding a column to a :class:`DataFrame` based on existing data in other columns
-Basic statistics (mean, median, min, max, counts...) are easily calculable. These or custom aggregations can be applied on the entire +Basic statistics (mean, median, min, max, counts...) are easily calculable across data frames. These, or even custom aggregations, can be applied on the entire data set, a sliding window of the data, or grouped by categories. The latter is also known as the split-apply-combine approach. .. image:: ../_static/schemas/06_groupby.svg @@ -369,8 +368,8 @@ data set, a sliding window of the data, or grouped by categories. The latter is
-Change the structure of your data table in multiple ways. You can :func:`~pandas.melt` your data table from wide to long/tidy form or :func:`~pandas.pivot` -from long to wide format. With aggregations built-in, a pivot table is created with a single command. +Change the structure of your data table in a variety of ways. You can use :func:`~pandas.melt` to reshape your data from a wide format to a long and tidy one. Use :func:`~pandas.pivot` + to go from long to wide format. With aggregations built-in, a pivot table can be created with a single command. .. image:: ../_static/schemas/07_melt.svg :align: center @@ -416,7 +415,7 @@ from long to wide format. With aggregations built-in, a pivot table is created w
-Multiple tables can be concatenated both column wise and row wise as database-like join/merge operations are provided to combine multiple tables of data. +Multiple tables can be concatenated column wise or row wise with pandas' database-like join and merge operations. .. image:: ../_static/schemas/08_concat_row.svg :align: center @@ -505,7 +504,7 @@ pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. +Data sets often contain more than just numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. .. raw:: html @@ -551,9 +550,9 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `R programming language `__ provides the - ``data.frame`` data structure and multiple packages, such as - `tidyverse `__ use and extend ``data.frame`` + The `R programming language `__ provides a + ``data.frame`` data structure as well as packages like + `tidyverse `__ which use and extend ``data.frame`` for convenient data handling functionalities similar to pandas. +++ @@ -572,8 +571,8 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - Already familiar to ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? - Most of these SQL manipulations do have equivalents in pandas. + Already familiar with ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? + Many SQL manipulations have equivalents in pandas. +++ @@ -613,7 +612,7 @@ the pandas-equivalent operations compared to software you already know: Users of `Excel `__ or other spreadsheet programs will find that many of the concepts are - transferrable to pandas. + transferable to pandas. +++ @@ -631,10 +630,10 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `SAS `__ statistical software suite - also provides the ``data set`` corresponding to the pandas ``DataFrame``. - Also SAS vectorized operations, filtering, string processing operations, - and more have similar functions in pandas. + `SAS `__, the statistical software suite, + uses the ``data set`` structure, which closely corresponds pandas' ``DataFrame``. + Also SAS vectorized operations such as filtering or string processing operations + have similar functions in pandas. +++ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1d7eca5223544..93663c1cced7e 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -6,88 +6,75 @@ Installation ============ -The easiest way to install pandas is to install it -as part of the `Anaconda `__ distribution, a -cross platform distribution for data analysis and scientific computing. -The `Conda `__ package manager is the -recommended installation method for most users. +The pandas development team officially distributes pandas for installation +through the following methods: -Instructions for installing :ref:`from source `, -:ref:`PyPI `, or a -:ref:`development version ` are also provided. +* Available on `conda-forge `__ for installation with the conda package manager. +* Available on `PyPI `__ for installation with pip. +* Available on `Github `__ for installation from source. + +.. note:: + pandas may be installable from other sources besides the ones listed above, + but they are **not** managed by the pandas development team. .. _install.version: Python version support ---------------------- -Officially Python 3.9, 3.10 and 3.11. +See :ref:`Python support policy `. Installing pandas ----------------- -.. _install.anaconda: +.. _install.conda: -Installing with Anaconda -~~~~~~~~~~~~~~~~~~~~~~~~ +Installing with Conda +~~~~~~~~~~~~~~~~~~~~~ -For users that are new to Python, the easiest way to install Python, pandas, and the -packages that make up the `PyData `__ stack -(`SciPy `__, `NumPy `__, -`Matplotlib `__, `and more `__) -is with `Anaconda `__, a cross-platform -(Linux, macOS, Windows) Python distribution for data analytics and -scientific computing. Installation instructions for Anaconda -`can be found here `__. +For users working with the `Conda `__ package manager, +pandas can be installed from the ``conda-forge`` channel. -.. _install.miniconda: +.. code-block:: shell -Installing with Miniconda -~~~~~~~~~~~~~~~~~~~~~~~~~ + conda install -c conda-forge pandas -For users experienced with Python, the recommended way to install pandas with -`Miniconda `__. -Miniconda allows you to create a minimal, self-contained Python installation compared to Anaconda and use the -`Conda `__ package manager to install additional packages -and create a virtual environment for your installation. Installation instructions for Miniconda -`can be found here `__. +To install the Conda package manager on your system, the +`Miniforge distribution `__ +is recommended. -The next step is to create a new conda environment. A conda environment is like a -virtualenv that allows you to specify a specific version of Python and set of libraries. -Run the following commands from a terminal window. +Additionally, it is recommended to install and run pandas from a virtual environment. .. code-block:: shell conda create -c conda-forge -n name_of_my_env python pandas - -This will create a minimal environment with only Python and pandas installed. -To put your self inside this environment run. - -.. code-block:: shell - + # On Linux or MacOS source activate name_of_my_env # On Windows activate name_of_my_env -.. _install.pypi: +.. tip:: + For users that are new to Python, the easiest way to install Python, pandas, and the + packages that make up the `PyData `__ stack such as + `SciPy `__, `NumPy `__ and + `Matplotlib `__ + is with `Anaconda `__, a cross-platform + (Linux, macOS, Windows) Python distribution for data analytics and + scientific computing. -Installing from PyPI -~~~~~~~~~~~~~~~~~~~~ + However, pandas from Anaconda is **not** officially managed by the pandas development team. -pandas can be installed via pip from -`PyPI `__. +.. _install.pip: -.. code-block:: shell - - pip install pandas +Installing with pip +~~~~~~~~~~~~~~~~~~~ -.. note:: - You must have ``pip>=19.3`` to install from PyPI. +For users working with the `pip `__ package manager, +pandas can be installed from `PyPI `__. -.. note:: +.. code-block:: shell - It is recommended to install and run pandas from a virtual environment, for example, - using the Python standard library's `venv `__ + pip install pandas pandas can also be installed with sets of optional dependencies to enable certain functionality. For example, to install pandas with the optional dependencies to read Excel files. @@ -98,25 +85,8 @@ to install pandas with the optional dependencies to read Excel files. The full list of extras that can be installed can be found in the :ref:`dependency section.` -Handling ImportErrors -~~~~~~~~~~~~~~~~~~~~~ - -If you encounter an ``ImportError``, it usually means that Python couldn't find pandas in the list of available -libraries. Python internally has a list of directories it searches through, to find packages. You can -obtain these directories with. - -.. code-block:: python - - import sys - sys.path - -One way you could be encountering this error is if you have multiple Python installations on your system -and you don't have pandas installed in the Python installation you're currently using. -In Linux/Mac you can run ``which python`` on your terminal and it will tell you which Python installation you're -using. If it's something like "/usr/bin/python", you're using the Python from the system, which is not recommended. - -It is highly recommended to use ``conda``, for quick installation and for package and dependency updates. -You can find simple installation instructions for pandas :ref:`in this document `. +Additionally, it is recommended to install and run pandas from a virtual environment, for example, +using the Python standard library's `venv `__ .. _install.source: @@ -144,49 +114,24 @@ index from the PyPI registry of anaconda.org. You can install it by running. pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas -Note that you might be required to uninstall an existing version of pandas to install the development version. +.. note:: + You might be required to uninstall an existing version of pandas to install the development version. -.. code-block:: shell + .. code-block:: shell - pip uninstall pandas -y + pip uninstall pandas -y Running the test suite ---------------------- -pandas is equipped with an exhaustive set of unit tests. The packages required to run the tests -can be installed with ``pip install "pandas[test]"``. To run the tests from a -Python terminal. - -.. code-block:: python - - >>> import pandas as pd - >>> pd.test() - running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.9/site-packages/pandas - - ============================= test session starts ============================== - platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 - rootdir: /home/user - plugins: dash-1.19.0, anyio-3.5.0, hypothesis-6.29.3 - collected 154975 items / 4 skipped / 154971 selected - ........................................................................ [ 0%] - ........................................................................ [ 99%] - ....................................... [100%] - - ==================================== ERRORS ==================================== - - =================================== FAILURES =================================== - - =============================== warnings summary =============================== - - =========================== short test summary info ============================ - - = 1 failed, 146194 passed, 7402 skipped, 1367 xfailed, 5 xpassed, 197 warnings, 10 errors in 1090.16s (0:18:10) = +If pandas has been installed :ref:`from source `, running ``pytest pandas`` will run all of pandas unit tests. +The unit tests can also be run from the pandas module itself with the :func:`test` function. The packages required to run the tests +can be installed with ``pip install "pandas[test]"``. .. note:: - This is just an example of what information is shown. Test failures are not necessarily indicative - of a broken pandas installation. + Test failures are not necessarily indicative of a broken pandas installation. .. _install.dependencies: @@ -203,9 +148,8 @@ pandas requires the following dependencies. ================================================================ ========================== Package Minimum supported version ================================================================ ========================== -`NumPy `__ 1.22.4 +`NumPy `__ 1.23.5 `python-dateutil `__ 2.8.2 -`pytz `__ 2020.1 `tzdata `__ 2022.7 ================================================================ ========================== @@ -220,7 +164,7 @@ For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while optional dependency is not installed, pandas will raise an ``ImportError`` when the method requiring that dependency is called. -If using pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) +With pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) as optional extras (e.g. ``pandas[performance, aws]``). All optional dependencies can be installed with ``pandas[all]``, and specific sets of dependencies are listed in the sections below. @@ -239,9 +183,9 @@ Installable with ``pip install "pandas[performance]"`` ===================================================== ================== ================== =================================================================================================================================================================================== Dependency Minimum Version pip extra Notes ===================================================== ================== ================== =================================================================================================================================================================================== -`numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups +`numexpr `__ 2.9.0 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups `bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. -`numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. +`numba `__ 0.59.0 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. ===================================================== ================== ================== =================================================================================================================================================================================== Visualization @@ -249,53 +193,56 @@ Visualization Installable with ``pip install "pandas[plot, output-formatting]"``. -========================= ================== ================== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== ================== ============================================================= -matplotlib 3.6.3 plot Plotting library -Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style -tabulate 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) -========================= ================== ================== ============================================================= +========================================================== ================== ================== ======================================================= +Dependency Minimum Version pip extra Notes +========================================================== ================== ================== ======================================================= +`matplotlib `__ 3.8.3 plot Plotting library +`Jinja2 `__ 3.1.3 output-formatting Conditional formatting with DataFrame.style +`tabulate `__ 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) +========================================================== ================== ================== ======================================================= Computation ^^^^^^^^^^^ Installable with ``pip install "pandas[computation]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -SciPy 1.10.0 computation Miscellaneous statistical functions -xarray 2022.12.0 computation pandas-like API for N-dimensional data -========================= ================== =============== ============================================================= +============================================== ================== =============== ======================================= +Dependency Minimum Version pip extra Notes +============================================== ================== =============== ======================================= +`SciPy `__ 1.12.0 computation Miscellaneous statistical functions +`xarray `__ 2024.1.1 computation pandas-like API for N-dimensional data +============================================== ================== =============== ======================================= + +.. _install.excel_dependencies: Excel files ^^^^^^^^^^^ Installable with ``pip install "pandas[excel]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -xlrd 2.0.1 excel Reading Excel -xlsxwriter 3.0.5 excel Writing Excel -openpyxl 3.1.0 excel Reading / writing for xlsx files -pyxlsb 1.0.10 excel Reading for xlsb files -python-calamine 0.1.7 excel Reading for xls/xlsx/xlsb/ods files -========================= ================== =============== ============================================================= +================================================================== ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +================================================================== ================== =============== ============================================================= +`xlrd `__ 2.0.1 excel Reading for xls files +`xlsxwriter `__ 3.2.0 excel Writing for xlsx files +`openpyxl `__ 3.1.2 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files +`pyxlsb `__ 1.0.10 excel Reading for xlsb files +`python-calamine `__ 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files +`odfpy `__ 1.4.1 excel Reading / writing for OpenDocument 1.2 files +================================================================== ================== =============== ============================================================= HTML ^^^^ Installable with ``pip install "pandas[html]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -BeautifulSoup4 4.11.2 html HTML parser for read_html -html5lib 1.1 html HTML parser for read_html -lxml 4.9.2 html HTML parser for read_html -========================= ================== =============== ============================================================= +=============================================================== ================== =============== ========================== +Dependency Minimum Version pip extra Notes +=============================================================== ================== =============== ========================== +`BeautifulSoup4 `__ 4.12.3 html HTML parser for read_html +`html5lib `__ 1.1 html HTML parser for read_html +`lxml `__ 4.9.2 html HTML parser for read_html +=============================================================== ================== =============== ========================== One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: @@ -326,45 +273,45 @@ XML Installable with ``pip install "pandas[xml]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -lxml 4.9.2 xml XML parser for read_xml and tree builder for to_xml -========================= ================== =============== ============================================================= +======================================== ================== =============== ==================================================== +Dependency Minimum Version pip extra Notes +======================================== ================== =============== ==================================================== +`lxml `__ 4.9.2 xml XML parser for read_xml and tree builder for to_xml +======================================== ================== =============== ==================================================== SQL databases ^^^^^^^^^^^^^ Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -SQLAlchemy 2.0.0 postgresql, SQL support for databases other than sqlite - mysql, - sql-other -psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy -pymysql 1.0.2 mysql MySQL engine for sqlalchemy -adbc-driver-postgresql 0.8.0 postgresql ADBC Driver for PostgreSQL -adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite -========================= ================== =============== ============================================================= +================================================================== ================== =============== ============================================ +Dependency Minimum Version pip extra Notes +================================================================== ================== =============== ============================================ +`SQLAlchemy `__ 2.0.0 postgresql, SQL support for databases other than sqlite + mysql, + sql-other +`psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy +`pymysql `__ 1.1.0 mysql MySQL engine for sqlalchemy +`adbc-driver-postgresql `__ 0.10.0 postgresql ADBC Driver for PostgreSQL +`adbc-driver-sqlite `__ 0.8.0 sql-other ADBC Driver for SQLite +================================================================== ================== =============== ============================================ Other data sources ^^^^^^^^^^^^^^^^^^ -Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"`` +Installable with ``pip install "pandas[hdf5, parquet, iceberg, feather, spss, excel]"`` -========================= ================== ================ ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== ================ ============================================================= -PyTables 3.8.0 hdf5 HDF5-based reading / writing -blosc 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` -zlib hdf5 Compression for HDF5 -fastparquet 2022.12.0 - Parquet reading / writing (pyarrow is default) -pyarrow 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing -pyreadstat 1.2.0 spss SPSS files (.sav) reading -odfpy 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing -========================= ================== ================ ============================================================= +====================================================== ================== ================ ========================================================== +Dependency Minimum Version pip extra Notes +====================================================== ================== ================ ========================================================== +`PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing +`zlib `__ hdf5 Compression for HDF5 +`fastparquet `__ 2024.2.0 - Parquet reading / writing (pyarrow is default) +`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +`PyIceberg `__ 0.7.1 iceberg Apache Iceberg reading +`pyreadstat `__ 1.2.6 spss SPSS files (.sav) reading +`odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing +====================================================== ================== ================ ========================================================== .. _install.warn_orc: @@ -379,27 +326,26 @@ Access data in the cloud Installable with ``pip install "pandas[fss, aws, gcp]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -fsspec 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required - dependency of s3fs, gcsfs). -gcsfs 2022.11.0 gcp Google Cloud Storage access -pandas-gbq 0.19.0 gcp Google Big Query access -s3fs 2022.11.0 aws Amazon S3 access -========================= ================== =============== ============================================================= +============================================ ================== =============== ========================================================== +Dependency Minimum Version pip extra Notes +============================================ ================== =============== ========================================================== +`fsspec `__ 2023.12.2 fss, gcp, aws Handling files aside from simple local and HTTP (required + dependency of s3fs, gcsfs). +`gcsfs `__ 2023.12.2 gcp Google Cloud Storage access +`s3fs `__ 2023.12.2 aws Amazon S3 access +============================================ ================== =============== ========================================================== Clipboard ^^^^^^^^^ Installable with ``pip install "pandas[clipboard]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -PyQt4/PyQt5 5.15.9 clipboard Clipboard I/O -qtpy 2.3.0 clipboard Clipboard I/O -========================= ================== =============== ============================================================= +======================================================================================== ================== =============== ============== +Dependency Minimum Version pip extra Notes +======================================================================================== ================== =============== ============== +`PyQt4 `__/`PyQt5 `__ 5.15.9 clipboard Clipboard I/O +`qtpy `__ 2.3.0 clipboard Clipboard I/O +======================================================================================== ================== =============== ============== .. note:: @@ -412,19 +358,19 @@ Compression Installable with ``pip install "pandas[compression]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -Zstandard 0.19.0 compression Zstandard compression -========================= ================== =============== ============================================================= +================================================= ================== =============== ====================== +Dependency Minimum Version pip extra Notes +================================================= ================== =============== ====================== +`Zstandard `__ 0.19.0 compression Zstandard compression +================================================= ================== =============== ====================== -Consortium Standard -^^^^^^^^^^^^^^^^^^^ +Timezone +^^^^^^^^ -Installable with ``pip install "pandas[consortium-standard]"`` +Installable with ``pip install "pandas[timezone]"`` -========================= ================== =================== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =================== ============================================================= -dataframe-api-compat 0.1.7 consortium-standard Consortium Standard-compatible implementation based on pandas -========================= ================== =================== ============================================================= +========================================== ================== =================== ============================================== +Dependency Minimum Version pip extra Notes +========================================== ================== =================== ============================================== +`pytz `__ 2023.4 timezone Alternative timezone library to ``zoneinfo``. +========================================== ================== =================== ============================================== diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index caaff3557ae40..efcdb22778ef4 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -46,7 +46,7 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno "Name": [ "Braund, Mr. Owen Harris", "Allen, Mr. William Henry", - "Bonnell, Miss. Elizabeth", + "Bonnell, Miss Elizabeth", ], "Age": [22, 35, 58], "Sex": ["male", "male", "female"], @@ -192,8 +192,8 @@ Check more options on ``describe`` in the user guide section about :ref:`aggrega .. note:: This is just a starting point. Similar to spreadsheet software, pandas represents data as a table with columns and rows. Apart - from the representation, also the data manipulations and calculations - you would do in spreadsheet software are supported by pandas. Continue + from the representation, the data manipulations and calculations + you would do in spreadsheet software are also supported by pandas. Continue reading the next tutorials to get started! .. raw:: html @@ -204,7 +204,7 @@ Check more options on ``describe`` in the user guide section about :ref:`aggrega - Import the package, aka ``import pandas as pd`` - A table of data is stored as a pandas ``DataFrame`` - Each column in a ``DataFrame`` is a ``Series`` -- You can do things by applying a method to a ``DataFrame`` or ``Series`` +- You can do things by applying a method on a ``DataFrame`` or ``Series`` .. raw:: html @@ -215,7 +215,7 @@ Check more options on ``describe`` in the user guide section about :ref:`aggrega
To user guide -A more extended explanation to ``DataFrame`` and ``Series`` is provided in the :ref:`introduction to data structures `. +A more extended explanation of ``DataFrame`` and ``Series`` is provided in the :ref:`introduction to data structures ` page. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index 832c2cc25712f..0549c17a1013c 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -97,11 +97,11 @@ in this ``DataFrame`` are integers (``int64``), floats (``float64``) and strings (``object``). .. note:: - When asking for the ``dtypes``, no brackets are used! + When asking for the ``dtypes``, no parentheses ``()`` are used! ``dtypes`` is an attribute of a ``DataFrame`` and ``Series``. Attributes - of a ``DataFrame`` or ``Series`` do not need brackets. Attributes + of a ``DataFrame`` or ``Series`` do not need ``()``. Attributes represent a characteristic of a ``DataFrame``/``Series``, whereas - methods (which require brackets) *do* something with the + methods (which require parentheses ``()``) *do* something with the ``DataFrame``/``Series`` as introduced in the :ref:`first tutorial <10min_tut_01_tableoriented>`. .. raw:: html @@ -111,6 +111,12 @@ strings (``object``). My colleague requested the Titanic data as a spreadsheet. +.. note:: + If you want to use :func:`~pandas.to_excel` and :func:`~pandas.read_excel`, + you need to install an Excel reader as outlined in the + :ref:`Excel files ` section of the + installation documentation. + .. ipython:: python titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False) @@ -166,11 +172,11 @@ The method :meth:`~DataFrame.info` provides technical information about a - The table has 12 columns. Most columns have a value for each of the rows (all 891 values are ``non-null``). Some columns do have missing values and less than 891 ``non-null`` values. -- The columns ``Name``, ``Sex``, ``Cabin`` and ``Embarked`` consists of +- The columns ``Name``, ``Sex``, ``Cabin`` and ``Embarked`` consist of textual data (strings, aka ``object``). The other columns are - numerical data with some of them whole numbers (aka ``integer``) and - others are real numbers (aka ``float``). -- The kind of data (characters, integers,…) in the different columns + numerical data, some of them are whole numbers (``integer``) and + others are real numbers (``float``). +- The kind of data (characters, integers, …) in the different columns are summarized by listing the ``dtypes``. - The approximate amount of RAM used to hold the DataFrame is provided as well. @@ -188,7 +194,7 @@ The method :meth:`~DataFrame.info` provides technical information about a - Getting data in to pandas from many different file formats or data sources is supported by ``read_*`` functions. - Exporting data out of pandas is provided by different - ``to_*``\ methods. + ``to_*`` methods. - The ``head``/``tail``/``info`` methods and the ``dtypes`` attribute are convenient for a first check. diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst index 6d7ec01551572..ced976f680885 100644 --- a/doc/source/getting_started/intro_tutorials/03_subset_data.rst +++ b/doc/source/getting_started/intro_tutorials/03_subset_data.rst @@ -101,7 +101,7 @@ selection brackets ``[]``. .. note:: The inner square brackets define a :ref:`Python list ` with column names, whereas - the outer brackets are used to select the data from a pandas + the outer square brackets are used to select the data from a pandas ``DataFrame`` as seen in the previous example. The returned data type is a pandas DataFrame: @@ -300,7 +300,7 @@ want to select. -When using the column names, row labels or a condition expression, use +When using column names, row labels or a condition expression, use the ``loc`` operator in front of the selection brackets ``[]``. For both the part before and after the comma, you can use a single label, a list of labels, a slice of labels, a conditional expression or a colon. Using @@ -335,14 +335,14 @@ the name ``anonymous`` to the first 3 elements of the fourth column: .. ipython:: python titanic.iloc[0:3, 3] = "anonymous" - titanic.head() + titanic.iloc[:5, 3] .. raw:: html
To user guide -See the user guide section on :ref:`different choices for indexing ` to get more insight in the usage of ``loc`` and ``iloc``. +See the user guide section on :ref:`different choices for indexing ` to get more insight into the usage of ``loc`` and ``iloc``. .. raw:: html @@ -354,13 +354,11 @@ See the user guide section on :ref:`different choices for indexing REMEMBER - When selecting subsets of data, square brackets ``[]`` are used. -- Inside these brackets, you can use a single column/row label, a list +- Inside these square brackets, you can use a single column/row label, a list of column/row labels, a slice of labels, a conditional expression or a colon. -- Select specific rows and/or columns using ``loc`` when using the row - and column names. -- Select specific rows and/or columns using ``iloc`` when using the - positions in the table. +- Use ``loc`` for label-based selection (using row/column names). +- Use ``iloc`` for position-based selection (using table positions). - You can assign new values to a selection based on ``loc``/``iloc``. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst index e96eb7c51a12a..e9f83c602d086 100644 --- a/doc/source/getting_started/intro_tutorials/04_plotting.rst +++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst @@ -32,8 +32,10 @@ How do I create plots in pandas? air_quality.head() .. note:: - The usage of the ``index_col`` and ``parse_dates`` parameters of the ``read_csv`` function to define the first (0th) column as - index of the resulting ``DataFrame`` and convert the dates in the column to :class:`Timestamp` objects, respectively. + The ``index_col=0`` and ``parse_dates=True`` parameters passed to the ``read_csv`` function define + the first (0th) column as index of the resulting ``DataFrame`` and convert the dates in the column + to :class:`Timestamp` objects, respectively. + .. raw:: html @@ -85,7 +87,7 @@ I want to plot only the columns of the data table with the data from Paris. air_quality["station_paris"].plot() plt.show() -To plot a specific column, use the selection method of the +To plot a specific column, use a selection method from the :ref:`subset data tutorial <10min_tut_03_subset>` in combination with the :meth:`~DataFrame.plot` method. Hence, the :meth:`~DataFrame.plot` method works on both ``Series`` and ``DataFrame``. @@ -127,7 +129,7 @@ standard Python to get an overview of the available plot methods: ] .. note:: - In many development environments as well as IPython and + In many development environments such as IPython and Jupyter Notebook, use the TAB button to get an overview of the available methods, for example ``air_quality.plot.`` + TAB. @@ -238,7 +240,7 @@ This strategy is applied in the previous example: - The ``.plot.*`` methods are applicable on both Series and DataFrames. - By default, each of the columns is plotted as a different element - (line, boxplot,…). + (line, boxplot, …). - Any plot created by pandas is a Matplotlib object. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/05_add_columns.rst b/doc/source/getting_started/intro_tutorials/05_add_columns.rst index d59a70cc2818e..481c094870e12 100644 --- a/doc/source/getting_started/intro_tutorials/05_add_columns.rst +++ b/doc/source/getting_started/intro_tutorials/05_add_columns.rst @@ -51,7 +51,7 @@ hPa, the conversion factor is 1.882*) air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.882 air_quality.head() -To create a new column, use the ``[]`` brackets with the new column name +To create a new column, use the square brackets ``[]`` with the new column name at the left side of the assignment. .. raw:: html @@ -89,8 +89,8 @@ values in each row*. -Also other mathematical operators (``+``, ``-``, ``*``, ``/``,…) or -logical operators (``<``, ``>``, ``==``,…) work element-wise. The latter was already +Other mathematical operators (``+``, ``-``, ``*``, ``/``, …) and logical +operators (``<``, ``>``, ``==``, …) also work element-wise. The latter was already used in the :ref:`subset data tutorial <10min_tut_03_subset>` to filter rows of a table using a conditional expression. diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index fe3ae820e7085..1399ab66426f4 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -162,7 +162,7 @@ columns by passing ``numeric_only=True``: It does not make much sense to get the average value of the ``Pclass``. If we are only interested in the average age for each gender, the -selection of columns (rectangular brackets ``[]`` as usual) is supported +selection of columns (square brackets ``[]`` as usual) is supported on the grouped data as well: .. ipython:: python @@ -235,7 +235,7 @@ category in a column. -The function is a shortcut, as it is actually a groupby operation in combination with counting of the number of records +The function is a shortcut, it is actually a groupby operation in combination with counting the number of records within each group: .. ipython:: python diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 9081f274cd941..024300bb8a9b0 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -137,7 +137,7 @@ Hence, the resulting table has 3178 = 1110 + 2068 rows. Most operations like concatenation or summary statistics are by default across rows (axis 0), but can be applied across columns as well. -Sorting the table on the datetime information illustrates also the +Sorting the table on the datetime information also illustrates the combination of both tables, with the ``parameter`` column defining the origin of the table (either ``no2`` from table ``air_quality_no2`` or ``pm25`` from table ``air_quality_pm25``): @@ -271,7 +271,7 @@ Add the parameters' full description and name, provided by the parameters metada Compared to the previous example, there is no common column name. However, the ``parameter`` column in the ``air_quality`` table and the -``id`` column in the ``air_quality_parameters_name`` both provide the +``id`` column in the ``air_quality_parameters`` table both provide the measured variable in a common format. The ``left_on`` and ``right_on`` arguments are used here (instead of just ``on``) to make the link between the two tables. @@ -286,7 +286,7 @@ between the two tables.
To user guide -pandas supports also inner, outer, and right joins. +pandas also supports inner, outer, and right joins. More information on join/merge of tables is provided in the user guide section on :ref:`database style merging of tables `. Or have a look at the :ref:`comparison with SQL` page. @@ -300,7 +300,7 @@ More information on join/merge of tables is provided in the user guide section o

REMEMBER

-- Multiple tables can be concatenated both column-wise and row-wise using +- Multiple tables can be concatenated column-wise or row-wise using the ``concat`` function. - For database-like merging/joining of tables, use the ``merge`` function. diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index b0530087e5b84..6ba3c17fac3c3 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -77,9 +77,9 @@ I want to work with the dates in the column ``datetime`` as datetime objects ins Initially, the values in ``datetime`` are character strings and do not provide any datetime operations (e.g. extract the year, day of the -week,…). By applying the ``to_datetime`` function, pandas interprets the +week, …). By applying the ``to_datetime`` function, pandas interprets the strings and convert these to datetime (i.e. ``datetime64[ns, UTC]``) -objects. In pandas we call these datetime objects similar to +objects. In pandas we call these datetime objects that are similar to ``datetime.datetime`` from the standard library as :class:`pandas.Timestamp`. .. raw:: html @@ -117,7 +117,7 @@ length of our time series: air_quality["datetime"].max() - air_quality["datetime"].min() The result is a :class:`pandas.Timedelta` object, similar to ``datetime.timedelta`` -from the standard Python library and defining a time duration. +from the standard Python library which defines a time duration. .. raw:: html @@ -257,7 +257,7 @@ the adapted time scale on plots. Let’s apply this on our data.
  • -Create a plot of the :math:`NO_2` values in the different stations from the 20th of May till the end of 21st of May +Create a plot of the :math:`NO_2` values in the different stations from May 20th till the end of May 21st. .. ipython:: python :okwarning: @@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in .. ipython:: python - monthly_max = no_2.resample("ME").max() + monthly_max = no_2.resample("MS").max() monthly_max A very powerful method on time series data with a datetime index, is the @@ -310,7 +310,7 @@ converting secondly data into 5-minutely data). The :meth:`~Series.resample` method is similar to a groupby operation: - it provides a time-based grouping, by using a string (e.g. ``M``, - ``5H``,…) that defines the target frequency + ``5H``, …) that defines the target frequency - it requires an aggregation function such as ``mean``, ``max``,… .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index 5b1885791d8fb..8493a071863c4 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -134,8 +134,8 @@ only one countess on the Titanic, we get one row as a result. .. note:: More powerful extractions on strings are supported, as the :meth:`Series.str.contains` and :meth:`Series.str.extract` methods accept `regular - expressions `__, but out of - scope of this tutorial. + expressions `__, but are out of + the scope of this tutorial. .. raw:: html @@ -200,7 +200,7 @@ In the "Sex" column, replace values of "male" by "M" and values of "female" by " Whereas :meth:`~Series.replace` is not a string method, it provides a convenient way to use mappings or vocabularies to translate certain values. It requires -a ``dictionary`` to define the mapping ``{from : to}``. +a ``dictionary`` to define the mapping ``{from: to}``. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/includes/titanic.rst b/doc/source/getting_started/intro_tutorials/includes/titanic.rst index 6e03b848aab06..41159516200fa 100644 --- a/doc/source/getting_started/intro_tutorials/includes/titanic.rst +++ b/doc/source/getting_started/intro_tutorials/includes/titanic.rst @@ -11,7 +11,7 @@ This tutorial uses the Titanic data set, stored as CSV. The data consists of the following data columns: - PassengerId: Id of every passenger. -- Survived: Indication whether passenger survived. ``0`` for yes and ``1`` for no. +- Survived: Indication whether passenger survived. ``0`` for no and ``1`` for yes. - Pclass: One out of the 3 ticket classes: Class ``1``, Class ``2`` and Class ``3``. - Name: Name of passenger. - Sex: Gender of passenger. diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index 05a7d63b7ff47..98a68080d33ef 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -6,11 +6,11 @@ Package overview **************** -pandas is a `Python `__ package providing fast, +pandas is a `Python `__ package that provides fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the -fundamental high-level building block for doing practical, **real-world** data -analysis in Python. Additionally, it has the broader goal of becoming **the +fundamental high-level building block for Python's practical, **real-world** data +analysis. Additionally, it seeks to become **the most powerful and flexible open source data analysis/manipulation tool available in any language**. It is already well on its way toward this goal. @@ -174,3 +174,4 @@ License ------- .. literalinclude:: ../../../LICENSE + :language: none diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 4393c3716bdad..eae7771418485 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -112,7 +112,7 @@ Various tutorials * `Wes McKinney's (pandas BDFL) blog `_ * `Statistical analysis made easy in Python with SciPy and pandas DataFrames, by Randal Olson `_ -* `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_ +* `Statistical Data Analysis in Python, tutorial by Christopher Fonnesbeck from SciPy 2013 `_ * `Financial analysis in Python, by Thomas Wiecki `_ * `Intro to pandas data structures, by Greg Reda `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index fe65364896f54..d37eebef5c0c0 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -61,7 +61,7 @@ is an :class:`ArrowDtype`. support as NumPy including first-class nullability support for all data types, immutability and more. The table below shows the equivalent pyarrow-backed (``pa``), pandas extension, and numpy (``np``) types that are recognized by pandas. -Pyarrow-backed types below need to be passed into :class:`ArrowDtype` to be recognized by pandas e.g. ``pd.ArrowDtype(pa.bool_())`` +Pyarrow-backed types below need to be passed into :class:`ArrowDtype` to be recognized by pandas e.g. ``pd.ArrowDtype(pa.bool_())``. =============================================== ========================== =================== PyArrow type pandas extension type NumPy type @@ -114,7 +114,7 @@ values. ArrowDtype -For more information, please see the :ref:`PyArrow user guide ` +For more information, please see the :ref:`PyArrow user guide `. .. _api.arrays.datetime: @@ -495,7 +495,7 @@ a :class:`CategoricalDtype`. CategoricalDtype.categories CategoricalDtype.ordered -Categorical data can be stored in a :class:`pandas.Categorical` +Categorical data can be stored in a :class:`pandas.Categorical`: .. autosummary:: :toctree: api/ @@ -539,6 +539,21 @@ To create a Series of dtype ``category``, use ``cat = s.astype(dtype)`` or If the :class:`Series` is of dtype :class:`CategoricalDtype`, ``Series.cat`` can be used to change the categorical data. See :ref:`api.series.cat` for more. +More methods are available on :class:`Categorical`: + +.. autosummary:: + :toctree: api/ + + Categorical.as_ordered + Categorical.as_unordered + Categorical.set_categories + Categorical.rename_categories + Categorical.reorder_categories + Categorical.add_categories + Categorical.remove_categories + Categorical.remove_unused_categories + Categorical.map + .. _api.arrays.sparse: Sparse @@ -649,6 +664,7 @@ Data type introspection api.types.is_datetime64_dtype api.types.is_datetime64_ns_dtype api.types.is_datetime64tz_dtype + api.types.is_dtype_equal api.types.is_extension_array_dtype api.types.is_float_dtype api.types.is_int64_dtype @@ -685,7 +701,6 @@ Scalar introspection api.types.is_float api.types.is_hashable api.types.is_integer - api.types.is_interval api.types.is_number api.types.is_re api.types.is_re_compilable diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index fefb02dd916cd..e701d48a89db7 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -48,7 +48,7 @@ Conversion DataFrame.convert_dtypes DataFrame.infer_objects DataFrame.copy - DataFrame.bool + DataFrame.to_numpy Indexing, iteration ~~~~~~~~~~~~~~~~~~~ @@ -74,6 +74,7 @@ Indexing, iteration DataFrame.where DataFrame.mask DataFrame.query + DataFrame.isetitem For more information on ``.at``, ``.iat``, ``.loc``, and ``.iloc``, see the :ref:`indexing documentation `. @@ -117,7 +118,6 @@ Function application, GroupBy & window DataFrame.apply DataFrame.map - DataFrame.applymap DataFrame.pipe DataFrame.agg DataFrame.aggregate @@ -185,11 +185,8 @@ Reindexing / selection / label manipulation DataFrame.duplicated DataFrame.equals DataFrame.filter - DataFrame.first - DataFrame.head DataFrame.idxmax DataFrame.idxmin - DataFrame.last DataFrame.reindex DataFrame.reindex_like DataFrame.rename @@ -198,7 +195,6 @@ Reindexing / selection / label manipulation DataFrame.sample DataFrame.set_axis DataFrame.set_index - DataFrame.tail DataFrame.take DataFrame.truncate @@ -209,7 +205,6 @@ Missing data handling .. autosummary:: :toctree: api/ - DataFrame.backfill DataFrame.bfill DataFrame.dropna DataFrame.ffill @@ -219,7 +214,6 @@ Missing data handling DataFrame.isnull DataFrame.notna DataFrame.notnull - DataFrame.pad DataFrame.replace Reshaping, sorting, transposing @@ -238,7 +232,6 @@ Reshaping, sorting, transposing DataFrame.swaplevel DataFrame.stack DataFrame.unstack - DataFrame.swapaxes DataFrame.melt DataFrame.explode DataFrame.squeeze @@ -382,7 +375,6 @@ Serialization / IO / conversion DataFrame.to_feather DataFrame.to_latex DataFrame.to_stata - DataFrame.to_gbq DataFrame.to_records DataFrame.to_string DataFrame.to_clipboard diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 771163ae1b0bc..004651ac0074f 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -79,8 +79,9 @@ Function application DataFrameGroupBy.cumsum DataFrameGroupBy.describe DataFrameGroupBy.diff + DataFrameGroupBy.ewm + DataFrameGroupBy.expanding DataFrameGroupBy.ffill - DataFrameGroupBy.fillna DataFrameGroupBy.first DataFrameGroupBy.head DataFrameGroupBy.idxmax @@ -105,6 +106,7 @@ Function application DataFrameGroupBy.shift DataFrameGroupBy.size DataFrameGroupBy.skew + DataFrameGroupBy.kurt DataFrameGroupBy.std DataFrameGroupBy.sum DataFrameGroupBy.var @@ -130,8 +132,9 @@ Function application SeriesGroupBy.cumsum SeriesGroupBy.describe SeriesGroupBy.diff + SeriesGroupBy.ewm + SeriesGroupBy.expanding SeriesGroupBy.ffill - SeriesGroupBy.fillna SeriesGroupBy.first SeriesGroupBy.head SeriesGroupBy.last @@ -161,6 +164,7 @@ Function application SeriesGroupBy.shift SeriesGroupBy.size SeriesGroupBy.skew + SeriesGroupBy.kurt SeriesGroupBy.std SeriesGroupBy.sum SeriesGroupBy.var diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 7da02f7958416..639bac4d40b70 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -24,13 +24,14 @@ The following subpackages are public. `pandas-stubs `_ package which has classes in addition to those that occur in pandas for type-hinting. -In addition, public functions in ``pandas.io`` and ``pandas.tseries`` submodules -are mentioned in the documentation. +In addition, public functions in ``pandas.io``, ``pandas.tseries``, ``pandas.util`` submodules +are explicitly mentioned in the documentation. Further APIs in these modules are not guaranteed +to be stable. .. warning:: - The ``pandas.core``, ``pandas.compat``, and ``pandas.util`` top-level modules are PRIVATE. Stable functionality in such modules is not guaranteed. + The ``pandas.core``, ``pandas.compat`` top-level modules are PRIVATE. Stable functionality in such modules is not guaranteed. .. If you update this toctree, also update the manual toctree in the .. main index.rst.template @@ -61,7 +62,6 @@ are mentioned in the documentation. .. .. toctree:: - api/pandas.Index.holds_integer api/pandas.Index.nlevels api/pandas.Index.sort diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index fa6105761df0a..7a4bc0f467f9a 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -41,6 +41,7 @@ Properties Index.empty Index.T Index.memory_usage + Index.array Modifying and computations ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -61,13 +62,6 @@ Modifying and computations Index.identical Index.insert Index.is_ - Index.is_boolean - Index.is_categorical - Index.is_floating - Index.is_integer - Index.is_interval - Index.is_numeric - Index.is_object Index.min Index.max Index.reindex @@ -110,6 +104,7 @@ Conversion Index.to_list Index.to_series Index.to_frame + Index.to_numpy Index.view Sorting diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index fbd0f6bd200b9..6e5992916f800 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -156,6 +156,15 @@ Parquet read_parquet DataFrame.to_parquet +Iceberg +~~~~~~~ +.. autosummary:: + :toctree: api/ + + read_iceberg + +.. warning:: ``read_iceberg`` is experimental and may change without warning. + ORC ~~~ .. autosummary:: @@ -188,13 +197,6 @@ SQL read_sql DataFrame.to_sql -Google BigQuery -~~~~~~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - read_gbq - STATA ~~~~~ .. autosummary:: diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index ab89fe74e7337..5876e005574fd 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -26,8 +26,6 @@ Properties DateOffset.normalize DateOffset.rule_code DateOffset.n - DateOffset.is_month_start - DateOffset.is_month_end Methods ~~~~~~~ @@ -35,7 +33,6 @@ Methods :toctree: api/ DateOffset.copy - DateOffset.is_anchored DateOffset.is_on_offset DateOffset.is_month_start DateOffset.is_month_end @@ -82,7 +79,6 @@ Methods :toctree: api/ BusinessDay.copy - BusinessDay.is_anchored BusinessDay.is_on_offset BusinessDay.is_month_start BusinessDay.is_month_end @@ -122,7 +118,6 @@ Methods :toctree: api/ BusinessHour.copy - BusinessHour.is_anchored BusinessHour.is_on_offset BusinessHour.is_month_start BusinessHour.is_month_end @@ -169,7 +164,6 @@ Methods :toctree: api/ CustomBusinessDay.copy - CustomBusinessDay.is_anchored CustomBusinessDay.is_on_offset CustomBusinessDay.is_month_start CustomBusinessDay.is_month_end @@ -209,7 +203,6 @@ Methods :toctree: api/ CustomBusinessHour.copy - CustomBusinessHour.is_anchored CustomBusinessHour.is_on_offset CustomBusinessHour.is_month_start CustomBusinessHour.is_month_end @@ -244,7 +237,6 @@ Methods :toctree: api/ MonthEnd.copy - MonthEnd.is_anchored MonthEnd.is_on_offset MonthEnd.is_month_start MonthEnd.is_month_end @@ -279,7 +271,6 @@ Methods :toctree: api/ MonthBegin.copy - MonthBegin.is_anchored MonthBegin.is_on_offset MonthBegin.is_month_start MonthBegin.is_month_end @@ -323,7 +314,6 @@ Methods :toctree: api/ BusinessMonthEnd.copy - BusinessMonthEnd.is_anchored BusinessMonthEnd.is_on_offset BusinessMonthEnd.is_month_start BusinessMonthEnd.is_month_end @@ -367,7 +357,6 @@ Methods :toctree: api/ BusinessMonthBegin.copy - BusinessMonthBegin.is_anchored BusinessMonthBegin.is_on_offset BusinessMonthBegin.is_month_start BusinessMonthBegin.is_month_end @@ -415,7 +404,6 @@ Methods :toctree: api/ CustomBusinessMonthEnd.copy - CustomBusinessMonthEnd.is_anchored CustomBusinessMonthEnd.is_on_offset CustomBusinessMonthEnd.is_month_start CustomBusinessMonthEnd.is_month_end @@ -463,7 +451,6 @@ Methods :toctree: api/ CustomBusinessMonthBegin.copy - CustomBusinessMonthBegin.is_anchored CustomBusinessMonthBegin.is_on_offset CustomBusinessMonthBegin.is_month_start CustomBusinessMonthBegin.is_month_end @@ -499,7 +486,6 @@ Methods :toctree: api/ SemiMonthEnd.copy - SemiMonthEnd.is_anchored SemiMonthEnd.is_on_offset SemiMonthEnd.is_month_start SemiMonthEnd.is_month_end @@ -535,7 +521,6 @@ Methods :toctree: api/ SemiMonthBegin.copy - SemiMonthBegin.is_anchored SemiMonthBegin.is_on_offset SemiMonthBegin.is_month_start SemiMonthBegin.is_month_end @@ -571,7 +556,6 @@ Methods :toctree: api/ Week.copy - Week.is_anchored Week.is_on_offset Week.is_month_start Week.is_month_end @@ -607,7 +591,6 @@ Methods :toctree: api/ WeekOfMonth.copy - WeekOfMonth.is_anchored WeekOfMonth.is_on_offset WeekOfMonth.weekday WeekOfMonth.is_month_start @@ -645,7 +628,6 @@ Methods :toctree: api/ LastWeekOfMonth.copy - LastWeekOfMonth.is_anchored LastWeekOfMonth.is_on_offset LastWeekOfMonth.is_month_start LastWeekOfMonth.is_month_end @@ -681,7 +663,6 @@ Methods :toctree: api/ BQuarterEnd.copy - BQuarterEnd.is_anchored BQuarterEnd.is_on_offset BQuarterEnd.is_month_start BQuarterEnd.is_month_end @@ -717,7 +698,6 @@ Methods :toctree: api/ BQuarterBegin.copy - BQuarterBegin.is_anchored BQuarterBegin.is_on_offset BQuarterBegin.is_month_start BQuarterBegin.is_month_end @@ -753,7 +733,6 @@ Methods :toctree: api/ QuarterEnd.copy - QuarterEnd.is_anchored QuarterEnd.is_on_offset QuarterEnd.is_month_start QuarterEnd.is_month_end @@ -789,7 +768,6 @@ Methods :toctree: api/ QuarterBegin.copy - QuarterBegin.is_anchored QuarterBegin.is_on_offset QuarterBegin.is_month_start QuarterBegin.is_month_end @@ -798,6 +776,146 @@ Methods QuarterBegin.is_year_start QuarterBegin.is_year_end +BHalfYearEnd +------------ +.. autosummary:: + :toctree: api/ + + BHalfYearEnd + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + BHalfYearEnd.freqstr + BHalfYearEnd.kwds + BHalfYearEnd.name + BHalfYearEnd.nanos + BHalfYearEnd.normalize + BHalfYearEnd.rule_code + BHalfYearEnd.n + BHalfYearEnd.startingMonth + +Methods +~~~~~~~ +.. autosummary:: + :toctree: api/ + + BHalfYearEnd.copy + BHalfYearEnd.is_on_offset + BHalfYearEnd.is_month_start + BHalfYearEnd.is_month_end + BHalfYearEnd.is_quarter_start + BHalfYearEnd.is_quarter_end + BHalfYearEnd.is_year_start + BHalfYearEnd.is_year_end + +BHalfYearBegin +-------------- +.. autosummary:: + :toctree: api/ + + BHalfYearBegin + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + BHalfYearBegin.freqstr + BHalfYearBegin.kwds + BHalfYearBegin.name + BHalfYearBegin.nanos + BHalfYearBegin.normalize + BHalfYearBegin.rule_code + BHalfYearBegin.n + BHalfYearBegin.startingMonth + +Methods +~~~~~~~ +.. autosummary:: + :toctree: api/ + + BHalfYearBegin.copy + BHalfYearBegin.is_on_offset + BHalfYearBegin.is_month_start + BHalfYearBegin.is_month_end + BHalfYearBegin.is_quarter_start + BHalfYearBegin.is_quarter_end + BHalfYearBegin.is_year_start + BHalfYearBegin.is_year_end + +HalfYearEnd +----------- +.. autosummary:: + :toctree: api/ + + HalfYearEnd + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + HalfYearEnd.freqstr + HalfYearEnd.kwds + HalfYearEnd.name + HalfYearEnd.nanos + HalfYearEnd.normalize + HalfYearEnd.rule_code + HalfYearEnd.n + HalfYearEnd.startingMonth + +Methods +~~~~~~~ +.. autosummary:: + :toctree: api/ + + HalfYearEnd.copy + HalfYearEnd.is_on_offset + HalfYearEnd.is_month_start + HalfYearEnd.is_month_end + HalfYearEnd.is_quarter_start + HalfYearEnd.is_quarter_end + HalfYearEnd.is_year_start + HalfYearEnd.is_year_end + +HalfYearBegin +------------- +.. autosummary:: + :toctree: api/ + + HalfYearBegin + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + HalfYearBegin.freqstr + HalfYearBegin.kwds + HalfYearBegin.name + HalfYearBegin.nanos + HalfYearBegin.normalize + HalfYearBegin.rule_code + HalfYearBegin.n + HalfYearBegin.startingMonth + +Methods +~~~~~~~ +.. autosummary:: + :toctree: api/ + + HalfYearBegin.copy + HalfYearBegin.is_on_offset + HalfYearBegin.is_month_start + HalfYearBegin.is_month_end + HalfYearBegin.is_quarter_start + HalfYearBegin.is_quarter_end + HalfYearBegin.is_year_start + HalfYearBegin.is_year_end + BYearEnd -------- .. autosummary:: @@ -825,7 +943,6 @@ Methods :toctree: api/ BYearEnd.copy - BYearEnd.is_anchored BYearEnd.is_on_offset BYearEnd.is_month_start BYearEnd.is_month_end @@ -861,7 +978,6 @@ Methods :toctree: api/ BYearBegin.copy - BYearBegin.is_anchored BYearBegin.is_on_offset BYearBegin.is_month_start BYearBegin.is_month_end @@ -897,7 +1013,6 @@ Methods :toctree: api/ YearEnd.copy - YearEnd.is_anchored YearEnd.is_on_offset YearEnd.is_month_start YearEnd.is_month_end @@ -933,7 +1048,6 @@ Methods :toctree: api/ YearBegin.copy - YearBegin.is_anchored YearBegin.is_on_offset YearBegin.is_month_start YearBegin.is_month_end @@ -973,7 +1087,6 @@ Methods FY5253.copy FY5253.get_rule_code_suffix FY5253.get_year_end - FY5253.is_anchored FY5253.is_on_offset FY5253.is_month_start FY5253.is_month_end @@ -1014,7 +1127,6 @@ Methods FY5253Quarter.copy FY5253Quarter.get_rule_code_suffix FY5253Quarter.get_weeks - FY5253Quarter.is_anchored FY5253Quarter.is_on_offset FY5253Quarter.year_has_extra_week FY5253Quarter.is_month_start @@ -1050,7 +1162,6 @@ Methods :toctree: api/ Easter.copy - Easter.is_anchored Easter.is_on_offset Easter.is_month_start Easter.is_month_end @@ -1071,7 +1182,6 @@ Properties .. autosummary:: :toctree: api/ - Tick.delta Tick.freqstr Tick.kwds Tick.name @@ -1086,7 +1196,6 @@ Methods :toctree: api/ Tick.copy - Tick.is_anchored Tick.is_on_offset Tick.is_month_start Tick.is_month_end @@ -1107,7 +1216,6 @@ Properties .. autosummary:: :toctree: api/ - Day.delta Day.freqstr Day.kwds Day.name @@ -1122,7 +1230,6 @@ Methods :toctree: api/ Day.copy - Day.is_anchored Day.is_on_offset Day.is_month_start Day.is_month_end @@ -1143,7 +1250,6 @@ Properties .. autosummary:: :toctree: api/ - Hour.delta Hour.freqstr Hour.kwds Hour.name @@ -1158,7 +1264,6 @@ Methods :toctree: api/ Hour.copy - Hour.is_anchored Hour.is_on_offset Hour.is_month_start Hour.is_month_end @@ -1179,7 +1284,6 @@ Properties .. autosummary:: :toctree: api/ - Minute.delta Minute.freqstr Minute.kwds Minute.name @@ -1194,7 +1298,6 @@ Methods :toctree: api/ Minute.copy - Minute.is_anchored Minute.is_on_offset Minute.is_month_start Minute.is_month_end @@ -1215,7 +1318,6 @@ Properties .. autosummary:: :toctree: api/ - Second.delta Second.freqstr Second.kwds Second.name @@ -1230,7 +1332,6 @@ Methods :toctree: api/ Second.copy - Second.is_anchored Second.is_on_offset Second.is_month_start Second.is_month_end @@ -1251,7 +1352,6 @@ Properties .. autosummary:: :toctree: api/ - Milli.delta Milli.freqstr Milli.kwds Milli.name @@ -1266,7 +1366,6 @@ Methods :toctree: api/ Milli.copy - Milli.is_anchored Milli.is_on_offset Milli.is_month_start Milli.is_month_end @@ -1287,7 +1386,6 @@ Properties .. autosummary:: :toctree: api/ - Micro.delta Micro.freqstr Micro.kwds Micro.name @@ -1302,7 +1400,6 @@ Methods :toctree: api/ Micro.copy - Micro.is_anchored Micro.is_on_offset Micro.is_month_start Micro.is_month_end @@ -1323,7 +1420,6 @@ Properties .. autosummary:: :toctree: api/ - Nano.delta Nano.freqstr Nano.kwds Nano.name @@ -1338,7 +1434,6 @@ Methods :toctree: api/ Nano.copy - Nano.is_anchored Nano.is_on_offset Nano.is_month_start Nano.is_month_end diff --git a/doc/source/reference/resampling.rst b/doc/source/reference/resampling.rst index edbc8090fc849..2e0717081b129 100644 --- a/doc/source/reference/resampling.rst +++ b/doc/source/reference/resampling.rst @@ -38,7 +38,6 @@ Upsampling Resampler.ffill Resampler.bfill Resampler.nearest - Resampler.fillna Resampler.asfreq Resampler.interpolate diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index af262f9e6c336..6006acc8f5e16 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -25,6 +25,7 @@ Attributes Series.array Series.values Series.dtype + Series.info Series.shape Series.nbytes Series.ndim @@ -47,7 +48,6 @@ Conversion Series.convert_dtypes Series.infer_objects Series.copy - Series.bool Series.to_numpy Series.to_period Series.to_timestamp @@ -177,17 +177,16 @@ Reindexing / selection / label manipulation :toctree: api/ Series.align + Series.case_when Series.drop Series.droplevel Series.drop_duplicates Series.duplicated Series.equals - Series.first Series.head Series.idxmax Series.idxmin Series.isin - Series.last Series.reindex Series.reindex_like Series.rename @@ -209,7 +208,6 @@ Missing data handling .. autosummary:: :toctree: api/ - Series.backfill Series.bfill Series.dropna Series.ffill @@ -219,7 +217,6 @@ Missing data handling Series.isnull Series.notna Series.notnull - Series.pad Series.replace Reshaping, sorting @@ -237,10 +234,8 @@ Reshaping, sorting Series.unstack Series.explode Series.searchsorted - Series.ravel Series.repeat Series.squeeze - Series.view Combining / comparing / joining / merging ----------------------------------------- @@ -341,7 +336,6 @@ Datetime properties Series.dt.tz Series.dt.freq Series.dt.unit - Series.dt.normalize Datetime methods ^^^^^^^^^^^^^^^^ diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 2256876c93e01..742263c788c2f 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -27,6 +27,7 @@ Styler properties Styler.template_html_style Styler.template_html_table Styler.template_latex + Styler.template_typst Styler.template_string Styler.loader @@ -41,6 +42,7 @@ Style application Styler.map_index Styler.format Styler.format_index + Styler.format_index_names Styler.relabel_index Styler.hide Styler.concat @@ -76,6 +78,7 @@ Style export and import Styler.to_html Styler.to_latex + Styler.to_typst Styler.to_excel Styler.to_string Styler.export diff --git a/doc/source/reference/testing.rst b/doc/source/reference/testing.rst index a5d61703aceed..1f164d1aa98b4 100644 --- a/doc/source/reference/testing.rst +++ b/doc/source/reference/testing.rst @@ -58,8 +58,6 @@ Exceptions and warnings errors.PossiblePrecisionLoss errors.PyperclipException errors.PyperclipWindowsException - errors.SettingWithCopyError - errors.SettingWithCopyWarning errors.SpecificationError errors.UndefinedVariableError errors.UnsortedIndexError diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 14af2b8a120e0..2bd63f02faf69 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -30,15 +30,19 @@ Rolling window functions Rolling.std Rolling.min Rolling.max + Rolling.first + Rolling.last Rolling.corr Rolling.cov Rolling.skew Rolling.kurt Rolling.apply + Rolling.pipe Rolling.aggregate Rolling.quantile Rolling.sem Rolling.rank + Rolling.nunique .. _api.functions_window: @@ -71,15 +75,19 @@ Expanding window functions Expanding.std Expanding.min Expanding.max + Expanding.first + Expanding.last Expanding.corr Expanding.cov Expanding.skew Expanding.kurt Expanding.apply + Expanding.pipe Expanding.aggregate Expanding.quantile Expanding.sem Expanding.rank + Expanding.nunique .. _api.functions_ewm: diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index c8e67710c85a9..72bb93d21a99f 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -19,7 +19,7 @@ Customarily, we import as follows: Basic data structures in pandas ------------------------------- -Pandas provides two types of classes for handling data: +pandas provides two types of classes for handling data: 1. :class:`Series`: a one-dimensional labeled array holding data of any type such as integers, strings, Python objects etc. @@ -91,8 +91,8 @@ will be completed: df2.any df2.combine df2.append df2.D df2.apply df2.describe - df2.applymap df2.diff df2.B df2.duplicated + df2.diff As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically tab completed. ``E`` and ``F`` are there as well; the rest of the attributes have been @@ -101,7 +101,7 @@ truncated for brevity. Viewing data ------------ -See the :ref:`Essentially basics functionality section `. +See the :ref:`Essential basic functionality section `. Use :meth:`DataFrame.head` and :meth:`DataFrame.tail` to view the top and bottom rows of the frame respectively: @@ -177,7 +177,7 @@ See the indexing documentation :ref:`Indexing and Selecting Data ` and Getitem (``[]``) ~~~~~~~~~~~~~~~~ -For a :class:`DataFrame`, passing a single label selects a columns and +For a :class:`DataFrame`, passing a single label selects a column and yields a :class:`Series` equivalent to ``df.A``: .. ipython:: python @@ -563,7 +563,7 @@ columns: .. ipython:: python - stacked = df2.stack(future_stack=True) + stacked = df2.stack() stacked With a "stacked" DataFrame or Series (having a :class:`MultiIndex` as the diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 453536098cfbb..f7ab466e92d93 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -11,13 +11,6 @@ and :ref:`other advanced indexing features `. See the :ref:`Indexing and Selecting Data ` for general indexing documentation. -.. warning:: - - Whether a copy or a reference is returned for a setting operation may - depend on the context. This is sometimes called ``chained assignment`` and - should be avoided. See :ref:`Returning a View versus Copy - `. - See the :ref:`cookbook` for some advanced strategies. .. _advanced.hierarchical: @@ -402,6 +395,7 @@ slicers on a single axis. Furthermore, you can *set* the values using the following methods. .. ipython:: python + :okwarning: df2 = dfmi.copy() df2.loc(axis=0)[:, :, ["C1", "C3"]] = -10 diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index f7d89110e6c8f..8155aa0ae03fa 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -36,7 +36,7 @@ of elements to display is five, but you may pass a custom number. Attributes and underlying data ------------------------------ -pandas objects have a number of attributes enabling you to access the metadata +pandas objects have a number of attributes enabling you to access the metadata. * **shape**: gives the axis dimensions of the object, consistent with ndarray * Axis labels @@ -59,7 +59,7 @@ NumPy's type system to add support for custom arrays (see :ref:`basics.dtypes`). To get the actual data inside a :class:`Index` or :class:`Series`, use -the ``.array`` property +the ``.array`` property. .. ipython:: python @@ -88,18 +88,18 @@ NumPy doesn't have a dtype to represent timezone-aware datetimes, so there are two possibly useful representations: 1. An object-dtype :class:`numpy.ndarray` with :class:`Timestamp` objects, each - with the correct ``tz`` + with the correct ``tz``. 2. A ``datetime64[ns]`` -dtype :class:`numpy.ndarray`, where the values have - been converted to UTC and the timezone discarded + been converted to UTC and the timezone discarded. -Timezones may be preserved with ``dtype=object`` +Timezones may be preserved with ``dtype=object``: .. ipython:: python ser = pd.Series(pd.date_range("2000", periods=2, tz="CET")) ser.to_numpy(dtype=object) -Or thrown away with ``dtype='datetime64[ns]'`` +Or thrown away with ``dtype='datetime64[ns]'``: .. ipython:: python @@ -155,17 +155,6 @@ speedups. ``numexpr`` uses smart chunking, caching, and multiple cores. ``bottle a set of specialized cython routines that are especially fast when dealing with arrays that have ``nans``. -Here is a sample (using 100 column x 100,000 row ``DataFrames``): - -.. csv-table:: - :header: "Operation", "0.11.0 (ms)", "Prior Version (ms)", "Ratio to Prior" - :widths: 25, 25, 25, 25 - :delim: ; - - ``df1 > df2``; 13.32; 125.35; 0.1063 - ``df1 * df2``; 21.71; 36.63; 0.5928 - ``df1 + df2``; 22.04; 36.50; 0.6039 - You are highly encouraged to install both libraries. See the section :ref:`Recommended Dependencies ` for more installation info. @@ -299,8 +288,7 @@ Boolean reductions ~~~~~~~~~~~~~~~~~~ You can apply the reductions: :attr:`~DataFrame.empty`, :meth:`~DataFrame.any`, -:meth:`~DataFrame.all`, and :meth:`~DataFrame.bool` to provide a -way to summarize a boolean result. +:meth:`~DataFrame.all`. .. ipython:: python @@ -477,15 +465,15 @@ For example: .. ipython:: python df - df.mean(0) - df.mean(1) + df.mean(axis=0) + df.mean(axis=1) All such methods have a ``skipna`` option signaling whether to exclude missing data (``True`` by default): .. ipython:: python - df.sum(0, skipna=False) + df.sum(axis=0, skipna=False) df.sum(axis=1, skipna=True) Combined with the broadcasting / arithmetic behavior, one can describe various @@ -496,8 +484,8 @@ standard deviation of 1), very concisely: ts_stand = (df - df.mean()) / df.std() ts_stand.std() - xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0) - xs_stand.std(1) + xs_stand = df.sub(df.mean(axis=1), axis=0).div(df.std(axis=1), axis=0) + xs_stand.std(axis=1) Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` preserve the location of ``NaN`` values. This is somewhat different from @@ -1309,8 +1297,8 @@ filling method chosen from the following table: :header: "Method", "Action" :widths: 30, 50 - pad / ffill, Fill values forward - bfill / backfill, Fill values backward + ffill, Fill values forward + bfill, Fill values backward nearest, Fill from the nearest index value We illustrate these fill methods on a simple Series: @@ -1608,7 +1596,7 @@ For instance: This method does not convert the row to a Series object; it merely returns the values inside a namedtuple. Therefore, :meth:`~DataFrame.itertuples` preserves the data type of the values -and is generally faster as :meth:`~DataFrame.iterrows`. +and is generally faster than :meth:`~DataFrame.iterrows`. .. note:: @@ -2076,12 +2064,12 @@ different numeric dtypes will **NOT** be combined. The following example will gi .. ipython:: python - df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32") + df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float64") df1 df1.dtypes df2 = pd.DataFrame( { - "A": pd.Series(np.random.randn(8), dtype="float16"), + "A": pd.Series(np.random.randn(8), dtype="float32"), "B": pd.Series(np.random.randn(8)), "C": pd.Series(np.random.randint(0, 255, size=8), dtype="uint8"), # [0,255] (range of uint8) } diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 3c361d4de17e5..7de0430123fd2 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -37,6 +37,19 @@ If you would prefer to keep the ``NA`` values you can manually fill them with `` s[mask.fillna(True)] +If you create a column of ``NA`` values (for example to fill them later) +with ``df['new_col'] = pd.NA``, the ``dtype`` would be set to ``object`` in the +new column. The performance on this column will be worse than with +the appropriate type. It's better to use +``df['new_col'] = pd.Series(pd.NA, dtype="boolean")`` +(or another ``dtype`` that supports ``NA``). + +.. ipython:: python + + df = pd.DataFrame() + df['objects'] = pd.NA + df.dtypes + .. _boolean.kleene: Kleene logical operations diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 8fb991dca02db..1e7d66dfeb142 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -245,7 +245,8 @@ Equality semantics Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal whenever they have the same categories and order. When comparing two -unordered categoricals, the order of the ``categories`` is not considered. +unordered categoricals, the order of the ``categories`` is not considered. Note +that categories with different dtypes are not the same. .. ipython:: python @@ -263,6 +264,16 @@ All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` c1 == "category" +Notice that the ``categories_dtype`` should be considered, especially when comparing with +two empty ``CategoricalDtype`` instances. + +.. ipython:: python + + c2 = pd.Categorical(np.array([], dtype=object)) + c3 = pd.Categorical(np.array([], dtype=float)) + + c2.dtype == c3.dtype + Description ----------- @@ -782,7 +793,7 @@ Assigning a ``Categorical`` to parts of a column of other types will use the val :okwarning: df = pd.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) - df.loc[1:2, "a"] = pd.Categorical(["b", "b"], categories=["a", "b"]) + df.loc[1:2, "a"] = pd.Categorical([2, 2], categories=[2, 3]) df.loc[2:3, "b"] = pd.Categorical(["b", "b"], categories=["a", "b"]) df df.dtypes diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index b1a6aa8753be1..91a0b4a4fe967 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -35,7 +35,7 @@ These are some neat pandas ``idioms`` ) df -if-then... +If-then... ********** An if-then on one column @@ -176,7 +176,7 @@ One could hard code: Selection --------- -Dataframes +DataFrames ********** The :ref:`indexing ` docs. @@ -311,7 +311,7 @@ The :ref:`multindexing ` docs. df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns]) df # Now stack & Reset - df = df.stack(0, future_stack=True).reset_index(1) + df = df.stack(0).reset_index(1) df # And fix the labels (Notice the label 'level_1' got added automatically) df.columns = ["Sample", "All_X", "All_Y"] @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) `Using get_group `__ @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp, include_groups=False) + expected_df = gb.apply(GrowUp) expected_df `Expanding apply @@ -688,7 +688,7 @@ The :ref:`Pivot ` docs. aggfunc="sum", margins=True, ) - table.stack("City", future_stack=True) + table.stack("City") `Frequency table like plyr in R `__ @@ -874,7 +874,7 @@ Timeseries `__ `Aggregation and plotting time series -`__ +`__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? @@ -914,7 +914,7 @@ Using TimeGrouper and another grouping to create subgroups, then apply a custom `__ `Resample intraday frame without adding new days -`__ +`__ `Resample minute data `__ @@ -1043,7 +1043,7 @@ CSV The :ref:`CSV ` docs -`read_csv in action `__ +`read_csv in action `__ `appending to a csv `__ @@ -1489,7 +1489,7 @@ of the data values: ) df -Constant series +Constant Series --------------- To assess if a series has a constant value, we can check if ``series.nunique() <= 1``. diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index 050c3901c3420..90353d9f49f00 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -8,16 +8,12 @@ Copy-on-Write (CoW) .. note:: - Copy-on-Write will become the default in pandas 3.0. We recommend - :ref:`turning it on now ` - to benefit from all improvements. + Copy-on-Write is now the default with pandas 3.0. Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the optimizations that become possible through CoW are implemented and supported. All possible optimizations are supported starting from pandas 2.1. -CoW will be enabled by default in version 3.0. - CoW will lead to more predictable behavior since it is not possible to update more than one object with one statement, e.g. indexing operations or methods won't have side-effects. Additionally, through delaying copies as long as possible, the average performance and memory usage will improve. @@ -29,21 +25,25 @@ pandas indexing behavior is tricky to understand. Some operations return views w other return copies. Depending on the result of the operation, mutating one object might accidentally mutate another: -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - subset = df["foo"] - subset.iloc[0] = 100 - df + In [1]: df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + In [2]: subset = df["foo"] + In [3]: subset.iloc[0] = 100 + In [4]: df + Out[4]: + foo bar + 0 100 4 + 1 2 5 + 2 3 6 -Mutating ``subset``, e.g. updating its values, also updates ``df``. The exact behavior is + +Mutating ``subset``, e.g. updating its values, also updated ``df``. The exact behavior was hard to predict. Copy-on-Write solves accidentally modifying more than one object, -it explicitly disallows this. With CoW enabled, ``df`` is unchanged: +it explicitly disallows this. ``df`` is unchanged: .. ipython:: python - pd.options.mode.copy_on_write = True - df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) subset = df["foo"] subset.iloc[0] = 100 @@ -57,13 +57,13 @@ applications. Migrating to Copy-on-Write -------------------------- -Copy-on-Write will be the default and only mode in pandas 3.0. This means that users +Copy-on-Write is the default and only mode in pandas 3.0. This means that users need to migrate their code to be compliant with CoW rules. -The default mode in pandas will raise warnings for certain cases that will actively +The default mode in pandas < 3.0 raises warnings for certain cases that will actively change behavior and thus change user intended behavior. -We added another mode, e.g. +pandas 2.2 has a warning mode .. code-block:: python @@ -84,7 +84,6 @@ The following few items describe the user visible changes: **Accessing the underlying array of a pandas object will return a read-only view** - .. ipython:: python ser = pd.Series([1, 2, 3]) @@ -101,16 +100,21 @@ for more details. **Only one pandas object is updated at once** -The following code snippet updates both ``df`` and ``subset`` without CoW: +The following code snippet updated both ``df`` and ``subset`` without CoW: -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - subset = df["foo"] - subset.iloc[0] = 100 - df + In [1]: df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + In [2]: subset = df["foo"] + In [3]: subset.iloc[0] = 100 + In [4]: df + Out[4]: + foo bar + 0 100 4 + 1 2 5 + 2 3 6 -This won't be possible anymore with CoW, since the CoW rules explicitly forbid this. +This is not possible anymore with CoW, since the CoW rules explicitly forbid this. This includes updating a single column as a :class:`Series` and relying on the change propagating back to the parent :class:`DataFrame`. This statement can be rewritten into a single statement with ``loc`` or ``iloc`` if @@ -146,7 +150,7 @@ A different alternative would be to not use ``inplace``: **Constructors now copy NumPy arrays by default** -The Series and DataFrame constructors will now copy NumPy array by default when not +The Series and DataFrame constructors now copies a NumPy array by default when not otherwise specified. This was changed to avoid mutating a pandas object when the NumPy array is changed inplace outside of pandas. You can set ``copy=False`` to avoid this copy. @@ -162,7 +166,7 @@ that shares data with another DataFrame or Series object inplace. This avoids side-effects when modifying values and hence, most methods can avoid actually copying the data and only trigger a copy when necessary. -The following example will operate inplace with CoW: +The following example will operate inplace: .. ipython:: python @@ -207,15 +211,17 @@ listed in :ref:`Copy-on-Write optimizations `. Previously, when operating on views, the view and the parent object was modified: -.. ipython:: python - - with pd.option_context("mode.copy_on_write", False): - df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - view = df[:] - df.iloc[0, 0] = 100 +.. code-block:: ipython - df - view + In [1]: df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + In [2]: subset = df["foo"] + In [3]: subset.iloc[0] = 100 + In [4]: df + Out[4]: + foo bar + 0 100 4 + 1 2 5 + 2 3 6 CoW triggers a copy when ``df`` is changed to avoid mutating ``view`` as well: @@ -236,16 +242,19 @@ Chained Assignment Chained assignment references a technique where an object is updated through two subsequent indexing operations, e.g. -.. ipython:: python - :okwarning: +.. code-block:: ipython - with pd.option_context("mode.copy_on_write", False): - df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - df["foo"][df["bar"] > 5] = 100 - df + In [1]: df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + In [2]: df["foo"][df["bar"] > 5] = 100 + In [3]: df + Out[3]: + foo bar + 0 100 4 + 1 2 5 + 2 3 6 -The column ``foo`` is updated where the column ``bar`` is greater than 5. -This violates the CoW principles though, because it would have to modify the +The column ``foo`` was updated where the column ``bar`` is greater than 5. +This violated the CoW principles though, because it would have to modify the view ``df["foo"]`` and ``df`` in one step. Hence, chained assignment will consistently never work and raise a ``ChainedAssignmentError`` warning with CoW enabled: @@ -272,7 +281,6 @@ shares data with the initial DataFrame: The array is a copy if the initial DataFrame consists of more than one array: - .. ipython:: python df = pd.DataFrame({"a": [1, 2], "b": [1.5, 2.5]}) @@ -295,7 +303,7 @@ This array is read-only, which means that it can't be modified inplace: The same holds true for a Series, since a Series always consists of a single array. -There are two potential solution to this: +There are two potential solutions to this: - Trigger a copy manually if you want to avoid updating DataFrames that share memory with your array. - Make the array writeable. This is a more performant solution but circumvents Copy-on-Write rules, so @@ -317,7 +325,7 @@ you are modifying one object inplace. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df2 = df.reset_index() + df2 = df.reset_index(drop=True) df2.iloc[0, 0] = 100 This creates two objects that share data and thus the setitem operation will trigger a @@ -328,7 +336,7 @@ held by the object. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = df.reset_index() + df = df.reset_index(drop=True) df.iloc[0, 0] = 100 No copy is necessary in this example. @@ -347,22 +355,3 @@ and :meth:`DataFrame.rename`. These methods return views when Copy-on-Write is enabled, which provides a significant performance improvement compared to the regular execution. - -.. _copy_on_write_enabling: - -How to enable CoW ------------------ - -Copy-on-Write can be enabled through the configuration option ``copy_on_write``. The option can -be turned on __globally__ through either of the following: - -.. ipython:: python - - pd.set_option("mode.copy_on_write", True) - - pd.options.mode.copy_on_write = True - -.. ipython:: python - :suppress: - - pd.options.mode.copy_on_write = False diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index d1e981ee1bbdc..89981786d60b5 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -41,8 +41,8 @@ Here, ``data`` can be many different things: * an ndarray * a scalar value (like 5) -The passed **index** is a list of axis labels. Thus, this separates into a few -cases depending on what **data is**: +The passed **index** is a list of axis labels. The constructor's behavior +depends on **data**'s type: **From ndarray** @@ -87,8 +87,9 @@ index will be pulled out. **From scalar value** -If ``data`` is a scalar value, an index must be -provided. The value will be repeated to match the length of **index**. +If ``data`` is a scalar value, the value will be repeated to match +the length of **index**. If the **index** is not provided, it defaults +to ``RangeIndex(1)``. .. ipython:: python @@ -97,7 +98,7 @@ provided. The value will be repeated to match the length of **index**. Series is ndarray-like ~~~~~~~~~~~~~~~~~~~~~~ -:class:`Series` acts very similarly to a ``ndarray`` and is a valid argument to most NumPy functions. +:class:`Series` acts very similarly to a :class:`numpy.ndarray` and is a valid argument to most NumPy functions. However, operations such as slicing will also slice the index. .. ipython:: python @@ -111,7 +112,7 @@ However, operations such as slicing will also slice the index. .. note:: We will address array-based indexing like ``s.iloc[[4, 3, 1]]`` - in :ref:`section on indexing `. + in the :ref:`section on indexing `. Like a NumPy array, a pandas :class:`Series` has a single :attr:`~Series.dtype`. @@ -325,7 +326,7 @@ This case is handled identically to a dict of arrays. .. ipython:: python - data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) + data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "S10")]) data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] pd.DataFrame(data) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 8c510173819e0..647b0f760f4d4 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -171,6 +171,7 @@ can be improved by passing an ``np.ndarray``. In [4]: %%cython ...: cimport numpy as np ...: import numpy as np + ...: np.import_array() ...: cdef double f_typed(double x) except? -2: ...: return x * (x - 1) ...: cpdef double integrate_f_typed(double a, double b, int N): @@ -225,6 +226,7 @@ and ``wraparound`` checks can yield more performance. ...: cimport cython ...: cimport numpy as np ...: import numpy as np + ...: np.import_array() ...: cdef np.float64_t f_typed(np.float64_t x) except? -2: ...: return x * (x - 1) ...: cpdef np.float64_t integrate_f_typed(np.float64_t a, np.float64_t b, np.int64_t N): @@ -427,7 +429,7 @@ prefer that Numba throw an error if it cannot compile a function in a way that speeds up your code, pass Numba the argument ``nopython=True`` (e.g. ``@jit(nopython=True)``). For more on troubleshooting Numba modes, see the `Numba troubleshooting page -`__. +`__. Using ``parallel=True`` (e.g. ``@jit(parallel=True)``) may result in a ``SIGABRT`` if the threading layer leads to unsafe behavior. You can first `specify a safe threading layer `__ @@ -453,7 +455,7 @@ by evaluate arithmetic and boolean expression all at once for large :class:`~pan :func:`~pandas.eval` is many orders of magnitude slower for smaller expressions or objects than plain Python. A good rule of thumb is to only use :func:`~pandas.eval` when you have a - :class:`.DataFrame` with more than 10,000 rows. + :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows. Supported syntax ~~~~~~~~~~~~~~~~ diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 99c85ac66623d..e85eead4e0f09 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -121,7 +121,7 @@ Below is how to check if any of the values are ``True``: if pd.Series([False, True, False]).any(): print("I am any") -Bitwise boolean +Bitwise Boolean ~~~~~~~~~~~~~~~ Bitwise boolean operators like ``==`` and ``!=`` return a boolean :class:`Series` @@ -315,19 +315,8 @@ Why not make NumPy like R? Many people have suggested that NumPy should simply emulate the ``NA`` support present in the more domain-specific statistical programming language `R -`__. Part of the reason is the NumPy type hierarchy: - -.. csv-table:: - :header: "Typeclass","Dtypes" - :widths: 30,70 - :delim: | - - ``numpy.floating`` | ``float16, float32, float64, float128`` - ``numpy.integer`` | ``int8, int16, int32, int64`` - ``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64`` - ``numpy.object_`` | ``object_`` - ``numpy.bool_`` | ``bool_`` - ``numpy.character`` | ``bytes_, str_`` +`__. Part of the reason is the +`NumPy type hierarchy `__. The R language, by contrast, only has a handful of built-in data types: ``integer``, ``numeric`` (floating-point), ``character``, and @@ -383,5 +372,5 @@ constructors using something similar to the following: s = pd.Series(newx) See `the NumPy documentation on byte order -`__ for more +`__ for more details. diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 11863f8aead31..4ec34db6ed959 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -137,15 +137,6 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``. -If we also have a MultiIndex on columns ``A`` and ``B``, we can group by all -the columns except the one we specify: - -.. ipython:: python - - df2 = df.set_index(["A", "B"]) - grouped = df2.groupby(level=df2.index.names.difference(["B"])) - grouped.sum() - The above GroupBy will split the DataFrame on its index (rows). To split by columns, first do a transpose: @@ -247,7 +238,7 @@ GroupBy object attributes ~~~~~~~~~~~~~~~~~~~~~~~~~ The ``groups`` attribute is a dictionary whose keys are the computed unique groups -and corresponding values are the axis labels belonging to each group. In the +and corresponding values are the index labels belonging to each group. In the above example we have: .. ipython:: python @@ -289,7 +280,7 @@ the number of groups, which is the same as the length of the ``groups`` dictiona In [1]: gb. # noqa: E225, E999 gb.agg gb.boxplot gb.cummin gb.describe gb.filter gb.get_group gb.height gb.last gb.median gb.ngroups gb.plot gb.rank gb.std gb.transform gb.aggregate gb.count gb.cumprod gb.dtype gb.first gb.groups gb.hist gb.max gb.min gb.nth gb.prod gb.resample gb.sum gb.var - gb.apply gb.cummax gb.cumsum gb.fillna gb.gender gb.head gb.indices gb.mean gb.name gb.ohlc gb.quantile gb.size gb.tail gb.weight + gb.apply gb.cummax gb.cumsum gb.gender gb.head gb.indices gb.mean gb.name gb.ohlc gb.quantile gb.size gb.tail gb.weight .. _groupby.multiindex: @@ -425,6 +416,12 @@ You can also include the grouping columns if you want to operate on them. grouped[["A", "B"]].sum() +.. note:: + + The ``groupby`` operation in pandas drops the ``name`` field of the columns Index object + after the operation. This change ensures consistency in syntax between different + column selection methods within groupby operations. + .. _groupby.iterating-label: Iterating through groups @@ -509,29 +506,28 @@ listed below, those with a ``*`` do *not* have an efficient, GroupBy-specific, i .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~.DataFrameGroupBy.any`;Compute whether any of the values in the groups are truthy - :meth:`~.DataFrameGroupBy.all`;Compute whether all of the values in the groups are truthy - :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups - :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups - :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group - :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group - :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group - :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group - :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group - :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group - :meth:`~.DataFrameGroupBy.median`;Compute the median of each group - :meth:`~.DataFrameGroupBy.min`;Compute the minimum value in each group - :meth:`~.DataFrameGroupBy.nunique`;Compute the number of unique values in each group - :meth:`~.DataFrameGroupBy.prod`;Compute the product of the values in each group - :meth:`~.DataFrameGroupBy.quantile`;Compute a given quantile of the values in each group - :meth:`~.DataFrameGroupBy.sem`;Compute the standard error of the mean of the values in each group - :meth:`~.DataFrameGroupBy.size`;Compute the number of values in each group - :meth:`~.DataFrameGroupBy.skew` *;Compute the skew of the values in each group - :meth:`~.DataFrameGroupBy.std`;Compute the standard deviation of the values in each group - :meth:`~.DataFrameGroupBy.sum`;Compute the sum of the values in each group - :meth:`~.DataFrameGroupBy.var`;Compute the variance of the values in each group + + :meth:`~.DataFrameGroupBy.any`,Compute whether any of the values in the groups are truthy + :meth:`~.DataFrameGroupBy.all`,Compute whether all of the values in the groups are truthy + :meth:`~.DataFrameGroupBy.count`,Compute the number of non-NA values in the groups + :meth:`~.DataFrameGroupBy.cov` * ,Compute the covariance of the groups + :meth:`~.DataFrameGroupBy.first`,Compute the first occurring value in each group + :meth:`~.DataFrameGroupBy.idxmax`,Compute the index of the maximum value in each group + :meth:`~.DataFrameGroupBy.idxmin`,Compute the index of the minimum value in each group + :meth:`~.DataFrameGroupBy.last`,Compute the last occurring value in each group + :meth:`~.DataFrameGroupBy.max`,Compute the maximum value in each group + :meth:`~.DataFrameGroupBy.mean`,Compute the mean of each group + :meth:`~.DataFrameGroupBy.median`,Compute the median of each group + :meth:`~.DataFrameGroupBy.min`,Compute the minimum value in each group + :meth:`~.DataFrameGroupBy.nunique`,Compute the number of unique values in each group + :meth:`~.DataFrameGroupBy.prod`,Compute the product of the values in each group + :meth:`~.DataFrameGroupBy.quantile`,Compute a given quantile of the values in each group + :meth:`~.DataFrameGroupBy.sem`,Compute the standard error of the mean of the values in each group + :meth:`~.DataFrameGroupBy.size`,Compute the number of values in each group + :meth:`~.DataFrameGroupBy.skew` * ,Compute the skew of the values in each group + :meth:`~.DataFrameGroupBy.std`,Compute the standard deviation of the values in each group + :meth:`~.DataFrameGroupBy.sum`,Compute the sum of the values in each group + :meth:`~.DataFrameGroupBy.var`,Compute the variance of the values in each group Some examples: @@ -622,7 +618,7 @@ this will make an extra copy. .. _groupby.aggregate.udf: -Aggregation with User-Defined Functions +Aggregation with user-defined functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Users can also provide their own User-Defined Functions (UDFs) for custom aggregations. @@ -672,8 +668,9 @@ column, which produces an aggregated result with a hierarchical column index: grouped[["C", "D"]].agg(["sum", "mean", "std"]) -The resulting aggregations are named after the functions themselves. If you -need to rename, then you can add in a chained operation for a ``Series`` like this: +The resulting aggregations are named after the functions themselves. + +For a ``Series``, if you need to rename, you can add in a chained operation like this: .. ipython:: python @@ -683,8 +680,19 @@ need to rename, then you can add in a chained operation for a ``Series`` like th .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"}) ) +Or, you can simply pass a list of tuples each with the name of the new column and the aggregate function: + +.. ipython:: python + + ( + grouped["C"] + .agg([("foo", "sum"), ("bar", "mean"), ("baz", "std")]) + ) + For a grouped ``DataFrame``, you can rename in a similar manner: +By chaining ``rename`` operation, + .. ipython:: python ( @@ -693,6 +701,16 @@ For a grouped ``DataFrame``, you can rename in a similar manner: ) ) +Or, passing a list of tuples, + +.. ipython:: python + + ( + grouped[["C", "D"]].agg( + [("foo", "sum"), ("bar", "mean"), ("baz", "std")] + ) + ) + .. note:: In general, the output column names should be unique, but pandas will allow @@ -835,19 +853,18 @@ The following methods on GroupBy act as transformations. .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~.DataFrameGroupBy.bfill`;Back fill NA values within each group - :meth:`~.DataFrameGroupBy.cumcount`;Compute the cumulative count within each group - :meth:`~.DataFrameGroupBy.cummax`;Compute the cumulative max within each group - :meth:`~.DataFrameGroupBy.cummin`;Compute the cumulative min within each group - :meth:`~.DataFrameGroupBy.cumprod`;Compute the cumulative product within each group - :meth:`~.DataFrameGroupBy.cumsum`;Compute the cumulative sum within each group - :meth:`~.DataFrameGroupBy.diff`;Compute the difference between adjacent values within each group - :meth:`~.DataFrameGroupBy.ffill`;Forward fill NA values within each group - :meth:`~.DataFrameGroupBy.pct_change`;Compute the percent change between adjacent values within each group - :meth:`~.DataFrameGroupBy.rank`;Compute the rank of each value within each group - :meth:`~.DataFrameGroupBy.shift`;Shift values up or down within each group + + :meth:`~.DataFrameGroupBy.bfill`,Back fill NA values within each group + :meth:`~.DataFrameGroupBy.cumcount`,Compute the cumulative count within each group + :meth:`~.DataFrameGroupBy.cummax`,Compute the cumulative max within each group + :meth:`~.DataFrameGroupBy.cummin`,Compute the cumulative min within each group + :meth:`~.DataFrameGroupBy.cumprod`,Compute the cumulative product within each group + :meth:`~.DataFrameGroupBy.cumsum`,Compute the cumulative sum within each group + :meth:`~.DataFrameGroupBy.diff`,Compute the difference between adjacent values within each group + :meth:`~.DataFrameGroupBy.ffill`,Forward fill NA values within each group + :meth:`~.DataFrameGroupBy.pct_change`,Compute the percent change between adjacent values within each group + :meth:`~.DataFrameGroupBy.rank`,Compute the rank of each value within each group + :meth:`~.DataFrameGroupBy.shift`,Shift values up or down within each group In addition, passing any built-in aggregation method as a string to :meth:`~.DataFrameGroupBy.transform` (see the next section) will broadcast the result @@ -1057,7 +1074,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D", include_groups=False).ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -1095,11 +1112,10 @@ efficient, GroupBy-specific, implementation. .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - :meth:`~.DataFrameGroupBy.head`;Select the top row(s) of each group - :meth:`~.DataFrameGroupBy.nth`;Select the nth row(s) of each group - :meth:`~.DataFrameGroupBy.tail`;Select the bottom row(s) of each group + :meth:`~.DataFrameGroupBy.head`,Select the top row(s) of each group + :meth:`~.DataFrameGroupBy.nth`,Select the nth row(s) of each group + :meth:`~.DataFrameGroupBy.tail`,Select the bottom row(s) of each group Users can also use transformations along with Boolean indexing to construct complex filtrations within groups. For example, suppose we are given groups of products and @@ -1236,16 +1252,16 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=True).apply(lambda x: x) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=False).apply(lambda x: x) -Numba Accelerated Routines +Numba accelerated routines -------------------------- .. versionadded:: 1.1 @@ -1680,7 +1696,7 @@ introduction ` and the dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup() -Groupby by indexer to 'resample' data +GroupBy by indexer to 'resample' data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. @@ -1726,8 +1742,8 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics, include_groups=False) + result = df.groupby("a").apply(compute_metrics) result - result.stack(future_stack=True) + result.stack() diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index f0d6a76f0de5b..230b2b86b2ffd 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -78,6 +78,7 @@ Guides boolean visualization style + user_defined_functions groupby window timeseries diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 4954ee1538697..ed5c7806b2e23 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -29,13 +29,6 @@ this area. production code, we recommended that you take advantage of the optimized pandas data access methods exposed in this chapter. -.. warning:: - - Whether a copy or a reference is returned for a setting operation, may - depend on the context. This is sometimes called ``chained assignment`` and - should be avoided. See :ref:`Returning a View versus Copy - `. - See the :ref:`MultiIndex / Advanced Indexing ` for ``MultiIndex`` and more advanced indexing documentation. See the :ref:`cookbook` for some advanced strategies. @@ -101,13 +94,14 @@ well). Any of the axes accessors may be the null slice ``:``. Axes left out of the specification are assumed to be ``:``, e.g. ``p.loc['a']`` is equivalent to ``p.loc['a', :]``. -.. csv-table:: - :header: "Object Type", "Indexers" - :widths: 30, 50 - :delim: ; - Series; ``s.loc[indexer]`` - DataFrame; ``df.loc[row_indexer,column_indexer]`` +.. ipython:: python + + ser = pd.Series(range(5), index=list("abcde")) + ser.loc[["a", "c", "e"]] + + df = pd.DataFrame(np.arange(25).reshape(5, 5), index=list("abcde"), columns=list("abcde")) + df.loc[["a", "c", "e"], ["b", "d"]] .. _indexing.basics: @@ -123,10 +117,9 @@ indexing pandas objects with ``[]``: .. csv-table:: :header: "Object Type", "Selection", "Return Value Type" :widths: 30, 30, 60 - :delim: ; - Series; ``series[label]``; scalar value - DataFrame; ``frame[colname]``; ``Series`` corresponding to colname + Series, ``series[label]``, scalar value + DataFrame, ``frame[colname]``, ``Series`` corresponding to colname Here we construct a simple time series data set to use for illustrating the indexing functionality: @@ -269,6 +262,10 @@ The most robust and consistent way of slicing ranges along arbitrary axes is described in the :ref:`Selection by Position ` section detailing the ``.iloc`` method. For now, we explain the semantics of slicing using the ``[]`` operator. + .. note:: + + When the :class:`Series` has float indices, slicing will select by position. + With Series, the syntax works exactly as with an ndarray, returning a slice of the values and the corresponding labels: @@ -299,12 +296,6 @@ largely as a convenience since it is such a common operation. Selection by label ------------------ -.. warning:: - - Whether a copy or a reference is returned for a setting operation, may depend on the context. - This is sometimes called ``chained assignment`` and should be avoided. - See :ref:`Returning a View versus Copy `. - .. warning:: ``.loc`` is strict when you present slicers that are not compatible (or convertible) with the index type. For example @@ -412,9 +403,9 @@ are returned: s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4]) s.loc[3:5] -If at least one of the two is absent, but the index is sorted, and can be -compared against start and stop labels, then slicing will still work as -expected, by selecting labels which *rank* between the two: +If the index is sorted, and can be compared against start and stop labels, +then slicing will still work as expected, by selecting labels which *rank* +between the two: .. ipython:: python @@ -445,12 +436,6 @@ For more information about duplicate labels, see Selection by position --------------------- -.. warning:: - - Whether a copy or a reference is returned for a setting operation, may depend on the context. - This is sometimes called ``chained assignment`` and should be avoided. - See :ref:`Returning a View versus Copy `. - pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. The ``.iloc`` attribute is the primary access method. The following are valid inputs: @@ -873,9 +858,10 @@ and :ref:`Advanced Indexing ` you may select along more than one axis .. warning:: - ``iloc`` supports two kinds of boolean indexing. If the indexer is a boolean ``Series``, - an error will be raised. For instance, in the following example, ``df.iloc[s.values, 1]`` is ok. - The boolean indexer is an array. But ``df.iloc[s, 1]`` would raise ``ValueError``. + While ``loc`` supports two kinds of boolean indexing, ``iloc`` only supports indexing with a + boolean array. If the indexer is a boolean ``Series``, an error will be raised. For instance, + in the following example, ``df.iloc[s.values, 1]`` is ok. The boolean indexer is an array. + But ``df.iloc[s, 1]`` would raise ``ValueError``. .. ipython:: python @@ -967,7 +953,7 @@ To select a row where each column meets its own criterion: values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]} - row_mask = df.isin(values).all(1) + row_mask = df.isin(values).all(axis=1) df[row_mask] @@ -1722,234 +1708,10 @@ You can assign a custom index to the ``index`` attribute: df_idx.index = pd.Index([10, 20, 30, 40], name="a") df_idx -.. _indexing.view_versus_copy: - -Returning a view versus a copy ------------------------------- - -.. warning:: - - :ref:`Copy-on-Write ` - will become the new default in pandas 3.0. This means than chained indexing will - never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary - anymore. - See :ref:`this section ` - for more context. - We recommend turning Copy-on-Write on to leverage the improvements with - - ``` - pd.options.mode.copy_on_write = True - ``` - - even before pandas 3.0 is available. - -When setting values in a pandas object, care must be taken to avoid what is called -``chained indexing``. Here is an example. - -.. ipython:: python - - dfmi = pd.DataFrame([list('abcd'), - list('efgh'), - list('ijkl'), - list('mnop')], - columns=pd.MultiIndex.from_product([['one', 'two'], - ['first', 'second']])) - dfmi - -Compare these two access methods: - -.. ipython:: python - - dfmi['one']['second'] - -.. ipython:: python - - dfmi.loc[:, ('one', 'second')] - -These both yield the same results, so which should you use? It is instructive to understand the order -of operations on these and why method 2 (``.loc``) is much preferred over method 1 (chained ``[]``). - -``dfmi['one']`` selects the first level of the columns and returns a DataFrame that is singly-indexed. -Then another Python operation ``dfmi_with_one['second']`` selects the series indexed by ``'second'``. -This is indicated by the variable ``dfmi_with_one`` because pandas sees these operations as separate events. -e.g. separate calls to ``__getitem__``, so it has to treat them as linear operations, they happen one after another. - -Contrast this to ``df.loc[:,('one','second')]`` which passes a nested tuple of ``(slice(None),('one','second'))`` to a single call to -``__getitem__``. This allows pandas to deal with this as a single entity. Furthermore this order of operations *can* be significantly -faster, and allows one to index *both* axes if so desired. - Why does assignment fail when using chained indexing? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. warning:: - - :ref:`Copy-on-Write ` - will become the new default in pandas 3.0. This means than chained indexing will - never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary - anymore. - See :ref:`this section ` - for more context. - We recommend turning Copy-on-Write on to leverage the improvements with - - ``` - pd.options.mode.copy_on_write = True - ``` - - even before pandas 3.0 is available. - -The problem in the previous section is just a performance issue. What's up with -the ``SettingWithCopy`` warning? We don't **usually** throw warnings around when -you do something that might cost a few extra milliseconds! - -But it turns out that assigning to the product of chained indexing has -inherently unpredictable results. To see this, think about how the Python -interpreter executes this code: - -.. code-block:: python - - dfmi.loc[:, ('one', 'second')] = value - # becomes - dfmi.loc.__setitem__((slice(None), ('one', 'second')), value) - -But this code is handled differently: - -.. code-block:: python - - dfmi['one']['second'] = value - # becomes - dfmi.__getitem__('one').__setitem__('second', value) - -See that ``__getitem__`` in there? Outside of simple cases, it's very hard to -predict whether it will return a view or a copy (it depends on the memory layout -of the array, about which pandas makes no guarantees), and therefore whether -the ``__setitem__`` will modify ``dfmi`` or a temporary object that gets thrown -out immediately afterward. **That's** what ``SettingWithCopy`` is warning you -about! - -.. note:: You may be wondering whether we should be concerned about the ``loc`` - property in the first example. But ``dfmi.loc`` is guaranteed to be ``dfmi`` - itself with modified indexing behavior, so ``dfmi.loc.__getitem__`` / - ``dfmi.loc.__setitem__`` operate on ``dfmi`` directly. Of course, - ``dfmi.loc.__getitem__(idx)`` may be a view or a copy of ``dfmi``. - -Sometimes a ``SettingWithCopy`` warning will arise at times when there's no -obvious chained indexing going on. **These** are the bugs that -``SettingWithCopy`` is designed to catch! pandas is probably trying to warn you -that you've done this: - -.. code-block:: python - - def do_something(df): - foo = df[['bar', 'baz']] # Is foo a view? A copy? Nobody knows! - # ... many lines here ... - # We don't know whether this will modify df or not! - foo['quux'] = value - return foo - -Yikes! - -.. _indexing.evaluation_order: - -Evaluation order matters -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. warning:: - - :ref:`Copy-on-Write ` - will become the new default in pandas 3.0. This means than chained indexing will - never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary - anymore. - See :ref:`this section ` - for more context. - We recommend turning Copy-on-Write on to leverage the improvements with - - ``` - pd.options.mode.copy_on_write = True - ``` - - even before pandas 3.0 is available. - -When you use chained indexing, the order and type of the indexing operation -partially determine whether the result is a slice into the original object, or -a copy of the slice. - -pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a -slice is frequently not intentional, but a mistake caused by chained indexing -returning a copy where a slice was expected. - -If you would like pandas to be more or less trusting about assignment to a -chained indexing expression, you can set the :ref:`option ` -``mode.chained_assignment`` to one of these values: - -* ``'warn'``, the default, means a ``SettingWithCopyWarning`` is printed. -* ``'raise'`` means pandas will raise a ``SettingWithCopyError`` - you have to deal with. -* ``None`` will suppress the warnings entirely. - -.. ipython:: python - :okwarning: - - dfb = pd.DataFrame({'a': ['one', 'one', 'two', - 'three', 'two', 'one', 'six'], - 'c': np.arange(7)}) - - # This will show the SettingWithCopyWarning - # but the frame values will be set - dfb['c'][dfb['a'].str.startswith('o')] = 42 - -This however is operating on a copy and will not work. - -.. ipython:: python - :okwarning: - :okexcept: - - with pd.option_context('mode.chained_assignment','warn'): - dfb[dfb['a'].str.startswith('o')]['c'] = 42 - -A chained assignment can also crop up in setting in a mixed dtype frame. - -.. note:: - - These setting rules apply to all of ``.loc/.iloc``. - -The following is the recommended access method using ``.loc`` for multiple items (using ``mask``) and a single item using a fixed index: - -.. ipython:: python - - dfc = pd.DataFrame({'a': ['one', 'one', 'two', - 'three', 'two', 'one', 'six'], - 'c': np.arange(7)}) - dfd = dfc.copy() - # Setting multiple items using a mask - mask = dfd['a'].str.startswith('o') - dfd.loc[mask, 'c'] = 42 - dfd - - # Setting a single item - dfd = dfc.copy() - dfd.loc[2, 'a'] = 11 - dfd - -The following *can* work at times, but it is not guaranteed to, and therefore should be avoided: - -.. ipython:: python - :okwarning: - - dfd = dfc.copy() - dfd['a'][2] = 111 - dfd - -Last, the subsequent example will **not** work at all, and so should be avoided: - -.. ipython:: python - :okwarning: - :okexcept: - - with pd.option_context('mode.chained_assignment','raise'): - dfd.loc[0]['a'] = 1111 - -.. warning:: - - The chained assignment warnings / exceptions are aiming to inform the user of a possibly invalid - assignment. There may be false positives; situations where a chained assignment is inadvertently - reported. +:ref:`Copy-on-Write ` is the new default with pandas 3.0. +This means that chained indexing will never work. +See :ref:`this section ` +for more context. diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 1a727cd78af09..8d35d1583d3bd 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -84,6 +84,19 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +If you create a column of ``NA`` values (for example to fill them later) +with ``df['new_col'] = pd.NA``, the ``dtype`` would be set to ``object`` in the +new column. The performance on this column will be worse than with +the appropriate type. It's better to use +``df['new_col'] = pd.Series(pd.NA, dtype="Int64")`` +(or another ``dtype`` that supports ``NA``). + +.. ipython:: python + + df = pd.DataFrame() + df['objects'] = pd.NA + df.dtypes + Operations ---------- @@ -134,7 +147,7 @@ Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well. df.sum() df.groupby("B").A.sum() -Scalar NA Value +Scalar NA value --------------- :class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6148086452d54..2a7cab701eecf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -16,27 +16,26 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like .. csv-table:: :header: "Format Type", "Data Description", "Reader", "Writer" :widths: 30, 100, 60, 60 - :delim: ; - - text;`CSV `__;:ref:`read_csv`;:ref:`to_csv` - text;Fixed-Width Text File;:ref:`read_fwf` - text;`JSON `__;:ref:`read_json`;:ref:`to_json` - text;`HTML `__;:ref:`read_html`;:ref:`to_html` - text;`LaTeX `__;;:ref:`Styler.to_latex` - text;`XML `__;:ref:`read_xml`;:ref:`to_xml` - text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` - binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` - binary;`OpenDocument `__;:ref:`read_excel`; - binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` - binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` - binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`ORC Format `__;:ref:`read_orc`;:ref:`to_orc` - binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` - binary;`SAS `__;:ref:`read_sas`; - binary;`SPSS `__;:ref:`read_spss`; - binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle` - SQL;`SQL `__;:ref:`read_sql`;:ref:`to_sql` - SQL;`Google BigQuery `__;:ref:`read_gbq`;:ref:`to_gbq` + + text,`CSV `__, :ref:`read_csv`, :ref:`to_csv` + text,Fixed-Width Text File, :ref:`read_fwf`, NA + text,`JSON `__, :ref:`read_json`, :ref:`to_json` + text,`HTML `__, :ref:`read_html`, :ref:`to_html` + text,`LaTeX `__, NA, :ref:`Styler.to_latex` + text,`XML `__, :ref:`read_xml`, :ref:`to_xml` + text, Local clipboard, :ref:`read_clipboard`, :ref:`to_clipboard` + binary,`MS Excel `__ , :ref:`read_excel`, :ref:`to_excel` + binary,`OpenDocument `__, :ref:`read_excel`, NA + binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf` + binary,`Feather Format `__, :ref:`read_feather`, :ref:`to_feather` + binary,`Parquet Format `__, :ref:`read_parquet`, :ref:`to_parquet` + binary,`Apache Iceberg `__, :ref:`read_iceberg` , NA + binary,`ORC Format `__, :ref:`read_orc`, :ref:`to_orc` + binary,`Stata `__, :ref:`read_stata`, :ref:`to_stata` + binary,`SAS `__, :ref:`read_sas` , NA + binary,`SPSS `__, :ref:`read_spss` , NA + binary,`Python Pickle Format `__, :ref:`read_pickle`, :ref:`to_pickle` + SQL,`SQL `__, :ref:`read_sql`,:ref:`to_sql` :ref:`Here ` is an informal performance comparison for some of these IO methods. @@ -61,8 +60,8 @@ Basic +++++ filepath_or_buffer : various - Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`, - or :class:`py:py._path.local.LocalPath`), URL (including http, ftp, and S3 + Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`) + URL (including http, ftp, and S3 locations), or any object with a ``read()`` method (such as an open file or :class:`~python:io.StringIO`). sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_table` @@ -75,14 +74,6 @@ sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_tabl delimiters are prone to ignoring quoted data. Regex example: ``'\\r\\t'``. delimiter : str, default ``None`` Alternative argument name for sep. -delim_whitespace : boolean, default False - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) - will be used as the delimiter. Equivalent to setting ``sep='\s+'``. - If this option is set to ``True``, nothing should be passed in for the - ``delimiter`` parameter. - - .. deprecated: 2.2.0 - Use ``sep="\\s+" instead. Column and index locations and names ++++++++++++++++++++++++++++++++++++ @@ -179,7 +170,7 @@ dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFram implementation when "numpy_nullable" is set, pyarrow is used for all dtypes if "pyarrow" is set. - The dtype_backends are still experimential. + The dtype_backends are still experimental. .. versionadded:: 2.0 @@ -272,34 +263,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default * If ``True`` -> try parsing the index. * If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date - column. - * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. .. note:: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : boolean, default ``False`` - If ``True`` and parse_dates is enabled for a column, attempt to infer the - datetime format to speed up the processing. - - .. deprecated:: 2.0.0 - A strict version of this argument is now the default, passing it has no effect. -keep_date_col : boolean, default ``False`` - If ``True`` and parse_dates specifies combining multiple columns then keep the - original columns. -date_parser : function, default ``None`` - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. pandas will try to call date_parser in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays (as - defined by parse_dates) as arguments; 2) concatenate (row-wise) the string - values from the columns defined by parse_dates into a single array and pass - that; and 3) call date_parser once for each row using one or more strings - (corresponding to the columns defined by parse_dates) as arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`to_datetime` as-needed. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, @@ -831,71 +797,8 @@ The simplest case is to just pass in ``parse_dates=True``: It is often the case that we may want to store date and time data separately, or store various date fields separately. the ``parse_dates`` keyword can be -used to specify a combination of columns to parse the dates and/or times from. - -You can specify a list of column lists to ``parse_dates``, the resulting date -columns will be prepended to the output (so as to not affect the existing column -order) and the new column names will be the concatenation of the component -column names: - -.. ipython:: python - :okwarning: - - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - - with open("tmp.csv", "w") as fh: - fh.write(data) - - df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) - df - -By default the parser removes the component date columns, but you can choose -to retain them via the ``keep_date_col`` keyword: - -.. ipython:: python - :okwarning: - - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True - ) - df - -Note that if you wish to combine multiple columns into a single date column, a -nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that -the second and third columns should each be parsed as separate date columns -while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a -single column. - -You can also use a dict to specify custom name columns: - -.. ipython:: python - :okwarning: +used to specify columns to parse the dates and/or times. - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) - df - -It is important to remember that if multiple text columns are to be parsed into -a single date column, then a new column is prepended to the data. The ``index_col`` -specification is based off of this new set of columns rather than the original -data columns: - - -.. ipython:: python - :okwarning: - - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=date_spec, index_col=0 - ) # index is the nominal column - df .. note:: If a column or index contains an unparsable date, the entire column or @@ -909,10 +812,6 @@ data columns: for your data to store datetimes in this format, load times will be significantly faster, ~20x has been observed. -.. deprecated:: 2.2.0 - Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime`` - on the relevant result columns instead. - Date parsing functions ++++++++++++++++++++++ @@ -928,12 +827,6 @@ Performance-wise, you should try these methods of parsing dates in order: then use ``to_datetime``. -.. ipython:: python - :suppress: - - os.remove("tmp.csv") - - .. _io.csv.mixed_timezones: Parsing a CSV with mixed timezones @@ -1045,7 +938,7 @@ Writing CSVs to binary file objects ``df.to_csv(..., mode="wb")`` allows writing a CSV to a file object opened binary mode. In most cases, it is not necessary to specify -``mode`` as Pandas will auto-detect whether the file object is +``mode`` as pandas will auto-detect whether the file object is opened in text or binary mode. .. ipython:: python @@ -1098,7 +991,7 @@ Thousand separators For large numbers that have been written with a thousands separator, you can set the ``thousands`` keyword to a string of length 1 so that integers will be parsed -correctly: +correctly. By default, numbers with a thousands separator will be parsed as strings: @@ -1605,7 +1498,7 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: Specifying the parser engine '''''''''''''''''''''''''''' -Pandas currently supports three engines, the C engine, the python engine, and an experimental +pandas currently supports three engines, the C engine, the python engine, and an experimental pyarrow engine (requires the ``pyarrow`` package). In general, the pyarrow engine is fastest on larger workloads and is equivalent in speed to the C engine on most other workloads. The python engine tends to be slower than the pyarrow and C engines on most workloads. However, @@ -1619,7 +1512,6 @@ Currently, options unsupported by the C and pyarrow engines include: * ``sep`` other than a single character (e.g. regex separators) * ``skipfooter`` -* ``sep=None`` with ``delim_whitespace=False`` Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. @@ -1634,14 +1526,12 @@ Options that are unsupported by the pyarrow engine which are not covered by the * ``memory_map`` * ``dialect`` * ``on_bad_lines`` -* ``delim_whitespace`` * ``quoting`` * ``lineterminator`` * ``converters`` * ``decimal`` * ``iterator`` * ``dayfirst`` -* ``infer_datetime_format`` * ``verbose`` * ``skipinitialspace`` * ``low_memory`` @@ -1704,7 +1594,7 @@ option parameter: .. code-block:: python - storage_options = {"client_kwargs": {"endpoint_url": "http://127.0.0.1:5555"}}} + storage_options = {"client_kwargs": {"endpoint_url": "http://127.0.0.1:5555"}} df = pd.read_json("s3://pandas-test/test-1", storage_options=storage_options) More sample configurations and documentation can be found at `S3Fs documentation @@ -1838,14 +1728,13 @@ with optional parameters: .. csv-table:: :widths: 20, 150 - :delim: ; - ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} - ``records``; list like [{column -> value}, ... , {column -> value}] - ``index``; dict like {index -> {column -> value}} - ``columns``; dict like {column -> {index -> value}} - ``values``; just the values array - ``table``; adhering to the JSON `Table Schema`_ + ``split``, dict like {index -> [index]; columns -> [columns]; data -> [values]} + ``records``, list like [{column -> value}; ... ] + ``index``, dict like {index -> {column -> value}} + ``columns``, dict like {column -> {index -> value}} + ``values``, just the values array + ``table``, adhering to the JSON `Table Schema`_ * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. @@ -1950,13 +1839,6 @@ Writing in ISO date format, with microseconds: json = dfd.to_json(date_format="iso", date_unit="us") json -Epoch timestamps, in seconds: - -.. ipython:: python - - json = dfd.to_json(date_format="epoch", date_unit="s") - json - Writing to a file, with a date index and a date column: .. ipython:: python @@ -1966,7 +1848,7 @@ Writing to a file, with a date index and a date column: dfj2["ints"] = list(range(5)) dfj2["bools"] = True dfj2.index = pd.date_range("20130101", periods=5) - dfj2.to_json("test.json") + dfj2.to_json("test.json", date_format="iso") with open("test.json") as fh: print(fh.read()) @@ -2033,14 +1915,13 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` .. csv-table:: :widths: 20, 150 - :delim: ; - ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} - ``records``; list like [{column -> value}, ... , {column -> value}] - ``index``; dict like {index -> {column -> value}} - ``columns``; dict like {column -> {index -> value}} - ``values``; just the values array - ``table``; adhering to the JSON `Table Schema`_ + ``split``, dict like {index -> [index]; columns -> [columns]; data -> [values]} + ``records``, list like [{column -> value} ...] + ``index``, dict like {index -> {column -> value}} + ``columns``, dict like {column -> {index -> value}} + ``values``, just the values array + ``table``, adhering to the JSON `Table Schema`_ * ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if ``False``, then don't infer dtypes at all, default is True, apply only to the data. @@ -2141,7 +2022,7 @@ Dates written in nanoseconds need to be read back in nanoseconds: .. ipython:: python from io import StringIO - json = dfj2.to_json(date_unit="ns") + json = dfj2.to_json(date_format="iso", date_unit="ns") # Try to parse timestamps as milliseconds -> Won't Work dfju = pd.read_json(StringIO(json), date_unit="ms") @@ -2281,7 +2162,7 @@ a JSON string with two fields, ``schema`` and ``data``. { "A": [1, 2, 3], "B": ["a", "b", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=3), + "C": pd.date_range("2016-01-01", freq="D", periods=3), }, index=pd.Index(range(3), name="idx"), ) @@ -2390,7 +2271,7 @@ round-trippable manner. { "foo": [1, 2, 3, 4], "bar": ["a", "b", "c", "d"], - "baz": pd.date_range("2018-01-01", freq="d", periods=4), + "baz": pd.date_range("2018-01-01", freq="D", periods=4), "qux": pd.Categorical(["a", "b", "c", "c"]), }, index=pd.Index(range(4), name="idx"), @@ -2460,6 +2341,7 @@ Read a URL with no options: .. code-block:: ipython In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" + In [321]: pd.read_html(url) Out[321]: [ Bank NameBank CityCity StateSt ... Acquiring InstitutionAI Closing DateClosing FundFund @@ -2486,6 +2368,7 @@ Read a URL while passing headers alongside the HTTP request: .. code-block:: ipython In [322]: url = 'https://www.sump.org/notes/request/' # HTTP request reflector + In [323]: pd.read_html(url) Out[323]: [ 0 1 @@ -2498,14 +2381,16 @@ Read a URL while passing headers alongside the HTTP request: 1 Host: www.sump.org 2 User-Agent: Python-urllib/3.8 3 Connection: close] + In [324]: headers = { - In [325]: 'User-Agent':'Mozilla Firefox v14.0', - In [326]: 'Accept':'application/json', - In [327]: 'Connection':'keep-alive', - In [328]: 'Auth':'Bearer 2*/f3+fe68df*4' - In [329]: } - In [340]: pd.read_html(url, storage_options=headers) - Out[340]: + .....: 'User-Agent':'Mozilla Firefox v14.0', + .....: 'Accept':'application/json', + .....: 'Connection':'keep-alive', + .....: 'Auth':'Bearer 2*/f3+fe68df*4' + .....: } + + In [325]: pd.read_html(url, storage_options=headers) + Out[325]: [ 0 1 0 Remote Socket: 51.15.105.256:51760 1 Protocol Version: HTTP/1.1 @@ -3013,16 +2898,17 @@ Read in the content of the "books.xml" as instance of ``StringIO`` or df Even read XML from AWS S3 buckets such as NIH NCBI PMC Article Datasets providing -Biomedical and Life Science Jorurnals: +Biomedical and Life Science Journals: -.. ipython:: python - :okwarning: +.. code-block:: python - df = pd.read_xml( - "s3://pmc-oa-opendata/oa_comm/xml/all/PMC1236943.xml", - xpath=".//journal-meta", - ) - df + >>> df = pd.read_xml( + ... "s3://pmc-oa-opendata/oa_comm/xml/all/PMC1236943.xml", + ... xpath=".//journal-meta", + ...) + >>> df + journal-id journal-title issn publisher + 0 Cardiovasc Ultrasound Cardiovascular Ultrasound 1476-7120 NaN With `lxml`_ as default ``parser``, you access the full-featured XML library that extends Python's ElementTree API. One powerful tool is ability to query @@ -3122,7 +3008,7 @@ However, if XPath does not reference node names such as default, ``/*``, then .. note:: Since ``xpath`` identifies the parent of content to be parsed, only immediate - desendants which include child nodes or current attributes are parsed. + descendants which include child nodes or current attributes are parsed. Therefore, ``read_xml`` will not parse the text of grandchildren or other descendants and will not parse attributes of any descendant. To retrieve lower level content, adjust xpath to lower level. For example, @@ -3247,7 +3133,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``: """ - df = pd.read_xml(StringIO(xml), stylesheet=xsl) + df = pd.read_xml(StringIO(xml), stylesheet=StringIO(xsl)) df For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` @@ -3418,7 +3304,7 @@ Write an XML and transform with stylesheet: """ - print(geom_df.to_xml(stylesheet=xsl)) + print(geom_df.to_xml(stylesheet=StringIO(xsl))) XML Final Notes @@ -3471,20 +3357,15 @@ saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. See the :ref:`cookbook` for some advanced strategies. -.. warning:: - - The `xlrd `__ package is now only for reading - old-style ``.xls`` files. +.. note:: - Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` - would result in using the ``xlrd`` engine in many cases, including new - Excel 2007+ (``.xlsx``) files. pandas will now default to using the - `openpyxl `__ engine. + When ``engine=None``, the following logic will be used to determine the engine: - It is strongly encouraged to install ``openpyxl`` to read Excel 2007+ - (``.xlsx``) files. - **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** - This is no longer supported, switch to using ``openpyxl`` instead. + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. .. _io.excel_reader: @@ -3659,7 +3540,7 @@ For example, to read in a ``MultiIndex`` index without names: df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df -If the index has level names, they will parsed as well, using the same +If the index has level names, they will be parsed as well, using the same parameters. .. ipython:: python @@ -3913,6 +3794,20 @@ The look and feel of Excel worksheets created from pandas can be modified using * ``float_format`` : Format string for floating point numbers (default ``None``). * ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``). +.. note:: + + As of pandas 3.0, by default spreadsheets created with the ``to_excel`` method + will not contain any styling. Users wishing to bold text, add bordered styles, + etc in a worksheet output by ``to_excel`` can do so by using :meth:`Styler.to_excel` + to create styled excel files. For documentation on styling spreadsheets, see + `here `__. + + +.. code-block:: python + + css = "border: 1px solid black; font-weight: bold;" + df.style.map_index(lambda x: css).map_index(lambda x: css, axis=1).to_excel("myfile.xlsx") + Using the `Xlsxwriter`_ engine provides many options for controlling the format of an Excel worksheet created with the ``to_excel`` method. Excellent examples can be found in the `Xlsxwriter`_ documentation here: https://xlsxwriter.readthedocs.io/working_with_pandas.html @@ -5100,7 +4995,7 @@ Caveats convenience you can use ``store.flush(fsync=True)`` to do this for you. * Once a ``table`` is created columns (DataFrame) are fixed; only exactly the same columns can be appended -* Be aware that timezones (e.g., ``pytz.timezone('US/Eastern')``) +* Be aware that timezones (e.g., ``zoneinfo.ZoneInfo('US/Eastern')``) are not necessarily equal across timezone versions. So if data is localized to a specific timezone in the HDFStore using one version of a timezone library and that data is updated with another version, the data @@ -5279,6 +5174,8 @@ See the `Full Documentation `__. .. ipython:: python + import pytz + df = pd.DataFrame( { "a": list("abc"), @@ -5288,7 +5185,7 @@ See the `Full Documentation `__. "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), - "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.date_range("20130101", periods=3, tz=pytz.timezone("US/Eastern")), "i": pd.date_range("20130101", periods=3, freq="ns"), } ) @@ -5507,6 +5404,102 @@ The above example creates a partitioned dataset that may look like: except OSError: pass +.. _io.iceberg: + +Iceberg +------- + +.. versionadded:: 3.0.0 + +Apache Iceberg is a high performance open-source format for large analytic tables. +Iceberg enables the use of SQL tables for big data while making it possible for different +engines to safely work with the same tables at the same time. + +Iceberg support predicate pushdown and column pruning, which are available to pandas +users via the ``row_filter`` and ``selected_fields`` parameters of the :func:`~pandas.read_iceberg` +function. This is convenient to extract from large tables a subset that fits in memory asa +pandas ``DataFrame``. + +Internally, pandas uses PyIceberg_ to query Iceberg. + +.. _PyIceberg: https://py.iceberg.apache.org/ + +A simple example loading all data from an Iceberg table ``my_table`` defined in the +``my_catalog`` catalog. + +.. code-block:: python + + df = pd.read_iceberg("my_table", catalog_name="my_catalog") + +Catalogs must be defined in the ``.pyiceberg.yaml`` file, usually in the home directory. +It is possible to to change properties of the catalog definition with the +``catalog_properties`` parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + catalog_properties={"s3.secret-access-key": "my_secret"}, + ) + +It is also possible to fully specify the catalog in ``catalog_properties`` and not provide +a ``catalog_name``: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_properties={ + "uri": "http://127.0.0.1:8181", + "s3.endpoint": "http://127.0.0.1:9000", + }, + ) + +To create the ``DataFrame`` with only a subset of the columns: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + selected_fields=["my_column_3", "my_column_7"] + ) + +This will execute the function faster, since other columns won't be read. And it will also +save memory, since the data from other columns won't be loaded into the underlying memory of +the ``DataFrame``. + +To fetch only a subset of the rows, we can do it with the ``limit`` parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + limit=100, + ) + +This will create a ``DataFrame`` with 100 rows, assuming there are at least this number in +the table. + +To fetch a subset of the rows based on a condition, this can be done using the ``row_filter`` +parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + row_filter="distance > 10.0", + ) + +Reading a particular snapshot is also possible providing the snapshot ID as an argument to +``snapshot_id``. + +More information about the Iceberg format can be found in the `Apache Iceberg official +page `__. + .. _io.orc: ORC @@ -5957,10 +5950,10 @@ You can check if a table exists using :func:`~pandas.io.sql.has_table` Schema support '''''''''''''' -Reading from and writing to different schema's is supported through the ``schema`` +Reading from and writing to different schemas is supported through the ``schema`` keyword in the :func:`~pandas.read_sql_table` and :func:`~pandas.DataFrame.to_sql` functions. Note however that this depends on the database flavor (sqlite does not -have schema's). For example: +have schemas). For example: .. code-block:: python @@ -6100,15 +6093,11 @@ Google BigQuery The ``pandas-gbq`` package provides functionality to read/write from Google BigQuery. -pandas integrates with this external package. if ``pandas-gbq`` is installed, you can -use the pandas methods ``pd.read_gbq`` and ``DataFrame.to_gbq``, which will call the -respective functions from ``pandas-gbq``. - Full documentation can be found `here `__. .. _io.stata: -Stata format +STATA format ------------ .. _io.stata_writer: @@ -6395,7 +6384,7 @@ ignored. In [2]: df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) In [3]: df.info() - + RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 2 columns): A 1000000 non-null float64 diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index c9c8478a719f0..af377dd7a32f2 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -107,7 +107,7 @@ Joining logic of the resulting axis The ``join`` keyword specifies how to handle axis values that don't exist in the first :class:`DataFrame`. -``join='outer'`` takes the union of all axis values +``join='outer'`` takes the union of all axis values. .. ipython:: python @@ -130,7 +130,7 @@ The ``join`` keyword specifies how to handle axis values that don't exist in the p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); plt.close("all"); -``join='inner'`` takes the intersection of the axis values +``join='inner'`` takes the intersection of the axis values. .. ipython:: python @@ -249,7 +249,7 @@ a :class:`MultiIndex`) associate specific keys with each original :class:`DataFr p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True) plt.close("all"); -The ``keys`` argument cane override the column names +The ``keys`` argument can override the column names when creating a new :class:`DataFrame` based on existing :class:`Series`. .. ipython:: python @@ -296,7 +296,7 @@ the index of the :class:`DataFrame` pieces: result.index.levels -``levels`` argument allows specifying resulting levels associated with the ``keys`` +``levels`` argument allows specifying resulting levels associated with the ``keys``. .. ipython:: python @@ -322,7 +322,7 @@ Appending rows to a :class:`DataFrame` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you have a :class:`Series` that you want to append as a single row to a :class:`DataFrame`, you can convert the row into a -:class:`DataFrame` and use :func:`concat` +:class:`DataFrame` and use :func:`concat`. .. ipython:: python @@ -355,7 +355,7 @@ Merge types their indexes which must contain unique values. * **many-to-one**: joining a unique index to one or more columns in a different :class:`DataFrame`. -* **many-to-many** : joining columns on columns. +* **many-to-many**: joining columns on columns. .. note:: @@ -484,9 +484,10 @@ either the left or right tables, the values in the joined table will be p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -You can :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of -the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. Transform -the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` before merging +You can merge :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of +the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. You can also +transform the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` +before merging: .. ipython:: python @@ -504,7 +505,7 @@ the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` bef pd.merge(df, ser.reset_index(), on=["Let", "Num"]) -Performing an outer join with duplicate join keys in :class:`DataFrame` +Performing an outer join with duplicate join keys in :class:`DataFrame`: .. ipython:: python @@ -586,7 +587,7 @@ A string argument to ``indicator`` will use the value as the name for the indica Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ -The merge ``suffixes`` argument takes a tuple of list of strings to append to +The merge ``suffixes`` argument takes a tuple or list of strings to append to overlapping column names in the input :class:`DataFrame` to disambiguate the result columns: @@ -763,7 +764,7 @@ Joining a single Index to a MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can join a :class:`DataFrame` with a :class:`Index` to a :class:`DataFrame` with a :class:`MultiIndex` on a level. -The ``name`` of the :class:`Index` with match the level name of the :class:`MultiIndex`. +The ``name`` of the :class:`Index` will match the level name of the :class:`MultiIndex`. .. ipython:: python @@ -906,7 +907,7 @@ resetting indexes. Joining multiple :class:`DataFrame` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A list or tuple of ``:class:`DataFrame``` can also be passed to :meth:`~DataFrame.join` +A list or tuple of :class:`DataFrame` can also be passed to :meth:`~DataFrame.join` to join them together on their indexes. .. ipython:: python @@ -956,7 +957,7 @@ location. :func:`merge_ordered` --------------------- -:func:`merge_ordered` combines order data such as numeric or time series data +:func:`merge_ordered` combines ordered data such as numeric or time series data with optional filling of missing data with ``fill_method``. .. ipython:: python @@ -974,12 +975,12 @@ with optional filling of missing data with ``fill_method``. :func:`merge_asof` --------------------- -:func:`merge_asof` is similar to an ordered left-join except that mactches are on the +:func:`merge_asof` is similar to an ordered left-join except that matches are on the nearest key rather than equal keys. For each row in the ``left`` :class:`DataFrame`, the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less than the left's key. Both :class:`DataFrame` must be sorted by the key. -Optionally an :func:`merge_asof` can perform a group-wise merge by matching the +Optionally :func:`merge_asof` can perform a group-wise merge by matching the ``by`` key in addition to the nearest match on the ``on`` key. .. ipython:: python @@ -1073,7 +1074,7 @@ compare two :class:`DataFrame` or :class:`Series`, respectively, and summarize t df.compare(df2) By default, if two corresponding values are equal, they will be shown as ``NaN``. -Furthermore, if all values in an entire row / column, the row / column will be +Furthermore, if all values in an entire row / column are equal, that row / column will be omitted from the result. The remaining differences will be aligned on columns. Stack the differences on rows. @@ -1082,7 +1083,7 @@ Stack the differences on rows. df.compare(df2, align_axis=0) -Keep all original rows and columns with ``keep_shape=True`` +Keep all original rows and columns with ``keep_shape=True``. .. ipython:: python diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 4a2aa565dd15c..56f4c80cbde16 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -21,7 +21,7 @@ is that the original data type will be coerced to ``np.float64`` or ``object``. pd.Series([True, False], dtype=np.bool_).reindex([0, 1, 2]) :class:`NaT` for NumPy ``np.datetime64``, ``np.timedelta64``, and :class:`PeriodDtype`. For typing applications, -use :class:`api.types.NaTType`. +use :class:`api.typing.NaTType`. .. ipython:: python @@ -30,9 +30,9 @@ use :class:`api.types.NaTType`. pd.Series(["2020", "2020"], dtype=pd.PeriodDtype("D")).reindex([0, 1, 2]) :class:`NA` for :class:`StringDtype`, :class:`Int64Dtype` (and other bit widths), -:class:`Float64Dtype`(and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`. +:class:`Float64Dtype` (and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`. These types will maintain the original data type of the data. -For typing applications, use :class:`api.types.NAType`. +For typing applications, use :class:`api.typing.NAType`. .. ipython:: python @@ -60,7 +60,7 @@ To detect these missing value, use the :func:`isna` or :func:`notna` methods. .. warning:: - Equality compaisons between ``np.nan``, :class:`NaT`, and :class:`NA` + Equality comparisons between ``np.nan``, :class:`NaT`, and :class:`NA` do not act like ``None`` .. ipython:: python @@ -88,7 +88,7 @@ To detect these missing value, use the :func:`isna` or :func:`notna` methods. .. warning:: - Experimental: the behaviour of :class:`NA`` can still change without warning. + Experimental: the behaviour of :class:`NA` can still change without warning. Starting from pandas 1.0, an experimental :class:`NA` value (singleton) is available to represent scalar missing values. The goal of :class:`NA` is provide a @@ -105,7 +105,7 @@ dtype, it will use :class:`NA`: s[2] s[2] is pd.NA -Currently, pandas does not yet use those data types using :class:`NA` by default +Currently, pandas does not use those data types using :class:`NA` by default in a :class:`DataFrame` or :class:`Series`, so you need to specify the dtype explicitly. An easy way to convert to those dtypes is explained in the :ref:`conversion section `. @@ -253,14 +253,11 @@ Conversion ^^^^^^^^^^ If you have a :class:`DataFrame` or :class:`Series` using ``np.nan``, -:meth:`Series.convert_dtypes` and :meth:`DataFrame.convert_dtypes` -in :class:`DataFrame` that can convert data to use the data types that use :class:`NA` +:meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes`, respectively, +will convert your data to use the nullable data types supporting :class:`NA`, such as :class:`Int64Dtype` or :class:`ArrowDtype`. This is especially helpful after reading in data sets from IO methods where data types were inferred. -In this example, while the dtypes of all columns are changed, we show the results for -the first 10 columns. - .. ipython:: python import io @@ -319,7 +316,7 @@ Missing values propagate through arithmetic operations between pandas objects. The descriptive statistics and computational methods discussed in the :ref:`data structure overview ` (and listed :ref:`here -` and :ref:`here `) are all +` and :ref:`here `) all account for missing data. When summing data, NA values or empty data will be treated as zero. @@ -337,10 +334,8 @@ When taking the product, NA values or empty data will be treated as 1. pd.Series([], dtype="float64").prod() Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` -ignore NA values by default preserve them in the result. This behavior can be changed -with ``skipna`` - -* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``. +ignore NA values by default, but preserve them in the resulting array. To override +this behaviour and include NA values in the calculation, use ``skipna=False``. .. ipython:: python @@ -355,7 +350,7 @@ with ``skipna`` Dropping missing data ~~~~~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.dropna` dropa rows or columns with missing data. +:meth:`~DataFrame.dropna` drops rows or columns with missing data. .. ipython:: python @@ -386,6 +381,27 @@ Replace NA with a scalar value df df.fillna(0) +When the data has object dtype, you can control what type of NA values are present. + +.. ipython:: python + + df = pd.DataFrame({"a": [pd.NA, np.nan, None]}, dtype=object) + df + df.fillna(None) + df.fillna(np.nan) + df.fillna(pd.NA) + +However when the dtype is not object, these will all be replaced with the proper NA value for the dtype. + +.. ipython:: python + + data = {"np": [1.0, np.nan, np.nan, 2], "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]")} + df = pd.DataFrame(data) + df + df.fillna(None) + df.fillna(np.nan) + df.fillna(pd.NA) + Fill gaps forward or backward .. ipython:: python @@ -415,7 +431,7 @@ where the index and column aligns between the original object and the filled obj .. note:: - :meth:`DataFrame.where` can also be used to fill NA values.Same result as above. + :meth:`DataFrame.where` can also be used to fill NA values. Same result as above. .. ipython:: python diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index ce805f98ca528..7757d95c2bccd 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -8,7 +8,7 @@ Options and settings Overview -------- -pandas has an options API configure and customize global behavior related to +pandas has an options API to configure and customize global behavior related to :class:`DataFrame` display, data behavior and more. Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``). diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index 61b383afb7c43..1807341530e69 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -22,7 +22,7 @@ Data Structure Integration A :class:`Series`, :class:`Index`, or the columns of a :class:`DataFrame` can be directly backed by a :external+pyarrow:py:class:`pyarrow.ChunkedArray` which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by -``[pyarrow]``, e.g. ``"int64[pyarrow]""`` into the ``dtype`` parameter +``[pyarrow]``, e.g. ``"int64[pyarrow]"`` into the ``dtype`` parameter .. ipython:: python @@ -159,9 +159,11 @@ PyArrow also provides IO reading functionality that has been integrated into sev functions provide an ``engine`` keyword that can dispatch to PyArrow to accelerate reading from an IO source. * :func:`read_csv` +* :func:`read_feather` * :func:`read_json` * :func:`read_orc` -* :func:`read_feather` +* :func:`read_parquet` +* :func:`read_table` (experimental) .. ipython:: python diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index a35cfb396f1f6..8c5e98791a9ef 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -163,7 +163,7 @@ as having a multi-level index: .. ipython:: python - table.stack(future_stack=True) + table.stack() .. _reshaping.stacking: @@ -209,7 +209,7 @@ stacked level becomes the new lowest level in a :class:`MultiIndex` on the colum .. ipython:: python - stacked = df2.stack(future_stack=True) + stacked = df2.stack() stacked With a "stacked" :class:`DataFrame` or :class:`Series` (having a :class:`MultiIndex` as the @@ -245,7 +245,7 @@ will result in a **sorted** copy of the original :class:`DataFrame` or :class:`S index = pd.MultiIndex.from_product([[2, 1], ["a", "b"]]) df = pd.DataFrame(np.random.randn(4), index=index, columns=["A"]) df - all(df.unstack().stack(future_stack=True) == df.sort_index()) + all(df.unstack().stack() == df.sort_index()) .. _reshaping.stack_multiple: @@ -270,16 +270,16 @@ processed individually. df = pd.DataFrame(np.random.randn(4, 4), columns=columns) df - df.stack(level=["animal", "hair_length"], future_stack=True) + df.stack(level=["animal", "hair_length"]) The list of levels can contain either level names or level numbers but not a mixture of the two. .. ipython:: python - # df.stack(level=['animal', 'hair_length'], future_stack=True) + # df.stack(level=['animal', 'hair_length']) # from above is equivalent to: - df.stack(level=[1, 2], future_stack=True) + df.stack(level=[1, 2]) Missing data ~~~~~~~~~~~~ @@ -321,7 +321,7 @@ The missing value can be filled with a specific value with the ``fill_value`` ar .. image:: ../_static/reshaping_melt.png The top-level :func:`~pandas.melt` function and the corresponding :meth:`DataFrame.melt` -are useful to massage a :class:`DataFrame` into a format where one or more columns +are useful to reshape a :class:`DataFrame` into a format where one or more columns are *identifier variables*, while all other columns, considered *measured variables*, are "unpivoted" to the row axis, leaving just two non-identifier columns, "variable" and "value". The names of those columns can be customized diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index b262de5d71439..d12993f7ead4b 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -5,7 +5,7 @@ Scaling to large datasets ************************* pandas provides data structures for in-memory analytics, which makes using pandas -to analyze datasets that are larger than memory datasets somewhat tricky. Even datasets +to analyze datasets that are larger than memory somewhat tricky. Even datasets that are a sizable fraction of memory become unwieldy, as some pandas operations need to make intermediate copies. @@ -156,7 +156,7 @@ fits in memory, you can work with datasets that are much larger than memory. Chunking works well when the operation you're performing requires zero or minimal coordination between chunks. For more complicated workflows, you're better off - :ref:`using another library `. + :ref:`using other libraries `. Suppose we have an even larger "logical dataset" on disk that's a directory of parquet files. Each file in the directory represents a different year of the entire dataset. @@ -219,160 +219,10 @@ different library that implements these out-of-core algorithms for you. .. _scale.other_libraries: -Use Dask --------- +Use Other Libraries +------------------- -pandas is just one library offering a DataFrame API. Because of its popularity, -pandas' API has become something of a standard that other libraries implement. -The pandas documentation maintains a list of libraries implementing a DataFrame API -in `the ecosystem page `_. - -For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a -pandas-like API for working with larger than memory datasets in parallel. Dask -can use multiple threads or processes on a single machine, or a cluster of -machines to process data in parallel. - - -We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. -We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. - -.. ipython:: python - :okwarning: - - import dask.dataframe as dd - - ddf = dd.read_parquet("data/timeseries/ts*.parquet", engine="pyarrow") - ddf - -Inspecting the ``ddf`` object, we see a few things - -* There are familiar attributes like ``.columns`` and ``.dtypes`` -* There are familiar methods like ``.groupby``, ``.sum``, etc. -* There are new attributes like ``.npartitions`` and ``.divisions`` - -The partitions and divisions are how Dask parallelizes computation. A **Dask** -DataFrame is made up of many pandas :class:`pandas.DataFrame`. A single method call on a -Dask DataFrame ends up making many pandas method calls, and Dask knows how to -coordinate everything to get the result. - -.. ipython:: python - - ddf.columns - ddf.dtypes - ddf.npartitions - -One major difference: the ``dask.dataframe`` API is *lazy*. If you look at the -repr above, you'll notice that the values aren't actually printed out; just the -column names and dtypes. That's because Dask hasn't actually read the data yet. -Rather than executing immediately, doing operations build up a **task graph**. - -.. ipython:: python - :okwarning: - - ddf - ddf["name"] - ddf["name"].value_counts() - -Each of these calls is instant because the result isn't being computed yet. -We're just building up a list of computation to do when someone needs the -result. Dask knows that the return type of a :class:`pandas.Series.value_counts` -is a pandas :class:`pandas.Series` with a certain dtype and a certain name. So the Dask version -returns a Dask Series with the same dtype and the same name. - -To get the actual result you can call ``.compute()``. - -.. ipython:: python - :okwarning: - - %time ddf["name"].value_counts().compute() - -At that point, you get back the same thing you'd get with pandas, in this case -a concrete pandas :class:`pandas.Series` with the count of each ``name``. - -Calling ``.compute`` causes the full task graph to be executed. This includes -reading the data, selecting the columns, and doing the ``value_counts``. The -execution is done *in parallel* where possible, and Dask tries to keep the -overall memory footprint small. You can work with datasets that are much larger -than memory, as long as each partition (a regular pandas :class:`pandas.DataFrame`) fits in memory. - -By default, ``dask.dataframe`` operations use a threadpool to do operations in -parallel. We can also connect to a cluster to distribute the work on many -machines. In this case we'll connect to a local "cluster" made up of several -processes on this single machine. - -.. code-block:: python - - >>> from dask.distributed import Client, LocalCluster - - >>> cluster = LocalCluster() - >>> client = Client(cluster) - >>> client - - -Once this ``client`` is created, all of Dask's computation will take place on -the cluster (which is just processes in this case). - -Dask implements the most used parts of the pandas API. For example, we can do -a familiar groupby aggregation. - -.. ipython:: python - :okwarning: - - %time ddf.groupby("name")[["x", "y"]].mean().compute().head() - -The grouping and aggregation is done out-of-core and in parallel. - -When Dask knows the ``divisions`` of a dataset, certain optimizations are -possible. When reading parquet datasets written by dask, the divisions will be -known automatically. In this case, since we created the parquet files manually, -we need to supply the divisions manually. - -.. ipython:: python - :okwarning: - - N = 12 - starts = [f"20{i:>02d}-01-01" for i in range(N)] - ends = [f"20{i:>02d}-12-13" for i in range(N)] - - divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) - ddf.divisions = divisions - ddf - -Now we can do things like fast random access with ``.loc``. - -.. ipython:: python - :okwarning: - - ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() - -Dask knows to just look in the 3rd partition for selecting values in 2002. It -doesn't need to look at any other data. - -Many workflows involve a large amount of data and processing it in a way that -reduces the size to something that fits in memory. In this case, we'll resample -to daily frequency and take the mean. Once we've taken the mean, we know the -results will fit in memory, so we can safely call ``compute`` without running -out of memory. At that point it's just a regular pandas object. - -.. ipython:: python - :okwarning: - - @savefig dask_resample.png - ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() - -.. ipython:: python - :suppress: - - import shutil - - shutil.rmtree("data/timeseries") - -These Dask examples have all be done using multiple processes on a single -machine. Dask can be `deployed on a cluster -`_ to scale up to even larger -datasets. - -You see more dask examples at https://examples.dask.org. - -.. _Dask: https://dask.org -.. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html +There are other libraries which provide similar APIs to pandas and work nicely with pandas DataFrame, +and can give you the ability to scale your large dataset processing and analytics +by parallel runtime, distributed memory, clustering, etc. You can find more information +in `the ecosystem page `_. diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index f22a506499cf4..9cda1486eb48b 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -38,19 +38,6 @@ "[concatfunc]: ../reference/api/pandas.io.formats.style.Styler.concat.rst" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot\n", - "# We have this here to trigger matplotlib's font cache stuff.\n", - "# This cell is hidden from the output" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -78,17 +65,13 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "import matplotlib as mpl\n", "\n", - "df = pd.DataFrame({\n", - " \"strings\": [\"Adam\", \"Mike\"],\n", - " \"ints\": [1, 3],\n", - " \"floats\": [1.123, 1000.23]\n", - "})\n", - "df.style \\\n", - " .format(precision=3, thousands=\".\", decimal=\",\") \\\n", - " .format_index(str.upper, axis=1) \\\n", - " .relabel_index([\"row 1\", \"row 2\"], axis=0)" + "df = pd.DataFrame(\n", + " {\"strings\": [\"Adam\", \"Mike\"], \"ints\": [1, 3], \"floats\": [1.123, 1000.23]}\n", + ")\n", + "df.style.format(precision=3, thousands=\".\", decimal=\",\").format_index(\n", + " str.upper, axis=1\n", + ").relabel_index([\"row 1\", \"row 2\"], axis=0)" ] }, { @@ -104,17 +87,21 @@ "metadata": {}, "outputs": [], "source": [ - "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n", - " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", - " columns=[\"Tokyo\", \"Beijing\"])\n", + "weather_df = pd.DataFrame(\n", + " np.random.default_rng().random((10, 2)) * 5,\n", + " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", + " columns=[\"Tokyo\", \"Beijing\"],\n", + ")\n", "\n", - "def rain_condition(v): \n", + "\n", + "def rain_condition(v):\n", " if v < 1.75:\n", " return \"Dry\"\n", " elif v < 2.75:\n", " return \"Rain\"\n", " return \"Heavy Rain\"\n", "\n", + "\n", "def make_pretty(styler):\n", " styler.set_caption(\"Weather Conditions\")\n", " styler.format(rain_condition)\n", @@ -122,6 +109,7 @@ " styler.background_gradient(axis=None, vmin=1, vmax=5, cmap=\"YlGnBu\")\n", " return styler\n", "\n", + "\n", "weather_df" ] }, @@ -157,10 +145,8 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame(np.random.randn(5, 5))\n", - "df.style \\\n", - " .hide(subset=[0, 2, 4], axis=0) \\\n", - " .hide(subset=[0, 2, 4], axis=1)" + "df = pd.DataFrame(np.random.default_rng().standard_normal((5, 5)))\n", + "df.style.hide(subset=[0, 2, 4], axis=0).hide(subset=[0, 2, 4], axis=1)" ] }, { @@ -177,9 +163,9 @@ "outputs": [], "source": [ "show = [0, 2, 4]\n", - "df.style \\\n", - " .hide([row for row in df.index if row not in show], axis=0) \\\n", - " .hide([col for col in df.columns if col not in show], axis=1)" + "df.style.hide([row for row in df.index if row not in show], axis=0).hide(\n", + " [col for col in df.columns if col not in show], axis=1\n", + ")" ] }, { @@ -199,9 +185,9 @@ "metadata": {}, "outputs": [], "source": [ - "summary_styler = df.agg([\"sum\", \"mean\"]).style \\\n", - " .format(precision=3) \\\n", - " .relabel_index([\"Sum\", \"Average\"])\n", + "summary_styler = (\n", + " df.agg([\"sum\", \"mean\"]).style.format(precision=3).relabel_index([\"Sum\", \"Average\"])\n", + ")\n", "df.style.format(precision=1).concat(summary_styler)" ] }, @@ -211,7 +197,7 @@ "source": [ "## Styler Object and HTML \n", "\n", - "The [Styler][styler] was originally constructed to support the wide array of HTML formatting options. Its HTML output creates an HTML `` and leverages CSS styling language to manipulate many parameters including colors, fonts, borders, background, etc. See [here][w3schools] for more information on styling HTML tables. This allows a lot of flexibility out of the box, and even enables web developers to integrate DataFrames into their exiting user interface designs.\n", + "The [Styler][styler] was originally constructed to support the wide array of HTML formatting options. Its HTML output creates an HTML `
    ` and leverages CSS styling language to manipulate many parameters including colors, fonts, borders, background, etc. See [here][w3schools] for more information on styling HTML tables. This allows a lot of flexibility out of the box, and even enables web developers to integrate DataFrames into their existing user interface designs.\n", "\n", "Below we demonstrate the default output, which looks very similar to the standard DataFrame HTML representation. But the HTML here has already attached some CSS classes to each cell, even if we haven't yet created any styles. We can view these by calling the [.to_html()][tohtml] method, which returns the raw HTML as string, which is useful for further processing or adding to a file - read on in [More about CSS and HTML](#More-About-CSS-and-HTML). This section will also provide a walkthrough for how to convert this default output to represent a DataFrame output that is more communicative. For example how we can build `s`:\n", "\n", @@ -227,9 +213,16 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]], \n", - " index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'), \n", - " columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))\n", + "idx = pd.Index([\"Tumour (Positive)\", \"Non-Tumour (Negative)\"], name=\"Actual Label:\")\n", + "cols = pd.MultiIndex.from_product(\n", + " [[\"Decision Tree\", \"Regression\", \"Random\"], [\"Tumour\", \"Non-Tumour\"]],\n", + " names=[\"Model:\", \"Predicted:\"],\n", + ")\n", + "df = pd.DataFrame(\n", + " [[38.0, 2.0, 18.0, 22.0, 21, np.nan], [19, 439, 6, 452, 226, 232]],\n", + " index=idx,\n", + " columns=cols,\n", + ")\n", "df.style" ] }, @@ -242,63 +235,68 @@ "outputs": [], "source": [ "# Hidden cell to just create the below example: code is covered throughout the guide.\n", - "s = df.style\\\n", - " .hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis='columns')\\\n", - " .format('{:.0f}')\\\n", - " .set_table_styles([{\n", - " 'selector': '',\n", - " 'props': 'border-collapse: separate;'\n", - " },{\n", - " 'selector': 'caption',\n", - " 'props': 'caption-side: bottom; font-size:1.3em;'\n", - " },{\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", - " },{\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", - " },{\n", - " 'selector': 'th.col_heading',\n", - " 'props': 'text-align: center;'\n", - " },{\n", - " 'selector': 'th.col_heading.level0',\n", - " 'props': 'font-size: 1.5em;'\n", - " },{\n", - " 'selector': 'th.col2',\n", - " 'props': 'border-left: 1px solid white;'\n", - " },{\n", - " 'selector': '.col2',\n", - " 'props': 'border-left: 1px solid #000066;'\n", - " },{\n", - " 'selector': 'td',\n", - " 'props': 'text-align: center; font-weight:bold;'\n", - " },{\n", - " 'selector': '.true',\n", - " 'props': 'background-color: #e6ffe6;'\n", - " },{\n", - " 'selector': '.false',\n", - " 'props': 'background-color: #ffe6e6;'\n", - " },{\n", - " 'selector': '.border-red',\n", - " 'props': 'border: 2px dashed red;'\n", - " },{\n", - " 'selector': '.border-green',\n", - " 'props': 'border: 2px dashed green;'\n", - " },{\n", - " 'selector': 'td:hover',\n", - " 'props': 'background-color: #ffffb3;'\n", - " }])\\\n", - " .set_td_classes(pd.DataFrame([['true border-green', 'false', 'true', 'false border-red', '', ''],\n", - " ['false', 'true', 'false', 'true', '', '']], \n", - " index=df.index, columns=df.columns))\\\n", - " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\\\n", - " .set_tooltips(pd.DataFrame([['This model has a very strong true positive rate', '', '', \"This model's total number of false negatives is too high\", '', ''],\n", - " ['', '', '', '', '', '']], \n", - " index=df.index, columns=df.columns),\n", - " css_class='pd-tt', props=\n", - " 'visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n", - " 'background-color: white; color: #000066; font-size: 0.8em;' \n", - " 'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')\n" + "s = (\n", + " df.style.hide([(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\")\n", + " .format(\"{:.0f}\")\n", + " .set_table_styles(\n", + " [\n", + " {\"selector\": \"\", \"props\": \"border-collapse: separate;\"},\n", + " {\"selector\": \"caption\", \"props\": \"caption-side: bottom; font-size:1.3em;\"},\n", + " {\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", + " },\n", + " {\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", + " },\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"th.col2\", \"props\": \"border-left: 1px solid white;\"},\n", + " {\"selector\": \".col2\", \"props\": \"border-left: 1px solid #000066;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight:bold;\"},\n", + " {\"selector\": \".true\", \"props\": \"background-color: #e6ffe6;\"},\n", + " {\"selector\": \".false\", \"props\": \"background-color: #ffe6e6;\"},\n", + " {\"selector\": \".border-red\", \"props\": \"border: 2px dashed red;\"},\n", + " {\"selector\": \".border-green\", \"props\": \"border: 2px dashed green;\"},\n", + " {\"selector\": \"td:hover\", \"props\": \"background-color: #ffffb3;\"},\n", + " ]\n", + " )\n", + " .set_td_classes(\n", + " pd.DataFrame(\n", + " [\n", + " [\"true border-green\", \"false\", \"true\", \"false border-red\", \"\", \"\"],\n", + " [\"false\", \"true\", \"false\", \"true\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " )\n", + " )\n", + " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\n", + " .set_tooltips(\n", + " pd.DataFrame(\n", + " [\n", + " [\n", + " \"This model has a very strong true positive rate\",\n", + " \"\",\n", + " \"\",\n", + " \"This model's total number of false negatives is too high\",\n", + " \"\",\n", + " \"\",\n", + " ],\n", + " [\"\", \"\", \"\", \"\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " ),\n", + " css_class=\"pd-tt\",\n", + " props=\"visibility: hidden; \"\n", + " \"position: absolute; z-index: 1; \"\n", + " \"border: 1px solid #000066;\"\n", + " \"background-color: white; color: #000066; font-size: 0.8em;\"\n", + " \"transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;\",\n", + " )\n", + ")" ] }, { @@ -325,7 +323,9 @@ "metadata": {}, "outputs": [], "source": [ - "s = df.style.format('{:.0f}').hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis=\"columns\")\n", + "s = df.style.format(\"{:.0f}\").hide(\n", + " [(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\"\n", + ")\n", "s" ] }, @@ -337,8 +337,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_hide')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_hide\")" ] }, { @@ -351,7 +351,7 @@ "\n", "- Using [.set_table_styles()][table] to control broader areas of the table with specified internal CSS. Although table styles allow the flexibility to add CSS selectors and properties controlling all individual parts of the table, they are unwieldy for individual cell specifications. Also, note that table styles cannot be exported to Excel. \n", "- Using [.set_td_classes()][td_class] to directly link either external CSS classes to your data cells or link the internal CSS classes created by [.set_table_styles()][table]. See [here](#Setting-Classes-and-Linking-to-External-CSS). These cannot be used on column header rows or indexes, and also won't export to Excel. \n", - "- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes; [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n", + "- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes: [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n", "\n", "[table]: ../reference/api/pandas.io.formats.style.Styler.set_table_styles.rst\n", "[styler]: ../reference/api/pandas.io.formats.style.Styler.rst\n", @@ -395,16 +395,16 @@ "outputs": [], "source": [ "cell_hover = { # for row hover use instead of \".format(align)\n", - " for series in [test1,test2,test3, test4]:\n", + " for series in [test1, test2, test3, test4]:\n", " s = series.copy()\n", - " s.name=''\n", - " row += \"\".format(s.to_frame().style.hide(axis='index').bar(align=align, \n", - " color=['#d65f5f', '#5fba7d'], \n", - " width=100).to_html()) #testn['width']\n", - " row += ''\n", + " s.name = \"\"\n", + " row += \"\".format(\n", + " s.to_frame()\n", + " .style.hide(axis=\"index\")\n", + " .bar(align=align, color=[\"#d65f5f\", \"#5fba7d\"], width=100)\n", + " .to_html()\n", + " ) # testn['width']\n", + " row += \"\"\n", " head += row\n", - " \n", - "head+= \"\"\"\n", + "\n", + "head += \"\"\"\n", "\n", "
    \n", - " 'selector': 'td:hover',\n", - " 'props': [('background-color', '#ffffb3')]\n", + " \"selector\": \"td:hover\",\n", + " \"props\": [(\"background-color\", \"#ffffb3\")],\n", "}\n", "index_names = {\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", "}\n", "headers = {\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", "}\n", "s.set_table_styles([cell_hover, index_names, headers])" ] @@ -417,8 +417,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles1')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles1\")" ] }, { @@ -434,11 +434,14 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles([\n", - " {'selector': 'th.col_heading', 'props': 'text-align: center;'},\n", - " {'selector': 'th.col_heading.level0', 'props': 'font-size: 1.5em;'},\n", - " {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'},\n", - "], overwrite=False)" + "s.set_table_styles(\n", + " [\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight: bold;\"},\n", + " ],\n", + " overwrite=False,\n", + ")" ] }, { @@ -449,8 +452,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles2')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles2\")" ] }, { @@ -468,10 +471,16 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles({\n", - " ('Regression', 'Tumour'): [{'selector': 'th', 'props': 'border-left: 1px solid white'},\n", - " {'selector': 'td', 'props': 'border-left: 1px solid #000066'}]\n", - "}, overwrite=False, axis=0)" + "s.set_table_styles(\n", + " {\n", + " (\"Regression\", \"Tumour\"): [\n", + " {\"selector\": \"th\", \"props\": \"border-left: 1px solid white\"},\n", + " {\"selector\": \"td\", \"props\": \"border-left: 1px solid #000066\"},\n", + " ]\n", + " },\n", + " overwrite=False,\n", + " axis=0,\n", + ")" ] }, { @@ -482,8 +491,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('xyz01')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"xyz01\")" ] }, { @@ -508,7 +517,7 @@ "outputs": [], "source": [ "out = s.set_table_attributes('class=\"my-table-cls\"').to_html()\n", - "print(out[out.find('
    {}{}
    {}
    \"\"\"" ] @@ -1463,11 +1532,12 @@ "metadata": {}, "outputs": [], "source": [ - "style1 = df2.style\\\n", - " .map(style_negative, props='color:red;')\\\n", - " .map(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\\\n", - " .set_table_styles([{\"selector\": \"th\", \"props\": \"color: blue;\"}])\\\n", - " .hide(axis=\"index\")\n", + "style1 = (\n", + " df2.style.map(style_negative, props=\"color:red;\")\n", + " .map(lambda v: \"opacity: 20%;\" if (v < 0.3) and (v > -0.3) else None)\n", + " .set_table_styles([{\"selector\": \"th\", \"props\": \"color: blue;\"}])\n", + " .hide(axis=\"index\")\n", + ")\n", "style1" ] }, @@ -1526,11 +1596,14 @@ "outputs": [], "source": [ "from ipywidgets import widgets\n", + "\n", + "\n", "@widgets.interact\n", - "def f(h_neg=(0, 359, 1), h_pos=(0, 359), s=(0., 99.9), l=(0., 99.9)):\n", + "def f(h_neg=(0, 359, 1), h_pos=(0, 359), s=(0.0, 99.9), l_post=(0.0, 99.9)):\n", " return df2.style.background_gradient(\n", - " cmap=sns.palettes.diverging_palette(h_neg=h_neg, h_pos=h_pos, s=s, l=l,\n", - " as_cmap=True)\n", + " cmap=sns.palettes.diverging_palette(\n", + " h_neg=h_neg, h_pos=h_pos, s=s, l=l_post, as_cmap=True\n", + " )\n", " )" ] }, @@ -1548,16 +1621,15 @@ "outputs": [], "source": [ "def magnify():\n", - " return [dict(selector=\"th\",\n", - " props=[(\"font-size\", \"4pt\")]),\n", - " dict(selector=\"td\",\n", - " props=[('padding', \"0em 0em\")]),\n", - " dict(selector=\"th:hover\",\n", - " props=[(\"font-size\", \"12pt\")]),\n", - " dict(selector=\"tr:hover td:hover\",\n", - " props=[('max-width', '200px'),\n", - " ('font-size', '12pt')])\n", - "]" + " return [\n", + " {\"selector\": \"th\", \"props\": [(\"font-size\", \"4pt\")]},\n", + " {\"selector\": \"td\", \"props\": [(\"padding\", \"0em 0em\")]},\n", + " {\"selector\": \"th:hover\", \"props\": [(\"font-size\", \"12pt\")]},\n", + " {\n", + " \"selector\": \"tr:hover td:hover\",\n", + " \"props\": [(\"max-width\", \"200px\"), (\"font-size\", \"12pt\")],\n", + " },\n", + " ]" ] }, { @@ -1566,15 +1638,12 @@ "metadata": {}, "outputs": [], "source": [ - "np.random.seed(25)\n", - "cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)\n", - "bigdf = pd.DataFrame(np.random.randn(20, 25)).cumsum()\n", + "cmap = sns.diverging_palette(5, 250, as_cmap=True)\n", + "bigdf = pd.DataFrame(np.random.default_rng(25).standard_normal((20, 25))).cumsum()\n", "\n", - "bigdf.style.background_gradient(cmap, axis=1)\\\n", - " .set_properties(**{'max-width': '80px', 'font-size': '1pt'})\\\n", - " .set_caption(\"Hover to magnify\")\\\n", - " .format(precision=2)\\\n", - " .set_table_styles(magnify())" + "bigdf.style.background_gradient(cmap, axis=1).set_properties(\n", + " **{\"max-width\": \"80px\", \"font-size\": \"1pt\"}\n", + ").set_caption(\"Hover to magnify\").format(precision=2).set_table_styles(magnify())" ] }, { @@ -1594,7 +1663,7 @@ "metadata": {}, "outputs": [], "source": [ - "bigdf = pd.DataFrame(np.random.randn(16, 100))\n", + "bigdf = pd.DataFrame(np.random.default_rng().standard_normal((16, 100)))\n", "bigdf.style.set_sticky(axis=\"index\")" ] }, @@ -1611,8 +1680,8 @@ "metadata": {}, "outputs": [], "source": [ - "bigdf.index = pd.MultiIndex.from_product([[\"A\",\"B\"],[0,1],[0,1,2,3]])\n", - "bigdf.style.set_sticky(axis=\"index\", pixel_size=18, levels=[1,2])" + "bigdf.index = pd.MultiIndex.from_product([[\"A\", \"B\"], [0, 1], [0, 1, 2, 3]])\n", + "bigdf.style.set_sticky(axis=\"index\", pixel_size=18, levels=[1, 2])" ] }, { @@ -1621,7 +1690,9 @@ "source": [ "### HTML Escaping\n", "\n", - "Suppose you have to display HTML within HTML, that can be a bit of pain when the renderer can't distinguish. You can use the `escape` formatting option to handle this, and even use it within a formatter that contains HTML itself." + "Suppose you have to display HTML within HTML, that can be a bit of pain when the renderer can't distinguish. You can use the `escape` formatting option to handle this, and even use it within a formatter that contains HTML itself.\n", + "\n", + "Note that if you're using `Styler` on untrusted, user-provided input to serve HTML then you should escape the input to prevent security vulnerabilities. See the Jinja2 documentation for more." ] }, { @@ -1630,7 +1701,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4 = pd.DataFrame([['
    ', '\"&other\"', '']])\n", + "df4 = pd.DataFrame([[\"
    \", '\"&other\"', \"\"]])\n", "df4.style" ] }, @@ -1649,7 +1720,9 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.format('{}', escape=\"html\")" + "df4.style.format(\n", + " '{}', escape=\"html\"\n", + ")" ] }, { @@ -1691,10 +1764,9 @@ "metadata": {}, "outputs": [], "source": [ - "df2.style.\\\n", - " map(style_negative, props='color:red;').\\\n", - " highlight_max(axis=0).\\\n", - " to_excel('styled.xlsx', engine='openpyxl')" + "df2.style.map(style_negative, props=\"color:red;\").highlight_max(axis=0).to_excel(\n", + " \"styled.xlsx\", engine=\"openpyxl\"\n", + ")" ] }, { @@ -1763,7 +1835,11 @@ "metadata": {}, "outputs": [], "source": [ - "print(pd.DataFrame([[1,2],[3,4]], index=['i1', 'i2'], columns=['c1', 'c2']).style.to_html())" + "print(\n", + " pd.DataFrame(\n", + " [[1, 2], [3, 4]], index=[\"i1\", \"i2\"], columns=[\"c1\", \"c2\"]\n", + " ).style.to_html()\n", + ")" ] }, { @@ -1781,9 +1857,8 @@ "metadata": {}, "outputs": [], "source": [ - "df4 = pd.DataFrame([['text']])\n", - "df4.style.map(lambda x: 'color:green;')\\\n", - " .map(lambda x: 'color:red;')" + "df4 = pd.DataFrame([[\"text\"]])\n", + "df4.style.map(lambda x: \"color:green;\").map(lambda x: \"color:red;\")" ] }, { @@ -1792,8 +1867,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.map(lambda x: 'color:red;')\\\n", - " .map(lambda x: 'color:green;')" + "df4.style.map(lambda x: \"color:red;\").map(lambda x: \"color:green;\")" ] }, { @@ -1818,9 +1892,9 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_uuid('a_')\\\n", - " .set_table_styles([{'selector': 'td', 'props': 'color:red;'}])\\\n", - " .map(lambda x: 'color:green;')" + "df4.style.set_uuid(\"a_\").set_table_styles(\n", + " [{\"selector\": \"td\", \"props\": \"color:red;\"}]\n", + ").map(lambda x: \"color:green;\")" ] }, { @@ -1836,11 +1910,12 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_uuid('b_')\\\n", - " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", - " {'selector': '.cls-1', 'props': 'color:blue;'}])\\\n", - " .map(lambda x: 'color:green;')\\\n", - " .set_td_classes(pd.DataFrame([['cls-1']]))" + "df4.style.set_uuid(\"b_\").set_table_styles(\n", + " [\n", + " {\"selector\": \"td\", \"props\": \"color:red;\"},\n", + " {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n", + " ]\n", + ").map(lambda x: \"color:green;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))" ] }, { @@ -1856,12 +1931,13 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_uuid('c_')\\\n", - " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", - " {'selector': '.cls-1', 'props': 'color:blue;'},\n", - " {'selector': 'td.data', 'props': 'color:yellow;'}])\\\n", - " .map(lambda x: 'color:green;')\\\n", - " .set_td_classes(pd.DataFrame([['cls-1']]))" + "df4.style.set_uuid(\"c_\").set_table_styles(\n", + " [\n", + " {\"selector\": \"td\", \"props\": \"color:red;\"},\n", + " {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n", + " {\"selector\": \"td.data\", \"props\": \"color:yellow;\"},\n", + " ]\n", + ").map(lambda x: \"color:green;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))" ] }, { @@ -1879,12 +1955,13 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_uuid('d_')\\\n", - " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", - " {'selector': '.cls-1', 'props': 'color:blue;'},\n", - " {'selector': 'td.data', 'props': 'color:yellow;'}])\\\n", - " .map(lambda x: 'color:green !important;')\\\n", - " .set_td_classes(pd.DataFrame([['cls-1']]))" + "df4.style.set_uuid(\"d_\").set_table_styles(\n", + " [\n", + " {\"selector\": \"td\", \"props\": \"color:red;\"},\n", + " {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n", + " {\"selector\": \"td.data\", \"props\": \"color:yellow;\"},\n", + " ]\n", + ").map(lambda x: \"color:green !important;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))" ] }, { @@ -1906,7 +1983,7 @@ "- Provide an API that is pleasing to use interactively and is \"good enough\" for many tasks\n", "- Provide the foundations for dedicated libraries to build on\n", "\n", - "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", + "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/community/ecosystem.html) to it.\n", "\n", "### Subclassing\n", "\n", @@ -1938,8 +2015,8 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"templates/myhtml.tpl\") as f:\n", - " print(f.read())" + "with open(\"templates/myhtml.tpl\") as f_html:\n", + " print(f_html.read())" ] }, { @@ -1958,10 +2035,12 @@ "source": [ "class MyStyler(Styler):\n", " env = Environment(\n", - " loader=ChoiceLoader([\n", - " FileSystemLoader(\"templates\"), # contains ours\n", - " Styler.loader, # the default\n", - " ])\n", + " loader=ChoiceLoader(\n", + " [\n", + " FileSystemLoader(\"templates\"), # contains ours\n", + " Styler.loader, # the default\n", + " ]\n", + " )\n", " )\n", " template_html_table = env.get_template(\"myhtml.tpl\")" ] @@ -2043,8 +2122,8 @@ }, "outputs": [], "source": [ - "with open(\"templates/html_style_structure.html\") as f:\n", - " style_structure = f.read()" + "with open(\"templates/html_style_structure.html\") as f_sty:\n", + " style_structure = f_sty.read()" ] }, { @@ -2071,8 +2150,8 @@ }, "outputs": [], "source": [ - "with open(\"templates/html_table_structure.html\") as f:\n", - " table_structure = f.read()" + "with open(\"templates/html_table_structure.html\") as f_table_struct:\n", + " table_structure = f_table_struct.read()" ] }, { @@ -2104,7 +2183,7 @@ "# from IPython.display import HTML\n", "# with open(\"themes/nature_with_gtoc/static/nature.css_t\") as f:\n", "# css = f.read()\n", - " \n", + "\n", "# HTML(''.format(css))" ] } diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index cf27fc8385223..3bb151a2dd339 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -13,7 +13,7 @@ Text data types There are two ways to store text data in pandas: -1. ``object`` -dtype NumPy array. +1. ``object`` dtype NumPy array. 2. :class:`StringDtype` extension type. We recommend using :class:`StringDtype` to store text data. @@ -40,20 +40,20 @@ to significantly increase the performance and lower the memory overhead of and parts of the API may change without warning. For backwards-compatibility, ``object`` dtype remains the default type we -infer a list of strings to +infer a list of strings to: .. ipython:: python pd.Series(["a", "b", "c"]) -To explicitly request ``string`` dtype, specify the ``dtype`` +To explicitly request ``string`` dtype, specify the ``dtype``: .. ipython:: python pd.Series(["a", "b", "c"], dtype="string") pd.Series(["a", "b", "c"], dtype=pd.StringDtype()) -Or ``astype`` after the ``Series`` or ``DataFrame`` is created +Or ``astype`` after the ``Series`` or ``DataFrame`` is created: .. ipython:: python @@ -88,9 +88,9 @@ Behavior differences ^^^^^^^^^^^^^^^^^^^^ These are places where the behavior of ``StringDtype`` objects differ from -``object`` dtype +``object`` dtype: -l. For ``StringDtype``, :ref:`string accessor methods` +1. For ``StringDtype``, :ref:`string accessor methods` that return **numeric** output will always return a nullable integer dtype, rather than either int or float dtype, depending on the presence of NA values. Methods returning **boolean** output will return a nullable boolean dtype. @@ -102,7 +102,7 @@ l. For ``StringDtype``, :ref:`string accessor methods` s.str.count("a") s.dropna().str.count("a") - Both outputs are ``Int64`` dtype. Compare that with object-dtype + Both outputs are ``Int64`` dtype. Compare that with object-dtype: .. ipython:: python @@ -204,7 +204,7 @@ and replacing any remaining whitespaces with underscores: .. warning:: - The type of the Series is inferred and the allowed types (i.e. strings). + The type of the Series is inferred and is one among the allowed types (i.e. strings). Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few exceptions, other uses are not supported, and may be disabled at a later point. @@ -332,8 +332,8 @@ regular expression object will raise a ``ValueError``. --------------------------------------------------------------------------- ValueError: case and flags cannot be set when pat is a compiled regex -``removeprefix`` and ``removesuffix`` have the same effect as ``str.removeprefix`` and ``str.removesuffix`` added in Python 3.9 -`__: +``removeprefix`` and ``removesuffix`` have the same effect as ``str.removeprefix`` and ``str.removesuffix`` added in +`Python 3.9 `__: .. versionadded:: 1.4.0 @@ -726,57 +726,56 @@ Method summary .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~Series.str.cat`;Concatenate strings - :meth:`~Series.str.split`;Split strings on delimiter - :meth:`~Series.str.rsplit`;Split strings on delimiter working from the end of the string - :meth:`~Series.str.get`;Index into each element (retrieve i-th element) - :meth:`~Series.str.join`;Join strings in each element of the Series with passed separator - :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables - :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex - :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence - :meth:`~Series.str.removeprefix`;Remove prefix from string, i.e. only remove if string starts with prefix. - :meth:`~Series.str.removesuffix`;Remove suffix from string, i.e. only remove if string ends with suffix. - :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) - :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" - :meth:`~Series.str.center`;Equivalent to ``str.center`` - :meth:`~Series.str.ljust`;Equivalent to ``str.ljust`` - :meth:`~Series.str.rjust`;Equivalent to ``str.rjust`` - :meth:`~Series.str.zfill`;Equivalent to ``str.zfill`` - :meth:`~Series.str.wrap`;Split long strings into lines with length less than a given width - :meth:`~Series.str.slice`;Slice each string in the Series - :meth:`~Series.str.slice_replace`;Replace slice in each string with passed value - :meth:`~Series.str.count`;Count occurrences of pattern - :meth:`~Series.str.startswith`;Equivalent to ``str.startswith(pat)`` for each element - :meth:`~Series.str.endswith`;Equivalent to ``str.endswith(pat)`` for each element - :meth:`~Series.str.findall`;Compute list of all occurrences of pattern/regex for each string - :meth:`~Series.str.match`;"Call ``re.match`` on each element, returning matched groups as list" - :meth:`~Series.str.extract`;"Call ``re.search`` on each element, returning DataFrame with one row for each element and one column for each regex capture group" - :meth:`~Series.str.extractall`;"Call ``re.findall`` on each element, returning DataFrame with one row for each match and one column for each regex capture group" - :meth:`~Series.str.len`;Compute string lengths - :meth:`~Series.str.strip`;Equivalent to ``str.strip`` - :meth:`~Series.str.rstrip`;Equivalent to ``str.rstrip`` - :meth:`~Series.str.lstrip`;Equivalent to ``str.lstrip`` - :meth:`~Series.str.partition`;Equivalent to ``str.partition`` - :meth:`~Series.str.rpartition`;Equivalent to ``str.rpartition`` - :meth:`~Series.str.lower`;Equivalent to ``str.lower`` - :meth:`~Series.str.casefold`;Equivalent to ``str.casefold`` - :meth:`~Series.str.upper`;Equivalent to ``str.upper`` - :meth:`~Series.str.find`;Equivalent to ``str.find`` - :meth:`~Series.str.rfind`;Equivalent to ``str.rfind`` - :meth:`~Series.str.index`;Equivalent to ``str.index`` - :meth:`~Series.str.rindex`;Equivalent to ``str.rindex`` - :meth:`~Series.str.capitalize`;Equivalent to ``str.capitalize`` - :meth:`~Series.str.swapcase`;Equivalent to ``str.swapcase`` - :meth:`~Series.str.normalize`;Return Unicode normal form. Equivalent to ``unicodedata.normalize`` - :meth:`~Series.str.translate`;Equivalent to ``str.translate`` - :meth:`~Series.str.isalnum`;Equivalent to ``str.isalnum`` - :meth:`~Series.str.isalpha`;Equivalent to ``str.isalpha`` - :meth:`~Series.str.isdigit`;Equivalent to ``str.isdigit`` - :meth:`~Series.str.isspace`;Equivalent to ``str.isspace`` - :meth:`~Series.str.islower`;Equivalent to ``str.islower`` - :meth:`~Series.str.isupper`;Equivalent to ``str.isupper`` - :meth:`~Series.str.istitle`;Equivalent to ``str.istitle`` - :meth:`~Series.str.isnumeric`;Equivalent to ``str.isnumeric`` - :meth:`~Series.str.isdecimal`;Equivalent to ``str.isdecimal`` + + :meth:`~Series.str.cat`,Concatenate strings + :meth:`~Series.str.split`,Split strings on delimiter + :meth:`~Series.str.rsplit`,Split strings on delimiter working from the end of the string + :meth:`~Series.str.get`,Index into each element (retrieve i-th element) + :meth:`~Series.str.join`,Join strings in each element of the Series with passed separator + :meth:`~Series.str.get_dummies`,Split strings on the delimiter returning DataFrame of dummy variables + :meth:`~Series.str.contains`,Return boolean array if each string contains pattern/regex + :meth:`~Series.str.replace`,Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence + :meth:`~Series.str.removeprefix`,Remove prefix from string i.e. only remove if string starts with prefix. + :meth:`~Series.str.removesuffix`,Remove suffix from string i.e. only remove if string ends with suffix. + :meth:`~Series.str.repeat`,Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) + :meth:`~Series.str.pad`,Add whitespace to the sides of strings + :meth:`~Series.str.center`,Equivalent to ``str.center`` + :meth:`~Series.str.ljust`,Equivalent to ``str.ljust`` + :meth:`~Series.str.rjust`,Equivalent to ``str.rjust`` + :meth:`~Series.str.zfill`,Equivalent to ``str.zfill`` + :meth:`~Series.str.wrap`,Split long strings into lines with length less than a given width + :meth:`~Series.str.slice`,Slice each string in the Series + :meth:`~Series.str.slice_replace`,Replace slice in each string with passed value + :meth:`~Series.str.count`,Count occurrences of pattern + :meth:`~Series.str.startswith`,Equivalent to ``str.startswith(pat)`` for each element + :meth:`~Series.str.endswith`,Equivalent to ``str.endswith(pat)`` for each element + :meth:`~Series.str.findall`,Compute list of all occurrences of pattern/regex for each string + :meth:`~Series.str.match`,Call ``re.match`` on each element returning matched groups as list + :meth:`~Series.str.extract`,Call ``re.search`` on each element returning DataFrame with one row for each element and one column for each regex capture group + :meth:`~Series.str.extractall`,Call ``re.findall`` on each element returning DataFrame with one row for each match and one column for each regex capture group + :meth:`~Series.str.len`,Compute string lengths + :meth:`~Series.str.strip`,Equivalent to ``str.strip`` + :meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip`` + :meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip`` + :meth:`~Series.str.partition`,Equivalent to ``str.partition`` + :meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition`` + :meth:`~Series.str.lower`,Equivalent to ``str.lower`` + :meth:`~Series.str.casefold`,Equivalent to ``str.casefold`` + :meth:`~Series.str.upper`,Equivalent to ``str.upper`` + :meth:`~Series.str.find`,Equivalent to ``str.find`` + :meth:`~Series.str.rfind`,Equivalent to ``str.rfind`` + :meth:`~Series.str.index`,Equivalent to ``str.index`` + :meth:`~Series.str.rindex`,Equivalent to ``str.rindex`` + :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize`` + :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase`` + :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize`` + :meth:`~Series.str.translate`,Equivalent to ``str.translate`` + :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum`` + :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha`` + :meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit`` + :meth:`~Series.str.isspace`,Equivalent to ``str.isspace`` + :meth:`~Series.str.islower`,Equivalent to ``str.islower`` + :meth:`~Series.str.isupper`,Equivalent to ``str.isupper`` + :meth:`~Series.str.istitle`,Equivalent to ``str.istitle`` + :meth:`~Series.str.isnumeric`,Equivalent to ``str.isnumeric`` + :meth:`~Series.str.isdecimal`,Equivalent to ``str.isdecimal`` diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 5daf204f39bcf..01df17bac5fd7 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -35,7 +35,7 @@ You can construct a ``Timedelta`` scalar through various arguments, including `I pd.Timedelta(days=1, seconds=1) # integers with a unit - pd.Timedelta(1, unit="d") + pd.Timedelta(1, unit="D") # from a datetime.timedelta/np.timedelta64 pd.Timedelta(datetime.timedelta(days=1, seconds=1)) @@ -94,7 +94,7 @@ is numeric: .. ipython:: python pd.to_timedelta(np.arange(5), unit="s") - pd.to_timedelta(np.arange(5), unit="d") + pd.to_timedelta(np.arange(5), unit="D") .. warning:: If a string or array of strings is passed as an input then the ``unit`` keyword diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0f0e6271d8329..10260cb011d90 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -326,7 +326,7 @@ which can be specified. These are computed from the starting point specified by .. note:: The ``unit`` parameter does not use the same strings as the ``format`` parameter - that was discussed :ref:`above`). The + that was discussed :ref:`above`. The available units are listed on the documentation for :func:`pandas.to_datetime`. Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp @@ -797,8 +797,6 @@ There are several time/date properties that one can access from ``Timestamp`` or timetz,"Returns datetime.time as local time with timezone information" dayofyear,"The ordinal day of year" day_of_year,"The ordinal day of year" - weekofyear,"The week ordinal of the year" - week,"The week ordinal of the year" dayofweek,"The number of the day of the week with Monday=0, Sunday=6" day_of_week,"The number of the day of the week with Monday=0, Sunday=6" weekday,"The number of the day of the week with Monday=0, Sunday=6" @@ -812,6 +810,10 @@ There are several time/date properties that one can access from ``Timestamp`` or is_year_end,"Logical indicating if last day of year (defined by frequency)" is_leap_year,"Logical indicating if the date belongs to a leap year" +.. note:: + + You can use ``DatetimeIndex.isocalendar().week`` to access week of year date information. + Furthermore, if you have a ``Series`` with datetimelike values, then you can access these properties via the ``.dt`` accessor, as detailed in the section on :ref:`.dt accessors`. @@ -889,6 +891,10 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQE``, "business quarter end" :class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin" :class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter" + :class:`~pandas.tseries.offsets.HalfYearEnd`, ``'HYE'``, "calendar half year end" + :class:`~pandas.tseries.offsets.HalfYearBegin`, ``'HYS'``, "calendar half year begin" + :class:`~pandas.tseries.offsets.BHalfYearEnd`, ``'BHYE``, "business half year end" + :class:`~pandas.tseries.offsets.BHalfYearBegin`, ``'BHYS'``, "business half year begin" :class:`~pandas.tseries.offsets.YearEnd`, ``'YE'``, "calendar year end" :class:`~pandas.tseries.offsets.YearBegin`, ``'YS'`` or ``'BYS'``,"calendar year begin" :class:`~pandas.tseries.offsets.BYearEnd`, ``'BYE'``, "business year end" @@ -1271,6 +1277,10 @@ frequencies. We will refer to these aliases as *offset aliases*. are deprecated in favour of the aliases ``h``, ``bh``, ``cbh``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. + Aliases ``Y``, ``M``, and ``Q`` are deprecated in favour of the aliases + ``YE``, ``ME``, ``QE``. + + .. note:: When using the offset aliases above, it should be noted that functions @@ -1327,8 +1337,8 @@ frequencies. We will refer to these aliases as *period aliases*. .. deprecated:: 2.2.0 - Aliases ``A``, ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` are deprecated in favour of the aliases - ``Y``, ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. + Aliases ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` are deprecated in favour of the aliases + ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. Combining aliases @@ -1466,11 +1476,16 @@ or some other non-observed day. Defined observance rules are: :header: "Rule", "Description" :widths: 15, 70 + "next_workday", "move Saturday and Sunday to Monday" + "previous_workday", "move Saturday and Sunday to Friday" "nearest_workday", "move Saturday to Friday and Sunday to Monday" + "before_nearest_workday", "apply ``nearest_workday`` and then move to previous workday before that day" + "after_nearest_workday", "apply ``nearest_workday`` and then move to next workday after that day" "sunday_to_monday", "move Sunday to following Monday" "next_monday_or_tuesday", "move Saturday to Monday and Sunday/Monday to Tuesday" - "previous_friday", move Saturday and Sunday to previous Friday" + "previous_friday", "move Saturday and Sunday to previous Friday" "next_monday", "move Saturday and Sunday to following Monday" + "weekend_to_monday", "same as ``next_monday``" An example of how holidays and holiday calendars are defined: @@ -1569,7 +1584,7 @@ the pandas objects. ts = ts[:5] ts.shift(1) -The ``shift`` method accepts an ``freq`` argument which can accept a +The ``shift`` method accepts a ``freq`` argument which can accept a ``DateOffset`` class or other ``timedelta``-like object or also an :ref:`offset alias `. @@ -1853,7 +1868,7 @@ to resample based on datetimelike column in the frame, it can passed to the ), ) df - df.resample("ME", on="date")[["a"]].sum() + df.resample("MS", on="date")[["a"]].sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1861,7 +1876,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample("ME", level="d")[["a"]].sum() + df.resample("MS", level="d")[["a"]].sum() .. _timeseries.iterating-label: @@ -2326,7 +2341,7 @@ Time zone handling ------------------ pandas provides rich support for working with timestamps in different time -zones using the ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` +zones using the ``zoneinfo``, ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` objects from the standard library. @@ -2343,14 +2358,14 @@ By default, pandas objects are time zone unaware: To localize these dates to a time zone (assign a particular time zone to a naive date), you can use the ``tz_localize`` method or the ``tz`` keyword argument in :func:`date_range`, :class:`Timestamp`, or :class:`DatetimeIndex`. -You can either pass ``pytz`` or ``dateutil`` time zone objects or Olson time zone database strings. +You can either pass ``zoneinfo``, ``pytz`` or ``dateutil`` time zone objects or Olson time zone database strings. Olson time zone strings will return ``pytz`` time zone objects by default. To return ``dateutil`` time zone objects, append ``dateutil/`` before the string. -* In ``pytz`` you can find a list of common (and less common) time zones using - ``from pytz import common_timezones, all_timezones``. +* For ``zoneinfo``, a list of available timezones are available from :py:func:`zoneinfo.available_timezones`. +* In ``pytz`` you can find a list of common (and less common) time zones using ``pytz.all_timezones``. * ``dateutil`` uses the OS time zones so there isn't a fixed list available. For - common zones, the names are the same as ``pytz``. + common zones, the names are the same as ``pytz`` and ``zoneinfo``. .. ipython:: python @@ -2455,7 +2470,7 @@ you can use the ``tz_convert`` method. .. warning:: - If you are using dates beyond 2038-01-18, due to current deficiencies + If you are using dates beyond 2038-01-18 with ``pytz``, due to current deficiencies in the underlying libraries caused by the year 2038 problem, daylight saving time (DST) adjustments to timezone aware dates will not be applied. If and when the underlying libraries are fixed, the DST transitions will be applied. @@ -2464,9 +2479,11 @@ you can use the ``tz_convert`` method. .. ipython:: python + import pytz + d_2037 = "2037-03-31T010101" d_2038 = "2038-03-31T010101" - DST = "Europe/London" + DST = pytz.timezone("Europe/London") assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz="GMT") assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz="GMT") @@ -2556,8 +2573,8 @@ Ambiguous times when localizing because daylight savings time (DST) in a local time zone causes some times to occur twice within one day ("clocks fall back"). The following options are available: -* ``'raise'``: Raises a ``pytz.AmbiguousTimeError`` (the default behavior) -* ``'infer'``: Attempt to determine the correct offset base on the monotonicity of the timestamps +* ``'raise'``: Raises a ``ValueError`` (the default behavior) +* ``'infer'``: Attempt to determine the correct offset based on the monotonicity of the timestamps * ``'NaT'``: Replaces ambiguous times with ``NaT`` * ``bool``: ``True`` represents a DST time, ``False`` represents non-DST time. An array-like of ``bool`` values is supported for a sequence of times. @@ -2591,7 +2608,7 @@ A DST transition may also shift the local time ahead by 1 hour creating nonexist local times ("clocks spring forward"). The behavior of localizing a timeseries with nonexistent times can be controlled by the ``nonexistent`` argument. The following options are available: -* ``'raise'``: Raises a ``pytz.NonExistentTimeError`` (the default behavior) +* ``'raise'``: Raises a ``ValueError`` (the default behavior) * ``'NaT'``: Replaces nonexistent times with ``NaT`` * ``'shift_forward'``: Shifts nonexistent times forward to the closest real time * ``'shift_backward'``: Shifts nonexistent times backward to the closest real time diff --git a/doc/source/user_guide/user_defined_functions.rst b/doc/source/user_guide/user_defined_functions.rst new file mode 100644 index 0000000000000..c2472b3c229db --- /dev/null +++ b/doc/source/user_guide/user_defined_functions.rst @@ -0,0 +1,305 @@ +.. _user_defined_functions: + +{{ header }} + +***************************** +User-Defined Functions (UDFs) +***************************** + +In pandas, User-Defined Functions (UDFs) provide a way to extend the library’s +functionality by allowing users to apply custom computations to their data. While +pandas comes with a set of built-in functions for data manipulation, UDFs offer +flexibility when built-in methods are not sufficient. These functions can be +applied at different levels: element-wise, row-wise, column-wise, or group-wise, +and behave differently, depending on the method used. + +Here’s a simple example to illustrate a UDF applied to a Series: + +.. ipython:: python + + s = pd.Series([1, 2, 3]) + + # Simple UDF that adds 1 to a value + def add_one(x): + return x + 1 + + # Apply the function element-wise using .map + s.map(add_one) + +You can also apply UDFs to an entire DataFrame. For example: + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + + # UDF that takes a row and returns the sum of columns A and B + def sum_row(row): + return row["A"] + row["B"] + + # Apply the function row-wise (axis=1 means apply across columns per row) + df.apply(sum_row, axis=1) + + +Why Not To Use User-Defined Functions +------------------------------------- + +While UDFs provide flexibility, they come with significant drawbacks, primarily +related to performance and behavior. When using UDFs, pandas must perform inference +on the result, and that inference could be incorrect. Furthermore, unlike vectorized operations, +UDFs are slower because pandas can't optimize their computations, leading to +inefficient processing. + +.. note:: + In general, most tasks can and should be accomplished using pandas’ built-in methods or vectorized operations. + +Despite their drawbacks, UDFs can be helpful when: + +* **Custom Computations Are Needed**: Implementing complex logic or domain-specific calculations that pandas' + built-in methods cannot handle. +* **Extending pandas' Functionality**: Applying external libraries or specialized algorithms unavailable in pandas. +* **Handling Complex Grouped Operations**: Performing operations on grouped data that standard methods do not support. + +For example: + +.. code-block:: python + + from sklearn.linear_model import LinearRegression + + # Sample data + df = pd.DataFrame({ + 'group': ['A', 'A', 'A', 'B', 'B', 'B'], + 'x': [1, 2, 3, 1, 2, 3], + 'y': [2, 4, 6, 1, 2, 1.5] + }) + + # Function to fit a model to each group + def fit_model(group): + model = LinearRegression() + model.fit(group[['x']], group['y']) + group['y_pred'] = model.predict(group[['x']]) + return group + + result = df.groupby('group').apply(fit_model) + + +Methods that support User-Defined Functions +------------------------------------------- + +User-Defined Functions can be applied across various pandas methods: + ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| Method | Function Input | Function Output | Description | ++============================+========================+==========================+==============================================================================================================================================+ +| :meth:`map` | Scalar | Scalar | Apply a function to each element | ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :meth:`apply` (axis=0) | Column (Series) | Column (Series) | Apply a function to each column | ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :meth:`apply` (axis=1) | Row (Series) | Row (Series) | Apply a function to each row | ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :meth:`agg` | Series/DataFrame | Scalar or Series | Aggregate and summarizes values, e.g., sum or custom reducer | ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :meth:`transform` (axis=0) | Column (Series) | Column(Series) | Same as :meth:`apply` with (axis=0), but it raises an exception if the function changes the shape of the data | ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :meth:`transform` (axis=1) | Row (Series) | Row (Series) | Same as :meth:`apply` with (axis=1), but it raises an exception if the function changes the shape of the data | ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :meth:`filter` | Series or DataFrame | Boolean | Only accepts UDFs in group by. Function is called for each group, and the group is removed from the result if the function returns ``False`` | ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ +| :meth:`pipe` | Series/DataFrame | Series/DataFrame | Chain functions together to apply to Series or Dataframe | ++----------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ + +When applying UDFs in pandas, it is essential to select the appropriate method based +on your specific task. Each method has its strengths and is designed for different use +cases. Understanding the purpose and behavior of each method will help you make informed +decisions, ensuring more efficient and maintainable code. + +.. note:: + Some of these methods are can also be applied to groupby, resample, and various window objects. + See :ref:`groupby`, :ref:`resample()`, :ref:`rolling()`, :ref:`expanding()`, + and :ref:`ewm()` for details. + + +:meth:`DataFrame.apply` +~~~~~~~~~~~~~~~~~~~~~~~ + +The :meth:`apply` method allows you to apply UDFs along either rows or columns. While flexible, +it is slower than vectorized operations and should be used only when you need operations +that cannot be achieved with built-in pandas functions. + +When to use: :meth:`apply` is suitable when no alternative vectorized method or UDF method is available, +but consider optimizing performance with vectorized operations wherever possible. + +:meth:`DataFrame.agg` +~~~~~~~~~~~~~~~~~~~~~ + +If you need to aggregate data, :meth:`agg` is a better choice than apply because it is +specifically designed for aggregation operations. + +When to use: Use :meth:`agg` for performing custom aggregations, where the operation returns +a scalar value on each input. + +:meth:`DataFrame.transform` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :meth:`transform` method is ideal for performing element-wise transformations while preserving the shape of the original DataFrame. +It is generally faster than apply because it can take advantage of pandas' internal optimizations. + +When to use: When you need to perform element-wise transformations that retain the original structure of the DataFrame. + +.. code-block:: python + + from sklearn.linear_model import LinearRegression + + df = pd.DataFrame({ + 'group': ['A', 'A', 'A', 'B', 'B', 'B'], + 'x': [1, 2, 3, 1, 2, 3], + 'y': [2, 4, 6, 1, 2, 1.5] + }).set_index("x") + + # Function to fit a model to each group + def fit_model(group): + x = group.index.to_frame() + y = group + model = LinearRegression() + model.fit(x, y) + pred = model.predict(x) + return pred + + result = df.groupby('group').transform(fit_model) + +:meth:`DataFrame.filter` +~~~~~~~~~~~~~~~~~~~~~~~~ + +The :meth:`filter` method is used to select subsets of the DataFrame’s +columns or row. It is useful when you want to extract specific columns or rows that +match particular conditions. + +When to use: Use :meth:`filter` when you want to use a UDF to create a subset of a DataFrame or Series + +.. note:: + :meth:`DataFrame.filter` does not accept UDFs, but can accept + list comprehensions that have UDFs applied to them. + +.. ipython:: python + + # Sample DataFrame + df = pd.DataFrame({ + 'AA': [1, 2, 3], + 'BB': [4, 5, 6], + 'C': [7, 8, 9], + 'D': [10, 11, 12] + }) + + # Function that filters out columns where the name is longer than 1 character + def is_long_name(column_name): + return len(column_name) > 1 + + df_filtered = df.filter(items=[col for col in df.columns if is_long_name(col)]) + print(df_filtered) + +Since filter does not directly accept a UDF, you have to apply the UDF indirectly, +for example, by using list comprehensions. + +:meth:`DataFrame.map` +~~~~~~~~~~~~~~~~~~~~~ + +The :meth:`map` method is used specifically to apply element-wise UDFs. + +When to use: Use :meth:`map` for applying element-wise UDFs to DataFrames or Series. + +:meth:`DataFrame.pipe` +~~~~~~~~~~~~~~~~~~~~~~ + +The :meth:`pipe` method is useful for chaining operations together into a clean and readable pipeline. +It is a helpful tool for organizing complex data processing workflows. + +When to use: Use :meth:`pipe` when you need to create a pipeline of operations and want to keep the code readable and maintainable. + + +Performance +----------- + +While UDFs provide flexibility, their use is generally discouraged as they can introduce +performance issues, especially when written in pure Python. To improve efficiency, +consider using built-in ``NumPy`` or ``pandas`` functions instead of UDFs +for common operations. + +.. note:: + If performance is critical, explore **vectorized operations** before resorting + to UDFs. + +Vectorized Operations +~~~~~~~~~~~~~~~~~~~~~ + +Below is a comparison of using UDFs versus using Vectorized Operations: + +.. code-block:: python + + # User-defined function + def calc_ratio(row): + return 100 * (row["one"] / row["two"]) + + df["new_col"] = df.apply(calc_ratio, axis=1) + + # Vectorized Operation + df["new_col2"] = 100 * (df["one"] / df["two"]) + +Measuring how long each operation takes: + +.. code-block:: text + + User-defined function: 5.6435 secs + Vectorized: 0.0043 secs + +Vectorized operations in pandas are significantly faster than using :meth:`DataFrame.apply` +with UDFs because they leverage highly optimized C functions +via ``NumPy`` to process entire arrays at once. This approach avoids the overhead of looping +through rows in Python and making separate function calls for each row, which is slow and +inefficient. Additionally, ``NumPy`` arrays benefit from memory efficiency and CPU-level +optimizations, making vectorized operations the preferred choice whenever possible. + + +Improving Performance with UDFs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In scenarios where UDFs are necessary, there are still ways to mitigate their performance drawbacks. +One approach is to use **Numba**, a Just-In-Time (JIT) compiler that can significantly speed up numerical +Python code by compiling Python functions to optimized machine code at runtime. + +By annotating your UDFs with ``@numba.jit``, you can achieve performance closer to vectorized operations, +especially for computationally heavy tasks. + +.. note:: + You may also refer to the user guide on `Enhancing performance `_ + for a more detailed guide to using **Numba**. + +Using :meth:`DataFrame.pipe` for Composable Logic +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Another useful pattern for improving readability and composability, especially when mixing +vectorized logic with UDFs, is to use the :meth:`DataFrame.pipe` method. + +:meth:`DataFrame.pipe` doesn't improve performance directly, but it enables cleaner +method chaining by passing the entire object into a function. This is especially helpful +when chaining custom transformations: + +.. code-block:: python + + def add_ratio_column(df): + df["ratio"] = 100 * (df["one"] / df["two"]) + return df + + df = ( + df + .query("one > 0") + .pipe(add_ratio_column) + .dropna() + ) + +This is functionally equivalent to calling ``add_ratio_column(df)``, but keeps your code +clean and composable. The function you pass to :meth:`DataFrame.pipe` can use vectorized operations, +row-wise UDFs, or any other logic; :meth:`DataFrame.pipe` is agnostic. + +.. note:: + While :meth:`DataFrame.pipe` does not improve performance on its own, + it promotes clean, modular design and allows both vectorized and UDF-based logic + to be composed in method chains. diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 9081d13ef2cf1..4b5cdca23103c 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1210,11 +1210,6 @@ You may set the ``xlabel`` and ``ylabel`` arguments to give the plot custom labe for x and y axis. By default, pandas will pick up index name as xlabel, while leaving it empty for ylabel. -.. ipython:: python - :suppress: - - plt.figure(); - .. ipython:: python df.plot(); @@ -1504,7 +1499,7 @@ Plotting with error bars Plotting with error bars is supported in :meth:`DataFrame.plot` and :meth:`Series.plot`. -Horizontal and vertical error bars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot()`. The error values can be specified using a variety of formats: +Horizontal and vertical error bars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot`. The error values can be specified using a variety of formats: * As a :class:`DataFrame` or ``dict`` of errors with column names matching the ``columns`` attribute of the plotting :class:`DataFrame` or matching the ``name`` attribute of the :class:`Series`. * As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values. diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index d997aa119b359..5b27442c80bb8 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -70,7 +70,8 @@ which will first group the data by the specified keys and then perform a windowi Some windowing aggregation, ``mean``, ``sum``, ``var`` and ``std`` methods may suffer from numerical imprecision due to the underlying windowing algorithms accumulating sums. When values differ - with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be + with magnitude ``1/np.finfo(np.double).eps`` (approximately :math:`4.5 \times 10^{15}`), + this results in truncation. It must be noted, that large values may have an impact on windows, which do not include these values. `Kahan summation `__ is used to compute the rolling sums to preserve accuracy as much as possible. @@ -79,9 +80,9 @@ which will first group the data by the specified keys and then perform a windowi .. versionadded:: 1.3.0 Some windowing operations also support the ``method='table'`` option in the constructor which -performs the windowing operation over an entire :class:`DataFrame` instead of a single column or row at a time. -This can provide a useful performance benefit for a :class:`DataFrame` with many columns or rows -(with the corresponding ``axis`` argument) or the ability to utilize other columns during the windowing +performs the windowing operation over an entire :class:`DataFrame` instead of a single column at a time. +This can provide a useful performance benefit for a :class:`DataFrame` with many columns +or the ability to utilize other columns during the windowing operation. The ``method='table'`` option can only be used if ``engine='numba'`` is specified in the corresponding method call. @@ -356,11 +357,11 @@ See :ref:`enhancing performance with Numba ` for general us Numba will be applied in potentially two routines: -#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. +#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. #. The engine will JIT the for loop where the apply function is applied to each window. The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the -`numba.jit decorator `__. +`numba.jit decorator `__. These keyword arguments will be applied to *both* the passed function (if a standard Python function) and the apply for loop over each window. @@ -567,9 +568,9 @@ One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass \alpha = \begin{cases} - \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ - \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ - 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 + \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ + \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ + 1 - e^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 \end{cases} One must specify precisely one of **span**, **center of mass**, **half-life** diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ec024f36d78b1..1dd6c5fabef04 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,12 +10,31 @@ This is the list of changes to pandas between each release. For full details, see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. +Version 3.0 +----------- + +.. toctree:: + :maxdepth: 2 + + v3.0.0 + +Version 2.3 +----------- + +.. toctree:: + :maxdepth: 2 + + v2.3.0 + Version 2.2 ----------- .. toctree:: :maxdepth: 2 + v2.2.3 + v2.2.2 + v2.2.1 v2.2.0 Version 2.1 diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index be50c34d7d14c..905583c708905 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -242,18 +242,42 @@ labeled the aggregated group with the end of the interval: the next day). - Calling ``fillna`` on Series or DataFrame with no arguments is no longer valid code. You must either specify a fill value or an interpolation method: -.. ipython:: python - :okwarning: +.. code-block:: ipython - s = pd.Series([np.nan, 1.0, 2.0, np.nan, 4]) - s - s.fillna(0) - s.fillna(method="pad") + In [6]: s = pd.Series([np.nan, 1.0, 2.0, np.nan, 4]) + + In [7]: s + Out[7]: + 0 NaN + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + dtype: float64 + + In [8]: s.fillna(0) + Out[8]: + 0 0.0 + 1 1.0 + 2 2.0 + 3 0.0 + 4 4.0 + dtype: float64 + + In [9]: s.fillna(method="pad") + Out[9]: + 0 NaN + 1 1.0 + 2 2.0 + 3 2.0 + 4 4.0 + dtype: float64 Convenience methods ``ffill`` and ``bfill`` have been added: .. ipython:: python + s = pd.Series([np.nan, 1.0, 2.0, np.nan, 4]) s.ffill() diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index f05cbc7f07d7d..28c9d46f21fd8 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -12,7 +12,7 @@ Data have had quite a number of additions, and Dtype support is now full-fledged There are also a number of important API changes that long-time pandas users should pay close attention to. -There is a new section in the documentation, :ref:`10 Minutes to Pandas <10min>`, +There is a new section in the documentation, :ref:`10 Minutes to pandas <10min>`, primarily geared to new users. There is a new section in the documentation, :ref:`Cookbook `, a collection @@ -74,10 +74,10 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe .. ipython:: python - df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float64') df1 df1.dtypes - df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'), + df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float32'), 'B': pd.Series(np.random.randn(8)), 'C': pd.Series(range(8), dtype='uint8')}) df2 diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 59d104cb3e96c..08d3a6b188322 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -133,9 +133,9 @@ API changes to be inserted if ``True``, default is ``False`` (same as prior to 0.12) (:issue:`3679`) - Implement ``__nonzero__`` for ``NDFrame`` objects (:issue:`3691`, :issue:`3696`) - - IO api + - IO API - - added top-level function ``read_excel`` to replace the following, + - Added top-level function ``read_excel`` to replace the following, The original API is deprecated and will be removed in a future version .. code-block:: python @@ -153,7 +153,7 @@ API changes pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"]) - - added top-level function ``read_sql`` that is equivalent to the following + - Added top-level function ``read_sql`` that is equivalent to the following .. code-block:: python @@ -201,12 +201,12 @@ IO enhancements You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so .. ipython:: python - :okwarning: + import io df = pd.DataFrame({"a": range(3), "b": list("abc")}) print(df) html = df.to_html() - alist = pd.read_html(html, index_col=0) + alist = pd.read_html(io.StringIO(html), index_col=0) print(df == alist[0]) Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and @@ -482,11 +482,11 @@ Bug fixes - ``HDFStore`` - - will retain index attributes (freq,tz,name) on recreation (:issue:`3499`) - - will warn with a ``AttributeConflictWarning`` if you are attempting to append + - Will retain index attributes (freq,tz,name) on recreation (:issue:`3499`) + - Will warn with a ``AttributeConflictWarning`` if you are attempting to append an index with a different frequency than the existing, or attempting to append an index with a different name than the existing - - support datelike columns with a timezone as data_columns (:issue:`2852`) + - Support datelike columns with a timezone as data_columns (:issue:`2852`) - Non-unique index support clarified (:issue:`3468`). diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index f2e29121760ab..8e323d8aac5e3 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -172,7 +172,7 @@ API changes statistical mode(s) by axis/Series. (:issue:`5367`) - Chained assignment will now by default warn if the user is assigning to a copy. This can be changed - with the option ``mode.chained_assignment``, allowed options are ``raise/warn/None``. See :ref:`the docs`. + with the option ``mode.chained_assignment``, allowed options are ``raise/warn/None``. .. ipython:: python @@ -345,7 +345,6 @@ Float64Index API change .. ipython:: python :okwarning: - s[2:4] s.loc[2:4] s.iloc[2:4] @@ -524,13 +523,25 @@ Enhancements Using the new top-level ``to_timedelta``, you can convert a scalar or array from the standard timedelta format (produced by ``to_csv``) into a timedelta type (``np.timedelta64`` in ``nanoseconds``). - .. ipython:: python + .. code-block:: ipython + + In [53]: pd.to_timedelta('1 days 06:05:01.00003') + Out[53]: Timedelta('1 days 06:05:01.000030') + + In [54]: pd.to_timedelta('15.5us') + Out[54]: Timedelta('0 days 00:00:00.000015500') + + In [55]: pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + Out[55]: TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], dtype='timedelta64[ns]', freq=None) + + In [56]: pd.to_timedelta(np.arange(5), unit='s') + Out[56]: + TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', + '0 days 00:00:03', '0 days 00:00:04'], + dtype='timedelta64[ns]', freq=None) - pd.to_timedelta('1 days 06:05:01.00003') - pd.to_timedelta('15.5us') - pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) - pd.to_timedelta(np.arange(5), unit='s') - pd.to_timedelta(np.arange(5), unit='d') + In [57]: pd.to_timedelta(np.arange(5), unit='d') + Out[57]: TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) A Series of dtype ``timedelta64[ns]`` can now be divided by another ``timedelta64[ns]`` object, or astyped to yield a ``float64`` dtyped Series. This diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 8c85868e1aedb..483dd15a8467a 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -24,8 +24,8 @@ Highlights include: .. warning:: 0.13.1 fixes a bug that was caused by a combination of having numpy < 1.8, and doing - chained assignment on a string-like array. Please review :ref:`the docs`, - chained indexing can have unexpected results and should generally be avoided. + chained assignment on a string-like array. + Chained indexing can have unexpected results and should generally be avoided. This would previously segfault: diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 8dafed1efee97..1ee7c5cbc6b9e 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -445,7 +445,7 @@ Rolling/expanding moments improvements 3 5 dtype: float64 -- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (`mean=True`) so that +- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (``mean=True``) so that the calculated weighted means (e.g. 'triang', 'gaussian') are distributed about the same means as those calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`) @@ -490,7 +490,7 @@ Rolling/expanding moments improvements ``ddof`` argument (with a default value of ``1``) was previously undocumented. (:issue:`8064`) - :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr` - now interpret ``min_periods`` in the same manner that the :func:`rolling_*()` and :func:`expanding_*()` functions do: + now interpret ``min_periods`` in the same manner that the :func:`rolling_*` and :func:`expanding_*` functions do: a given result entry will be ``NaN`` if the (expanding, in this case) window does not contain at least ``min_periods`` values. The previous behavior was to set to ``NaN`` the ``min_periods`` entries starting with the first non- ``NaN`` value. (:issue:`7977`) @@ -567,7 +567,7 @@ Rolling/expanding moments improvements .. warning:: - By default (``ignore_na=False``) the :func:`ewm*()` functions' weights calculation + By default (``ignore_na=False``) the :func:`ewm*` functions' weights calculation in the presence of missing values is different than in pre-0.15.0 versions. To reproduce the pre-0.15.0 calculation of weights in the presence of missing values one must specify explicitly ``ignore_na=True``. @@ -576,7 +576,7 @@ Rolling/expanding moments improvements returning results with columns sorted by name and producing an error for non-unique columns; now handles non-unique columns and returns columns in original order (except for the case of two DataFrames with ``pairwise=False``, where behavior is unchanged) (:issue:`7542`) -- Bug in :func:`rolling_count` and :func:`expanding_*()` functions unnecessarily producing error message for zero-length data (:issue:`8056`) +- Bug in :func:`rolling_count` and :func:`expanding_*` functions unnecessarily producing error message for zero-length data (:issue:`8056`) - Bug in :func:`rolling_apply` and :func:`expanding_apply` interpreting ``min_periods=0`` as ``min_periods=1`` (:issue:`8080`) - Bug in :func:`expanding_std` and :func:`expanding_var` for a single value producing a confusing error message (:issue:`7900`) - Bug in :func:`rolling_std` and :func:`rolling_var` for a single value producing ``0`` rather than ``NaN`` (:issue:`7900`) @@ -875,7 +875,7 @@ Other notable API changes: The behaviour of assigning a column to an existing dataframe as ``df['a'] = i`` remains unchanged (this already returned an ``object`` column with a timezone). -- When passing multiple levels to :meth:`~pandas.DataFrame.stack()`, it will now raise a ``ValueError`` when the +- When passing multiple levels to :meth:`~pandas.DataFrame.stack`, it will now raise a ``ValueError`` when the levels aren't all level names or all level numbers (:issue:`7660`). See :ref:`Reshaping by stacking and unstacking `. @@ -1110,7 +1110,7 @@ Other: - ``DataFrame.fillna`` can now accept a ``DataFrame`` as a fill value (:issue:`8377`) -- Passing multiple levels to :meth:`~pandas.DataFrame.stack()` will now work when multiple level +- Passing multiple levels to :meth:`~pandas.DataFrame.stack` will now work when multiple level numbers are passed (:issue:`7660`). See :ref:`Reshaping by stacking and unstacking `. diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index 09b59f35972cd..f16c9b3f5d45b 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -92,7 +92,7 @@ API changes .. code-block:: ipython - In [4]: gr.apply(sum) + In [4]: gr.apply("sum") Out[4]: joe jim @@ -102,9 +102,8 @@ API changes current behavior: .. ipython:: python - :okwarning: - gr.apply(sum) + gr.apply("sum") - Support for slicing with monotonic decreasing indexes, even if ``start`` or ``stop`` is not found in the index (:issue:`7860`): @@ -264,7 +263,7 @@ Enhancements - Raise errors in certain aggregation cases where an argument such as ``numeric_only`` is not handled (:issue:`8592`). -- Added support for 3-character ISO and non-standard country codes in :func:`io.wb.download()` (:issue:`8482`) +- Added support for 3-character ISO and non-standard country codes in :func:`io.wb.download` (:issue:`8482`) - World Bank data requests now will warn/raise based on an ``errors`` argument, as well as a list of hard-coded country codes and diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index acc5409b86d09..a9003710540d7 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -212,7 +212,7 @@ Other enhancements: 4 True True True True - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on ``Timestamp`` class (:issue:`5351`). -- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. +- Added Google Analytics (``pandas.io.ga``) basic documentation (:issue:`8835`). See `here `__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithmetic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). - Added ``Timedelta.to_timedelta64()`` method to the public API (:issue:`8884`). diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 569197fe9daf5..563035e0e2940 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -322,15 +322,28 @@ Tz-aware are rounded, floored and ceiled in local times Timedeltas -.. ipython:: python +.. code-block:: ipython + + In [37]: t = pd.timedelta_range('1 days 2 hr 13 min 45 us', periods=3, freq='d') - t = pd.timedelta_range('1 days 2 hr 13 min 45 us', periods=3, freq='d') - t - t.round('10min') + In [38]: t + Out[38]: + TimedeltaIndex(['1 days 02:13:00.000045', '2 days 02:13:00.000045', + '3 days 02:13:00.000045'], + dtype='timedelta64[ns]', freq='D') + + In [39]: t.round('10min') + Out[39]: + TimedeltaIndex(['1 days 02:10:00', '2 days 02:10:00', + '3 days 02:10:00'], + dtype='timedelta64[ns]', freq=None) # Timedelta scalar - t[0] - t[0].round('2h') + In [40]: t[0] + Out[40]: Timedelta('1 days 02:13:00.000045') + + In [41]: t[0].round('2h') + Out[41]: Timedelta('1 days 02:00:00') In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available through the ``.dt`` accessor of ``Series``. diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 09bf5428d0432..d6d1d96ccc878 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -308,15 +308,26 @@ The new orient ``'table'`` for :meth:`DataFrame.to_json` will generate a `Table Schema`_ compatible string representation of the data. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, - index=pd.Index(range(3), name='idx')) - df - df.to_json(orient='table') + In [38]: df = pd.DataFrame( + ....: {'A': [1, 2, 3], + ....: 'B': ['a', 'b', 'c'], + ....: 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, + ....: index=pd.Index(range(3), name='idx')) + In [39]: df + Out[39]: + A B C + idx + 0 1 a 2016-01-01 + 1 2 b 2016-01-02 + 2 3 c 2016-01-03 + + [3 rows x 3 columns] + + In [40]: df.to_json(orient='table') + Out[40]: + '{"schema":{"fields":[{"name":"idx","type":"integer"},{"name":"A","type":"integer"},{"name":"B","type":"string"},{"name":"C","type":"datetime"}],"primaryKey":["idx"],"pandas_version":"1.4.0"},"data":[{"idx":0,"A":1,"B":"a","C":"2016-01-01T00:00:00.000"},{"idx":1,"A":2,"B":"b","C":"2016-01-02T00:00:00.000"},{"idx":2,"A":3,"B":"c","C":"2016-01-03T00:00:00.000"}]}' See :ref:`IO: Table Schema for more information `. @@ -369,7 +380,6 @@ Experimental support has been added to export ``DataFrame.style`` formats to Exc For example, after running the following, ``styled.xlsx`` renders as below: .. ipython:: python - :okwarning: np.random.seed(24) df = pd.DataFrame({'A': np.linspace(1, 10, 10)}) @@ -379,7 +389,7 @@ For example, after running the following, ``styled.xlsx`` renders as below: df.iloc[0, 2] = np.nan df styled = (df.style - .applymap(lambda val: 'color:red;' if val < 0 else 'color:black;') + .map(lambda val: 'color:red;' if val < 0 else 'color:black;') .highlight_max()) styled.to_excel('styled.xlsx', engine='openpyxl') diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index dad69b99ee6a4..43719cd53b0ff 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -318,7 +318,7 @@ New keywords - :func:`Series.set_axis` and :func:`DataFrame.set_axis` now support the ``inplace`` parameter. (:issue:`14636`) - :func:`Series.to_pickle` and :func:`DataFrame.to_pickle` have gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) -- :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) +- :func:`DataFrame.clip` and :func:`Series.clip` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) - :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :func:`read_json` and :func:`~DataFrame.to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`) @@ -977,10 +977,10 @@ Other API changes Deprecations ~~~~~~~~~~~~ -- :meth:`DataFrame.from_csv` and :meth:`Series.from_csv` have been deprecated in favor of :func:`read_csv()` (:issue:`4191`) -- :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). -- :func:`read_excel()` has deprecated ``parse_cols`` in favor of ``usecols`` for consistency with :func:`read_csv` (:issue:`4988`) -- :func:`read_csv()` has deprecated the ``tupleize_cols`` argument. Column tuples will always be converted to a ``MultiIndex`` (:issue:`17060`) +- :meth:`DataFrame.from_csv` and :meth:`Series.from_csv` have been deprecated in favor of :func:`read_csv` (:issue:`4191`) +- :func:`read_excel` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- :func:`read_excel` has deprecated ``parse_cols`` in favor of ``usecols`` for consistency with :func:`read_csv` (:issue:`4988`) +- :func:`read_csv` has deprecated the ``tupleize_cols`` argument. Column tuples will always be converted to a ``MultiIndex`` (:issue:`17060`) - :meth:`DataFrame.to_csv` has deprecated the ``tupleize_cols`` argument. MultiIndex columns will be always written as rows in the CSV file (:issue:`17060`) - The ``convert`` parameter has been deprecated in the ``.take()`` method, as it was not being respected (:issue:`16948`) - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). @@ -1045,7 +1045,7 @@ return the position of the maximum or minimum. Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :func:`read_excel()` has dropped the ``has_index_names`` parameter (:issue:`10967`) +- :func:`read_excel` has dropped the ``has_index_names`` parameter (:issue:`10967`) - The ``pd.options.display.height`` configuration has been dropped (:issue:`3663`) - The ``pd.options.display.line_width`` configuration has been dropped (:issue:`2881`) - The ``pd.options.display.mpl_style`` configuration has been dropped (:issue:`12190`) @@ -1154,7 +1154,7 @@ GroupBy/resample/rolling - Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) - Bug in :func:`infer_freq` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) -- Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) +- Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile` and :func:`DataFrame.quantile` (:issue:`9413`, :issue:`16211`) - Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) - Bug in ``Series.resample(...).apply()`` where an empty ``Series`` modified the source index and did not return the name of a ``Series`` (:issue:`14313`) - Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1`` (:issue:`15305`) @@ -1194,7 +1194,7 @@ Reshaping Numeric ^^^^^^^ - Bug in ``.clip()`` with ``axis=1`` and a list-like for ``threshold`` is passed; previously this raised ``ValueError`` (:issue:`15390`) -- :func:`Series.clip()` and :func:`DataFrame.clip()` now treat NA values for upper and lower arguments as ``None`` instead of raising ``ValueError`` (:issue:`17276`). +- :func:`Series.clip` and :func:`DataFrame.clip` now treat NA values for upper and lower arguments as ``None`` instead of raising ``ValueError`` (:issue:`17276`). Categorical diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst index e217e1a75efc5..a8f9f9c9e0840 100644 --- a/doc/source/whatsnew/v0.21.1.rst +++ b/doc/source/whatsnew/v0.21.1.rst @@ -141,7 +141,7 @@ IO Plotting ^^^^^^^^ -- Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not pickleable in Python 3 (:issue:`18439`) +- Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not picklable in Python 3 (:issue:`18439`) GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -176,7 +176,7 @@ Categorical String ^^^^^^ -- :meth:`Series.str.split()` will now propagate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) +- :meth:`Series.str.split` will now propagate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) .. _whatsnew_0.21.1.contributors: diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index a33a8f7addeef..8a9227ac37b67 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -157,16 +157,27 @@ sum and ``1`` for product. *pandas 0.22.0* -.. ipython:: python +.. code-block:: ipython + + In [11]: s = pd.Series([1, 1, np.nan, np.nan], + ....: index=pd.date_range("2017", periods=4)) - s = pd.Series([1, 1, np.nan, np.nan], index=pd.date_range("2017", periods=4)) - s.resample("2d").sum() + In [12]: s.resample("2d").sum() + Out[12]: + 2017-01-01 2.0 + 2017-01-03 0.0 + Freq: 2D, Length: 2, dtype: float64 To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. -.. ipython:: python +.. code-block:: ipython + + In [13]: s.resample("2d").sum(min_count=1) + Out[13]: + 2017-01-01 2.0 + 2017-01-03 NaN + Freq: 2D, Length: 2, dtype: float64 - s.resample("2d").sum(min_count=1) In particular, upsampling and taking the sum or product is affected, as upsampling introduces missing values even if the original series was diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 808741ccf4475..7f7609edc27b6 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -50,19 +50,55 @@ JSON read/write round-trippable with ``orient='table'`` A ``DataFrame`` can now be written to and subsequently read back via JSON while preserving metadata through usage of the ``orient='table'`` argument (see :issue:`18912` and :issue:`9146`). Previously, none of the available ``orient`` values guaranteed the preservation of dtypes and index names, amongst other metadata. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame({'foo': [1, 2, 3, 4], - 'bar': ['a', 'b', 'c', 'd'], - 'baz': pd.date_range('2018-01-01', freq='d', periods=4), - 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, - index=pd.Index(range(4), name='idx')) - df - df.dtypes - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') - new_df - new_df.dtypes + In [1]: df = pd.DataFrame({'foo': [1, 2, 3, 4], + ...: 'bar': ['a', 'b', 'c', 'd'], + ...: 'baz': pd.date_range('2018-01-01', freq='d', periods=4), + ...: 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, + ...: index=pd.Index(range(4), name='idx')) + + In [2]: df + Out[2]: + foo bar baz qux + idx + 0 1 a 2018-01-01 a + 1 2 b 2018-01-02 b + 2 3 c 2018-01-03 c + 3 4 d 2018-01-04 c + + [4 rows x 4 columns] + + In [3]: df.dtypes + Out[3]: + foo int64 + bar object + baz datetime64[ns] + qux category + Length: 4, dtype: object + + In [4]: df.to_json('test.json', orient='table') + + In [5]: new_df = pd.read_json('test.json', orient='table') + + In [6]: new_df + Out[6]: + foo bar baz qux + idx + 0 1 a 2018-01-01 a + 1 2 b 2018-01-02 b + 2 3 c 2018-01-03 c + 3 4 d 2018-01-04 c + + [4 rows x 4 columns] + + In [7]: new_df.dtypes + Out[7]: + foo int64 + bar object + baz datetime64[ns] + qux category + Length: 4, dtype: object Please note that the string ``index`` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. @@ -538,7 +574,7 @@ Other enhancements - :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`). - :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`) - :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`) -- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) +- :func:`read_excel` has gained the ``nrows`` parameter (:issue:`16645`) - :meth:`DataFrame.append` can now in more cases preserve the type of the calling dataframe's columns (e.g. if both are ``CategoricalIndex``) (:issue:`18359`) - :meth:`DataFrame.to_json` and :meth:`Series.to_json` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) - ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`) @@ -1056,10 +1092,10 @@ Other API changes - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) -- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) +- Rearranged the order of keyword arguments in :func:`read_excel` to align with :func:`read_csv` (:issue:`16672`) - :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) - In :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`) -- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) +- Rearranged the order of keyword arguments in :func:`read_excel` to align with :func:`read_csv` (:issue:`16672`) - The options ``html.border`` and ``mode.use_inf_as_null`` were deprecated in prior versions, these will now show ``FutureWarning`` rather than a ``DeprecationWarning`` (:issue:`19003`) - :class:`IntervalIndex` and ``IntervalDtype`` no longer support categorical, object, and string subtypes (:issue:`19016`) - ``IntervalDtype`` now returns ``True`` when compared against ``'interval'`` regardless of subtype, and ``IntervalDtype.name`` now returns ``'interval'`` regardless of subtype (:issue:`18980`) @@ -1171,7 +1207,7 @@ Performance improvements - ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) - Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) - Improved performance of :func:`Series.dt.time` and :func:`DatetimeIndex.time` (:issue:`18461`) -- Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) +- Improved performance of :func:`IntervalIndex.symmetric_difference` (:issue:`18475`) - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) @@ -1290,7 +1326,7 @@ Timedelta - Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) - Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`, :issue:`19834`) -- Bug in :func:`Timedelta.total_seconds()` causing precision errors, for example ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) +- Bug in :func:`Timedelta.total_seconds` causing precision errors, for example ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) - Bug in :func:`Timedelta.__rmod__` where operating with a ``numpy.timedelta64`` returned a ``timedelta64`` object instead of a ``Timedelta`` (:issue:`19820`) - Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mismatch (:issue:`19333`) - Bug in indexing a :class:`TimedeltaIndex` with a ``np.timedelta64`` object which was raising a ``TypeError`` (:issue:`20393`) @@ -1394,12 +1430,12 @@ IO - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) - Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`) - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`) -- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) -- Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`) -- Bug in :func:`DataFrame.to_latex()` where a non-string index-level name would result in an ``AttributeError`` (:issue:`19981`) -- Bug in :func:`DataFrame.to_latex()` where the combination of an index name and the ``index_names=False`` option would result in incorrect output (:issue:`18326`) -- Bug in :func:`DataFrame.to_latex()` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`) -- Bug in :func:`DataFrame.to_latex()` where missing space characters caused wrong escaping and produced non-valid latex in some cases (:issue:`20859`) +- Bug in :func:`DataFrame.to_latex` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) +- Bug in :func:`DataFrame.to_latex` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`) +- Bug in :func:`DataFrame.to_latex` where a non-string index-level name would result in an ``AttributeError`` (:issue:`19981`) +- Bug in :func:`DataFrame.to_latex` where the combination of an index name and the ``index_names=False`` option would result in incorrect output (:issue:`18326`) +- Bug in :func:`DataFrame.to_latex` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`) +- Bug in :func:`DataFrame.to_latex` where missing space characters caused wrong escaping and produced non-valid latex in some cases (:issue:`20859`) - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst index 685fe1b3836bf..a98933e7f5969 100644 --- a/doc/source/whatsnew/v0.23.1.rst +++ b/doc/source/whatsnew/v0.23.1.rst @@ -106,7 +106,7 @@ Bug fixes **Data-type specific** -- Bug in :meth:`Series.str.replace()` where the method throws ``TypeError`` on Python 3.5.2 (:issue:`21078`) +- Bug in :meth:`Series.str.replace` where the method throws ``TypeError`` on Python 3.5.2 (:issue:`21078`) - Bug in :class:`Timedelta` where passing a float with a unit would prematurely round the float precision (:issue:`14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index cb12962256a55..60e77a8c5d8c5 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -418,7 +418,7 @@ Other enhancements - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`8917`, :issue:`24466`) - :meth:`Index.difference`, :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference` now have an optional ``sort`` parameter to control whether the results should be sorted if possible (:issue:`17839`, :issue:`24471`) -- :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) +- :meth:`read_excel` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) - :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object. - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed string columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`) @@ -723,8 +723,8 @@ Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` The time values in :class:`Period` and :class:`PeriodIndex` objects are now set to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, -:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, -or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) +:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp` with ``how='end'``, +or :func:`PeriodIndex.to_timestamp` with ``how='end'`` (:issue:`17157`) *Previous behavior*: @@ -840,7 +840,7 @@ then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was retur In [2]: df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']}) In [3]: type(pd.get_dummies(df, sparse=True)) - Out[3]: pandas.core.frame.DataFrame + Out[3]: pandas.DataFrame In [4]: type(pd.get_dummies(df[['B', 'C']], sparse=True)) Out[4]: pandas.core.sparse.frame.SparseDataFrame @@ -1289,15 +1289,15 @@ ways of adding operator support. - Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) - :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) -- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) -- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- :meth:`Series.combine` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine` with scalar argument now works for any function type (:issue:`21248`) - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185`). - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`) - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). -- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). +- Bug when grouping :meth:`Dataframe.groupby` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). - Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`). @@ -1586,7 +1586,7 @@ Categorical - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). - In :meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`) -- Bug when resampling :meth:`DataFrame.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) +- Bug when resampling :meth:`DataFrame.resample` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) - Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`) @@ -1653,10 +1653,10 @@ Timedelta - Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) - Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) - Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) -- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) +- Fixed bug in adding a :class:`DataFrame` with all-``timedelta64[ns]`` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) - Bug in :class:`TimedeltaIndex` where adding a timezone-aware datetime scalar incorrectly returned a timezone-naive :class:`DatetimeIndex` (:issue:`23215`) - Bug in :class:`TimedeltaIndex` where adding ``np.timedelta64('NaT')`` incorrectly returned an all-``NaT`` :class:`DatetimeIndex` instead of an all-``NaT`` :class:`TimedeltaIndex` (:issue:`23215`) -- Bug in :class:`Timedelta` and :func:`to_timedelta()` have inconsistencies in supported unit string (:issue:`21762`) +- Bug in :class:`Timedelta` and :func:`to_timedelta` have inconsistencies in supported unit string (:issue:`21762`) - Bug in :class:`TimedeltaIndex` division where dividing by another :class:`TimedeltaIndex` raised ``TypeError`` instead of returning a :class:`Float64Index` (:issue:`23829`, :issue:`22631`) - Bug in :class:`TimedeltaIndex` comparison operations where comparing against non-``Timedelta``-like objects would raise ``TypeError`` instead of returning all-``False`` for ``__eq__`` and all-``True`` for ``__ne__`` (:issue:`24056`) - Bug in :class:`Timedelta` comparisons when comparing with a ``Tick`` object incorrectly raising ``TypeError`` (:issue:`24710`) @@ -1803,39 +1803,39 @@ IO - Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`) - Bug in :meth:`DataFrame.to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`) - Bug in :meth:`DataFrame.to_sql` where a naive :class:`DatetimeIndex` would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`) -- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`) -- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) -- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) -- :func:`read_csv()` and :func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) -- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`) -- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`) -- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) -- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) -- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) -- Bug in :func:`read_sas()` in which an incorrect error was raised on an invalid file format. (:issue:`24548`) +- Bug in :meth:`read_excel` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`) +- :func:`read_html` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) +- :func:`read_excel` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv` and :func:`read_table` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) +- :func:`read_csv` will correctly parse timezone-aware datetimes (:issue:`22256`) +- Bug in :func:`read_csv` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`) +- Bug in :func:`read_csv` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`) +- :func:`read_sas` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) +- :func:`read_sas` will correctly parse sas7bdat files with many columns (:issue:`22628`) +- :func:`read_sas` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) +- Bug in :func:`read_sas` in which an incorrect error was raised on an invalid file format. (:issue:`24548`) - Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`DataFrame.to_html()` with ``index=False`` misses truncation indicators (...) on truncated DataFrame (:issue:`15019`, :issue:`22783`) -- Bug in :func:`DataFrame.to_html()` with ``index=False`` when both columns and row index are ``MultiIndex`` (:issue:`22579`) -- Bug in :func:`DataFrame.to_html()` with ``index_names=False`` displaying index name (:issue:`22747`) -- Bug in :func:`DataFrame.to_html()` with ``header=False`` not displaying row index names (:issue:`23788`) -- Bug in :func:`DataFrame.to_html()` with ``sparsify=False`` that caused it to raise ``TypeError`` (:issue:`22887`) -- Bug in :func:`DataFrame.to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) -- Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) +- Bug in :func:`DataFrame.to_html` with ``index=False`` misses truncation indicators (...) on truncated DataFrame (:issue:`15019`, :issue:`22783`) +- Bug in :func:`DataFrame.to_html` with ``index=False`` when both columns and row index are ``MultiIndex`` (:issue:`22579`) +- Bug in :func:`DataFrame.to_html` with ``index_names=False`` displaying index name (:issue:`22747`) +- Bug in :func:`DataFrame.to_html` with ``header=False`` not displaying row index names (:issue:`23788`) +- Bug in :func:`DataFrame.to_html` with ``sparsify=False`` that caused it to raise ``TypeError`` (:issue:`22887`) +- Bug in :func:`DataFrame.to_string` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) +- Bug in :func:`DataFrame.to_string` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). - :class:`HDFStore` will raise ``ValueError`` when the ``format`` kwarg is passed to the constructor (:issue:`13291`) - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) -- Bug in :func:`read_csv()` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`) -- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`) -- Bug in :func:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) -- Bug in :func:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`) -- Bug in :func:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) -- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) -- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`) -- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`) -- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) +- Bug in :func:`read_csv` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`) +- Bug in :func:`read_csv` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`) +- Bug in :func:`read_csv` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) +- Bug in :func:`read_csv` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`) +- Bug in :func:`read_html` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) +- Bug in :meth:`read_excel` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) +- Bug in :meth:`read_excel` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`) +- Bug in :meth:`read_excel` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`) +- Bug in :meth:`read_excel` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) - Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) -- :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) +- :func:`DataFrame.to_string`, :func:`DataFrame.to_html`, :func:`DataFrame.to_latex` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) - Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) - Bug in :func:`read_csv` that caused the C engine on Python 3.6+ on Windows to improperly read CSV filenames with accented or special characters (:issue:`15086`) - Bug in :func:`read_fwf` in which the compression type of a file was not being properly inferred (:issue:`22199`) @@ -1843,7 +1843,7 @@ IO - Bug in :meth:`DataFrame.to_stata`, :class:`pandas.io.stata.StataWriter` and :class:`pandas.io.stata.StataWriter117` where a exception would leave a partially written and invalid dta file (:issue:`23573`) - Bug in :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` that produced invalid files when using strLs with non-ASCII characters (:issue:`23573`) - Bug in :class:`HDFStore` that caused it to raise ``ValueError`` when reading a Dataframe in Python 3 from fixed format written in Python 2 (:issue:`24510`) -- Bug in :func:`DataFrame.to_string()` and more generally in the floating ``repr`` formatter. Zeros were not trimmed if ``inf`` was present in a columns while it was the case with NA values. Zeros are now trimmed as in the presence of NA (:issue:`24861`). +- Bug in :func:`DataFrame.to_string` and more generally in the floating ``repr`` formatter. Zeros were not trimmed if ``inf`` was present in a columns while it was the case with NA values. Zeros are now trimmed as in the presence of NA (:issue:`24861`). - Bug in the ``repr`` when truncating the number of columns and having a wide last column (:issue:`24849`). Plotting diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 9a8c2ee5d00fa..d8f2e17cb9e4f 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -22,7 +22,7 @@ Fixed regressions - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`) - Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`) -- Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) +- Fixed regression in :meth:`DataFrame.duplicated`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ``Categorical`` data (:issue:`25299`) - Fixed regression in subtraction between :class:`Series` objects with ``datetime64[ns]`` dtype incorrectly raising ``OverflowError`` when the ``Series`` on the right contains null values (:issue:`25317`) - Fixed regression in :class:`TimedeltaIndex` where ``np.sum(index)`` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5cf5623f73036..bddb47cd3f629 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -988,7 +988,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Significant speedup in :class:`SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) -- :meth:`DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) +- :meth:`DataFrame.to_stata` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`.GroupBy.quantile` (:issue:`20405`) @@ -1082,13 +1082,13 @@ Numeric - Bug in :meth:`~pandas.eval` when comparing floats with scalar operators, for example: ``x < -0.1`` (:issue:`25928`) - Fixed bug where casting all-boolean array to integer extension array failed (:issue:`25211`) - Bug in ``divmod`` with a :class:`Series` object containing zeros incorrectly raising ``AttributeError`` (:issue:`26987`) -- Inconsistency in :class:`Series` floor-division (`//`) and ``divmod`` filling positive//zero with ``NaN`` instead of ``Inf`` (:issue:`27321`) +- Inconsistency in :class:`Series` floor-division (``//``) and ``divmod`` filling positive//zero with ``NaN`` instead of ``Inf`` (:issue:`27321`) - Conversion ^^^^^^^^^^ -- Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) +- Bug in :func:`DataFrame.astype` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) - Strings @@ -1146,7 +1146,7 @@ MultiIndex IO ^^ -- Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) +- Bug in :func:`DataFrame.to_html` where values were truncated using display options instead of outputting the full content (:issue:`17004`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) @@ -1159,7 +1159,7 @@ IO - Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`) - Bug in :func:`read_hdf` where reading a table from an HDF5 file written directly with PyTables fails with a ``ValueError`` when using a sub-selection via the ``start`` or ``stop`` arguments (:issue:`11188`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) -- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) +- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested workarounds (:issue:`25772`) - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) - Improved the ``col_space`` parameter in :meth:`DataFrame.to_html` to accept a string so CSS length values can be set correctly (:issue:`25941`) - Fixed bug in loading objects from S3 that contain ``#`` characters in the URL (:issue:`25945`) @@ -1171,7 +1171,7 @@ IO - Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) - :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`) - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). -- Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. ``PeriodIndex``) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) +- Fixed bug in :func:`DataFrame.to_excel` where custom objects (i.e. ``PeriodIndex``) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) - Bug in :meth:`read_hdf` where reading a timezone aware :class:`DatetimeIndex` would raise a ``TypeError`` (:issue:`11926`) - Bug in :meth:`to_msgpack` and :meth:`read_msgpack` which would raise a ``ValueError`` rather than a ``FileNotFoundError`` for an invalid path (:issue:`27160`) - Fixed bug in :meth:`DataFrame.to_parquet` which would raise a ``ValueError`` when the dataframe had no columns (:issue:`27339`) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 94a8ee7cd1a5d..98cb9c4ad7b45 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -414,7 +414,7 @@ Extended verbose info output for :class:`~pandas.DataFrame` ... "text_col": ["a", "b", "c"], ... "float_col": [0.0, 0.1, 0.2]}) In [2]: df.info(verbose=True) - + RangeIndex: 3 entries, 0 to 2 Data columns (total 3 columns): int_col 3 non-null int64 @@ -900,7 +900,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed ``pandas.plotting._matplotlib.tsplot``, use :meth:`Series.plot` instead (:issue:`19980`) - ``pandas.tseries.converter.register`` has been moved to :func:`pandas.plotting.register_matplotlib_converters` (:issue:`18307`) - :meth:`Series.plot` no longer accepts positional arguments, pass keyword arguments instead (:issue:`30003`) -- :meth:`DataFrame.hist` and :meth:`Series.hist` no longer allows ``figsize="default"``, specify figure size by passinig a tuple instead (:issue:`30003`) +- :meth:`DataFrame.hist` and :meth:`Series.hist` no longer allows ``figsize="default"``, specify figure size by passing a tuple instead (:issue:`30003`) - Floordiv of integer-dtyped array by :class:`Timedelta` now raises ``TypeError`` (:issue:`21036`) - :class:`TimedeltaIndex` and :class:`DatetimeIndex` no longer accept non-nanosecond dtype strings like "timedelta64" or "datetime64", use "timedelta64[ns]" and "datetime64[ns]" instead (:issue:`24806`) - Changed the default "skipna" argument in :func:`pandas.api.types.infer_dtype` from ``False`` to ``True`` (:issue:`24050`) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 0a5716f52c836..7354e2bafacc0 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -47,7 +47,7 @@ Fixed regressions .. --------------------------------------------------------------------------- -Indexing with nullable boolean arrays +Indexing with nullable Boolean arrays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 37d021efddf0b..b199b113d26f2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1083,7 +1083,7 @@ IO timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :func:`read_csv` was raising ``TypeError`` when ``sep=None`` was used in combination with ``comment`` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a :class:`DataFrame` in Python 3 from fixed format written in Python 2 (:issue:`31750`) -- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) +- :func:`read_sas` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) - :func:`read_csv` will raise a ``ValueError`` when the column names passed in ``parse_dates`` are missing in the :class:`Dataframe` (:issue:`31251`) - Bug in :func:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) @@ -1174,13 +1174,13 @@ Reshaping - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregate a non-existent column (:issue:`32755`) - Bug in :meth:`DataFrame.unstack` when :class:`MultiIndex` columns and :class:`MultiIndex` rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) - Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a :class:`Series` if ignore_index=True or if the :class:`Series` has a name`` (:issue:`30871`) -- Bug in :meth:`DataFrame.corrwith()`, :meth:`DataFrame.memory_usage()`, :meth:`DataFrame.dot()`, - :meth:`DataFrame.idxmin()`, :meth:`DataFrame.idxmax()`, :meth:`DataFrame.duplicated()`, :meth:`DataFrame.isin()`, - :meth:`DataFrame.count()`, :meth:`Series.explode()`, :meth:`Series.asof()` and :meth:`DataFrame.asof()` not +- Bug in :meth:`DataFrame.corrwith`, :meth:`DataFrame.memory_usage`, :meth:`DataFrame.dot`, + :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.duplicated`, :meth:`DataFrame.isin`, + :meth:`DataFrame.count`, :meth:`Series.explode`, :meth:`Series.asof` and :meth:`DataFrame.asof` not returning subclassed types. (:issue:`31331`) - Bug in :func:`concat` was not allowing for concatenation of :class:`DataFrame` and :class:`Series` with duplicate keys (:issue:`33654`) - Bug in :func:`cut` raised an error when the argument ``labels`` contains duplicates (:issue:`33141`) -- Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) +- Ensure only named functions can be used in :func:`eval` (:issue:`32460`) - Bug in :meth:`Dataframe.aggregate` and :meth:`Series.aggregate` was causing a recursive loop in some cases (:issue:`34224`) - Fixed bug in :func:`melt` where melting :class:`MultiIndex` columns with ``col_level > 0`` would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) - Bug in :meth:`Series.where` with an empty :class:`Series` and empty ``cond`` having non-bool dtype (:issue:`34592`) @@ -1203,7 +1203,7 @@ ExtensionArray - Fixed bug in :func:`concat` when concatenating :class:`DataFrame` objects with non-overlapping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) - Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`) - Fixed bug in :class:`Series` construction with EA dtype and index but no data or scalar data fails (:issue:`26469`) -- Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). +- Fixed bug that caused :meth:`Series.__repr__` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). - Fixed bug where :meth:`Series.update` would raise a ``ValueError`` for ``ExtensionArray`` dtypes with missing values (:issue:`33980`) - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) - Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable Boolean dtypes (:issue:`34051`) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index ef0c4d741ca58..c6fda04070240 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -31,7 +31,7 @@ Fixed regressions - Fixed regression in setitem with :meth:`DataFrame.iloc` which raised error when trying to set a value while filtering with a boolean list (:issue:`36741`) - Fixed regression in setitem with a Series getting aligned before setting the values (:issue:`37427`) - Fixed regression in :attr:`MultiIndex.is_monotonic_increasing` returning wrong results with ``NaN`` in at least one of the levels (:issue:`37220`) -- Fixed regression in inplace arithmetic operation (`+=`) on a Series not updating the parent DataFrame/Series (:issue:`36373`) +- Fixed regression in inplace arithmetic operation (``+=``) on a Series not updating the parent DataFrame/Series (:issue:`36373`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 25e616dcdf37f..cb344cc728566 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -22,7 +22,7 @@ Fixed regressions - Fixed regression in ``DataFrame.__setitem__`` raising ``ValueError`` when expanding :class:`DataFrame` and new column is from type ``"0 - name"`` (:issue:`39010`) - Fixed regression in setting with :meth:`DataFrame.loc` raising ``ValueError`` when :class:`DataFrame` has unsorted :class:`MultiIndex` columns and indexer is a scalar (:issue:`38601`) - Fixed regression in setting with :meth:`DataFrame.loc` raising ``KeyError`` with :class:`MultiIndex` and list-like columns indexer enlarging :class:`DataFrame` (:issue:`39147`) -- Fixed regression in :meth:`~DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) +- Fixed regression in :meth:`~DataFrame.groupby` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) - Fixed regression in :meth:`.DataFrameGroupBy.sem` and :meth:`.SeriesGroupBy.sem` where the presence of non-numeric columns would cause an error instead of being dropped (:issue:`38774`) - Fixed regression in :meth:`.DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`) - Fixed regression in :meth:`DataFrame.groupby` when aggregating an ``ExtensionDType`` that could fail for non-numeric values (:issue:`38980`) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 17aab87b93f8e..0e2d487a89ff5 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -975,7 +975,7 @@ Numeric - Bug in :class:`Series` and :class:`DataFrame` reductions with methods ``any`` and ``all`` not returning Boolean results for object data (:issue:`12863`, :issue:`35450`, :issue:`27709`) - Bug in :meth:`Series.clip` would fail if the Series contains NA values and has nullable int or float as a data type (:issue:`40851`) - Bug in :meth:`UInt64Index.where` and :meth:`UInt64Index.putmask` with an ``np.int64`` dtype ``other`` incorrectly raising ``TypeError`` (:issue:`41974`) -- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggregation functions when one or more aggregation function fails to produce results (:issue:`33634`) +- Bug in :meth:`DataFrame.agg` not sorting the aggregated axis in the order of the provided aggregation functions when one or more aggregation function fails to produce results (:issue:`33634`) - Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) Conversion diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index a57995eb0db9a..f3b21554e668c 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Pandas could not be built on PyPy (:issue:`42355`) +- pandas could not be built on PyPy (:issue:`42355`) - :class:`DataFrame` constructed with an older version of pandas could not be unpickled (:issue:`42345`) - Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42248`) - Fixed regression in :meth:`DataFrame.agg` dropping values when the DataFrame had an Extension Array dtype, a duplicate index, and ``axis=1`` (:issue:`42380`) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ec0b04f8b3b77..7b1aef07e5f00 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -607,7 +607,7 @@ Deprecated Int64Index, UInt64Index & Float64Index :class:`Int64Index`, :class:`UInt64Index` and :class:`Float64Index` have been deprecated in favor of the base :class:`Index` class and will be removed in -Pandas 2.0 (:issue:`43028`). +pandas 2.0 (:issue:`43028`). For constructing a numeric index, you can use the base :class:`Index` class instead specifying the data type (which will also work on older pandas @@ -1045,7 +1045,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) - Bug in :func:`merge` raising ``KeyError`` when joining over differently named indexes with on keywords (:issue:`45094`) - Bug in :meth:`Series.unstack` with object doing unwanted type inference on resulting columns (:issue:`44595`) -- Bug in :meth:`MultiIndex.join()` with overlapping ``IntervalIndex`` levels (:issue:`44096`) +- Bug in :meth:`MultiIndex.join` with overlapping ``IntervalIndex`` levels (:issue:`44096`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` results is different ``dtype`` based on ``regex`` parameter (:issue:`44864`) - Bug in :meth:`DataFrame.pivot` with ``index=None`` when the :class:`DataFrame` index was a :class:`MultiIndex` (:issue:`23955`) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8fa1361cc30c1..43aa63c284f38 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -63,7 +63,7 @@ We recommend installing the latest version of PyArrow to access the most recentl DataFrame interchange protocol implementation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas now implement the DataFrame interchange API spec. +pandas now implement the DataFrame interchange API spec. See the full details on the API at https://data-apis.org/dataframe-protocol/latest/index.html The protocol consists of two parts: @@ -383,7 +383,7 @@ Other enhancements - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) - Added ``numeric_only`` argument to :meth:`.Resampler.sum`, :meth:`.Resampler.prod`, :meth:`.Resampler.min`, :meth:`.Resampler.max`, :meth:`.Resampler.first`, and :meth:`.Resampler.last` (:issue:`46442`) - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) -- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError`, :class:`.PossiblePrecisionLoss`, :class:`.ValueLabelTypeMismatch`, :class:`.InvalidColumnName`, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) +- :class:`.DataError`, :class:`.SpecificationError`, ``SettingWithCopyError``, ``SettingWithCopyWarning``, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError`, :class:`.PossiblePrecisionLoss`, :class:`.ValueLabelTypeMismatch`, :class:`.InvalidColumnName`, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) - Add support for :meth:`.DataFrameGroupBy.ohlc` and :meth:`.SeriesGroupBy.ohlc` for extension array dtypes (:issue:`37493`) - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index be63da09846c8..ddcd69c3fd962 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -57,7 +57,7 @@ can it now take all numpy numeric dtypes, i.e. pd.Index([1, 2, 3], dtype=np.uint16) pd.Index([1, 2, 3], dtype=np.float32) -The ability for :class:`Index` to hold the numpy numeric dtypes has meant some changes in Pandas +The ability for :class:`Index` to hold the numpy numeric dtypes has meant some changes in pandas functionality. In particular, operations that previously were forced to create 64-bit indexes, can now create indexes with lower bit sizes, e.g. 32-bit indexes. @@ -732,7 +732,7 @@ or, if your formats are all ISO8601 (but possibly not identically-formatted) :: Other API changes ^^^^^^^^^^^^^^^^^ -- The ``freq``, ``tz``, ``nanosecond``, and ``unit`` keywords in the :class:`Timestamp` constructor are now keyword-only (:issue:`45307`, :issue:`32526`) +- The ``tz``, ``nanosecond``, and ``unit`` keywords in the :class:`Timestamp` constructor are now keyword-only (:issue:`45307`, :issue:`32526`) - Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`) - :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser. - Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`) @@ -1130,7 +1130,7 @@ Performance improvements - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Performance improvement in :meth:`Series.median` for nullable dtypes (:issue:`50838`) -- Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) +- Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsets (:issue:`35296`) - Performance improvement in :func:`isna` and :func:`isnull` (:issue:`50658`) - Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 51b4c4f297b07..495c8244142f9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -67,7 +67,7 @@ DataFrame reductions preserve extension dtypes In previous versions of pandas, the results of DataFrame reductions (:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) had NumPy dtypes, even when the DataFrames -were of extension dtypes. Pandas can now keep the dtypes when doing reductions over DataFrame +were of extension dtypes. pandas can now keep the dtypes when doing reductions over DataFrame columns with a common dtype (:issue:`52788`). *Old Behavior* @@ -432,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d In [3]: ser[0] = 'not an int64' FutureWarning: - Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. + Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first. In [4]: ser diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 57b83a294963b..73b1103c1bd37 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -42,4 +42,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.3..v2.1.4|HEAD +.. contributors:: v2.1.3..v2.1.4 diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d1481639ca5a0..329ef2859f56f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_220: -What's new in 2.2.0 (Month XX, 2024) ------------------------------------- +What's new in 2.2.0 (January 19, 2024) +-------------------------------------- These are the changes in pandas 2.2.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -123,7 +123,7 @@ nullability handling. with pg_dbapi.connect(uri) as conn: df.to_sql("pandas_table", conn, index=False) - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn) @@ -176,7 +176,7 @@ leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` .. code-block:: ipython - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") @@ -188,6 +188,26 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv Implementation Status `_ documentation. +.. _whatsnew_220.enhancements.case_when: + +Create a pandas Series based on one or more conditions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`) + +.. ipython:: python + + import pandas as pd + + df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6])) + default=pd.Series('default', index=df.index) + default.case_when( + caselist=[ + (df.a == 1, 'first'), # condition, replacement + (df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement + ], + ) + .. _whatsnew_220.enhancements.to_numpy_ea: ``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype @@ -251,6 +271,14 @@ DataFrame. (:issue:`54938`) ) series.struct.explode() +Use :meth:`Series.struct.field` to index into a (possible nested) +struct field. + + +.. ipython:: python + + series.struct.field("project") + .. _whatsnew_220.enhancements.list_accessor: Series.list accessor for PyArrow list data @@ -306,22 +334,23 @@ Other enhancements - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`) -- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) +- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"`` (:issue:`54480`) - :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`) -- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`) - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) +- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) -- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) +- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) -- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) +- Implement :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for :class:`ArrowDtype` and masked dtypes (:issue:`56267`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) +- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) -- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`) -- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) +- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) +- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) - The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) - .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: @@ -386,6 +415,8 @@ index levels when joining on two indexes with different levels (:issue:`34133`). left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + left + right result = left.join(right) *Old Behavior* @@ -405,36 +436,67 @@ index levels when joining on two indexes with different levels (:issue:`34133`). result -.. --------------------------------------------------------------------------- -.. _whatsnew_220.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - .. _whatsnew_220.api_breaking.deps: Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Some minimum supported versions of dependencies were updated. -If installed, we now require: - -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| | | X | X | -+-----------------+-----------------+----------+---------+ - -For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+---------+ -| Package | Minimum Version | Changed | -+=================+=================+=========+ -| mypy (dev) | 1.7.1 | X | -+-----------------+-----------------+---------+ -| | | X | -+-----------------+-----------------+---------+ +For `optional dependencies `_ the general recommendation is to use the latest version. +Optional dependencies below the lowest tested version may still work but are not considered supported. +The following table lists the optional dependencies that have had their minimum tested version increased. + ++-----------------+---------------------+ +| Package | New Minimum Version | ++=================+=====================+ +| beautifulsoup4 | 4.11.2 | ++-----------------+---------------------+ +| blosc | 1.21.3 | ++-----------------+---------------------+ +| bottleneck | 1.3.6 | ++-----------------+---------------------+ +| fastparquet | 2022.12.0 | ++-----------------+---------------------+ +| fsspec | 2022.11.0 | ++-----------------+---------------------+ +| gcsfs | 2022.11.0 | ++-----------------+---------------------+ +| lxml | 4.9.2 | ++-----------------+---------------------+ +| matplotlib | 3.6.3 | ++-----------------+---------------------+ +| numba | 0.56.4 | ++-----------------+---------------------+ +| numexpr | 2.8.4 | ++-----------------+---------------------+ +| qtpy | 2.3.0 | ++-----------------+---------------------+ +| openpyxl | 3.1.0 | ++-----------------+---------------------+ +| psycopg2 | 2.9.6 | ++-----------------+---------------------+ +| pyreadstat | 1.2.0 | ++-----------------+---------------------+ +| pytables | 3.8.0 | ++-----------------+---------------------+ +| pyxlsb | 1.0.10 | ++-----------------+---------------------+ +| s3fs | 2022.11.0 | ++-----------------+---------------------+ +| scipy | 1.10.0 | ++-----------------+---------------------+ +| sqlalchemy | 2.0.0 | ++-----------------+---------------------+ +| tabulate | 0.9.0 | ++-----------------+---------------------+ +| xarray | 2022.12.0 | ++-----------------+---------------------+ +| xlsxwriter | 3.0.5 | ++-----------------+---------------------+ +| zstandard | 0.19.0 | ++-----------------+---------------------+ +| pyqt5 | 5.15.8 | ++-----------------+---------------------+ +| tzdata | 2022.7 | ++-----------------+---------------------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -557,6 +619,8 @@ For example: pd.date_range('2020-01-01', periods=3, freq='QE-NOV') +.. _whatsnew_220.silent_downcasting: + Deprecated automatic downcasting ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -594,32 +658,33 @@ Other Deprecations - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated :attr:`offsets.Day.delta`, :attr:`offsets.Hour.delta`, :attr:`offsets.Minute.delta`, :attr:`offsets.Second.delta`, :attr:`offsets.Milli.delta`, :attr:`offsets.Micro.delta`, :attr:`offsets.Nano.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) - Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) -- Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) +- Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`) - Deprecated :meth:`DatetimeArray.__init__` and :meth:`TimedeltaArray.__init__`, use :func:`array` instead (:issue:`55623`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) - Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`) - Deprecated :meth:`Series.resample` and :meth:`DataFrame.resample` with a :class:`PeriodIndex` (and the 'convention' keyword), convert to :class:`DatetimeIndex` (with ``.to_timestamp()``) before resampling instead (:issue:`53481`) - Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`) +- Deprecated :meth:`offsets.Tick.is_anchored`, use ``False`` instead (:issue:`55388`) - Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) - Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer`` (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) @@ -652,6 +717,7 @@ Other Deprecations - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) +- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: @@ -692,37 +758,38 @@ Bug fixes Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) -- Bug in :meth:`CategoricalDtype.__eq__` returning false for unordered categorical data with mixed types (:issue:`55468`) -- +- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`) +- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`) Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`) - Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`) - Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`) -- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) +- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame (:issue:`52093`) - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) +- Bug in :meth:`.Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) - Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) -- Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) -- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetime64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) -- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in addition or subtraction of very large :class:`.Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) +- Fixed regression where :func:`concat` would raise an error when concatenating ``datetime64`` columns with differing resolutions (:issue:`53641`) Timedelta ^^^^^^^^^ @@ -738,15 +805,18 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) -- +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`) +- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`) Conversion ^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) -- +- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) Strings ^^^^^^^ @@ -756,6 +826,7 @@ Strings - Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) +- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) @@ -763,16 +834,17 @@ Strings Interval ^^^^^^^^ -- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`) - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) +- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) -- Indexing ^^^^^^^^ +- Bug in :meth:`DataFrame.loc` mutating a boolean indexer when :class:`DataFrame` has a :class:`MultiIndex` (:issue:`56635`) - Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`) - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) - Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) @@ -781,25 +853,24 @@ Indexing Missing ^^^^^^^ - Bug in :meth:`DataFrame.update` wasn't updating in-place for tz-aware datetime64 dtypes (:issue:`56227`) -- MultiIndex ^^^^^^^^^^ - Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`) -- I/O ^^^ -- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`) -- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`) -- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified (:issue:`56323`) +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified (:issue:`55677`) +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raising a Python warning; this now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``quotechar`` was ignored (:issue:`52266`) -- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) -- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a CSV with no headers (:issue:`54459`) +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when the file contains ``NaN`` or ``Inf`` (:issue:`54564`) - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) -- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) +- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) -- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`) +- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`) +- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`) - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) @@ -808,29 +879,30 @@ Period - Bug in :class:`PeriodIndex` construction when more than one of ``data``, ``ordinal`` and ``**fields`` are passed failing to raise ``ValueError`` (:issue:`55961`) - Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`) - Bug in casting from :class:`PeriodDtype` with ``astype`` to ``datetime64`` or :class:`DatetimeTZDtype` with non-nanosecond unit incorrectly returning with nanosecond unit (:issue:`55958`) -- Plotting ^^^^^^^^ -- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) -- Bug in :meth:`DataFrame.plot.scatter` discaring string columns (:issue:`56142`) +- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a Matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) +- Bug in :meth:`DataFrame.plot.scatter` discarding string columns (:issue:`56142`) - Bug in :meth:`Series.plot` when reusing an ``ax`` object failing to raise when a ``how`` keyword is passed (:issue:`55953`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`) +- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`) Reshaping ^^^^^^^^^ @@ -839,50 +911,41 @@ Reshaping - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) - Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`) - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) +- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`) - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) +- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) - Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`) -- Bug in :meth:`DataFrame.stack` and :meth:`Series.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) +- Bug in :meth:`DataFrame.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) Sparse ^^^^^^ -- Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) -- - -ExtensionArray -^^^^^^^^^^^^^^ -- -- - -Styler -^^^^^^ -- -- +- Bug in :meth:`arrays.SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) Other ^^^^^ +- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`) - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`) +- Bug in :func:`api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) -- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) +- Bug in rendering ``inf`` values inside a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) - -.. ***DO NOT USE THIS SECTION*** - -- -- +- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.4..v2.2.0 diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst new file mode 100644 index 0000000000000..4db0069ec4b95 --- /dev/null +++ b/doc/source/whatsnew/v2.2.1.rst @@ -0,0 +1,90 @@ +.. _whatsnew_221: + +What's new in 2.2.1 (February 22, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.enhancements: + +Enhancements +~~~~~~~~~~~~ +- Added ``pyarrow`` pip extra so users can install pandas and pyarrow with pip with ``pip install pandas[pyarrow]`` (:issue:`54466`) + +.. _whatsnew_221.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed memory leak in :func:`read_csv` (:issue:`57039`) +- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) +- Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) +- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) +- Fixed regression in :func:`read_json` where an :class:`Index` would be returned instead of a :class:`RangeIndex` (:issue:`57429`) +- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) +- Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) +- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) +- Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.query` with all ``NaT`` column with object dtype (:issue:`57068`) +- Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) +- Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) +- Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) +- Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) +- Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.transpose` with nullable extension dtypes not having F-contiguous data potentially causing exceptions when used (:issue:`57315`) +- Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) +- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) +- Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) +- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) +- Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) +- Fixed regression in addition or subtraction of :class:`DateOffset` objects with millisecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`57529`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) +- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) +- Fixed bug in :meth:`PeriodIndex.asfreq` which was silently converting frequencies which are not supported as period frequencies instead of raising an error (:issue:`56945`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.other: + +Other +~~~~~ + +.. note:: + + The ``DeprecationWarning`` that was raised when pandas was imported without PyArrow being + installed has been removed. This decision was made because the warning was too noisy for too + many users and a lot of feedback was collected about the decision to make PyArrow a required + dependency. Pandas is currently considering the decision whether or not PyArrow should be added + as a hard dependency in 3.0. Interested users can follow the discussion + `here `_. + +- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) +- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.0..v2.2.1 diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst new file mode 100644 index 0000000000000..fbe5e9b4febb5 --- /dev/null +++ b/doc/source/whatsnew/v2.2.2.rst @@ -0,0 +1,59 @@ +.. _whatsnew_222: + +What's new in 2.2.2 (April 10, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_220.np2_compat: + +Pandas 2.2.2 is now compatible with numpy 2.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.2 is the first version of pandas that is generally compatible with the upcoming +numpy 2.0 release, and wheels for pandas 2.2.2 will work with both numpy 1.x and 2.x. + +One major caveat is that arrays created with numpy 2.0's new ``StringDtype`` will convert +to ``object`` dtyped arrays upon :class:`Series`/:class:`DataFrame` creation. +Full support for numpy 2.0's StringDtype is expected to land in pandas 3.0. + +As usual please report any bugs discovered to our `issue tracker `_ + +.. _whatsnew_222.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) +- Avoid issuing a spurious ``DeprecationWarning`` when a custom :class:`DataFrame` or :class:`Series` subclass method is called (:issue:`57553`) +- Fixed regression in precision of :func:`to_datetime` with string and ``unit`` input (:issue:`57051`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.bug_fixes: + +Bug fixes +~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the column's type was nullable boolean (:issue:`55332`) +- :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) +- :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) +- :meth:`DataFrame.to_sql` was failing to find the right table when using the schema argument (:issue:`57539`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.1..v2.2.2 diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst new file mode 100644 index 0000000000000..1696a7b6449af --- /dev/null +++ b/doc/source/whatsnew/v2.2.3.rst @@ -0,0 +1,45 @@ +.. _whatsnew_223: + +What's new in 2.2.3 (September 20, 2024) +---------------------------------------- + +These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_220.py13_compat: + +Pandas 2.2.3 is now compatible with Python 3.13 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.3 is the first version of pandas that is generally compatible with the upcoming +Python 3.13, and both wheels for free-threaded and normal Python 3.13 will be uploaded for +this release. + +As usual please report any bugs discovered to our `issue tracker `_ + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) +- Minor fixes for numpy 2.1 compatibility. (:issue:`59444`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.other: + +Other +~~~~~ +- Missing licenses for 3rd party dependencies were added back into the wheels. (:issue:`58632`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.2..v2.2.3|HEAD diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst new file mode 100644 index 0000000000000..65bd941800294 --- /dev/null +++ b/doc/source/whatsnew/v2.3.0.rst @@ -0,0 +1,210 @@ +.. _whatsnew_230: + +What's new in 2.3.0 (Month XX, 2024) +------------------------------------ + +These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_230.upcoming_changes: + +Upcoming changes in pandas 3.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +.. _whatsnew_230.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_230.enhancements.enhancement1: + +enhancement1 +^^^^^^^^^^^^ + + +.. _whatsnew_230.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called + when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been + updated to work correctly with NumPy >= 2 (:issue:`57739`) +- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) +- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`) +- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) +- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +.. _whatsnew_230.notable_bug_fixes.string_comparisons: + +Comparisons between different string dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, comparing Series of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy + + object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA) + +in determining the result dtype when there are different string dtypes compared. Some examples: + +- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``. +- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array. +- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array. + +.. _whatsnew_230.api_changes: + +API changes +~~~~~~~~~~~ + +- When enabling the ``future.infer_string`` option: Index set operations (like + union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or + empty ``Index`` with object dtype when determining the dtype of the resulting + Index (:issue:`60797`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.deprecations: + +Deprecations +~~~~~~~~~~~~ +- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) +- Deprecated the ``"pyarrow_numpy"`` storage option for :class:`StringDtype` (:issue:`60152`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- +- + +Datetimelike +^^^^^^^^^^^^ +- +- + +Timedelta +^^^^^^^^^ +- +- + +Timezones +^^^^^^^^^ +- +- + +Numeric +^^^^^^^ +- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`) +- + +Conversion +^^^^^^^^^^ +- +- + +Strings +^^^^^^^ +- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` on string input of all NA values would return float dtype; now returns string (:issue:`60810`) +- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`) +- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) +- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) +- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) +- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) +- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) + +Interval +^^^^^^^^ +- +- + +Indexing +^^^^^^^^ +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) +- + +Missing +^^^^^^^ +- +- + +MultiIndex +^^^^^^^^^^ +- +- + +I/O +^^^ +- :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`) +- + +Period +^^^^^^ +- +- + +Plotting +^^^^^^^^ +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- +- + +Reshaping +^^^^^^^^^ +- +- + +Sparse +^^^^^^ +- +- + +ExtensionArray +^^^^^^^^^^^^^^ +- +- + +Styler +^^^^^^ +- +- + +Other +^^^^^ +- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2`` + are not installed (:issue:`60196`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.contributors: + +Contributors +~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst new file mode 100644 index 0000000000000..099e5bc48353a --- /dev/null +++ b/doc/source/whatsnew/v3.0.0.rst @@ -0,0 +1,920 @@ +.. _whatsnew_300: + +What's new in 3.0.0 (Month XX, 2024) +------------------------------------ + +These are the changes in pandas 3.0.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_300.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_300.enhancements.enhancement1: + +Enhancement1 +^^^^^^^^^^^^ + +.. _whatsnew_300.enhancements.enhancement2: + +Enhancement2 +^^^^^^^^^^^^ + +.. _whatsnew_300.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ +- :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) +- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) +- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) +- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) +- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` +- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) +- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) +- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`) +- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) +- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) +- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). +- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) +- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) +- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`) +- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) +- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) +- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) +- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) +- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) +- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`) +- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) +- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) +- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) +- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) +- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) +- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) +- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) +- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`) +- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) +- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering. +- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`) +- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) +- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) +- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) +- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) +- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) +- :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) +- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) +- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) +- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) +- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) +- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) +- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) +- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) +- Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`). +- Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`) +- Added support to read from Apache Iceberg tables with the new :func:`read_iceberg` function (:issue:`61383`) +- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) +- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) +- Improved deprecation message for offset aliases (:issue:`60820`) +- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) +- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) +- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) +- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) +- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_300.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +.. _whatsnew_300.notable_bug_fixes.groupby_unobs_and_na: + +Improved behavior in groupby for ``observed=False`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A number of bugs have been fixed due to improved handling of unobserved groups (:issue:`55738`). All remarks in this section equally impact :class:`.SeriesGroupBy`. + +In previous versions of pandas, a single grouping with :meth:`.DataFrameGroupBy.apply` or :meth:`.DataFrameGroupBy.agg` would pass the unobserved groups to the provided function, resulting in ``0`` below. + +.. ipython:: python + + df = pd.DataFrame( + { + "key1": pd.Categorical(list("aabb"), categories=list("abc")), + "key2": [1, 1, 1, 2], + "values": [1, 2, 3, 4], + } + ) + df + gb = df.groupby("key1", observed=False) + gb[["values"]].apply(lambda x: x.sum()) + +However this was not the case when using multiple groupings, resulting in ``NaN`` below. + +.. code-block:: ipython + + In [1]: gb = df.groupby(["key1", "key2"], observed=False) + In [2]: gb[["values"]].apply(lambda x: x.sum()) + Out[2]: + values + key1 key2 + a 1 3.0 + 2 NaN + b 1 3.0 + 2 4.0 + c 1 NaN + 2 NaN + +Now using multiple groupings will also pass the unobserved groups to the provided function. + +.. ipython:: python + + gb = df.groupby(["key1", "key2"], observed=False) + gb[["values"]].apply(lambda x: x.sum()) + +Similarly: + +- In previous versions of pandas the method :meth:`.DataFrameGroupBy.sum` would result in ``0`` for unobserved groups, but :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.all`, and :meth:`.DataFrameGroupBy.any` would all result in NA values. Now these methods result in ``1``, ``True``, and ``False`` respectively. +- :meth:`.DataFrameGroupBy.groups` did not include unobserved groups and now does. + +These improvements also fixed certain bugs in groupby: + +- :meth:`.DataFrameGroupBy.agg` would fail when there are multiple groupings, unobserved groups, and ``as_index=False`` (:issue:`36698`) +- :meth:`.DataFrameGroupBy.groups` with ``sort=False`` would sort groups; they now occur in the order they are observed (:issue:`56966`) +- :meth:`.DataFrameGroupBy.nunique` would fail when there are multiple groupings, unobserved groups, and ``as_index=False`` (:issue:`52848`) +- :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`) +- :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) + +.. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: + +notable_bug_fix2 +^^^^^^^^^^^^^^^^ + +.. --------------------------------------------------------------------------- +.. _whatsnew_300.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_300.api_breaking.datetime_resolution_inference: + +Datetime resolution inference +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Converting a sequence of strings, ``datetime`` objects, or ``np.datetime64`` objects to +a ``datetime64`` dtype now performs inference on the appropriate resolution (AKA unit) for the output dtype. This affects :class:`Series`, :class:`DataFrame`, :class:`Index`, :class:`DatetimeIndex`, and :func:`to_datetime`. + +Previously, these would always give nanosecond resolution: + +.. code-block:: ipython + + In [1]: dt = pd.Timestamp("2024-03-22 11:36").to_pydatetime() + In [2]: pd.to_datetime([dt]).dtype + Out[2]: dtype('`_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++------------------------+---------------------+ +| Package | New Minimum Version | ++========================+=====================+ +| pytz | 2023.4 | ++------------------------+---------------------+ +| fastparquet | 2024.2.0 | ++------------------------+---------------------+ +| adbc-driver-postgresql | 0.10.0 | ++------------------------+---------------------+ +| mypy (dev) | 1.9.0 | ++------------------------+---------------------+ +| beautifulsoup4 | 4.12.3 | ++------------------------+---------------------+ +| fsspec | 2024.2.0 | ++------------------------+---------------------+ +| gcsfs | 2024.2.0 | ++------------------------+---------------------+ +| s3fs | 2024.2.0 | ++------------------------+---------------------+ +| Jinja2 | 3.1.3 | ++------------------------+---------------------+ +| matplotlib | 3.8.3 | ++------------------------+---------------------+ +| numba | 0.59.0 | ++------------------------+---------------------+ +| numexpr | 2.9.0 | ++------------------------+---------------------+ +| pymysql | 1.1.0 | ++------------------------+---------------------+ +| pyreadstat | 1.2.6 | ++------------------------+---------------------+ +| SciPy | 1.12.0 | ++------------------------+---------------------+ +| xarray | 2024.1.0 | ++------------------------+---------------------+ +| xlsxwriter | 3.2.0 | ++------------------------+---------------------+ +| zstandard | 0.22.0 | ++------------------------+---------------------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + +.. _whatsnew_300.api_breaking.pytz: + +``pytz`` now an optional dependency +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas now uses :py:mod:`zoneinfo` from the standard library as the default timezone implementation when passing a timezone +string to various methods. (:issue:`34916`) + +*Old behavior:* + +.. code-block:: ipython + + In [1]: ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific") + In [2]: ts.tz + + +*New behavior:* + +.. ipython:: python + + ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific") + ts.tz + +``pytz`` timezone objects are still supported when passed directly, but they will no longer be returned by default +from string inputs. Moreover, ``pytz`` is no longer a required dependency of pandas, but can be installed +with the pip extra ``pip install pandas[timezone]``. + + +Additionally, pandas no longer throws ``pytz`` exceptions for timezone operations leading to ambiguous or nonexistent +times. These cases will now raise a ``ValueError``. + +.. _whatsnew_300.api_breaking.other: + +Other API changes +^^^^^^^^^^^^^^^^^ +- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) +- :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) +- All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` during indexing operations (:issue:`57922`) +- Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) +- Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) +- Removed :meth:`Index.sort` which always raised a ``TypeError``. This attribute is not defined and will raise an ``AttributeError`` (:issue:`59283`) +- Unused ``dtype`` argument has been removed from the :class:`MultiIndex` constructor (:issue:`60962`) +- Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) +- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) +- pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) +- when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`) +- Index set operations (like union or intersection) will now ignore the dtype of + an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining + the dtype of the resulting Index (:issue:`60797`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_300.deprecations: + +Deprecations +~~~~~~~~~~~~ + +Copy keyword +^^^^^^^^^^^^ + +The ``copy`` keyword argument in the following methods is deprecated and +will be removed in a future version: + +- :meth:`DataFrame.truncate` / :meth:`Series.truncate` +- :meth:`DataFrame.tz_convert` / :meth:`Series.tz_convert` +- :meth:`DataFrame.tz_localize` / :meth:`Series.tz_localize` +- :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` +- :meth:`DataFrame.align` / :meth:`Series.align` +- :meth:`DataFrame.astype` / :meth:`Series.astype` +- :meth:`DataFrame.reindex` / :meth:`Series.reindex` +- :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` +- :meth:`DataFrame.set_axis` / :meth:`Series.set_axis` +- :meth:`DataFrame.to_period` / :meth:`Series.to_period` +- :meth:`DataFrame.to_timestamp` / :meth:`Series.to_timestamp` +- :meth:`DataFrame.rename` / :meth:`Series.rename` +- :meth:`DataFrame.transpose` +- :meth:`DataFrame.swaplevel` +- :meth:`DataFrame.merge` / :func:`pd.merge` + +Copy-on-Write utilizes a lazy copy mechanism that defers copying the data until +necessary. Use ``.copy`` to trigger an eager copy. The copy keyword has no effect +starting with 3.0, so it can be safely removed from your code. + +Other Deprecations +^^^^^^^^^^^^^^^^^^ + +- Deprecated :func:`core.internals.api.make_block`, use public APIs instead (:issue:`56815`) +- Deprecated :meth:`.DataFrameGroupby.corrwith` (:issue:`57158`) +- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`) +- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`) +- Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`) +- Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) +- Deprecated behavior of :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups`, in a future version ``groups`` by one element list will return tuple instead of scalar. (:issue:`58858`) +- Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) +- Deprecated lowercase strings ``d``, ``b`` and ``c`` denoting frequencies in :class:`Day`, :class:`BusinessDay` and :class:`CustomBusinessDay` in favour of ``D``, ``B`` and ``C`` (:issue:`58998`) +- Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`) +- Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) +- Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`) +- Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`) +- Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_300.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Enforced deprecation of aliases ``M``, ``Q``, ``Y``, etc. in favour of ``ME``, ``QE``, ``YE``, etc. for offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Renamed the following offset aliases (:issue:`57986`): + ++-------------------------------+------------------+------------------+ +| offset | removed alias | new alias | ++===============================+==================+==================+ +|:class:`MonthEnd` | ``M`` | ``ME`` | ++-------------------------------+------------------+------------------+ +|:class:`BusinessMonthEnd` | ``BM`` | ``BME`` | ++-------------------------------+------------------+------------------+ +|:class:`SemiMonthEnd` | ``SM`` | ``SME`` | ++-------------------------------+------------------+------------------+ +|:class:`CustomBusinessMonthEnd`| ``CBM`` | ``CBME`` | ++-------------------------------+------------------+------------------+ +|:class:`QuarterEnd` | ``Q`` | ``QE`` | ++-------------------------------+------------------+------------------+ +|:class:`BQuarterEnd` | ``BQ`` | ``BQE`` | ++-------------------------------+------------------+------------------+ +|:class:`YearEnd` | ``Y`` | ``YE`` | ++-------------------------------+------------------+------------------+ +|:class:`BYearEnd` | ``BY`` | ``BYE`` | ++-------------------------------+------------------+------------------+ + +Other Removals +^^^^^^^^^^^^^^ +- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`) +- :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`) +- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`) +- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) +- :func:`to_datetime` with a ``unit`` specified no longer parses strings into floats, instead parses them the same way as without ``unit`` (:issue:`50735`) +- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`) +- :meth:`ExtensionArray._reduce` now requires a ``keepdims: bool = False`` parameter in the signature (:issue:`52788`) +- :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) +- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) +- All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) +- All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`) +- Changed behavior of :meth:`Series.__getitem__` and :meth:`Series.__setitem__` to always treat integer keys as labels, never as positional, consistent with :class:`DataFrame` behavior (:issue:`50617`) +- Changed behavior of :meth:`Series.__getitem__`, :meth:`Series.__setitem__`, :meth:`DataFrame.__getitem__`, :meth:`DataFrame.__setitem__` with an integer slice on objects with a floating-dtype index. This is now treated as *positional* indexing (:issue:`49612`) +- Disallow a callable argument to :meth:`Series.iloc` to return a ``tuple`` (:issue:`53769`) +- Disallow allowing logical operations (``||``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``); wrap the objects in :class:`Series`, :class:`Index`, or ``np.array`` first instead (:issue:`52264`) +- Disallow automatic casting to object in :class:`Series` logical operations (``&``, ``^``, ``||``) between series with mismatched indexes and dtypes other than ``object`` or ``bool`` (:issue:`52538`) +- Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`) +- Disallow constructing a :class:`arrays.SparseArray` with scalar data (:issue:`53039`) +- Disallow indexing an :class:`Index` with a boolean indexer of length zero, it now raises ``ValueError`` (:issue:`55820`) +- Disallow non-standard (``np.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series`) to :func:`isin`, :func:`unique`, :func:`factorize` (:issue:`52986`) +- Disallow passing a pandas type to :meth:`Index.view` (:issue:`55709`) +- Disallow units other than "s", "ms", "us", "ns" for datetime64 and timedelta64 dtypes in :func:`array` (:issue:`53817`) +- Removed "freq" keyword from :class:`PeriodArray` constructor, use "dtype" instead (:issue:`52462`) +- Removed 'fastpath' keyword in :class:`Categorical` constructor (:issue:`20110`) +- Removed 'kind' keyword in :meth:`Series.resample` and :meth:`DataFrame.resample` (:issue:`58125`) +- Removed ``Block``, ``DatetimeTZBlock``, ``ExtensionBlock``, ``create_block_manager_from_blocks`` from ``pandas.core.internals`` and ``pandas.core.internals.api`` (:issue:`55139`) +- Removed alias :class:`arrays.PandasArray` for :class:`arrays.NumpyExtensionArray` (:issue:`53694`) +- Removed deprecated "method" and "limit" keywords from :meth:`Series.replace` and :meth:`DataFrame.replace` (:issue:`53492`) +- Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`) +- Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`) +- Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`) +- Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`) +- Stopped automatically casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) +- Stopped performing dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when given a pandas object (:class:`Series`, :class:`Index`, :class:`ExtensionArray`), call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`) +- Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`) +- Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`) +- Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`) +- All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`) +- All arguments in :meth:`Series.to_dict` are now keyword only (:issue:`56493`) +- Changed the default value of ``na_action`` in :meth:`Categorical.map` to ``None`` (:issue:`51645`) +- Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`) +- Enforce deprecation in :func:`testing.assert_series_equal` and :func:`testing.assert_frame_equal` with object dtype and mismatched null-like values, which are now considered not-equal (:issue:`18463`) +- Enforce banning of upcasting in in-place setitem-like operations (:issue:`59007`) (see `PDEP6 `_) +- Enforced deprecation ``all`` and ``any`` reductions with ``datetime64``, :class:`DatetimeTZDtype`, and :class:`PeriodDtype` dtypes (:issue:`58029`) +- Enforced deprecation disallowing ``float`` "periods" in :func:`date_range`, :func:`period_range`, :func:`timedelta_range`, :func:`interval_range`, (:issue:`56036`) +- Enforced deprecation disallowing parsing datetimes with mixed time zones unless user passes ``utc=True`` to :func:`to_datetime` (:issue:`57275`) +- Enforced deprecation in :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype performing dtype inference on the ``.index`` of the result (:issue:`56161`) +- Enforced deprecation of :meth:`.DataFrameGroupBy.get_group` and :meth:`.SeriesGroupBy.get_group` allowing the ``name`` argument to be a non-tuple when grouping by a list of length 1 (:issue:`54155`) +- Enforced deprecation of :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for object-dtype (:issue:`57820`) +- Enforced deprecation of :meth:`offsets.Tick.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) +- Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`) +- Enforced deprecation of ``core.internals`` member ``DatetimeTZBlock`` (:issue:`58467`) +- Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`) +- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`) +- Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`) +- Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`) +- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`) +- Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`) +- Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`) +- Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`) +- Enforced deprecation of string ``AS`` denoting frequency in :class:`YearBegin` and strings ``AS-DEC``, ``AS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`57793`) +- Enforced deprecation of string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57699`) +- Enforced deprecation of string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`57793`) +- Enforced deprecation of string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57793`) +- Enforced deprecation of strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`59143`) +- Enforced deprecation of strings ``H``, ``BH``, and ``CBH`` denoting units in :class:`Timedelta` (:issue:`59143`) +- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) +- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) +- Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) +- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`) +- Enforced deprecation of the behavior of :meth:`Series.argsort` in the presence of NA values (:issue:`58232`) +- Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) +- Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) +- Enforced silent-downcasting deprecation for :ref:`all relevant methods ` (:issue:`54710`) +- In :meth:`DataFrame.stack`, the default value of ``future_stack`` is now ``True``; specifying ``False`` will raise a ``FutureWarning`` (:issue:`55448`) +- Iterating over a :class:`.DataFrameGroupBy` or :class:`.SeriesGroupBy` will return tuples of length 1 for the groups when grouping by ``level`` a list of length 1 (:issue:`50064`) +- Methods ``apply``, ``agg``, and ``transform`` will no longer replace NumPy functions (e.g. ``np.sum``) and built-in functions (e.g. ``min``) with the equivalent pandas implementation; use string aliases (e.g. ``"sum"`` and ``"min"``) if you desire to use the pandas implementation (:issue:`53974`) +- Passing both ``freq`` and ``fill_value`` in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`.DataFrameGroupBy.shift` now raises a ``ValueError`` (:issue:`54818`) +- Removed :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` supporting bool dtype (:issue:`53975`) +- Removed :meth:`DateOffset.is_anchored` and :meth:`offsets.Tick.is_anchored` (:issue:`56594`) +- Removed ``DataFrame.applymap``, ``Styler.applymap`` and ``Styler.applymap_index`` (:issue:`52364`) +- Removed ``DataFrame.bool`` and ``Series.bool`` (:issue:`51756`) +- Removed ``DataFrame.first`` and ``DataFrame.last`` (:issue:`53710`) +- Removed ``DataFrame.swapaxes`` and ``Series.swapaxes`` (:issue:`51946`) +- Removed ``DataFrameGroupBy.grouper`` and ``SeriesGroupBy.grouper`` (:issue:`56521`) +- Removed ``DataFrameGroupby.fillna`` and ``SeriesGroupBy.fillna``` (:issue:`55719`) +- Removed ``Index.format``, use :meth:`Index.astype` with ``str`` or :meth:`Index.map` with a ``formatter`` function instead (:issue:`55439`) +- Removed ``Resample.fillna`` (:issue:`55719`) +- Removed ``Series.__int__`` and ``Series.__float__``. Call ``int(Series.iloc[0])`` or ``float(Series.iloc[0])`` instead. (:issue:`51131`) +- Removed ``Series.ravel`` (:issue:`56053`) +- Removed ``Series.view`` (:issue:`56054`) +- Removed ``StataReader.close`` (:issue:`49228`) +- Removed ``_data`` from :class:`DataFrame`, :class:`Series`, :class:`.arrays.ArrowExtensionArray` (:issue:`52003`) +- Removed ``axis`` argument from :meth:`DataFrame.groupby`, :meth:`Series.groupby`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.resample`, and :meth:`Series.resample` (:issue:`51203`) +- Removed ``axis`` argument from all groupby operations (:issue:`50405`) +- Removed ``convert_dtype`` from :meth:`Series.apply` (:issue:`52257`) +- Removed ``method``, ``limit`` ``fill_axis`` and ``broadcast_axis`` keywords from :meth:`DataFrame.align` (:issue:`51968`) +- Removed ``pandas.api.types.is_interval`` and ``pandas.api.types.is_period``, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) +- Removed ``pandas.io.sql.execute`` (:issue:`50185`) +- Removed ``pandas.value_counts``, use :meth:`Series.value_counts` instead (:issue:`53493`) +- Removed ``read_gbq`` and ``DataFrame.to_gbq``. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) +- Removed ``use_nullable_dtypes`` from :func:`read_parquet` (:issue:`51853`) +- Removed ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) +- Removed argument ``limit`` from :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`.DataFrameGroupBy.pct_change`, and :meth:`.SeriesGroupBy.pct_change`; the argument ``method`` must be set to ``None`` and will be removed in a future version of pandas (:issue:`53520`) +- Removed deprecated argument ``obj`` in :meth:`.DataFrameGroupBy.get_group` and :meth:`.SeriesGroupBy.get_group` (:issue:`53545`) +- Removed deprecated behavior of :meth:`Series.agg` using :meth:`Series.apply` (:issue:`53325`) +- Removed deprecated keyword ``method`` on :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`57760`) +- Removed option ``mode.use_inf_as_na``, convert inf entries to ``NaN`` before instead (:issue:`51684`) +- Removed support for :class:`DataFrame` in :meth:`DataFrame.from_records`(:issue:`51697`) +- Removed support for ``errors="ignore"`` in :func:`to_datetime`, :func:`to_timedelta` and :func:`to_numeric` (:issue:`55734`) +- Removed support for ``slice`` in :meth:`DataFrame.take` (:issue:`51539`) +- Removed the ``ArrayManager`` (:issue:`55043`) +- Removed the ``fastpath`` argument from the :class:`Series` constructor (:issue:`55466`) +- Removed the ``is_boolean``, ``is_integer``, ``is_floating``, ``holds_integer``, ``is_numeric``, ``is_categorical``, ``is_object``, and ``is_interval`` attributes of :class:`Index` (:issue:`50042`) +- Removed the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`) +- Removed unused arguments ``*args`` and ``**kwargs`` in :class:`Resampler` methods (:issue:`50977`) +- Unrecognized timezones when parsing strings to datetimes now raises a ``ValueError`` (:issue:`51477`) +- Removed the :class:`Grouper` attributes ``ax``, ``groups``, ``indexer``, and ``obj`` (:issue:`51206`, :issue:`51182`) +- Removed deprecated keyword ``verbose`` on :func:`read_csv` and :func:`read_table` (:issue:`56556`) +- Removed the ``method`` keyword in ``ExtensionArray.fillna``, implement ``ExtensionArray._pad_or_backfill`` instead (:issue:`53621`) +- Removed the attribute ``dtypes`` from :class:`.DataFrameGroupBy` (:issue:`51997`) +- Enforced deprecation of ``argmin``, ``argmax``, ``idxmin``, and ``idxmax`` returning a result when ``skipna=False`` and an NA value is encountered or all values are NA values; these operations will now raise in such cases (:issue:`33941`, :issue:`51276`) +- Removed specifying ``include_groups=True`` in :class:`.DataFrameGroupBy.apply` and :class:`.Resampler.apply` (:issue:`7155`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_300.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- Eliminated circular reference in to original pandas object in accessor attributes (e.g. :attr:`Series.str`). However, accessor instantiation is no longer cached (:issue:`47667`, :issue:`41357`) +- :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`) +- :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`) +- :class:`Series` returns a :class:`RangeIndex` index when possible when ``data`` is a ``dict`` (:issue:`58118`) +- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`) +- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) +- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) +- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`) +- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) +- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) +- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) +- Performance improvement in :class:`MultiIndex` when setting :attr:`MultiIndex.names` doesn't invalidate all cached operations (:issue:`59578`) +- Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`) +- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`) +- Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`) +- Performance improvement in :meth:`DataFrame.to_csv` when ``index=False`` (:issue:`59312`) +- Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`) +- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) +- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) +- Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`) +- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`) +- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) +- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) +- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) +- Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) +- Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) +- Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`) +- Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`) +- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) +- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) +- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) +- Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) +- Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`) +- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) +- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`) +- Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`) +- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) +- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) +- Performance improvement in indexing operations for string dtypes (:issue:`56997`) +- Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_300.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) +- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) +- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) +- + +Datetimelike +^^^^^^^^^^^^ +- Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) +- Bug in :class:`DataFrame` raising ``ValueError`` when ``dtype`` is ``timedelta64`` and ``data`` is a list containing ``None`` (:issue:`60064`) +- Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) +- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) +- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) +- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) +- Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`) +- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) +- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) +- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) +- Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) +- Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) +- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) +- Bug in :meth:`to_datetime` on float array with missing values throwing ``FloatingPointError`` (:issue:`58419`) +- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`) +- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) +- Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`) +- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) +- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) + +Timedelta +^^^^^^^^^ +- Accuracy improvement in :meth:`Timedelta.to_pytimedelta` to round microseconds consistently for large nanosecond based Timedelta (:issue:`57841`) +- Bug in :meth:`DataFrame.cumsum` which was raising ``IndexError`` if dtype is ``timedelta64[ns]`` (:issue:`57956`) + +Timezones +^^^^^^^^^ +- +- + +Numeric +^^^^^^^ +- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`) +- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) +- Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`) +- Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`) + +Conversion +^^^^^^^^^^ +- Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`) +- Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) +- Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) +- Bug in :meth:`Series.convert_dtypes` and :meth:`DataFrame.convert_dtypes` removing timezone information for objects with :class:`ArrowDtype` (:issue:`60237`) +- Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) + +Strings +^^^^^^^ +- Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) +- + +Interval +^^^^^^^^ +- :meth:`Index.is_monotonic_decreasing`, :meth:`Index.is_monotonic_increasing`, and :meth:`Index.is_unique` could incorrectly be ``False`` for an ``Index`` created from a slice of another ``Index``. (:issue:`57911`) +- Bug in :func:`interval_range` where start and end numeric types were always cast to 64 bit (:issue:`57268`) +- + +Indexing +^^^^^^^^ +- Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) +- Bug in :meth:`DataFrame.__getitem__` when slicing a :class:`DataFrame` with many rows raised an ``OverflowError`` (:issue:`59531`) +- Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) +- Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) +- Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) +- Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) +- Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) +- Bug in reindexing of :class:`DataFrame` with :class:`PeriodDtype` columns in case of consolidated block (:issue:`60980`, :issue:`60273`) + +Missing +^^^^^^^ +- Bug in :meth:`DataFrame.fillna` and :meth:`Series.fillna` that would ignore the ``limit`` argument on :class:`.ExtensionArray` dtypes (:issue:`58001`) +- + +MultiIndex +^^^^^^^^^^ +- :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`) +- :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) +- :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) +- :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) +- Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) +- Bug in :class:`DataFrame` arithmetic operations with :class:`Series` in case of unaligned MultiIndex (:issue:`61009`) +- Bug in :meth:`MultiIndex.from_tuples` causing wrong output with input of type tuples having NaN values (:issue:`60695`, :issue:`60988`) + +I/O +^^^ +- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) +- Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) +- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) +- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) +- Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) +- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) +- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) +- Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`) +- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) +- Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) +- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) +- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) +- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) +- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) +- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) +- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) +- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) +- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) +- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) +- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) +- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) +- Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) +- Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) + +Period +^^^^^^ +- Fixed error message when passing invalid period alias to :meth:`PeriodIndex.to_timestamp` (:issue:`58974`) +- + +Plotting +^^^^^^^^ +- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) +- Bug in :meth:`DataFrame.plot.bar` when ``subplots`` and ``stacked=True`` are used in conjunction which causes incorrect stacking. (:issue:`61018`) +- Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`) +- Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`) +- Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`) +- Bug in :meth:`DataFrame.plot` where ``title`` would require extra titles when plotting more than one column per subplot. (:issue:`61019`) +- Bug in :meth:`Series.plot` preventing a line and bar from being aligned on the same plot (:issue:`61161`) +- Bug in :meth:`Series.plot` preventing a line and scatter plot from being aligned (:issue:`61005`) +- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`) + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) +- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) +- Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) +- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) +- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) +- Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`) +- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`) +- Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`) +- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) +- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) +- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) +- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) +- Bug in :meth:`DataFrameGroupBy.cumsum` and :meth:`DataFrameGroupBy.cumprod` where ``numeric_only`` parameter was passed indirectly through kwargs instead of passing directly. (:issue:`58811`) +- Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) +- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) +- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) +- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) +- Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) + +Reshaping +^^^^^^^^^ +- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) +- Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`) +- Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`) +- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) +- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) +- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) +- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) +- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) +- Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`) +- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) +- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) +- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) + +Sparse +^^^^^^ +- Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) +- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`) +- Bug in :meth:`DataFrame.sparse.to_dense` which ignored subclassing and always returned an instance of :class:`DataFrame` (:issue:`59913`) + +ExtensionArray +^^^^^^^^^^^^^^ +- Bug in :class:`Categorical` when constructing with an :class:`Index` with :class:`ArrowDtype` (:issue:`60563`) +- Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) +- Bug in :meth:`ArrowExtensionArray.factorize` where NA values were dropped when input was dictionary-encoded even when dropna was set to False(:issue:`60567`) +- Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) +- Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) +- Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) +- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) + +Styler +^^^^^^ +- Bug in :meth:`Styler.to_latex` where styling column headers when combined with a hidden index or hidden index-levels is fixed. + +Other +^^^^^ +- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`) +- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) +- Bug in :func:`eval` where method calls on binary operations like ``(x + y).dropna()`` would raise ``AttributeError: 'BinOp' object has no attribute 'value'`` (:issue:`61175`) +- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) +- Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) +- Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`) +- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) +- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) +- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) +- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) +- Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`) +- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) +- Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`) +- Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`) +- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) +- Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) +- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) +- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) +- Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`) +- Bug in :meth:`Series.describe` where median percentile was always included when the ``percentiles`` argument was passed (:issue:`60550`). +- Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) +- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) +- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) +- Bug in :meth:`Series.mode` where an exception was raised when taking the mode with nullable types with no null values in the series. (:issue:`58926`) +- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`) +- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) +- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) +- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) +- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) +- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) +- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) +- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) +- Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) +- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) + +.. ***DO NOT USE THIS SECTION*** + +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_300.contributors: + +Contributors +~~~~~~~~~~~~ diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index 89361eb75606c..66e999e251c5e 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -32,6 +32,7 @@ $ ./scripts/announce.py $GITHUB v1.11.0..v1.11.1 > announce.rst """ + import codecs import os import re diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index c2b21e40cadad..06f205b5cc3ce 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -14,6 +14,7 @@ While the v0.23.1 tag does not exist, that will use the HEAD of the branch as the end of the revision range. """ + from announce import build_components from docutils import nodes from docutils.parsers.rst import Directive diff --git a/environment.yml b/environment.yml index 74317d47e2e53..6300e32b5a1b5 100644 --- a/environment.yml +++ b/environment.yml @@ -7,57 +7,58 @@ dependencies: - pip # build dependencies - - versioneer[toml] - - cython=3.0.5 - - meson[ninja]=1.2.1 + - versioneer + - cython~=3.0.5 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-qt>=4.4.0 + - pytest-localserver - pyqt>=5.15.9 - coverage # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy<3 # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc + - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 + - fastparquet>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 + - hypothesis>=6.84.0 + - gcsfs>=2023.12.2 - ipython - - jinja2>=3.1.2 + - pickleshare # Needed for IPython Sphinx directive in the docs GH#60429 + - jinja2>=3.1.3 - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - openpyxl>=3.1.0 + - matplotlib>=3.8.3 + - numba>=0.59.0 + - numexpr>=2.9.0 + - openpyxl>=3.1.2 - odfpy>=1.4.1 - - py - psycopg2>=2.9.6 - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyreadstat>=1.2.0 + - pyiceberg>=0.7.1 + - pymysql>=1.1.0 + - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 + - s3fs>=2023.12.2 + - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2024.1.1 - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 + - xlsxwriter>=3.2.0 + - zstandard>=0.22.0 # downstream packages - dask-core @@ -75,18 +76,16 @@ dependencies: - cxx-compiler # code checks - - flake8=6.1.0 # run in subprocess over docstring examples - - mypy=1.7.1 # pre-commit uses locally installed mypy + - flake8=7.1.0 # run in subprocess over docstring examples + - mypy=1.13.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - - pre-commit>=3.6.0 + - pre-commit>=4.2.0 # documentation - gitpython # obtain contributors from git for whatsnew - - gitdb - - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme=0.14 + - pydata-sphinx-theme=0.16 - pytest-cython # doctest - sphinx - sphinx-design @@ -114,9 +113,15 @@ dependencies: - requests - pygments # Code highlighting + # web interactive REPL + # see the following links for more context: + # 1. https://jupyterlite-pyodide-kernel.readthedocs.io/en/stable/#compatibility + # 2. https://pyodide.org/en/stable/usage/packages-in-pyodide.html + - jupyterlite-core + - jupyterlite-pyodide-kernel + - pip: - - adbc-driver-postgresql>=0.8.0 + - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - dataframe-api-compat>=0.1.7 - typing_extensions; python_version<"3.11" - tzdata>=2022.7 diff --git a/meson.build b/meson.build index 06623a305ab54..6bf88cc772a59 100644 --- a/meson.build +++ b/meson.build @@ -1,15 +1,13 @@ # This file is adapted from https://github.com/scipy/scipy/blob/main/meson.build project( 'pandas', - 'c', 'cpp', 'cython', + 'c', + 'cpp', + 'cython', version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', meson_version: '>=1.2.1', - default_options: [ - 'buildtype=release', - 'c_std=c11', - 'warning_level=2', - ] + default_options: ['buildtype=release', 'c_std=c11', 'warning_level=2'], ) fs = import('fs') @@ -18,36 +16,61 @@ tempita = files('generate_pxi.py') versioneer = files('generate_version.py') -add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'c') -add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'cpp') +add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'c') +add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'cpp') # Allow supporting older numpys than the version compiled against # Set the define to the min supported version of numpy for pandas # e.g. right now this is targeting numpy 1.21+ -add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c') -add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'cpp') +add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language: 'c') +add_project_arguments( + '-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', + language: 'cpp', +) if fs.exists('_version_meson.py') py.install_sources('_version_meson.py', subdir: 'pandas') else - custom_target('write_version_file', + custom_target( + 'write_version_file', output: '_version_meson.py', - command: [ - py, versioneer, '-o', '@OUTPUT@' - ], + command: [py, versioneer, '-o', '@OUTPUT@'], build_by_default: true, build_always_stale: true, install: true, - install_dir: py.get_install_dir() / 'pandas' + install_dir: py.get_install_dir() / 'pandas', ) meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif +cy = meson.get_compiler('cython') +if cy.version().version_compare('>=3.1.0') + add_project_arguments('-Xfreethreading_compatible=true', language: 'cython') + + # Use shared utility code to reduce wheel sizes + # copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files + cy = find_program(cy.cmd_array()[0]) + cython_shared_src = custom_target( + install: false, + output: '_cyutility.c', + command: [ + cy, + '-3', + '--fast-fail', + '--generate-shared=' + meson.current_build_dir() / '_cyutility.c', + ], + ) + + py.extension_module( + '_cyutility', + cython_shared_src, + subdir: 'pandas/_libs', + install: true, + ) +endif + # Needed by pandas.test() when it looks for the pytest ini options -py.install_sources( - 'pyproject.toml', - subdir: 'pandas' -) +py.install_sources('pyproject.toml', subdir: 'pandas') subdir('pandas') diff --git a/pandas/__init__.py b/pandas/__init__.py index 7fab662ed2de4..8b92ad6cdfebb 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -1,25 +1,20 @@ from __future__ import annotations -import os -import warnings - __docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies -_hard_dependencies = ("numpy", "pytz", "dateutil") -_missing_dependencies = [] +_hard_dependencies = ("numpy", "dateutil", "tzdata") for _dependency in _hard_dependencies: try: __import__(_dependency) except ImportError as _e: # pragma: no cover - _missing_dependencies.append(f"{_dependency}: {_e}") + raise ImportError( + f"Unable to import required dependency {_dependency}. " + "Please see the traceback for details." + ) from _e -if _missing_dependencies: # pragma: no cover - raise ImportError( - "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies) - ) -del _hard_dependencies, _dependency, _missing_dependencies +del _hard_dependencies, _dependency try: # numpy compat @@ -31,7 +26,8 @@ raise ImportError( f"C extension: {_module} not built. If you want to import " "pandas from the source directory, you may need to run " - "'python setup.py build_ext' to build the C extensions first." + "'python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true' " + "to build the C extensions first." ) from _err from pandas._config import ( @@ -102,7 +98,6 @@ Grouper, factorize, unique, - value_counts, NamedAgg, array, Categorical, @@ -163,13 +158,13 @@ read_parquet, read_orc, read_feather, - read_gbq, read_html, read_xml, read_json, read_stata, read_sas, read_spss, + read_iceberg, ) from pandas.io.json._normalize import json_normalize @@ -193,17 +188,6 @@ __git_version__ = v.get("full-revisionid") del get_versions, v -# GH#55043 - deprecation of the data_manager option -if "PANDAS_DATA_MANAGER" in os.environ: - warnings.warn( - "The env variable PANDAS_DATA_MANAGER is set. The data_manager option is " - "deprecated and will be removed in a future version. Only the BlockManager " - "will be available. Unset this environment variable to silence this warning.", - FutureWarning, - stacklevel=2, - ) -# Don't allow users to use pandas.os or pandas.warnings -del os, warnings # module level doc-string __doc__ = """ @@ -250,6 +234,7 @@ # Pandas is not (yet) a py.typed library: the public API is determined # based on the documentation. __all__ = [ + "NA", "ArrowDtype", "BooleanDtype", "Categorical", @@ -268,15 +253,14 @@ "HDFStore", "Index", "IndexSlice", + "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", - "Int8Dtype", "Interval", "IntervalDtype", "IntervalIndex", "MultiIndex", - "NA", "NaT", "NamedAgg", "Period", @@ -289,10 +273,10 @@ "Timedelta", "TimedeltaIndex", "Timestamp", + "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", - "UInt8Dtype", "api", "array", "arrays", @@ -305,8 +289,8 @@ "errors", "eval", "factorize", - "get_dummies", "from_dummies", + "get_dummies", "get_option", "infer_freq", "interval_range", @@ -334,9 +318,9 @@ "read_excel", "read_feather", "read_fwf", - "read_gbq", "read_hdf", "read_html", + "read_iceberg", "read_json", "read_orc", "read_parquet", @@ -362,6 +346,5 @@ "to_timedelta", "tseries", "unique", - "value_counts", "wide_to_long", ] diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 97784c924dab4..463e8af7cc561 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -5,17 +5,16 @@ importing `dates` and `display` ensures that keys needed by _libs are initialized. """ + __all__ = [ "config", + "describe_option", "detect_console_encoding", "get_option", - "set_option", - "reset_option", - "describe_option", "option_context", "options", - "using_copy_on_write", - "warn_copy_on_write", + "reset_option", + "set_option", ] from pandas._config import config from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401 @@ -31,27 +30,6 @@ from pandas._config.display import detect_console_encoding -def using_copy_on_write() -> bool: - _mode_options = _global_config["mode"] - return ( - _mode_options["copy_on_write"] is True - and _mode_options["data_manager"] == "block" - ) - - -def warn_copy_on_write() -> bool: - _mode_options = _global_config["mode"] - return ( - _mode_options["copy_on_write"] == "warn" - and _mode_options["data_manager"] == "block" - ) - - -def using_nullable_dtypes() -> bool: - _mode_options = _global_config["mode"] - return _mode_options["nullable_dtypes"] - - -def using_pyarrow_string_dtype() -> bool: +def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] diff --git a/pandas/_config/config.py b/pandas/_config/config.py index c391939d22491..ce53e05608ba7 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -50,31 +50,24 @@ from __future__ import annotations -from contextlib import ( - ContextDecorator, - contextmanager, -) +from contextlib import contextmanager import re from typing import ( TYPE_CHECKING, Any, - Callable, - Generic, NamedTuple, cast, ) import warnings -from pandas._typing import ( - F, - T, -) +from pandas._typing import F from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, - Iterable, + Sequence, ) @@ -87,7 +80,7 @@ class DeprecatedOption(NamedTuple): class RegisteredOption(NamedTuple): key: str - defval: object + defval: Any doc: str validator: Callable[[object], Any] | None cb: Callable[[str], Any] | None @@ -112,6 +105,10 @@ class OptionError(AttributeError, KeyError): Backwards compatible with KeyError checks. + See Also + -------- + options : Access and modify global pandas settings. + Examples -------- >>> pd.options.context @@ -124,68 +121,197 @@ class OptionError(AttributeError, KeyError): # User API -def _get_single_key(pat: str, silent: bool) -> str: +def _get_single_key(pat: str) -> str: keys = _select_options(pat) if len(keys) == 0: - if not silent: - _warn_if_deprecated(pat) - raise OptionError(f"No such keys(s): {repr(pat)}") + _warn_if_deprecated(pat) + raise OptionError(f"No such keys(s): {pat!r}") if len(keys) > 1: raise OptionError("Pattern matched multiple keys") key = keys[0] - if not silent: - _warn_if_deprecated(key) + _warn_if_deprecated(key) key = _translate_key(key) return key -def _get_option(pat: str, silent: bool = False) -> Any: - key = _get_single_key(pat, silent) +def get_option(pat: str) -> Any: + """ + Retrieve the value of the specified option. + + This method allows users to query the current value of a given option + in the pandas configuration system. Options control various display, + performance, and behavior-related settings within pandas. + + Parameters + ---------- + pat : str + Regexp which should match a single option. + + .. warning:: + + Partial matches are supported for convenience, but unless you use the + full option name (e.g. x.y.z.option_name), your code may break in future + versions if new options with similar names are introduced. + + Returns + ------- + Any + The value of the option. + + Raises + ------ + OptionError : if no such option exists + + See Also + -------- + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + + Notes + ----- + For all available options, please view the :ref:`User Guide ` + or use ``pandas.describe_option()``. + + Examples + -------- + >>> pd.get_option("display.max_columns") # doctest: +SKIP + 4 + """ + key = _get_single_key(pat) # walk the nested dict root, k = _get_root(key) return root[k] -def _set_option(*args, **kwargs) -> None: +def set_option(*args) -> None: + """ + Set the value of the specified option or options. + + This method allows fine-grained control over the behavior and display settings + of pandas. Options affect various functionalities such as output formatting, + display limits, and operational behavior. Settings can be modified at runtime + without requiring changes to global configurations or environment variables. + + Parameters + ---------- + *args : str | object + Arguments provided in pairs, which will be interpreted as (pattern, value) + pairs. + pattern: str + Regexp which should match a single option + value: object + New value of option + + .. warning:: + + Partial pattern matches are supported for convenience, but unless you + use the full option name (e.g. x.y.z.option_name), your code may break in + future versions if new options with similar names are introduced. + + Returns + ------- + None + No return value. + + Raises + ------ + ValueError if odd numbers of non-keyword arguments are provided + TypeError if keyword arguments are provided + OptionError if no such option exists + + See Also + -------- + get_option : Retrieve the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + option_context : Context manager to temporarily set options in a ``with`` + statement. + + Notes + ----- + For all available options, please view the :ref:`User Guide ` + or use ``pandas.describe_option()``. + + Examples + -------- + >>> pd.set_option("display.max_columns", 4) + >>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) + >>> df + 0 1 ... 3 4 + 0 1 2 ... 4 5 + 1 6 7 ... 9 10 + [2 rows x 5 columns] + >>> pd.reset_option("display.max_columns") + """ # must at least 1 arg deal with constraints later nargs = len(args) if not nargs or nargs % 2 != 0: raise ValueError("Must provide an even number of non-keyword arguments") - # default to false - silent = kwargs.pop("silent", False) - - if kwargs: - kwarg = next(iter(kwargs.keys())) - raise TypeError(f'_set_option() got an unexpected keyword argument "{kwarg}"') - for k, v in zip(args[::2], args[1::2]): - key = _get_single_key(k, silent) + key = _get_single_key(k) - o = _get_registered_option(key) - if o and o.validator: - o.validator(v) + opt = _get_registered_option(key) + if opt and opt.validator: + opt.validator(v) # walk the nested dict root, k_root = _get_root(key) root[k_root] = v - if o.cb: - if silent: - with warnings.catch_warnings(record=True): - o.cb(key) - else: - o.cb(key) + if opt.cb: + opt.cb(key) -def _describe_option(pat: str = "", _print_desc: bool = True) -> str | None: +def describe_option(pat: str = "", _print_desc: bool = True) -> str | None: + """ + Print the description for one or more registered options. + + Call with no arguments to get a listing for all registered options. + + Parameters + ---------- + pat : str, default "" + String or string regexp pattern. + Empty string will return all options. + For regexp strings, all matching keys will have their description displayed. + _print_desc : bool, default True + If True (default) the description(s) will be printed to stdout. + Otherwise, the description(s) will be returned as a string + (for testing). + + Returns + ------- + None + If ``_print_desc=True``. + str + If the description(s) as a string if ``_print_desc=False``. + + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + + Notes + ----- + For all available options, please view the + :ref:`User Guide `. + + Examples + -------- + >>> pd.describe_option("display.max_columns") # doctest: +SKIP + display.max_columns : int + If max_cols is exceeded, switch to truncate view... + """ keys = _select_options(pat) if len(keys) == 0: - raise OptionError("No such keys(s)") + raise OptionError(f"No such keys(s) for {pat=}") s = "\n".join([_build_option_description(k) for k in keys]) @@ -195,11 +321,51 @@ def _describe_option(pat: str = "", _print_desc: bool = True) -> str | None: return s -def _reset_option(pat: str, silent: bool = False) -> None: +def reset_option(pat: str) -> None: + """ + Reset one or more options to their default value. + + This method resets the specified pandas option(s) back to their default + values. It allows partial string matching for convenience, but users should + exercise caution to avoid unintended resets due to changes in option names + in future versions. + + Parameters + ---------- + pat : str/regex + If specified only options matching ``pat*`` will be reset. + Pass ``"all"`` as argument to reset all options. + + .. warning:: + + Partial matches are supported for convenience, but unless you + use the full option name (e.g. x.y.z.option_name), your code may break + in future versions if new options with similar names are introduced. + + Returns + ------- + None + No return value. + + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + describe_option : Print the description for one or more registered options. + + Notes + ----- + For all available options, please view the + :ref:`User Guide `. + + Examples + -------- + >>> pd.reset_option("display.max_columns") # doctest: +SKIP + """ keys = _select_options(pat) if len(keys) == 0: - raise OptionError("No such keys(s)") + raise OptionError(f"No such keys(s) for {pat=}") if len(keys) > 1 and len(pat) < 4 and pat != "all": raise ValueError( @@ -209,11 +375,11 @@ def _reset_option(pat: str, silent: bool = False) -> None: ) for k in keys: - _set_option(k, _registered_options[k].defval, silent=silent) + set_option(k, _registered_options[k].defval) def get_default_val(pat: str): - key = _get_single_key(pat, silent=True) + key = _get_single_key(pat) return _get_registered_option(key).defval @@ -234,7 +400,7 @@ def __setattr__(self, key: str, val: Any) -> None: # you can't set new keys # can you can't overwrite subtrees if key in self.d and not isinstance(self.d[key], dict): - _set_option(prefix, val) + set_option(prefix, val) else: raise OptionError("You can only set the value of existing options") @@ -250,242 +416,77 @@ def __getattr__(self, key: str): if isinstance(v, dict): return DictWrapper(v, prefix) else: - return _get_option(prefix) + return get_option(prefix) def __dir__(self) -> list[str]: return list(self.d.keys()) -# For user convenience, we'd like to have the available options described -# in the docstring. For dev convenience we'd like to generate the docstrings -# dynamically instead of maintaining them by hand. To this, we use the -# class below which wraps functions inside a callable, and converts -# __doc__ into a property function. The doctsrings below are templates -# using the py2.6+ advanced formatting syntax to plug in a concise list -# of options, and option descriptions. - - -class CallableDynamicDoc(Generic[T]): - def __init__(self, func: Callable[..., T], doc_tmpl: str) -> None: - self.__doc_tmpl__ = doc_tmpl - self.__func__ = func - - def __call__(self, *args, **kwds) -> T: - return self.__func__(*args, **kwds) - - # error: Signature of "__doc__" incompatible with supertype "object" - @property - def __doc__(self) -> str: # type: ignore[override] - opts_desc = _describe_option("all", _print_desc=False) - opts_list = pp_options_list(list(_registered_options.keys())) - return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list) - - -_get_option_tmpl = """ -get_option(pat) - -Retrieves the value of the specified option. - -Available options: - -{opts_list} - -Parameters ----------- -pat : str - Regexp which should match a single option. - Note: partial matches are supported for convenience, but unless you use the - full option name (e.g. x.y.z.option_name), your code may break in future - versions if new options with similar names are introduced. - -Returns -------- -result : the value of the option - -Raises ------- -OptionError : if no such option exists - -Notes ------ -Please reference the :ref:`User Guide ` for more information. - -The available options with its descriptions: - -{opts_desc} - -Examples --------- ->>> pd.get_option('display.max_columns') # doctest: +SKIP -4 -""" - -_set_option_tmpl = """ -set_option(pat, value) - -Sets the value of the specified option. - -Available options: - -{opts_list} - -Parameters ----------- -pat : str - Regexp which should match a single option. - Note: partial matches are supported for convenience, but unless you use the - full option name (e.g. x.y.z.option_name), your code may break in future - versions if new options with similar names are introduced. -value : object - New value of option. - -Returns -------- -None - -Raises ------- -OptionError if no such option exists - -Notes ------ -Please reference the :ref:`User Guide ` for more information. - -The available options with its descriptions: - -{opts_desc} - -Examples --------- ->>> pd.set_option('display.max_columns', 4) ->>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) ->>> df - 0 1 ... 3 4 -0 1 2 ... 4 5 -1 6 7 ... 9 10 -[2 rows x 5 columns] ->>> pd.reset_option('display.max_columns') -""" - -_describe_option_tmpl = """ -describe_option(pat, _print_desc=False) - -Prints the description for one or more registered options. - -Call with no arguments to get a listing for all registered options. - -Available options: - -{opts_list} - -Parameters ----------- -pat : str - Regexp pattern. All matching keys will have their description displayed. -_print_desc : bool, default True - If True (default) the description(s) will be printed to stdout. - Otherwise, the description(s) will be returned as a unicode string - (for testing). - -Returns -------- -None by default, the description(s) as a unicode string if _print_desc -is False - -Notes ------ -Please reference the :ref:`User Guide ` for more information. - -The available options with its descriptions: - -{opts_desc} - -Examples --------- ->>> pd.describe_option('display.max_columns') # doctest: +SKIP -display.max_columns : int - If max_cols is exceeded, switch to truncate view... -""" - -_reset_option_tmpl = """ -reset_option(pat) - -Reset one or more options to their default value. - -Pass "all" as argument to reset all options. - -Available options: - -{opts_list} - -Parameters ----------- -pat : str/regex - If specified only options matching `prefix*` will be reset. - Note: partial matches are supported for convenience, but unless you - use the full option name (e.g. x.y.z.option_name), your code may break - in future versions if new options with similar names are introduced. - -Returns -------- -None +options = DictWrapper(_global_config) -Notes ------ -Please reference the :ref:`User Guide ` for more information. +# +# Functions for use by pandas developers, in addition to User - api -The available options with its descriptions: -{opts_desc} +@contextmanager +def option_context(*args) -> Generator[None]: + """ + Context manager to temporarily set options in a ``with`` statement. -Examples --------- ->>> pd.reset_option('display.max_columns') # doctest: +SKIP -""" + This method allows users to set one or more pandas options temporarily + within a controlled block. The previous options' values are restored + once the block is exited. This is useful when making temporary adjustments + to pandas' behavior without affecting the global state. -# bind the functions with their docstrings into a Callable -# and use that as the functions exposed in pd.api -get_option = CallableDynamicDoc(_get_option, _get_option_tmpl) -set_option = CallableDynamicDoc(_set_option, _set_option_tmpl) -reset_option = CallableDynamicDoc(_reset_option, _reset_option_tmpl) -describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl) -options = DictWrapper(_global_config) + Parameters + ---------- + *args : str | object + An even amount of arguments provided in pairs which will be + interpreted as (pattern, value) pairs. -# -# Functions for use by pandas developers, in addition to User - api + Returns + ------- + None + No return value. + Yields + ------ + None + No yield value. -class option_context(ContextDecorator): - """ - Context manager to temporarily set options in the `with` statement context. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. - You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + Notes + ----- + For all available options, please view the :ref:`User Guide ` + or use ``pandas.describe_option()``. Examples -------- >>> from pandas import option_context - >>> with option_context('display.max_rows', 10, 'display.max_columns', 5): + >>> with option_context("display.max_rows", 10, "display.max_columns", 5): ... pass """ + if len(args) % 2 != 0 or len(args) < 2: + raise ValueError( + "Provide an even amount of arguments as " + "option_context(pat, val, pat, val...)." + ) - def __init__(self, *args) -> None: - if len(args) % 2 != 0 or len(args) < 2: - raise ValueError( - "Need to invoke as option_context(pat, val, [(pat, val), ...])." - ) - - self.ops = list(zip(args[::2], args[1::2])) - - def __enter__(self) -> None: - self.undo = [(pat, _get_option(pat)) for pat, val in self.ops] - - for pat, val in self.ops: - _set_option(pat, val, silent=True) - - def __exit__(self, *args) -> None: - if self.undo: - for pat, val in self.undo: - _set_option(pat, val, silent=True) + ops = tuple(zip(args[::2], args[1::2])) + try: + undo = tuple((pat, get_option(pat)) for pat, val in ops) + for pat, val in ops: + set_option(pat, val) + yield + finally: + for pat, val in undo: + set_option(pat, val) def register_option( @@ -640,12 +641,6 @@ def _get_root(key: str) -> tuple[dict[str, Any], str]: return cursor, path[-1] -def _is_deprecated(key: str) -> bool: - """Returns True if the given option has been deprecated""" - key = key.lower() - return key in _deprecated_options - - def _get_deprecated_option(key: str): """ Retrieves the metadata for a deprecated option, if `key` is deprecated. @@ -728,7 +723,10 @@ def _build_option_description(k: str) -> str: s += "No description available." if o: - s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + warnings.simplefilter("ignore", DeprecationWarning) + s += f"\n [default: {o.defval}] [currently: {get_option(k)}]" if d: rkey = d.rkey or "" @@ -739,46 +737,11 @@ def _build_option_description(k: str) -> str: return s -def pp_options_list(keys: Iterable[str], width: int = 80, _print: bool = False): - """Builds a concise listing of available options, grouped by prefix""" - from itertools import groupby - from textwrap import wrap - - def pp(name: str, ks: Iterable[str]) -> list[str]: - pfx = "- " + name + ".[" if name else "" - ls = wrap( - ", ".join(ks), - width, - initial_indent=pfx, - subsequent_indent=" ", - break_long_words=False, - ) - if ls and ls[-1] and name: - ls[-1] = ls[-1] + "]" - return ls - - ls: list[str] = [] - singles = [x for x in sorted(keys) if x.find(".") < 0] - if singles: - ls += pp("", singles) - keys = [x for x in keys if x.find(".") >= 0] - - for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]): - ks = [x[len(k) + 1 :] for x in list(g)] - ls += pp(k, ks) - s = "\n".join(ls) - if _print: - print(s) - else: - return s - - -# # helpers @contextmanager -def config_prefix(prefix: str) -> Generator[None, None, None]: +def config_prefix(prefix: str) -> Generator[None]: """ contextmanager for multiple invocations of API with a common prefix @@ -853,7 +816,7 @@ def inner(x) -> None: return inner -def is_instance_factory(_type) -> Callable[[Any], None]: +def is_instance_factory(_type: type | tuple[type, ...]) -> Callable[[Any], None]: """ Parameters @@ -866,8 +829,7 @@ def is_instance_factory(_type) -> Callable[[Any], None]: ValueError if x is not an instance of `_type` """ - if isinstance(_type, (tuple, list)): - _type = tuple(_type) + if isinstance(_type, tuple): type_repr = "|".join(map(str, _type)) else: type_repr = f"'{_type}'" @@ -879,7 +841,7 @@ def inner(x) -> None: return inner -def is_one_of_factory(legal_values) -> Callable[[Any], None]: +def is_one_of_factory(legal_values: Sequence) -> Callable[[Any], None]: callables = [c for c in legal_values if callable(c)] legal_values = [c for c in legal_values if not callable(c)] @@ -930,7 +892,7 @@ def is_nonnegative_int(value: object) -> None: is_text = is_instance_factory((str, bytes)) -def is_callable(obj) -> bool: +def is_callable(obj: object) -> bool: """ Parameters diff --git a/pandas/_config/dates.py b/pandas/_config/dates.py index b37831f96eb73..2d9f5d390dc9c 100644 --- a/pandas/_config/dates.py +++ b/pandas/_config/dates.py @@ -1,6 +1,7 @@ """ config for datetime formatting """ + from __future__ import annotations from pandas._config import config as cf diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 5c1a0ff139533..6602633f20399 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -3,6 +3,7 @@ Name `localization` is chosen to avoid overlap with builtin `locale` module. """ + from __future__ import annotations from contextlib import contextmanager @@ -10,7 +11,10 @@ import platform import re import subprocess -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + cast, +) from pandas._config.config import options @@ -21,7 +25,7 @@ @contextmanager def set_locale( new_locale: str | tuple[str, str], lc_var: int = locale.LC_ALL -) -> Generator[str | tuple[str, str], None, None]: +) -> Generator[str | tuple[str, str]]: """ Context manager for temporarily setting a locale. @@ -152,7 +156,7 @@ def get_locales( out_locales = [] for x in split_raw_locales: try: - out_locales.append(str(x, encoding=options.display.encoding)) + out_locales.append(str(x, encoding=cast(str, options.display.encoding))) except UnicodeError: # 'locale -a' is used to populated 'raw_locales' and on # Redhat 7 Linux (and maybe others) prints locale names diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index 26a872a90e493..d499f9a6cd75e 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,4 +1,5 @@ __all__ = [ + "Interval", "NaT", "NaTType", "OutOfBoundsDatetime", @@ -6,7 +7,6 @@ "Timedelta", "Timestamp", "iNaT", - "Interval", ] diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index caf5425dfc7b4..0a6be851e1efd 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -165,6 +165,15 @@ def take_1d_int32_float64( def take_1d_int64_int64( values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... ) -> None: ... +def take_1d_uint16_uint16( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_uint32_uint32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_1d_uint64_uint64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... def take_1d_int64_float64( values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... ) -> None: ... @@ -225,6 +234,15 @@ def take_2d_axis0_int64_int64( def take_2d_axis0_int64_float64( values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... ) -> None: ... +def take_2d_axis0_uint16_uint16( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_uint32_uint32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis0_uint64_uint64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... def take_2d_axis0_float32_float32( values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... ) -> None: ... @@ -279,6 +297,15 @@ def take_2d_axis1_int32_float64( def take_2d_axis1_int64_int64( values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... ) -> None: ... +def take_2d_axis1_uint16_uint16( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_uint32_uint32( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... +def take_2d_axis1_uint64_uint64( + values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... +) -> None: ... def take_2d_axis1_int64_float64( values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=... ) -> None: ... diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index e70ac26a2c28e..f584c0ff9f614 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -180,6 +180,8 @@ def is_lexsorted(list_of_arrays: list) -> bool: n = len(list_of_arrays[0]) cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) + if vecs is NULL: + raise MemoryError() for i in range(nlevels): arr = list_of_arrays[i] assert arr.dtype.name == "int64" @@ -349,12 +351,11 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): Py_ssize_t i, xi, yi, N, K int64_t minpv float64_t[:, ::1] result - ndarray[uint8_t, ndim=2] mask + uint8_t[:, :] mask int64_t nobs = 0 - float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy + float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val N, K = (mat).shape - if minp is None: minpv = 1 else: @@ -387,8 +388,15 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): else: divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) + # clip `covxy / divisor` to ensure coeff is within bounds if divisor != 0: - result[xi, yi] = result[yi, xi] = covxy / divisor + val = covxy / divisor + if not cov: + if val > 1.0: + val = 1.0 + elif val < -1.0: + val = -1.0 + result[xi, yi] = result[yi, xi] = val else: result[xi, yi] = result[yi, xi] = NaN @@ -405,7 +413,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr Py_ssize_t i, xi, yi, N, K ndarray[float64_t, ndim=2] result ndarray[float64_t, ndim=2] ranked_mat - ndarray[float64_t, ndim=1] rankedx, rankedy + float64_t[::1] rankedx, rankedy float64_t[::1] maskedx, maskedy ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 @@ -564,8 +572,8 @@ def get_fill_indexer(const uint8_t[:] mask, limit=None): @cython.boundscheck(False) @cython.wraparound(False) def pad( - ndarray[numeric_object_t] old, - ndarray[numeric_object_t] new, + const numeric_object_t[:] old, + const numeric_object_t[:] new, limit=None ) -> ndarray: # -> ndarray[intp_t, ndim=1] @@ -689,8 +697,8 @@ def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None @cython.boundscheck(False) @cython.wraparound(False) def backfill( - ndarray[numeric_object_t] old, - ndarray[numeric_object_t] new, + const numeric_object_t[:] old, + const numeric_object_t[:] new, limit=None ) -> ndarray: # -> ndarray[intp_t, ndim=1] """ @@ -784,7 +792,7 @@ def backfill_2d_inplace(numeric_object_t[:, :] values, @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike): +def is_monotonic(const numeric_object_t[:] arr, bint timelike): """ Returns ------- @@ -816,33 +824,7 @@ def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike): if timelike and arr[0] == NPY_NAT: return False, False, False - if numeric_object_t is not object: - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == NPY_NAT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - else: - # object-dtype, identical to above except we cannot use `with nogil` + with nogil(numeric_object_t is not object): prev = arr[0] for i in range(1, n): cur = arr[i] @@ -1087,8 +1069,7 @@ cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, - # TODO(cython3): make const (https://github.com/cython/cython/issues/3222) - numeric_object_t[:] masked_vals, + const numeric_object_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1376,7 +1357,7 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - ndarray[diff_t, ndim=2] arr, # TODO(cython3) update to "const diff_t[:, :] arr" + const diff_t[:, :] arr, ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, @@ -1384,8 +1365,7 @@ def diff_2d( ): cdef: Py_ssize_t i, j, sx, sy, start, stop - bint f_contig = arr.flags.f_contiguous - # bint f_contig = arr.is_f_contig() # TODO(cython3) once arr is memoryview + bint f_contig = arr.is_f_contig() diff_t left, right # Disable for unsupported dtype combinations, diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 88c3abba506a3..e6b39896280b9 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -15,6 +15,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in dtypes = [ ('uint8_t', 'uint8_t'), ('uint8_t', 'object'), + ('uint16_t', 'uint16_t'), + ('uint32_t', 'uint32_t'), + ('uint64_t', 'uint64_t'), ('int8_t', 'int8_t'), ('int8_t', 'int32_t'), ('int8_t', 'int64_t'), @@ -184,6 +187,17 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, fv = fill_value + {{if c_type_in == c_type_out != "object"}} + with nogil: + for i in range(n): + for j in range(k): + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + {{else}} for i in range(n): for j in range(k): idx = indexer[j] @@ -195,6 +209,7 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{else}} out[i, j] = values[i, idx] {{endif}} + {{endif}} @cython.wraparound(False) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 86f69c3cdfc75..60e4ff3fab74e 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -1,4 +1,4 @@ -from typing import Sequence +from collections.abc import Sequence import numpy as np @@ -14,9 +14,9 @@ class NDArrayBacked: _ndarray: np.ndarray def __init__(self, values: np.ndarray, dtype: DtypeObj) -> None: ... @classmethod - def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ... - def _from_backing_data(self, values: np.ndarray): ... - def __setstate__(self, state): ... + def _simple_new(cls, values: np.ndarray, dtype: DtypeObj) -> Self: ... + def _from_backing_data(self, values: np.ndarray) -> Self: ... + def __setstate__(self, state) -> None: ... def __len__(self) -> int: ... @property def shape(self) -> Shape: ... @@ -26,14 +26,14 @@ class NDArrayBacked: def size(self) -> int: ... @property def nbytes(self) -> int: ... - def copy(self, order=...): ... - def delete(self, loc, axis=...): ... - def swapaxes(self, axis1, axis2): ... - def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... - def reshape(self, *args, **kwargs): ... - def ravel(self, order=...): ... + def copy(self, order=...) -> Self: ... + def delete(self, loc, axis=...) -> Self: ... + def swapaxes(self, axis1, axis2) -> Self: ... + def repeat(self, repeats: int | Sequence[int], axis: int | None = ...) -> Self: ... + def reshape(self, *args, **kwargs) -> Self: ... + def ravel(self, order=...) -> Self: ... @property - def T(self): ... + def T(self) -> Self: ... @classmethod def _concat_same_type( cls, to_concat: Sequence[Self], axis: AxisInt = ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9889436a542c1..2932f3ff56396 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -67,6 +67,10 @@ cdef class NDArrayBacked: """ Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. + The returned array has the same dtype as self. + + Caller is responsible for ensuring `values.dtype == self._ndarray.dtype`. + This should round-trip: self == self._from_backing_data(self._ndarray) """ diff --git a/pandas/_libs/byteswap.pyx b/pandas/_libs/byteswap.pyx index 67cd7ad58d229..7a8a9fc5a9139 100644 --- a/pandas/_libs/byteswap.pyx +++ b/pandas/_libs/byteswap.pyx @@ -15,7 +15,7 @@ from libc.string cimport memcpy def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint32_t value - assert offset + sizeof(value) < len(data) + assert offset + sizeof(value) < len(data) cdef const void *ptr = (data) + offset memcpy(&value, ptr, sizeof(value)) if byteswap: @@ -28,7 +28,7 @@ def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint64_t value - assert offset + sizeof(value) < len(data) + assert offset + sizeof(value) < len(data) cdef const void *ptr = (data) + offset memcpy(&value, ptr, sizeof(value)) if byteswap: @@ -41,7 +41,7 @@ def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint16_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap2(res) @@ -50,7 +50,7 @@ def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint32_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap4(res) @@ -59,7 +59,7 @@ def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint64_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap8(res) diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index de4b70d387b5f..ccfb2d2ef4a23 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -34,8 +34,3 @@ ctypedef fused numeric_t: ctypedef fused numeric_object_t: numeric_t object - -ctypedef fused uint8_int64_object_t: - uint8_t - int64_t - object diff --git a/pandas/_libs/free_threading_config.pxi.in b/pandas/_libs/free_threading_config.pxi.in new file mode 100644 index 0000000000000..fe7ca399389ab --- /dev/null +++ b/pandas/_libs/free_threading_config.pxi.in @@ -0,0 +1,3 @@ +# Autogenerated file containing Cython compile-time defines + +DEF CYTHON_COMPATIBLE_WITH_FREE_THREADING = @freethreading_compatible@ diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 135828a23648a..163fc23535022 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -12,6 +12,8 @@ def group_median_float64( min_count: int = ..., # Py_ssize_t mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + is_datetimelike: bool = ..., # bint + skipna: bool = ..., ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] @@ -42,10 +44,10 @@ def group_shift_indexer( def group_fillna_indexer( out: np.ndarray, # ndarray[intp_t] labels: np.ndarray, # ndarray[int64_t] - sorted_labels: npt.NDArray[np.intp], mask: npt.NDArray[np.uint8], limit: int, # int64_t - dropna: bool, + compute_ffill: bool, + ngroups: int, ) -> None: ... def group_any_all( out: np.ndarray, # uint8_t[::1] @@ -65,6 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] @@ -74,6 +77,7 @@ def group_prod( mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., + skipna: bool = ..., ) -> None: ... def group_var( out: np.ndarray, # floating[:, ::1] @@ -86,6 +90,7 @@ def group_var( result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., name: str = ..., + skipna: bool = ..., ) -> None: ... def group_skew( out: np.ndarray, # float64_t[:, ::1] @@ -96,6 +101,15 @@ def group_skew( result_mask: np.ndarray | None = ..., skipna: bool = ..., ) -> None: ... +def group_kurt( + out: np.ndarray, # float64_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[float64_T, ndim=2] + labels: np.ndarray, # const intp_t[::1] + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + skipna: bool = ..., +) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] counts: np.ndarray, # int64_t[::1] @@ -105,6 +119,7 @@ def group_mean( is_datetimelike: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_ohlc( out: np.ndarray, # floatingintuint_t[:, ::1] @@ -136,6 +151,7 @@ def group_last( result_mask: npt.NDArray[np.bool_] | None = ..., min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_nth( out: np.ndarray, # rank_t[:, ::1] @@ -147,6 +163,7 @@ def group_nth( min_count: int = ..., # int64_t rank: int = ..., # int64_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_rank( out: np.ndarray, # float64_t[:, ::1] @@ -169,6 +186,7 @@ def group_max( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -179,6 +197,7 @@ def group_min( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_idxmin_idxmax( out: npt.NDArray[np.intp], diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 19d71b0a6fde3..f65fa2368967a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT -cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil: +cdef float64_t median_linear_mask( + float64_t* a, + int n, + uint8_t* mask, + bint skipna=True +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -77,10 +82,12 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) + if tmp is NULL: + raise MemoryError() j = 0 for i in range(n): @@ -99,7 +106,12 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n return result -cdef float64_t median_linear(float64_t* a, int n) noexcept nogil: +cdef float64_t median_linear( + float64_t* a, + int n, + bint is_datetimelike=False, + bint skipna=True, +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -109,21 +121,34 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil: return NaN # count NAs - for i in range(n): - if a[i] != a[i]: - na_count += 1 + if is_datetimelike: + for i in range(n): + if a[i] == NPY_NAT: + na_count += 1 + else: + for i in range(n): + if a[i] != a[i]: + na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) + if tmp is NULL: + raise MemoryError() j = 0 - for i in range(n): - if a[i] == a[i]: - tmp[j] = a[i] - j += 1 + if is_datetimelike: + for i in range(n): + if a[i] != NPY_NAT: + tmp[j] = a[i] + j += 1 + else: + for i in range(n): + if a[i] == a[i]: + tmp[j] = a[i] + j += 1 a = tmp n -= na_count @@ -166,6 +191,8 @@ def group_median_float64( Py_ssize_t min_count=-1, const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, + bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -209,7 +236,7 @@ def group_median_float64( for j in range(ngroups): size = _counts[j + 1] - result = median_linear_mask(ptr, size, ptr_mask) + result = median_linear_mask(ptr, size, ptr_mask, skipna) out[j, i] = result if result != result: @@ -224,7 +251,7 @@ def group_median_float64( ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = median_linear(ptr, size) + out[j, i] = median_linear(ptr, size, is_datetimelike, skipna) ptr += size @@ -378,8 +405,14 @@ def group_cumsum( for i in range(N): lab = labels[i] - if lab < 0: + if uses_mask and lab < 0: + # GH#58811 + result_mask[i, :] = True + out[i, :] = 0 + continue + elif lab < 0: continue + for j in range(K): val = values[i, j] @@ -491,12 +524,12 @@ def group_shift_indexer( @cython.wraparound(False) @cython.boundscheck(False) def group_fillna_indexer( - ndarray[intp_t] out, - ndarray[intp_t] labels, - ndarray[intp_t] sorted_labels, - ndarray[uint8_t] mask, + Py_ssize_t[::1] out, + const intp_t[::1] labels, + const uint8_t[:] mask, int64_t limit, - bint dropna, + bint compute_ffill, + int ngroups, ) -> None: """ Indexes how to fill values forwards or backwards within a group. @@ -508,50 +541,52 @@ def group_fillna_indexer( labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. - sorted_labels : np.ndarray[np.intp] - obtained by `np.argsort(labels, kind="mergesort")` - values : np.ndarray[np.uint8] - Containing the truth value of each element. mask : np.ndarray[np.uint8] Indicating whether a value is na or not. - limit : Consecutive values to fill before stopping, or -1 for no limit - dropna : Flag to indicate if NaN groups should return all NaN values + limit : int64_t + Consecutive values to fill before stopping, or -1 for no limit. + compute_ffill : bint + Whether to compute ffill or bfill. + ngroups : int + Number of groups, larger than all entries of `labels`. Notes ----- This method modifies the `out` parameter rather than returning an object """ cdef: - Py_ssize_t i, N, idx - intp_t curr_fill_idx=-1 - int64_t filled_vals = 0 - - N = len(out) + Py_ssize_t idx, N = len(out) + intp_t label + intp_t[::1] last = -1 * np.ones(ngroups, dtype=np.intp) + intp_t[::1] fill_count = np.zeros(ngroups, dtype=np.intp) # Make sure all arrays are the same size assert N == len(labels) == len(mask) with nogil: - for i in range(N): - idx = sorted_labels[i] - if dropna and labels[idx] == -1: # nan-group gets nan-values - curr_fill_idx = -1 + # Can't use for loop with +/- step + # https://github.com/cython/cython/issues/1106 + idx = 0 if compute_ffill else N-1 + for _ in range(N): + label = labels[idx] + if label == -1: # na-group gets na-values + out[idx] = -1 elif mask[idx] == 1: # is missing # Stop filling once we've hit the limit - if filled_vals >= limit and limit != -1: - curr_fill_idx = -1 - filled_vals += 1 - else: # reset items when not missing - filled_vals = 0 - curr_fill_idx = idx - - out[idx] = curr_fill_idx + if limit != -1 and fill_count[label] >= limit: + out[idx] = -1 + else: + out[idx] = last[label] + fill_count[label] += 1 + else: + fill_count[label] = 0 # reset items when not missing + last[label] = idx + out[idx] = idx - # If we move to the next group, reset - # the fill_idx and counter - if i == N - 1 or labels[idx] != labels[sorted_labels[i + 1]]: - curr_fill_idx = -1 - filled_vals = 0 + if compute_ffill: + idx += 1 + else: + idx -= 1 @cython.boundscheck(False) @@ -672,18 +707,19 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 using Kahan summation """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - sum_t val, t, y + sum_t val, t, y, nan_val sum_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -694,6 +730,15 @@ def group_sum( compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape + if uses_mask: + nan_val = 0 + elif is_datetimelike: + nan_val = NPY_NAT + elif sum_t is int64_t or sum_t is uint64_t: + # This has no effect as int64 can't be nan. Setting to 0 to avoid type error + nan_val = 0 + else: + nan_val = NAN with nogil(sum_t is not object): for i in range(N): @@ -711,6 +756,18 @@ def group_sum( else: isna_entry = _treat_as_na(val, is_datetimelike) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelikebecause adding a value to NPY_NAT may not result + # in a NPY_NAT + continue + if not isna_entry: nobs[lab, j] += 1 @@ -737,6 +794,11 @@ def group_sum( # because of no gil compensation[lab, j] = 0 sumx[lab, j] = t + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx @@ -753,17 +815,18 @@ def group_prod( const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64float_t val + int64float_t val, nan_val int64float_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -772,6 +835,7 @@ def group_prod( prodx = np.ones((out).shape, dtype=(out).base.dtype) N, K = (values).shape + nan_val = _get_na_val(0, False) with nogil: for i in range(N): @@ -788,9 +852,24 @@ def group_prod( else: isna_entry = _treat_as_na(val, False) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na(prodx[lab, j], False) + + if isna_result: + # If prod is already NA, no need to update it + continue + if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + prodx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx @@ -811,6 +890,7 @@ def group_var( uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, str name="var", + bint skipna=True, ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -818,7 +898,7 @@ def group_var( floating[:, ::1] mean int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None bint is_std = name == "std" bint is_sem = name == "sem" @@ -855,11 +935,34 @@ def group_var( else: isna_entry = _treat_as_na(val, is_datetimelike) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + elif is_datetimelike: + # With group_var, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_result = out[lab, j] == NPY_NAT + else: + isna_result = _treat_as_na(out[lab, j], is_datetimelike) + + if isna_result: + # If aggregate is already NA, don't add to it. This is + # important for datetimelike because adding a value to NPY_NAT + # may not result in a NPY_NAT + continue + if not isna_entry: nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + elif not skipna: + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + out[lab, j] = NAN for i in range(ncounts): for j in range(K): @@ -882,7 +985,7 @@ def group_var( @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -@cython.cpow +@cython.cpow(True) def group_skew( float64_t[:, ::1] out, int64_t[::1] counts, @@ -933,7 +1036,7 @@ def group_skew( isna_entry = _treat_as_na(val, False) if not isna_entry: - # Based on RunningStats::Push from + # Running stats update based on RunningStats::Push from # https://www.johndcook.com/blog/skewness_kurtosis/ n1 = nobs[lab, j] n = n1 + 1 @@ -967,6 +1070,100 @@ def group_skew( ) +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +@cython.cpow(True) +def group_kurt( + float64_t[:, ::1] out, + int64_t[::1] counts, + ndarray[float64_t, ndim=2] values, + const intp_t[::1] labels, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, + bint skipna=True, +) -> None: + cdef: + Py_ssize_t i, j, N, K, lab, ngroups = len(counts) + int64_t[:, ::1] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = mask is not None + float64_t[:, ::1] M1, M2, M3, M4 + float64_t delta, delta_n, delta_n2, term1, val + int64_t n1, n + float64_t ct, num, den, adj + + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + + # M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments + M1 = np.zeros((out).shape, dtype=np.float64) + M2 = np.zeros((out).shape, dtype=np.float64) + M3 = np.zeros((out).shape, dtype=np.float64) + M4 = np.zeros((out).shape, dtype=np.float64) + + N, K = (values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, False) + + if not isna_entry: + # Running stats update based on RunningStats::Push from + # https://www.johndcook.com/blog/skewness_kurtosis/ + n1 = nobs[lab, j] + n = n1 + 1 + + nobs[lab, j] = n + delta = val - M1[lab, j] + delta_n = delta / n + delta_n2 = delta_n * delta_n + term1 = delta * delta_n * n1 + + M1[lab, j] += delta_n + M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3) + + 6 * delta_n2 * M2[lab, j] + - 4 * delta_n * M3[lab, j]) + M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j] + M2[lab, j] += term1 + elif not skipna: + M1[lab, j] = NaN + M2[lab, j] = NaN + M3[lab, j] = NaN + M4[lab, j] = NaN + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 4: + if result_mask is not None: + result_mask[i, j] = 1 + out[i, j] = NaN + elif M2[i, j] == 0: + out[i, j] = 0 + else: + num = ct * (ct + 1) * (ct - 1) * M4[i, j] + den = (ct - 2) * (ct - 3) * M2[i, j] ** 2 + adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3)) + out[i, j] = num / den - adj + + @cython.wraparound(False) @cython.boundscheck(False) def group_mean( @@ -978,6 +1175,7 @@ def group_mean( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """ Compute the mean per label given a label assignment for each value. @@ -1003,6 +1201,8 @@ def group_mean( Mask of the input values. result_mask : ndarray[bool, ndim=2], optional Mask of the out array + skipna : bool, optional + If True, ignore nans in `values`. Notes ----- @@ -1016,7 +1216,7 @@ def group_mean( mean_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None assert min_count == -1, "'min_count' only used in sum and prod" @@ -1056,6 +1256,23 @@ def group_mean( else: isna_entry = _treat_as_na(val, is_datetimelike) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + elif is_datetimelike: + # With group_mean, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_result = sumx[lab, j] == NPY_NAT + else: + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in NPY_NAT + continue + if not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] @@ -1069,6 +1286,14 @@ def group_mean( # because of no gil compensation[lab, j] = 0. sumx[lab, j] = t + elif not skipna: + # Set the nobs to 0 so that in case of datetimelike, + # dividing NPY_NAT by nobs may not result in a NPY_NAT + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val for i in range(ncounts): for j in range(K): @@ -1157,13 +1382,13 @@ def group_ohlc( @cython.boundscheck(False) @cython.wraparound(False) def group_quantile( - ndarray[float64_t, ndim=2] out, + float64_t[:, ::1] out, ndarray[numeric_t, ndim=1] values, - ndarray[intp_t] labels, + const intp_t[::1] labels, const uint8_t[:] mask, const float64_t[:] qs, - ndarray[int64_t] starts, - ndarray[int64_t] ends, + const int64_t[::1] starts, + const int64_t[::1] ends, str interpolation, uint8_t[:, ::1] result_mask, bint is_datetimelike, @@ -1284,7 +1509,9 @@ def group_quantile( elif interp == INTERPOLATION_MIDPOINT: out[i, k] = (val + next_val) / 2.0 elif interp == INTERPOLATION_NEAREST: - if frac > .5 or (frac == .5 and q_val > .5): # Always OK? + if frac > .5 or (frac == .5 and idx % 2 == 1): + # If quantile lies in the middle of two indexes, + # take the even index, as np.quantile. out[i, k] = next_val else: out[i, k] = val @@ -1364,7 +1591,7 @@ cdef inline void _check_below_mincount( uint8_t[:, ::1] result_mask, Py_ssize_t ncounts, Py_ssize_t K, - int64_t[:, ::1] nobs, + const int64_t[:, ::1] nobs, int64_t min_count, mincount_t[:, ::1] resx, ) noexcept: @@ -1411,19 +1638,18 @@ cdef inline void _check_below_mincount( out[i, j] = 0 -# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can -# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_last( numeric_object_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_object_t, ndim=2] values, + const numeric_object_t[:, :] values, const intp_t[::1] labels, const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1458,34 +1684,38 @@ def group_last( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - resx[lab, j] = val + nobs[lab, j] += 1 + resx[lab, j] = val + + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx ) -# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can -# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_nth( numeric_object_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_object_t, ndim=2] values, + const numeric_object_t[:, :] values, const intp_t[::1] labels, const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, int64_t min_count=-1, int64_t rank=1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1520,15 +1750,19 @@ def group_nth( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx @@ -1631,6 +1865,7 @@ cdef group_min_max( bint compute_max=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1658,6 +1893,8 @@ cdef group_min_max( result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. + skipna : bool, default True + If True, ignore nans in `values`. Notes ----- @@ -1666,17 +1903,18 @@ cdef group_min_max( """ cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - numeric_t val + numeric_t val, nan_val numeric_t[:, ::1] group_min_or_max int64_t[:, ::1] nobs bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) + nan_val = _get_na_val(0, is_datetimelike) group_min_or_max = np.empty_like(out) group_min_or_max[:] = _get_min_or_max(0, compute_max, is_datetimelike) @@ -1698,6 +1936,18 @@ cdef group_min_max( else: isna_entry = _treat_as_na(val, is_datetimelike) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na( + group_min_or_max[lab, j], is_datetimelike + ) + + if isna_result: + # If current min/max is already NA, it will always be NA + continue + if not isna_entry: nobs[lab, j] += 1 if compute_max: @@ -1706,6 +1956,11 @@ cdef group_min_max( else: if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + group_min_or_max[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max @@ -1767,6 +2022,7 @@ def group_idxmin_idxmax( Py_ssize_t i, j, N, K, lab numeric_object_t val numeric_object_t[:, ::1] group_min_or_max + uint8_t[:, ::1] seen bint uses_mask = mask is not None bint isna_entry bint compute_max = name == "idxmax" @@ -1780,13 +2036,10 @@ def group_idxmin_idxmax( if numeric_object_t is object: group_min_or_max = np.empty((out).shape, dtype=object) + seen = np.zeros((out).shape, dtype=np.uint8) else: group_min_or_max = np.empty_like(out, dtype=values.dtype) - if N > 0 and K > 0: - # When N or K is zero, we never use group_min_or_max - group_min_or_max[:] = _get_min_or_max( - values[0, 0], compute_max, is_datetimelike - ) + seen = np.zeros_like(out, dtype=np.uint8) # When using transform, we need a valid value for take in the case # a category is not observed; these values will be dropped @@ -1802,6 +2055,7 @@ def group_idxmin_idxmax( if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back continue + val = values[i, j] if uses_mask: @@ -1810,10 +2064,14 @@ def group_idxmin_idxmax( isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: - if not skipna: + if not skipna or not seen[lab, j]: out[lab, j] = -1 else: - if compute_max: + if not seen[lab, j]: + seen[lab, j] = True + group_min_or_max[lab, j] = val + out[lab, j] = i + elif compute_max: if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i @@ -1834,6 +2092,7 @@ def group_max( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -1846,6 +2105,7 @@ def group_max( compute_max=True, mask=mask, result_mask=result_mask, + skipna=skipna, ) @@ -1860,6 +2120,7 @@ def group_min( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -1872,6 +2133,7 @@ def group_min( compute_max=False, mask=mask, result_mask=result_mask, + skipna=skipna, ) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index be6958e3315e9..a1fd70529efa7 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -23,7 +23,7 @@ from pandas._libs.util cimport is_nan @cython.boundscheck(False) def hash_object_array( - ndarray[object] arr, str key, str encoding="utf8" + ndarray[object, ndim=1] arr, str key, str encoding="utf8" ) -> np.ndarray[np.uint64]: """ Parameters @@ -68,7 +68,11 @@ def hash_object_array( # create an array of bytes vecs = malloc(n * sizeof(char *)) + if vecs is NULL: + raise MemoryError() lens = malloc(n * sizeof(uint64_t)) + if lens is NULL: + raise MemoryError() for i in range(n): val = arr[i] diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index eaec9e8462450..0480ee54ffb4e 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -41,7 +41,7 @@ cdef class HashTable: cdef class UInt64HashTable(HashTable): cdef kh_uint64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint64_t val) @@ -51,7 +51,7 @@ cdef class UInt64HashTable(HashTable): cdef class Int64HashTable(HashTable): cdef kh_int64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int64_t val) @@ -61,7 +61,7 @@ cdef class Int64HashTable(HashTable): cdef class UInt32HashTable(HashTable): cdef kh_uint32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint32_t val) @@ -71,7 +71,7 @@ cdef class UInt32HashTable(HashTable): cdef class Int32HashTable(HashTable): cdef kh_int32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int32_t val) @@ -81,7 +81,7 @@ cdef class Int32HashTable(HashTable): cdef class UInt16HashTable(HashTable): cdef kh_uint16_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint16_t val) @@ -91,7 +91,7 @@ cdef class UInt16HashTable(HashTable): cdef class Int16HashTable(HashTable): cdef kh_int16_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int16_t val) @@ -101,7 +101,7 @@ cdef class Int16HashTable(HashTable): cdef class UInt8HashTable(HashTable): cdef kh_uint8_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint8_t val) @@ -111,7 +111,7 @@ cdef class UInt8HashTable(HashTable): cdef class Int8HashTable(HashTable): cdef kh_int8_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int8_t val) @@ -121,7 +121,7 @@ cdef class Int8HashTable(HashTable): cdef class Float64HashTable(HashTable): cdef kh_float64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, float64_t val) @@ -131,7 +131,7 @@ cdef class Float64HashTable(HashTable): cdef class Float32HashTable(HashTable): cdef kh_float32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, float32_t val) @@ -141,7 +141,7 @@ cdef class Float32HashTable(HashTable): cdef class Complex64HashTable(HashTable): cdef kh_complex64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, complex64_t val) @@ -151,7 +151,7 @@ cdef class Complex64HashTable(HashTable): cdef class Complex128HashTable(HashTable): cdef kh_complex128_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, complex128_t val) @@ -174,16 +174,16 @@ cdef class StringHashTable(HashTable): cdef struct Int64VectorData: int64_t *data - Py_ssize_t n, m + Py_ssize_t size, capacity cdef class Vector: cdef bint external_view_exists cdef class Int64Vector(Vector): - cdef Int64VectorData *data + cdef Int64VectorData data cdef ndarray ao - cdef resize(self) + cdef resize(self, Py_ssize_t new_size) cpdef ndarray to_array(self) cdef void append(self, int64_t x) noexcept cdef extend(self, int64_t[:] x) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 555ec73acd9b2..5ee359d84a6ed 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -1,7 +1,8 @@ +from collections.abc import Hashable from typing import ( Any, - Hashable, Literal, + overload, ) import numpy as np @@ -15,7 +16,7 @@ def unique_label_indices( class Factorizer: count: int uniques: Any - def __init__(self, size_hint: int) -> None: ... + def __init__(self, size_hint: int, uses_mask: bool = False) -> None: ... def get_count(self) -> int: ... def factorize( self, @@ -24,6 +25,9 @@ class Factorizer: na_value=..., mask=..., ) -> npt.NDArray[np.intp]: ... + def hash_inner_join( + self, values: np.ndarray, mask=... + ) -> tuple[np.ndarray, np.ndarray]: ... class ObjectFactorizer(Factorizer): table: PyObjectHashTable @@ -180,18 +184,33 @@ class HashTable: na_value: object = ..., mask=..., ) -> npt.NDArray[np.intp]: ... + @overload def unique( self, values: np.ndarray, # np.ndarray[subclass-specific] - return_inverse: bool = ..., - mask=..., - ) -> ( - tuple[ - np.ndarray, # np.ndarray[subclass-specific] - npt.NDArray[np.intp], - ] - | np.ndarray - ): ... # np.ndarray[subclass-specific] + *, + return_inverse: Literal[False] = ..., + mask: None = ..., + ) -> np.ndarray: ... # np.ndarray[subclass-specific] + @overload + def unique( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + *, + return_inverse: Literal[True], + mask: None = ..., + ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific] + @overload + def unique( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + *, + return_inverse: Literal[False] = ..., + mask: npt.NDArray[np.bool_], + ) -> tuple[ + np.ndarray, + npt.NDArray[np.bool_], + ]: ... # np.ndarray[subclass-specific] def factorize( self, values: np.ndarray, # np.ndarray[subclass-specific] @@ -200,6 +219,9 @@ class HashTable: mask=..., ignore_na: bool = True, ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific] + def hash_inner_join( + self, values: np.ndarray, mask=... + ) -> tuple[np.ndarray, np.ndarray]: ... class Complex128HashTable(HashTable): ... class Complex64HashTable(HashTable): ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..b5ae5a3440f39 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,8 +1,4 @@ cimport cython -from cpython.mem cimport ( - PyMem_Free, - PyMem_Malloc, -) from cpython.ref cimport ( Py_INCREF, PyObject, @@ -11,6 +7,7 @@ from libc.stdlib cimport ( free, malloc, ) +from libc.string cimport memcpy import numpy as np @@ -33,7 +30,10 @@ from pandas._libs.khash cimport ( kh_python_hash_func, khiter_t, ) -from pandas._libs.missing cimport checknull +from pandas._libs.missing cimport ( + checknull, + is_matching_na, +) def get_hashtable_trace_domain(): @@ -73,7 +73,7 @@ cdef class Factorizer: cdef readonly: Py_ssize_t count - def __cinit__(self, size_hint: int): + def __cinit__(self, size_hint: int, uses_mask: bool = False): self.count = 0 def get_count(self) -> int: @@ -82,13 +82,16 @@ cdef class Factorizer: def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: raise NotImplementedError + def hash_inner_join(self, values, mask=None): + raise NotImplementedError + cdef class ObjectFactorizer(Factorizer): cdef public: PyObjectHashTable table ObjectVector uniques - def __cinit__(self, size_hint: int): + def __cinit__(self, size_hint: int, uses_mask: bool = False): self.table = PyObjectHashTable(size_hint) self.uniques = ObjectVector() diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c0723392496c1..eae393f33bfd3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -3,7 +3,7 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ - +from cpython.unicode cimport PyUnicode_AsUTF8 {{py: @@ -98,7 +98,6 @@ from pandas._libs.khash cimport ( # VectorData # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA @@ -133,7 +132,7 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), ctypedef struct {{name}}VectorData: {{c_type}} *data - Py_ssize_t n, m + Py_ssize_t size, capacity {{endif}} @@ -143,8 +142,8 @@ ctypedef struct {{name}}VectorData: cdef void append_data_{{dtype}}({{name}}VectorData *data, {{c_type}} x) noexcept nogil: - data.data[data.n] = x - data.n += 1 + data.data[data.size] = x + data.size += 1 {{endfor}} @@ -163,8 +162,9 @@ ctypedef fused vector_data: Complex64VectorData StringVectorData -cdef bint needs_resize(vector_data *data) noexcept nogil: - return data.n == data.m + +cdef bint needs_resize(Py_ssize_t nelems, Py_ssize_t capacity) noexcept nogil: + return nelems >= capacity # ---------------------------------------------------------------------- # Vector @@ -204,98 +204,93 @@ cdef class {{name}}Vector(Vector): # Int64Vector is the only one we need exposed for other cython files. {{if dtype != 'int64'}} cdef: - {{name}}VectorData *data + {{name}}VectorData data ndarray ao {{endif}} def __cinit__(self): - self.data = <{{name}}VectorData *>PyMem_Malloc( - sizeof({{name}}VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) + self.data.size = 0 + self.data.capacity = _INIT_VEC_CAP + self.ao = np.empty(self.data.capacity, dtype=np.{{dtype}}) self.data.data = <{{c_type}}*>self.ao.data - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m, refcheck=False) + cdef resize(self, Py_ssize_t new_size): + self.data.capacity = max(new_size, _INIT_VEC_CAP) + self.ao.resize(self.data.capacity, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data - def __dealloc__(self): - if self.data is not NULL: - PyMem_Free(self.data) - self.data = NULL - def __len__(self) -> int: - return self.data.n + return self.data.size cpdef ndarray to_array(self): - if self.data.m != self.data.n: + if self.data.capacity != self.data.size: if self.external_view_exists: # should never happen raise ValueError("should have raised on append()") - self.ao.resize(self.data.n, refcheck=False) - self.data.m = self.data.n + self.ao.resize(self.data.size, refcheck=False) + self.data.capacity = self.data.size self.external_view_exists = True return self.ao cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(self.data): + if needs_resize(self.data.size, self.data.capacity): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.resize() + self.resize(self.data.capacity * 4) - append_data_{{dtype}}(self.data, x) + append_data_{{dtype}}(&self.data, x) cdef extend(self, const {{c_type}}[:] x): - for i in range(len(x)): - self.append(x[i]) + cdef Py_ssize_t x_size = len(x) + if x_size == 0: + return + + cdef Py_ssize_t needed_size = self.data.size + x_size + if needs_resize(needed_size, self.data.capacity): + if self.external_view_exists: + raise ValueError("external reference but " + "Vector.resize() needed") + self.resize(needed_size) + + memcpy(self.data.data + self.data.size, &x[0], x_size * sizeof({{c_type}})) + self.data.size = needed_size {{endfor}} cdef class StringVector(Vector): cdef: - StringVectorData *data + StringVectorData data def __cinit__(self): - self.data = PyMem_Malloc(sizeof(StringVectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.data.data = malloc(self.data.m * sizeof(char *)) - if not self.data.data: + self.data.size = 0 + self.data.capacity = _INIT_VEC_CAP + self.data.data = malloc(self.data.capacity * sizeof(char *)) + if self.data.data is NULL: raise MemoryError() - cdef resize(self): + cdef resize(self, Py_ssize_t new_size): cdef: char **orig_data - Py_ssize_t i, m + Py_ssize_t i, orig_capacity - m = self.data.m - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + orig_capacity = self.data.capacity + self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP) orig_data = self.data.data - self.data.data = malloc(self.data.m * sizeof(char *)) - if not self.data.data: + self.data.data = malloc(self.data.capacity * sizeof(char *)) + if self.data.data is NULL: raise MemoryError() - for i in range(m): + for i in range(orig_capacity): self.data.data[i] = orig_data[i] def __dealloc__(self): - if self.data is not NULL: - if self.data.data is not NULL: - free(self.data.data) - PyMem_Free(self.data) - self.data = NULL + free(self.data.data) def __len__(self) -> int: - return self.data.n + return self.data.size cpdef ndarray[object, ndim=1] to_array(self): cdef: @@ -303,20 +298,20 @@ cdef class StringVector(Vector): Py_ssize_t n object val - ao = np.empty(self.data.n, dtype=object) - for i in range(self.data.n): + ao = np.empty(self.data.size, dtype=object) + for i in range(self.data.size): val = self.data.data[i] ao[i] = val self.external_view_exists = True - self.data.m = self.data.n + self.data.capacity = self.data.size return ao cdef void append(self, char *x) noexcept: - if needs_resize(self.data): - self.resize() + if needs_resize(self.data.size, self.data.capacity): + self.resize(self.data.capacity * 4) - append_data_string(self.data, x) + append_data_string(&self.data, x) cdef extend(self, ndarray[object] x): for i in range(len(x)): @@ -327,37 +322,37 @@ cdef class ObjectVector(Vector): cdef: PyObject **data - Py_ssize_t n, m + Py_ssize_t size, capacity ndarray ao def __cinit__(self): - self.n = 0 - self.m = _INIT_VEC_CAP + self.size = 0 + self.capacity = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) self.data = self.ao.data def __len__(self) -> int: - return self.n + return self.size cdef append(self, object obj): - if self.n == self.m: + if self.size == self.capacity: if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m, refcheck=False) + self.capacity = max(self.capacity * 2, _INIT_VEC_CAP) + self.ao.resize(self.capacity, refcheck=False) self.data = self.ao.data Py_INCREF(obj) - self.data[self.n] = obj - self.n += 1 + self.data[self.size] = obj + self.size += 1 cpdef ndarray[object, ndim=1] to_array(self): - if self.m != self.n: + if self.capacity != self.size: if self.external_view_exists: raise ValueError("should have raised on append()") - self.ao.resize(self.n, refcheck=False) - self.m = self.n + self.ao.resize(self.size, refcheck=False) + self.capacity = self.size self.external_view_exists = True return self.ao @@ -540,7 +535,7 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{c_type}} val khiter_t k - int8_t na_position = self.na_position + Py_ssize_t na_position = self.na_position if self.uses_mask and mask is None: raise NotImplementedError # pragma: no cover @@ -561,6 +556,49 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = i self.na_position = na_position + @cython.wraparound(False) + @cython.boundscheck(False) + def hash_inner_join(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> tuple[ndarray, ndarray]: + cdef: + Py_ssize_t i, n = len(values) + {{c_type}} val + khiter_t k + Int64Vector locs = Int64Vector() + Int64Vector self_locs = Int64Vector() + Int64VectorData *l + Int64VectorData *sl + Py_ssize_t na_position = self.na_position + + l = &locs.data + sl = &self_locs.data + + if self.uses_mask and mask is None: + raise NotImplementedError # pragma: no cover + + with nogil: + for i in range(n): + if self.uses_mask and mask[i]: + if self.na_position == -1: + continue + if needs_resize(l.size, l.capacity): + with gil: + locs.resize(locs.data.capacity * 4) + self_locs.resize(locs.data.capacity * 4) + append_data_int64(l, i) + append_data_int64(sl, na_position) + else: + val = {{to_c_type}}(values[i]) + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + if needs_resize(l.size, l.capacity): + with gil: + locs.resize(locs.data.capacity * 4) + self_locs.resize(locs.data.capacity * 4) + append_data_int64(l, i) + append_data_int64(sl, self.table.vals[k]) + + return self_locs.to_array(), locs.to_array() + @cython.boundscheck(False) def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray: # -> np.ndarray[np.intp] @@ -571,7 +609,7 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val khiter_t k intp_t[::1] locs = np.empty(n, dtype=np.intp) - int8_t na_position = self.na_position + Py_ssize_t na_position = self.na_position if self.uses_mask and mask is None: raise NotImplementedError # pragma: no cover @@ -652,7 +690,7 @@ cdef class {{name}}HashTable(HashTable): if return_inverse: labels = np.empty(n, dtype=np.intp) - ud = uniques.data + ud = &uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: @@ -662,7 +700,7 @@ cdef class {{name}}HashTable(HashTable): raise NotImplementedError # pragma: no cover result_mask = UInt8Vector() - rmd = result_mask.data + rmd = &result_mask.data if use_mask: mask_values = mask.view("uint8") @@ -700,18 +738,18 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud): + if needs_resize(ud.size, ud.capacity): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") - uniques.resize() + uniques.resize(uniques.data.capacity * 4) if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") - result_mask.resize() + result_mask.resize(result_mask.data.capacity * 4) append_data_{{dtype}}(ud, val) append_data_uint8(rmd, 1) continue @@ -722,19 +760,19 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): + if needs_resize(ud.size, ud.capacity): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") - uniques.resize() + uniques.resize(uniques.data.capacity * 4) if use_result_mask: if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") - result_mask.resize() + result_mask.resize(result_mask.data.capacity * 4) append_data_{{dtype}}(ud, val) if use_result_mask: append_data_uint8(rmd, 0) @@ -755,7 +793,7 @@ cdef class {{name}}HashTable(HashTable): return uniques.to_array(), result_mask.to_array() return uniques.to_array() - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None): + def unique(self, const {{dtype}}_t[:] values, *, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -846,7 +884,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud labels = np.empty(n, dtype=np.intp) - ud = uniques.data + ud = &uniques.data with nogil: for i in range(n): @@ -865,9 +903,9 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud): + if needs_resize(ud.size, ud.capacity): with gil: - uniques.resize() + uniques.resize(uniques.data.capacity * 4) append_data_{{dtype}}(ud, val) labels[i] = count count += 1 @@ -883,8 +921,8 @@ cdef class {{name}}Factorizer(Factorizer): {{name}}HashTable table {{name}}Vector uniques - def __cinit__(self, size_hint: int): - self.table = {{name}}HashTable(size_hint) + def __cinit__(self, size_hint: int, uses_mask: bool = False): + self.table = {{name}}HashTable(size_hint, uses_mask=uses_mask) self.uniques = {{name}}Vector() def factorize(self, const {{c_type}}[:] values, @@ -915,6 +953,9 @@ cdef class {{name}}Factorizer(Factorizer): self.count = len(self.uniques) return labels + def hash_inner_join(self, const {{c_type}}[:] values, const uint8_t[:] mask = None) -> tuple[np.ndarray, np.ndarray]: + return self.table.hash_inner_join(values, mask) + {{endfor}} @@ -933,6 +974,9 @@ cdef class StringHashTable(HashTable): kh_destroy_str(self.table) self.table = NULL + def __len__(self) -> int: + return self.table.size + def sizeof(self, deep: bool = False) -> int: overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) @@ -953,7 +997,7 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k const char *v - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) k = kh_get_str(self.table, v) if k != self.table.n_buckets: @@ -967,7 +1011,7 @@ cdef class StringHashTable(HashTable): int ret = 0 const char *v - v = get_c_string(key) + v = PyUnicode_AsUTF8(key) k = kh_put_str(self.table, v, &ret) if kh_exist_str(self.table, k): @@ -988,9 +1032,11 @@ cdef class StringHashTable(HashTable): const char **vecs vecs = malloc(n * sizeof(char *)) + if vecs is NULL: + raise MemoryError() for i in range(n): val = values[i] - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) vecs[i] = v with nogil: @@ -1018,15 +1064,17 @@ cdef class StringHashTable(HashTable): # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) + if vecs is NULL: + raise MemoryError() for i in range(n): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string won't recognize + # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize # it as a str, even though isinstance does. - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) else: - v = get_c_string(self.na_string_sentinel) + v = PyUnicode_AsUTF8(self.na_string_sentinel) vecs[i] = v with nogil: @@ -1054,15 +1102,17 @@ cdef class StringHashTable(HashTable): # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) + if vecs is NULL: + raise MemoryError() for i in range(n): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string won't recognize + # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize # it as a str, even though isinstance does. - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) else: - v = get_c_string(self.na_string_sentinel) + v = PyUnicode_AsUTF8(self.na_string_sentinel) vecs[i] = v with nogil: @@ -1121,20 +1171,29 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + bint non_null_na_value if return_inverse: labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) + if vecs is NULL: + raise MemoryError() for i in range(n): val = values[i] if (ignore_na and (not isinstance(val, str) - or (use_na_value and val == na_value))): + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value))) + ) + ) + ): # if missing values do not count as unique values (i.e. if # ignore_na is True), we can skip the actual value, and # replace the label with na_sentinel directly @@ -1142,9 +1201,9 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. try: - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) except UnicodeEncodeError: - v = get_c_string(repr(val)) + v = PyUnicode_AsUTF8(repr(val)) vecs[i] = v # compute @@ -1180,7 +1239,7 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() - def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): + def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1400,10 +1459,11 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k bint use_na_value - + bint non_null_na_value if return_inverse: labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) for i in range(n): val = values[i] @@ -1411,7 +1471,11 @@ cdef class PyObjectHashTable(HashTable): if ignore_na and ( checknull(val) - or (use_na_value and val == na_value) + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value)) + ) + ) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, and @@ -1438,7 +1502,7 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() - def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): + def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 336af306d410f..3487f5ebd050d 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -415,20 +415,7 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): modes = np.empty(nkeys, dtype=values.dtype) - if htfunc_t is not object: - with nogil: - for k in range(nkeys): - count = counts[k] - if count == max_count: - j += 1 - elif count > max_count: - max_count = count - j = 0 - else: - continue - - modes[j] = keys[k] - else: + with nogil(htfunc_t is not object): for k in range(nkeys): count = counts[k] if count == max_count: @@ -443,7 +430,7 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): if na_counter > 0: res_mask = np.zeros(j+1, dtype=np.bool_) - res_mask[j] = True + res_mask[j] = (na_counter == max_count) return modes[:j + 1], res_mask @@ -472,7 +459,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: kh_{{ttype}}_t *table = kh_init_{{ttype}}() {{name}}Vector idx = {{name}}Vector() ndarray[{{c_type}}, ndim=1] arr - {{name}}VectorData *ud = idx.data + {{name}}VectorData *ud = &idx.data kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) @@ -480,9 +467,9 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(ud): + if needs_resize(ud.size, ud.capacity): with gil: - idx.resize() + idx.resize(idx.data.capacity * 4) append_data_{{ttype}}(ud, i) kh_destroy_{{ttype}}(table) diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h index e039991847a62..043805a8b25f4 100644 --- a/pandas/_libs/include/pandas/datetime/date_conversions.h +++ b/pandas/_libs/include/pandas/datetime/date_conversions.h @@ -9,6 +9,7 @@ The full license is in the LICENSE file, distributed with this software. #define PY_SSIZE_T_CLEAN #include + #include // Scales value inplace from nanosecond resolution to unit resolution diff --git a/pandas/_libs/include/pandas/parser/io.h b/pandas/_libs/include/pandas/parser/io.h index c707c23b567d2..41f1bb9312724 100644 --- a/pandas/_libs/include/pandas/parser/io.h +++ b/pandas/_libs/include/pandas/parser/io.h @@ -10,9 +10,10 @@ The full license is in the LICENSE file, distributed with this software. #pragma once #define PY_SSIZE_T_CLEAN -#include "tokenizer.h" #include +#include "tokenizer.h" + #define FS(source) ((file_source *)source) typedef struct _rd_source { diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h index 58a09ae1bba39..543839b5d75bf 100644 --- a/pandas/_libs/include/pandas/parser/pd_parser.h +++ b/pandas/_libs/include/pandas/parser/pd_parser.h @@ -13,9 +13,10 @@ extern "C" { #endif #define PY_SSIZE_T_CLEAN -#include "pandas/parser/tokenizer.h" #include +#include "pandas/parser/tokenizer.h" + typedef struct { int (*to_double)(char *, double *, char, char, int *); int (*floatify)(PyObject *, double *, int *); diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 5a933b45d9e21..04e2d1e39ce2a 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -1,6 +1,10 @@ // Licence at LICENSES/KLIB_LICENSE +#pragma once + #include + +#include #include typedef struct { @@ -12,20 +16,8 @@ typedef struct { double imag; } khcomplex128_t; -// khash should report usage to tracemalloc -#if PY_VERSION_HEX >= 0x03060000 -#include -#if PY_VERSION_HEX < 0x03070000 -#define PyTraceMalloc_Track _PyTraceMalloc_Track -#define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack -#endif -#else -#define PyTraceMalloc_Track(...) -#define PyTraceMalloc_Untrack(...) -#endif - static const int KHASH_TRACE_DOMAIN = 424242; -void *traced_malloc(size_t size) { +static void *traced_malloc(size_t size) { void *ptr = malloc(size); if (ptr != NULL) { PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); @@ -33,7 +25,7 @@ void *traced_malloc(size_t size) { return ptr; } -void *traced_calloc(size_t num, size_t size) { +static void *traced_calloc(size_t num, size_t size) { void *ptr = calloc(num, size); if (ptr != NULL) { PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num * size); @@ -41,18 +33,16 @@ void *traced_calloc(size_t num, size_t size) { return ptr; } -void *traced_realloc(void *old_ptr, size_t size) { +static void *traced_realloc(void *old_ptr, size_t size) { + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); void *ptr = realloc(old_ptr, size); if (ptr != NULL) { - if (old_ptr != ptr) { - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); - } PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); } return ptr; } -void traced_free(void *ptr) { +static void traced_free(void *ptr) { if (ptr != NULL) { PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr); } @@ -165,7 +155,7 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) // NaN-floats should be in the same equivalency class, see GH 22119 static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { - return (Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b))) || + return (isnan(PyFloat_AS_DOUBLE(a)) && isnan(PyFloat_AS_DOUBLE(b))) || (PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b)); } @@ -173,12 +163,12 @@ static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { // PyObject_RichCompareBool for complexobjects has a different behavior // needs to be replaced static inline int complexobject_cmp(PyComplexObject *a, PyComplexObject *b) { - return (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && - Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag)) || - (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + return (isnan(a->cval.real) && isnan(b->cval.real) && isnan(a->cval.imag) && + isnan(b->cval.imag)) || + (isnan(a->cval.real) && isnan(b->cval.real) && a->cval.imag == b->cval.imag) || - (a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag)) || + (a->cval.real == b->cval.real && isnan(a->cval.imag) && + isnan(b->cval.imag)) || (a->cval.real == b->cval.real && a->cval.imag == b->cval.imag); } @@ -216,7 +206,8 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { if (PyComplex_CheckExact(a)) { return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b); } - if (PyTuple_CheckExact(a)) { + if (PyTuple_Check(a)) { + // compare tuple subclasses as builtin tuples return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b); } // frozenset isn't yet supported @@ -232,7 +223,7 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { static inline Py_hash_t _Pandas_HashDouble(double val) { // Since Python3.10, nan is no longer has hash 0 - if (Py_IS_NAN(val)) { + if (isnan(val)) { return 0; } #if PY_VERSION_HEX < 0x030A0000 @@ -320,7 +311,8 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { // because complex(k,0) == k holds for any int-object k // and kh_complex128_hash_func doesn't respect it hash = complexobject_hash((PyComplexObject *)key); - } else if (PyTuple_CheckExact(key)) { + } else if (PyTuple_Check(key)) { + // hash tuple subclasses as builtin tuples hash = tupleobject_hash((PyTupleObject *)key); } else { hash = PyObject_Hash(key); diff --git a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h index 0d62bb0ba915c..51fdbc50bba57 100644 --- a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h +++ b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h @@ -170,8 +170,8 @@ typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); -typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen); +typedef const char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen); typedef void *(*JSPFN_MALLOC)(size_t size); typedef void (*JSPFN_FREE)(void *pptr); typedef void *(*JSPFN_REALLOC)(void *base, size_t size); diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 75db47bf3160e..3af2856d2fbbf 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -2,7 +2,10 @@ import numpy as np from pandas._typing import npt -from pandas import MultiIndex +from pandas import ( + Index, + MultiIndex, +) from pandas.core.arrays import ExtensionArray multiindex_nulls_shift: int @@ -50,6 +53,7 @@ class UInt32Engine(IndexEngine): ... class UInt16Engine(IndexEngine): ... class UInt8Engine(IndexEngine): ... class ObjectEngine(IndexEngine): ... +class StringEngine(IndexEngine): ... class DatetimeEngine(Int64Engine): ... class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... @@ -68,15 +72,18 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] - offsets: np.ndarray # ndarray[uint64_t, ndim=1] + offsets: np.ndarray # np.ndarray[..., ndim=1] def __init__( self, - levels: list[np.ndarray], # all entries hashable + levels: list[Index], # all entries hashable labels: list[np.ndarray], # all entries integer-dtyped - offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] + offsets: np.ndarray, # np.ndarray[..., ndim=1] ) -> None: ... def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0dc139781f58d..c219d0b63870f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -9,7 +9,6 @@ from numpy cimport ( intp_t, ndarray, uint8_t, - uint64_t, ) cnp.import_array() @@ -43,6 +42,8 @@ from pandas._libs.missing cimport ( is_matching_na, ) +from decimal import InvalidOperation + # Defines shift of MultiIndex codes to avoid negative codes (missing values) multiindex_nulls_shift = 2 @@ -96,6 +97,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): return indexer.view(bool) +cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length): + """ + Resize array if loc is out of bounds. + """ + cdef: + Py_ssize_t n = len(values) + + if loc >= n: + while loc >= n: + n *= 2 + values = np.resize(values, min(n, max_length)) + return values + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -237,17 +252,31 @@ cdef class IndexEngine: return self.sizeof() cpdef _update_from_sliced(self, IndexEngine other, reverse: bool): - self.unique = other.unique - self.need_unique_check = other.need_unique_check + if other.unique: + self.unique = other.unique + self.need_unique_check = other.need_unique_check + if not other.need_monotonic_check and ( other.is_monotonic_increasing or other.is_monotonic_decreasing): - self.need_monotonic_check = other.need_monotonic_check - # reverse=True means the index has been reversed - self.monotonic_inc = other.monotonic_dec if reverse else other.monotonic_inc - self.monotonic_dec = other.monotonic_inc if reverse else other.monotonic_dec + self.need_monotonic_check = 0 + if len(self.values) > 0 and self.values[0] != self.values[-1]: + # reverse=True means the index has been reversed + if reverse: + self.monotonic_inc = other.monotonic_dec + self.monotonic_dec = other.monotonic_inc + else: + self.monotonic_inc = other.monotonic_inc + self.monotonic_dec = other.monotonic_dec + else: + self.monotonic_inc = 1 + self.monotonic_dec = 1 @property def is_unique(self) -> bool: + # for why we check is_monotonic_increasing here, see + # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781 + if self.need_monotonic_check: + self.is_monotonic_increasing if self.need_unique_check: self._do_unique_check() @@ -281,7 +310,7 @@ cdef class IndexEngine: values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, InvalidOperation, ValueError): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0 @@ -450,27 +479,18 @@ cdef class IndexEngine: # found if val in d: key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = j count += 1 # value not found else: - - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -527,6 +547,34 @@ cdef class ObjectEngine(IndexEngine): return loc +cdef class StringEngine(IndexEngine): + + cdef _make_hash_table(self, Py_ssize_t n): + return _hash.StringHashTable(n) + + cdef _check_type(self, object val): + if not isinstance(val, str): + raise KeyError(val) + return str(val) + +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif checknull(val): + return self.na_value + else: + raise KeyError(val) + + cdef class DatetimeEngine(Int64Engine): cdef: @@ -677,8 +725,7 @@ cdef class BaseMultiIndexCodesEngine: Keys are located by first locating each component against the respective level, then locating (the integer representation of) codes. """ - def __init__(self, object levels, object labels, - ndarray[uint64_t, ndim=1] offsets): + def __init__(self, object levels, object labels, ndarray offsets): """ Parameters ---------- @@ -686,7 +733,7 @@ cdef class BaseMultiIndexCodesEngine: Levels of the MultiIndex. labels : list-like of numpy arrays of integer dtype Labels of the MultiIndex. - offsets : numpy array of uint64 dtype + offsets : numpy array of int dtype Pre-calculated offsets, one for each level of the index. """ self.levels = levels @@ -696,8 +743,9 @@ cdef class BaseMultiIndexCodesEngine: # with positive integers (-1 for NaN becomes 1). This enables us to # differentiate between values that are missing in other and matching # NaNs. We will set values that are not found to 0 later: - labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift - codes = labels_arr.astype("uint64", copy=False) + codes = np.array(labels).T + codes += multiindex_nulls_shift # inplace sum optimisation + self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously @@ -709,8 +757,37 @@ cdef class BaseMultiIndexCodesEngine: # integers representing labels: we will use its get_loc and get_indexer self._base.__init__(self, lab_ints) - def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: - raise NotImplementedError("Implemented by subclass") # pragma: no cover + def _codes_to_ints(self, ndarray codes) -> np.ndarray: + """ + Transform combination(s) of uint in one uint or Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of integer + combinations). + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint + Combinations of integers (one per row) + + Returns + ------- + scalar or 1-dimensional array, of dtype _codes_dtype + Integer(s) representing one combination (each). + """ + # To avoid overflows, first make sure we are working with the right dtype: + codes = codes.astype(self._codes_dtype, copy=False) + + # Shift the representation of each level by the pre-calculated number of bits: + codes <<= self.offsets # inplace shift optimisation + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) def _extract_level_codes(self, target) -> np.ndarray: """ @@ -735,7 +812,7 @@ cdef class BaseMultiIndexCodesEngine: codes[codes > 0] += 1 if self.level_has_nans[i]: codes[target.codes[i] == -1] += 1 - return self._codes_to_ints(np.array(level_codes, dtype="uint64").T) + return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T) def get_indexer(self, target: np.ndarray) -> np.ndarray: """ @@ -766,7 +843,7 @@ cdef class BaseMultiIndexCodesEngine: raise KeyError(key) # Transform indices into single integer: - lab_int = self._codes_to_ints(np.array(indices, dtype="uint64")) + lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype)) return self._base.get_loc(self, lab_int) @@ -832,17 +909,31 @@ cdef class SharedEngine: pass cpdef _update_from_sliced(self, ExtensionEngine other, reverse: bool): - self.unique = other.unique - self.need_unique_check = other.need_unique_check + if other.unique: + self.unique = other.unique + self.need_unique_check = other.need_unique_check + if not other.need_monotonic_check and ( other.is_monotonic_increasing or other.is_monotonic_decreasing): - self.need_monotonic_check = other.need_monotonic_check - # reverse=True means the index has been reversed - self.monotonic_inc = other.monotonic_dec if reverse else other.monotonic_inc - self.monotonic_dec = other.monotonic_inc if reverse else other.monotonic_dec + self.need_monotonic_check = 0 + if len(self.values) > 0 and self.values[0] != self.values[-1]: + # reverse=True means the index has been reversed + if reverse: + self.monotonic_inc = other.monotonic_dec + self.monotonic_dec = other.monotonic_inc + else: + self.monotonic_inc = other.monotonic_inc + self.monotonic_dec = other.monotonic_dec + else: + self.monotonic_inc = 1 + self.monotonic_dec = 1 @property def is_unique(self) -> bool: + # for why we check is_monotonic_increasing here, see + # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781 + if self.need_monotonic_check: + self.is_monotonic_increasing if self.need_unique_check: arr = self.values.unique() self.unique = len(arr) == len(self.values) @@ -1193,13 +1284,12 @@ cdef class MaskedIndexEngine(IndexEngine): if PySequence_GetItem(target_mask, i): if na_pos: + result = _maybe_resize_array( + result, + count + len(na_pos) - 1, + max_alloc, + ) for na_idx in na_pos: - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = na_idx count += 1 continue @@ -1207,23 +1297,18 @@ cdef class MaskedIndexEngine(IndexEngine): elif val in d: # found key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc, + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = j count += 1 continue # value not found - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index ffe6c7730bcdc..a680304d55ea2 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -1,6 +1,8 @@ -from typing import ( +from collections.abc import ( Iterator, Sequence, +) +from typing import ( final, overload, ) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 05c4e7bd5e9dc..8caf7e5fb99b2 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,9 +1,10 @@ from collections import defaultdict -import weakref cimport cython +from cpython.object cimport PyObject from cpython.pyport cimport PY_SSIZE_T_MAX from cpython.slice cimport PySlice_GetIndicesEx +from cpython.weakref cimport PyWeakref_NewRef from cython cimport Py_ssize_t import numpy as np @@ -25,6 +26,18 @@ from pandas._libs.util cimport ( is_integer_object, ) +include "free_threading_config.pxi" + +IF CYTHON_COMPATIBLE_WITH_FREE_THREADING: + from cpython.ref cimport Py_DECREF + from cpython.weakref cimport PyWeakref_GetRef +ELSE: + from cpython.weakref cimport PyWeakref_GetObject + + +cdef extern from "Python.h": + PyObject* Py_None + @cython.final @cython.freelist(32) @@ -494,7 +507,7 @@ def get_concat_blkno_indexers(list blknos_list not None): @cython.boundscheck(False) @cython.wraparound(False) def get_blkno_indexers( - int64_t[:] blknos, bint group=True + const int64_t[:] blknos, bint group=True ) -> list[tuple[int, slice | np.ndarray]]: """ Enumerate contiguous runs of integers in ndarray. @@ -588,8 +601,8 @@ def get_blkno_placements(blknos, group: bool = True): @cython.boundscheck(False) @cython.wraparound(False) cpdef update_blklocs_and_blknos( - ndarray[intp_t, ndim=1] blklocs, - ndarray[intp_t, ndim=1] blknos, + const intp_t[:] blklocs, + const intp_t[:] blknos, Py_ssize_t loc, intp_t nblocks, ): @@ -746,7 +759,7 @@ cdef class BlockManager: # ------------------------------------------------------------------- # Block Placement - def _rebuild_blknos_and_blklocs(self) -> None: + cpdef _rebuild_blknos_and_blklocs(self): """ Update mgr._blknos / mgr._blklocs. """ @@ -890,27 +903,48 @@ cdef class BlockValuesRefs: def __cinit__(self, blk: Block | None = None) -> None: if blk is not None: - self.referenced_blocks = [weakref.ref(blk)] + self.referenced_blocks = [PyWeakref_NewRef(blk, None)] else: self.referenced_blocks = [] self.clear_counter = 500 # set reasonably high - def _clear_dead_references(self, force=False) -> None: + cdef _clear_dead_references(self, bint force=False): # Use exponential backoff to decide when we want to clear references # if force=False. Clearing for every insertion causes slowdowns if # all these objects stay alive, e.g. df.items() for wide DataFrames # see GH#55245 and GH#55008 + IF CYTHON_COMPATIBLE_WITH_FREE_THREADING: + cdef PyObject* pobj + cdef bint status + if force or len(self.referenced_blocks) > self.clear_counter: - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + IF CYTHON_COMPATIBLE_WITH_FREE_THREADING: + new_referenced_blocks = [] + for ref in self.referenced_blocks: + status = PyWeakref_GetRef(ref, &pobj) + if status == -1: + return + elif status == 1: + new_referenced_blocks.append(ref) + Py_DECREF(pobj) + self.referenced_blocks = new_referenced_blocks + ELSE: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks + if PyWeakref_GetObject(ref) != Py_None + ] + nr_of_refs = len(self.referenced_blocks) if nr_of_refs < self.clear_counter // 2: self.clear_counter = max(self.clear_counter // 2, 500) elif nr_of_refs > self.clear_counter: self.clear_counter = max(self.clear_counter * 2, nr_of_refs) - def add_reference(self, blk: Block) -> None: + cpdef _add_reference_maybe_locked(self, Block blk): + self._clear_dead_references() + self.referenced_blocks.append(PyWeakref_NewRef(blk, None)) + + cpdef add_reference(self, Block blk): """Adds a new reference to our reference collection. Parameters @@ -918,8 +952,15 @@ cdef class BlockValuesRefs: blk : Block The block that the new references should point to. """ + IF CYTHON_COMPATIBLE_WITH_FREE_THREADING: + with cython.critical_section(self): + self._add_reference_maybe_locked(blk) + ELSE: + self._add_reference_maybe_locked(blk) + + def _add_index_reference_maybe_locked(self, index: object) -> None: self._clear_dead_references() - self.referenced_blocks.append(weakref.ref(blk)) + self.referenced_blocks.append(PyWeakref_NewRef(index, None)) def add_index_reference(self, index: object) -> None: """Adds a new reference to our reference collection when creating an index. @@ -929,8 +970,16 @@ cdef class BlockValuesRefs: index : Index The index that the new reference should point to. """ - self._clear_dead_references() - self.referenced_blocks.append(weakref.ref(index)) + IF CYTHON_COMPATIBLE_WITH_FREE_THREADING: + with cython.critical_section(self): + self._add_index_reference_maybe_locked(index) + ELSE: + self._add_index_reference_maybe_locked(index) + + def _has_reference_maybe_locked(self) -> bool: + self._clear_dead_references(force=True) + # Checking for more references than block pointing to itself + return len(self.referenced_blocks) > 1 def has_reference(self) -> bool: """Checks if block has foreign references. @@ -942,6 +991,8 @@ cdef class BlockValuesRefs: ------- bool """ - self._clear_dead_references(force=True) - # Checking for more references than block pointing to itself - return len(self.referenced_blocks) > 1 + IF CYTHON_COMPATIBLE_WITH_FREE_THREADING: + with cython.critical_section(self): + return self._has_reference_maybe_locked() + ELSE: + return self._has_reference_maybe_locked() diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index a37ab45dd57ed..5d0876591a151 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -167,6 +167,12 @@ cdef class IntervalMixin: """ Return the midpoint of the Interval. + See Also + -------- + Interval.left : Return the left bound for the interval. + Interval.right : Return the right bound for the interval. + Interval.length : Return the length of the interval. + Examples -------- >>> iv = pd.Interval(0, 5) @@ -203,6 +209,12 @@ cdef class IntervalMixin: """ Indicates if an interval is empty, meaning it contains no points. + An interval is considered empty if its `left` and `right` endpoints + are equal, and it is not closed on both sides. This means that the + interval does not include any real points. In the case of an + :class:`pandas.arrays.IntervalArray` or :class:`IntervalIndex`, the + property returns a boolean array indicating the emptiness of each interval. + Returns ------- bool or ndarray @@ -285,7 +297,7 @@ cdef class Interval(IntervalMixin): """ Immutable object implementing an Interval, a bounded slice-like interval. - Parameters + Attributes ---------- left : orderable scalar Left bound for the interval. @@ -377,6 +389,12 @@ cdef class Interval(IntervalMixin): """ Left bound for the interval. + See Also + -------- + Interval.right : Return the right bound for the interval. + numpy.ndarray.left : A similar method in numpy for obtaining + the left endpoint(s) of intervals. + Examples -------- >>> interval = pd.Interval(left=1, right=2, closed='left') @@ -390,6 +408,12 @@ cdef class Interval(IntervalMixin): """ Right bound for the interval. + See Also + -------- + Interval.left : Return the left bound for the interval. + numpy.ndarray.right : A similar method in numpy for obtaining + the right endpoint(s) of intervals. + Examples -------- >>> interval = pd.Interval(left=1, right=2, closed='left') @@ -405,6 +429,13 @@ cdef class Interval(IntervalMixin): Either ``left``, ``right``, ``both`` or ``neither``. + See Also + -------- + Interval.closed_left : Check if the interval is closed on the left side. + Interval.closed_right : Check if the interval is closed on the right side. + Interval.open_left : Check if the interval is open on the left side. + Interval.open_right : Check if the interval is open on the right side. + Examples -------- >>> interval = pd.Interval(left=1, right=2, closed='left') diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index a6cec0fb30ecc..b94f60c272e5d 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -145,12 +145,12 @@ cdef class IntervalTree(IntervalMixin): # overflow -> no match, which is already handled below pass - if result.data.n == old_len: + if result.data.size == old_len: result.append(-1) - elif result.data.n > old_len + 1: + elif result.data.size > old_len + 1: raise KeyError( 'indexer does not intersect a unique set of intervals') - old_len = result.data.n + old_len = result.data.size return result.to_array().astype('intp') def get_indexer_non_unique(self, ndarray[scalar_t, ndim=1] target): @@ -172,10 +172,10 @@ cdef class IntervalTree(IntervalMixin): # overflow -> no match, which is already handled below pass - if result.data.n == old_len: + if result.data.size == old_len: result.append(-1) missing.append(i) - old_len = result.data.n + old_len = result.data.size return (result.to_array().astype('intp'), missing.to_array().astype('intp')) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 368abe60d7237..e6bd5a8de39dc 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -225,7 +225,10 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, @cython.wraparound(False) @cython.boundscheck(False) -cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) noexcept nogil: +cdef void _get_result_indexer( + const intp_t[::1] sorter, + intp_t[::1] indexer, +) noexcept nogil: """NOTE: overwrites indexer with the result to avoid allocating another array""" cdef: Py_ssize_t i, n, idx @@ -681,8 +684,8 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] from pandas._libs.hashtable cimport Int64HashTable -def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, - ndarray[numeric_t] right_values, +def asof_join_backward_on_X_by_Y(const numeric_t[:] left_values, + const numeric_t[:] right_values, const int64_t[:] left_by_values, const int64_t[:] right_by_values, bint allow_exact_matches=True, @@ -752,8 +755,8 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, return left_indexer, right_indexer -def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, - ndarray[numeric_t] right_values, +def asof_join_forward_on_X_by_Y(const numeric_t[:] left_values, + const numeric_t[:] right_values, const int64_t[:] left_by_values, const int64_t[:] right_by_values, bint allow_exact_matches=1, @@ -824,8 +827,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, return left_indexer, right_indexer -def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values, - ndarray[numeric_t] right_values, +def asof_join_nearest_on_X_by_Y(const numeric_t[:] left_values, + const numeric_t[:] right_values, const int64_t[:] left_by_values, const int64_t[:] right_by_values, bint allow_exact_matches=True, diff --git a/pandas/_libs/json.pyi b/pandas/_libs/json.pyi index bc4fe68573b94..349320d69d707 100644 --- a/pandas/_libs/json.pyi +++ b/pandas/_libs/json.pyi @@ -1,6 +1,6 @@ +from collections.abc import Callable from typing import ( Any, - Callable, ) def ujson_dumps( diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b9fd970e68f5b..331233f37f63d 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -1,12 +1,14 @@ # TODO(npdtypes): Many types specified here can be made more specific/accurate; # the more specific versions are specified in comments +from collections.abc import ( + Callable, + Generator, + Hashable, +) from decimal import Decimal from typing import ( Any, - Callable, Final, - Generator, - Hashable, Literal, TypeAlias, overload, @@ -14,8 +16,6 @@ from typing import ( import numpy as np -from pandas._libs.interval import Interval -from pandas._libs.tslibs import Period from pandas._typing import ( ArrayLike, DtypeObj, @@ -44,8 +44,6 @@ def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... def is_pyarrow_array(obj: object) -> bool: ... -def is_period(val: object) -> TypeGuard[Period]: ... -def is_interval(obj: object) -> TypeGuard[Interval]: ... def is_decimal(obj: object) -> TypeGuard[Decimal]: ... def is_complex(obj: object) -> TypeGuard[complex]: ... def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ... @@ -69,16 +67,25 @@ def fast_multiget( mapping: dict, keys: np.ndarray, # object[:] default=..., -) -> np.ndarray: ... +) -> ArrayLike: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ... -def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ... +@overload def map_infer( arr: np.ndarray, f: Callable[[Any], Any], - convert: bool = ..., + *, + convert: Literal[False], ignore_na: bool = ..., ) -> np.ndarray: ... @overload +def map_infer( + arr: np.ndarray, + f: Callable[[Any], Any], + *, + convert: bool = ..., + ignore_na: bool = ..., +) -> ArrayLike: ... +@overload def maybe_convert_objects( objects: npt.NDArray[np.object_], *, @@ -164,14 +171,26 @@ def is_all_arraylike(obj: list) -> bool: ... # Functions which in reality take memoryviews def memory_usage_of_objects(arr: np.ndarray) -> int: ... # object[:] # np.int64 +@overload def map_infer_mask( arr: np.ndarray, f: Callable[[Any], Any], mask: np.ndarray, # const uint8_t[:] - convert: bool = ..., + *, + convert: Literal[False], na_value: Any = ..., dtype: np.dtype = ..., ) -> np.ndarray: ... +@overload +def map_infer_mask( + arr: np.ndarray, + f: Callable[[Any], Any], + mask: np.ndarray, # const uint8_t[:] + *, + convert: bool = ..., + na_value: Any = ..., + dtype: np.dtype = ..., +) -> ArrayLike: ... def indices_fast( index: npt.NDArray[np.intp], labels: np.ndarray, # const int64_t[:] @@ -179,7 +198,8 @@ def indices_fast( sorted_labels: list[npt.NDArray[np.int64]], ) -> dict[Hashable, npt.NDArray[np.intp]]: ... def generate_slices( - labels: np.ndarray, ngroups: int # const intp_t[:] + labels: np.ndarray, + ngroups: int, # const intp_t[:] ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... def count_level_2d( mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True], @@ -209,5 +229,10 @@ def get_reverse_indexer( def is_bool_list(obj: list) -> bool: ... def dtypes_all_equal(types: list[DtypeObj]) -> bool: ... def is_range_indexer( - left: np.ndarray, n: int # np.ndarray[np.int64, ndim=1] + left: np.ndarray, + n: int, # np.ndarray[np.int64, ndim=1] +) -> bool: ... +def is_sequence_range( + sequence: np.ndarray, + step: int, # np.ndarray[np.int64, ndim=1] ) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c483f35513a40..cded299f77876 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,7 +37,7 @@ from cython cimport ( floating, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.missing import check_na_tuples_nonequal @@ -53,6 +53,7 @@ from numpy cimport ( PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, + PyArray_SETITEM, complex128_t, flatiter, float64_t, @@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h": PandasParser_IMPORT from pandas._libs cimport util -from pandas._libs.dtypes cimport uint8_int64_object_t from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, @@ -96,16 +96,12 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) -from pandas._libs.tslibs.conversion cimport ( - _TSObject, - convert_to_tsobject, -) +from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, checknull_with_nat, ) -from pandas._libs.tslibs.np_datetime cimport NPY_FR_ns from pandas._libs.tslibs.offsets cimport is_offset_object from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 @@ -184,6 +180,13 @@ def is_scalar(val: object) -> bool: bool Return True if given object is scalar. + See Also + -------- + api.types.is_list_like : Check if the input is list-like. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + api.types.is_bool : Check if the input is a boolean. + Examples -------- >>> import datetime @@ -256,15 +259,23 @@ def is_iterator(obj: object) -> bool: Check if the object is an iterator. This is intended for generators, not list-like objects. + This method checks whether the passed object is an iterator. It + returns `True` if the object is an iterator, and `False` otherwise. Parameters ---------- obj : The object to check + The object to check for iterator type. Returns ------- is_iter : bool Whether `obj` is an iterator. + `True` if the object is of iterator type, otherwise `False`. + + See Also + -------- + api.types.is_list_like : Check if the input is list-like. Examples -------- @@ -312,34 +323,6 @@ def item_from_zerodim(val: object) -> object: return val -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list: - cdef: - list buf - Py_ssize_t k = len(lists) - Py_ssize_t i, j, n - list uniques = [] - dict table = {} - object val, stub = 0 - - for i in range(k): - buf = lists[i] - n = len(buf) - for j in range(n): - val = buf[j] - if val not in table: - table[val] = stub - uniques.append(val) - if sort: - try: - uniques.sort() - except TypeError: - pass - - return uniques - - @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: @@ -361,15 +344,15 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: list buf Py_ssize_t j, n list uniques = [] - dict table = {} - object val, stub = 0 + set table = set() + object val for buf in gen: n = len(buf) for j in range(n): val = buf[j] if val not in table: - table[val] = stub + table.add(val) uniques.append(val) if sort: try: @@ -505,7 +488,7 @@ def has_infs(const floating[:] arr) -> bool: @cython.boundscheck(False) @cython.wraparound(False) -def has_only_ints_or_nan(floating[:] arr) -> bool: +def has_only_ints_or_nan(const floating[:] arr) -> bool: cdef: floating val intp_t i @@ -519,7 +502,7 @@ def has_only_ints_or_nan(floating[:] arr) -> bool: return True -def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): +def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, intp_t max_len): cdef: Py_ssize_t i, n = len(indices) intp_t k, vstart, vlast, v @@ -625,6 +608,8 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool: if not array_equivalent(x, y): return False + elif PyArray_Check(x) or PyArray_Check(y): + return False elif (x is C_NA) ^ (y is C_NA): return False elif not ( @@ -659,7 +644,7 @@ ctypedef fused int6432_t: @cython.wraparound(False) @cython.boundscheck(False) -def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: +def is_range_indexer(const int6432_t[:] left, Py_ssize_t n) -> bool: """ Perform an element by element comparison on 1-d integer arrays, meant for indexer comparisons @@ -678,6 +663,28 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: return True +@cython.wraparound(False) +@cython.boundscheck(False) +def is_sequence_range(const int6432_t[:] sequence, int64_t step) -> bool: + """ + Check if sequence is equivalent to a range with the specified step. + """ + cdef: + Py_ssize_t i, n = len(sequence) + int6432_t first_element + + if step == 0: + return False + if n == 0: + return True + + first_element = sequence[0] + for i in range(1, n): + if sequence[i] != first_element + i * step: + return False + return True + + ctypedef fused ndarr_object: ndarray[object, ndim=1] ndarray[object, ndim=2] @@ -736,7 +743,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -753,22 +762,39 @@ cpdef ndarray[object] ensure_string_array( if hasattr(arr, "to_numpy"): - if hasattr(arr, "dtype") and arr.dtype.kind in "mM": + if ( + hasattr(arr, "dtype") + and arr.dtype.kind in "mM" + # TODO: we should add a custom ArrowExtensionArray.astype implementation + # that handles astype(str) specifically, avoiding ending up here and + # then we can remove the below check for `_pa_array` (for ArrowEA) + and not hasattr(arr, "_pa_array") + ): # dtype check to exclude DataFrame # GH#41409 TODO: not a great place for this out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - arr = arr.to_numpy() + arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): - arr = np.array(arr, dtype="object") + # GH#61155: Guarantee a 1-d result when array is a list of lists + input_arr = arr + arr = np.empty(len(arr), dtype="object") + arr[:] = input_arr result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False + elif not copy and not result.flags.writeable: + # Weird edge case where result is a view already_copied = False if issubclass(arr.dtype.type, np.str_): @@ -958,16 +984,14 @@ def get_level_sorter( @cython.boundscheck(False) @cython.wraparound(False) -def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, +def count_level_2d(const uint8_t[:, :] mask, const intp_t[:] labels, Py_ssize_t max_bin, ): cdef: - Py_ssize_t i, j, k, n + Py_ssize_t i, j, k = mask.shape[1], n = mask.shape[0] ndarray[int64_t, ndim=2] counts - n, k = (mask).shape - counts = np.zeros((n, max_bin), dtype="i8") with nogil: for i in range(n): @@ -1074,9 +1098,23 @@ def is_float(obj: object) -> bool: """ Return True if given object is float. + This method checks whether the passed object is a float type. It + returns `True` if the object is a float, and `False` otherwise. + + Parameters + ---------- + obj : object + The object to check for float type. + Returns ------- bool + `True` if the object is of float type, otherwise `False`. + + See Also + -------- + api.types.is_integer : Check if an object is of integer type. + api.types.is_numeric_dtype : Check if an object is of numeric type. Examples -------- @@ -1093,9 +1131,23 @@ def is_integer(obj: object) -> bool: """ Return True if given object is integer. + This method checks whether the passed object is an integer type. It + returns `True` if the object is an integer, and `False` otherwise. + + Parameters + ---------- + obj : object + The object to check for integer type. + Returns ------- bool + `True` if the object is of integer type, otherwise `False`. + + See Also + -------- + api.types.is_float : Check if an object is of float type. + api.types.is_numeric_dtype : Check if an object is of numeric type. Examples -------- @@ -1123,10 +1175,21 @@ def is_bool(obj: object) -> bool: """ Return True if given object is boolean. + Parameters + ---------- + obj : object + Object to check. + Returns ------- bool + See Also + -------- + api.types.is_scalar : Check if the input is a scalar. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + Examples -------- >>> pd.api.types.is_bool(True) @@ -1142,10 +1205,22 @@ def is_complex(obj: object) -> bool: """ Return True if given object is complex. + Parameters + ---------- + obj : object + Object to check. + Returns ------- bool + See Also + -------- + api.types.is_complex_dtype: Check whether the provided array or + dtype is of a complex dtype. + api.types.is_number: Check if the object is a number. + api.types.is_integer: Return True if given object is integer. + Examples -------- >>> pd.api.types.is_complex(1 + 1j) @@ -1161,43 +1236,6 @@ cpdef bint is_decimal(object obj): return isinstance(obj, Decimal) -cpdef bint is_interval(object obj): - import warnings - - from pandas.util._exceptions import find_stack_level - - warnings.warn( - # GH#55264 - "is_interval is deprecated and will be removed in a future version. " - "Use isinstance(obj, pd.Interval) instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return getattr(obj, "_typ", "_typ") == "interval" - - -def is_period(val: object) -> bool: - """ - Return True if given object is Period. - - Returns - ------- - bool - """ - import warnings - - from pandas.util._exceptions import find_stack_level - - warnings.warn( - # GH#55264 - "is_period is deprecated and will be removed in a future version. " - "Use isinstance(obj, pd.Period) instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return is_period_object(val) - - def is_list_like(obj: object, allow_sets: bool = True) -> bool: """ Check if the object is list-like. @@ -1219,6 +1257,12 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: bool Whether `obj` has list-like properties. + See Also + -------- + Series : One-dimensional ndarray with axis labels (including time series). + Index : Immutable sequence used for indexing and alignment. + numpy.ndarray : Array object from NumPy, which is considered list-like. + Examples -------- >>> import datetime @@ -1477,11 +1521,17 @@ cdef object _try_infer_map(object dtype): def infer_dtype(value: object, skipna: bool = True) -> str: """ - Return a string label of the type of a scalar or list-like of values. + Return a string label of the type of the elements in a list-like input. + + This method inspects the elements of the provided input and determines + classification of its data type. It is particularly useful for + handling heterogeneous data inputs where explicit dtype conversion may not + be possible or necessary. Parameters ---------- - value : scalar, list, ndarray, or pandas type + value : list, ndarray, or pandas type + The input data to infer the dtype. skipna : bool, default True Ignore NaN values when inferring the type. @@ -1516,6 +1566,14 @@ def infer_dtype(value: object, skipna: bool = True) -> str: TypeError If ndarray-like but cannot infer the dtype + See Also + -------- + api.types.is_scalar : Check if the input is a scalar. + api.types.is_list_like : Check if the input is list-like. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + api.types.is_bool : Check if the input is a boolean. + Notes ----- - 'mixed' is the catchall for anything that is not otherwise @@ -1830,7 +1888,7 @@ cdef class BoolValidator(Validator): cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef: - BoolValidator validator = BoolValidator(len(values), + BoolValidator validator = BoolValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1848,7 +1906,7 @@ cdef class IntegerValidator(Validator): # Note: only python-exposed for tests cpdef bint is_integer_array(ndarray values, bint skipna=True): cdef: - IntegerValidator validator = IntegerValidator(len(values), + IntegerValidator validator = IntegerValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1863,7 +1921,7 @@ cdef class IntegerNaValidator(Validator): cdef bint is_integer_na_array(ndarray values, bint skipna=True): cdef: - IntegerNaValidator validator = IntegerNaValidator(len(values), + IntegerNaValidator validator = IntegerNaValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1879,7 +1937,7 @@ cdef class IntegerFloatValidator(Validator): cdef bint is_integer_float_array(ndarray values, bint skipna=True): cdef: - IntegerFloatValidator validator = IntegerFloatValidator(len(values), + IntegerFloatValidator validator = IntegerFloatValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1897,7 +1955,7 @@ cdef class FloatValidator(Validator): # Note: only python-exposed for tests cpdef bint is_float_array(ndarray values): cdef: - FloatValidator validator = FloatValidator(len(values), values.dtype) + FloatValidator validator = FloatValidator(values.size, values.dtype) return validator.validate(values) @@ -1915,7 +1973,7 @@ cdef class ComplexValidator(Validator): cdef bint is_complex_array(ndarray values): cdef: - ComplexValidator validator = ComplexValidator(len(values), values.dtype) + ComplexValidator validator = ComplexValidator(values.size, values.dtype) return validator.validate(values) @@ -1928,7 +1986,7 @@ cdef class DecimalValidator(Validator): cdef bint is_decimal_array(ndarray values, bint skipna=False): cdef: DecimalValidator validator = DecimalValidator( - len(values), values.dtype, skipna=skipna + values.size, values.dtype, skipna=skipna ) return validator.validate(values) @@ -1944,7 +2002,7 @@ cdef class StringValidator(Validator): cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: - StringValidator validator = StringValidator(len(values), + StringValidator validator = StringValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1961,7 +2019,7 @@ cdef class BytesValidator(Validator): cdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: - BytesValidator validator = BytesValidator(len(values), values.dtype, + BytesValidator validator = BytesValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2012,7 +2070,7 @@ cdef class DatetimeValidator(TemporalValidator): cpdef bint is_datetime_array(ndarray values, bint skipna=True): cdef: - DatetimeValidator validator = DatetimeValidator(len(values), + DatetimeValidator validator = DatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2026,7 +2084,7 @@ cdef class Datetime64Validator(DatetimeValidator): # Note: only python-exposed for tests cpdef bint is_datetime64_array(ndarray values, bint skipna=True): cdef: - Datetime64Validator validator = Datetime64Validator(len(values), + Datetime64Validator validator = Datetime64Validator(values.size, skipna=skipna) return validator.validate(values) @@ -2041,7 +2099,7 @@ cdef class AnyDatetimeValidator(DatetimeValidator): cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True): cdef: - AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), + AnyDatetimeValidator validator = AnyDatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2053,7 +2111,7 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: Doesn't check values are datetime-like types. """ cdef: - Py_ssize_t i = 0, j, n = len(values) + Py_ssize_t i = 0, j, n = values.size object base_val, base_tz, val, tz if n == 0: @@ -2101,7 +2159,7 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True): Infer with timedeltas and/or nat/none. """ cdef: - AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), + AnyTimedeltaValidator validator = AnyTimedeltaValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2115,7 +2173,7 @@ cdef class DateValidator(Validator): # Note: only python-exposed for tests cpdef bint is_date_array(ndarray values, bint skipna=False): cdef: - DateValidator validator = DateValidator(len(values), skipna=skipna) + DateValidator validator = DateValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2128,7 +2186,7 @@ cdef class TimeValidator(Validator): # Note: only python-exposed for tests cpdef bint is_time_array(ndarray values, bint skipna=False): cdef: - TimeValidator validator = TimeValidator(len(values), skipna=skipna) + TimeValidator validator = TimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2179,14 +2237,14 @@ cpdef bint is_interval_array(ndarray values): Is this an ndarray of Interval (or np.nan) with a single dtype? """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, n = values.size str closed = None bint numeric = False bint dt64 = False bint td64 = False object val - if len(values) == 0: + if n == 0: return False for i in range(n): @@ -2521,7 +2579,6 @@ def maybe_convert_objects(ndarray[object] objects, ndarray[uint8_t] mask Seen seen = Seen() object val - _TSObject tsobj float64_t fnan = NaN if dtype_if_all_nat is not None: @@ -2628,8 +2685,7 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.datetime_ = True try: - tsobj = convert_to_tsobject(val, None, None, 0, 0) - tsobj.ensure_reso(NPY_FR_ns) + convert_to_tsobject(val, None, None, 0, 0) except OutOfBoundsDatetime: # e.g. test_out_of_s_bounds_datetime64 seen.object_ = True @@ -2668,7 +2724,11 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break elif val is C_NA: - seen.object_ = True + if convert_to_nullable_dtype: + seen.null_ = True + mask[i] = True + else: + seen.object_ = True continue else: seen.object_ = True @@ -2725,10 +2785,16 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype() + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + + elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True @@ -2774,12 +2840,12 @@ def maybe_convert_objects(ndarray[object] objects, return objects if seen.bool_: - if seen.is_bool: - # is_bool property rules out everything else - return bools.view(np.bool_) - elif convert_to_nullable_dtype and seen.is_bool_or_na: + if convert_to_nullable_dtype and seen.is_bool_or_na: from pandas.core.arrays import BooleanArray return BooleanArray(bools.view(np.bool_), mask) + elif seen.is_bool: + # is_bool property rules out everything else + return bools.view(np.bool_) seen.object_ = True if not seen.object_: @@ -2792,11 +2858,11 @@ def maybe_convert_objects(ndarray[object] objects, result = floats elif seen.int_ or seen.uint_: if convert_to_nullable_dtype: - from pandas.core.arrays import IntegerArray + # Below we will wrap in IntegerArray if seen.uint_: - result = IntegerArray(uints, mask) + result = uints else: - result = IntegerArray(ints, mask) + result = ints else: result = floats elif seen.nan_: @@ -2811,7 +2877,6 @@ def maybe_convert_objects(ndarray[object] objects, result = uints else: result = ints - else: # don't cast int to float, etc. if seen.null_: @@ -2834,6 +2899,22 @@ def maybe_convert_objects(ndarray[object] objects, else: result = ints + # TODO: do these after the itemsize check? + if (result is ints or result is uints) and convert_to_nullable_dtype: + from pandas.core.arrays import IntegerArray + + # Set these values to 1 to be deterministic, match + # IntegerDtype._internal_fill_value + result[mask] = 1 + result = IntegerArray(result, mask) + elif result is floats and convert_to_nullable_dtype: + from pandas.core.arrays import FloatingArray + + # Set these values to 1.0 to be deterministic, match + # FloatingDtype._internal_fill_value + result[mask] = 1.0 + result = FloatingArray(result, mask) + if result is uints or result is ints or result is floats or result is complexes: # cast to the largest itemsize when all values are NumPy scalars if itemsize_max > 0 and itemsize_max != result.dtype.itemsize: @@ -2860,14 +2941,17 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value. NoDefault = Literal[_NoDefault.no_default] +@cython.boundscheck(False) +@cython.wraparound(False) def map_infer_mask( - ndarray[object] arr, - object f, - const uint8_t[:] mask, - bint convert=True, - object na_value=no_default, - cnp.dtype dtype=np.dtype(object) -) -> np.ndarray: + ndarray arr, + object f, + const uint8_t[:] mask, + *, + bint convert=True, + object na_value=no_default, + cnp.dtype dtype=np.dtype(object) +) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2887,62 +2971,48 @@ def map_infer_mask( Returns ------- - np.ndarray - """ - cdef Py_ssize_t n = len(arr) - result = np.empty(n, dtype=dtype) - - _map_infer_mask( - result, - arr, - f, - mask, - na_value, - ) - if convert: - return maybe_convert_objects(result) - else: - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def _map_infer_mask( - ndarray[uint8_int64_object_t] out, - ndarray[object] arr, - object f, - const uint8_t[:] mask, - object na_value=no_default, -) -> None: - """ - Helper for map_infer_mask, split off to use fused types based on the result. + np.ndarray or an ExtensionArray """ cdef: - Py_ssize_t i, n + Py_ssize_t i + Py_ssize_t n = len(arr) object val - n = len(arr) + ndarray result = np.empty(n, dtype=dtype) + + flatiter arr_it = PyArray_IterNew(arr) + flatiter result_it = PyArray_IterNew(result) + for i in range(n): if mask[i]: if na_value is no_default: - val = arr[i] + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) else: val = na_value else: - val = f(arr[i]) + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = f(val) if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 val = val.item() - out[i] = val + PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) + + PyArray_ITER_NEXT(arr_it) + PyArray_ITER_NEXT(result_it) + + if convert: + return maybe_convert_objects(result) + else: + return result @cython.boundscheck(False) @cython.wraparound(False) def map_infer( - ndarray arr, object f, bint convert=True, bint ignore_na=False -) -> np.ndarray: + ndarray arr, object f, *, bint convert=True, bint ignore_na=False +) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2956,7 +3026,7 @@ def map_infer( Returns ------- - np.ndarray + np.ndarray or an ExtensionArray """ cdef: Py_ssize_t i, n @@ -3091,7 +3161,7 @@ def to_object_array_tuples(rows: object) -> np.ndarray: @cython.wraparound(False) @cython.boundscheck(False) -def fast_multiget(dict mapping, object[:] keys, default=np.nan) -> np.ndarray: +def fast_multiget(dict mapping, object[:] keys, default=np.nan) -> "ArrayLike": cdef: Py_ssize_t i, n = len(keys) object val diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c27386743c6e9..33fc65e5034d0 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -1,94 +1,132 @@ -_algos_take_helper = custom_target('algos_take_helper_pxi', +_algos_take_helper = custom_target( + 'algos_take_helper_pxi', output: 'algos_take_helper.pxi', input: 'algos_take_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_algos_common_helper = custom_target('algos_common_helper_pxi', +_algos_common_helper = custom_target( + 'algos_common_helper_pxi', output: 'algos_common_helper.pxi', input: 'algos_common_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_khash_primitive_helper = custom_target('khash_primitive_helper_pxi', +_khash_primitive_helper = custom_target( + 'khash_primitive_helper_pxi', output: 'khash_for_primitive_helper.pxi', input: 'khash_for_primitive_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_hashtable_class_helper = custom_target('hashtable_class_helper_pxi', +_hashtable_class_helper = custom_target( + 'hashtable_class_helper_pxi', output: 'hashtable_class_helper.pxi', input: 'hashtable_class_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_hashtable_func_helper = custom_target('hashtable_func_helper_pxi', +_hashtable_func_helper = custom_target( + 'hashtable_func_helper_pxi', output: 'hashtable_func_helper.pxi', input: 'hashtable_func_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_index_class_helper = custom_target('index_class_helper_pxi', +_index_class_helper = custom_target( + 'index_class_helper_pxi', output: 'index_class_helper.pxi', input: 'index_class_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_sparse_op_helper = custom_target('sparse_op_helper_pxi', +_sparse_op_helper = custom_target( + 'sparse_op_helper_pxi', output: 'sparse_op_helper.pxi', input: 'sparse_op_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_intervaltree_helper = custom_target('intervaltree_helper_pxi', +_intervaltree_helper = custom_target( + 'intervaltree_helper_pxi', output: 'intervaltree.pxi', input: 'intervaltree.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], +) +_khash_primitive_helper_dep = declare_dependency( + sources: _khash_primitive_helper, +) + +cdata = configuration_data() +if cy.version().version_compare('>=3.1.0') + cdata.set('freethreading_compatible', '1') +else + cdata.set('freethreading_compatible', '0') +endif +_free_threading_config = configure_file( + input: 'free_threading_config.pxi.in', + output: 'free_threading_config.pxi', + configuration: cdata, + install: false, ) -_khash_primitive_helper_dep = declare_dependency(sources: _khash_primitive_helper) subdir('tslibs') libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included - 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, + 'algos': { + 'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], + 'deps': _khash_primitive_helper_dep, + }, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, - 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, - 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep}, + 'hashtable': { + 'sources': [ + 'hashtable.pyx', + _hashtable_class_helper, + _hashtable_func_helper, + ], + 'deps': _khash_primitive_helper_dep, + }, + 'index': { + 'sources': ['index.pyx', _index_class_helper], + 'deps': _khash_primitive_helper_dep, + }, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper], - 'deps': _khash_primitive_helper_dep}, - 'join': {'sources': ['join.pyx', _khash_primitive_helper], - 'deps': _khash_primitive_helper_dep}, + 'interval': { + 'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep, + }, + 'join': { + 'sources': ['join.pyx', _khash_primitive_helper], + 'deps': _khash_primitive_helper_dep, + }, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, 'missing': {'sources': ['missing.pyx']}, - 'pandas_datetime': {'sources': ['src/vendored/numpy/datetime/np_datetime.c', - 'src/vendored/numpy/datetime/np_datetime_strings.c', - 'src/datetime/date_conversions.c', - 'src/datetime/pd_datetime.c']}, - 'pandas_parser': {'sources': ['src/parser/tokenizer.c', - 'src/parser/io.c', - 'src/parser/pd_parser.c']}, - 'parsers': {'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], - 'deps': _khash_primitive_helper_dep}, - 'json': {'sources': ['src/vendored/ujson/python/ujson.c', - 'src/vendored/ujson/python/objToJSON.c', - 'src/vendored/ujson/python/JSONtoObj.c', - 'src/vendored/ujson/lib/ultrajsonenc.c', - 'src/vendored/ujson/lib/ultrajsondec.c']}, + 'pandas_datetime': { + 'sources': [ + 'src/vendored/numpy/datetime/np_datetime.c', + 'src/vendored/numpy/datetime/np_datetime_strings.c', + 'src/datetime/date_conversions.c', + 'src/datetime/pd_datetime.c', + ], + }, + 'pandas_parser': { + 'sources': [ + 'src/parser/tokenizer.c', + 'src/parser/io.c', + 'src/parser/pd_parser.c', + ], + }, + 'parsers': { + 'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], + 'deps': _khash_primitive_helper_dep, + }, + 'json': { + 'sources': [ + 'src/vendored/ujson/python/ujson.c', + 'src/vendored/ujson/python/objToJSON.c', + 'src/vendored/ujson/python/JSONtoObj.c', + 'src/vendored/ujson/lib/ultrajsonenc.c', + 'src/vendored/ujson/lib/ultrajsondec.c', + ], + }, 'ops': {'sources': ['ops.pyx']}, 'ops_dispatch': {'sources': ['ops_dispatch.pyx']}, 'properties': {'sources': ['properties.pyx']}, @@ -98,18 +136,24 @@ libs_sources = { 'sparse': {'sources': ['sparse.pyx', _sparse_op_helper]}, 'tslib': {'sources': ['tslib.pyx']}, 'testing': {'sources': ['testing.pyx']}, - 'writers': {'sources': ['writers.pyx']} + 'writers': {'sources': ['writers.pyx']}, } cython_args = [ '--include-dir', meson.current_build_dir(), - '-X always_allow_keywords=true' + '-X always_allow_keywords=true', ] if get_option('buildtype') == 'debug' cython_args += ['--gdb'] endif +# Use shared utility code to reduce wheel sizes +# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files +if cy.version().version_compare('>=3.1.0') + cython_args += ['--shared=pandas._libs._cyutility'] +endif + foreach ext_name, ext_dict : libs_sources py.extension_module( ext_name, @@ -118,7 +162,7 @@ foreach ext_name, ext_dict : libs_sources include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs', - install: true + install: true, ) endforeach @@ -148,14 +192,11 @@ sources_to_install = [ 'sparse.pyi', 'testing.pyi', 'tslib.pyi', - 'writers.pyi' + 'writers.pyi', ] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs') endforeach subdir('window') diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index 5920649519442..899d729690451 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -7,8 +7,8 @@ from numpy cimport ( cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*) cpdef bint check_na_tuples_nonequal(object left, object right) -cpdef bint checknull(object val, bint inf_as_na=*) -cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=*) +cpdef bint checknull(object val) +cpdef ndarray[uint8_t] isnaobj(ndarray arr) cdef bint is_null_datetime64(v) cdef bint is_null_timedelta64(v) diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index 282dcee3ed6cf..6bf30a03cef32 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -11,6 +11,6 @@ def is_matching_na( ) -> bool: ... def isposinf_scalar(val: object) -> bool: ... def isneginf_scalar(val: object) -> bool: ... -def checknull(val: object, inf_as_na: bool = ...) -> bool: ... -def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... +def checknull(val: object) -> bool: ... +def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 0fd1c2b21d5cd..390a527c22bbb 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -137,7 +137,7 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False return False -cpdef bint checknull(object val, bint inf_as_na=False): +cpdef bint checknull(object val): """ Return boolean describing of the input is NA-like, defined here as any of: @@ -152,8 +152,6 @@ cpdef bint checknull(object val, bint inf_as_na=False): Parameters ---------- val : object - inf_as_na : bool, default False - Whether to treat INF and -INF as NA values. Returns ------- @@ -164,8 +162,6 @@ cpdef bint checknull(object val, bint inf_as_na=False): elif util.is_float_object(val) or util.is_complex_object(val): if val != val: return True - elif inf_as_na: - return val == INF or val == NEGINF return False elif cnp.is_timedelta64_object(val): return cnp.get_timedelta64_value(val) == NPY_NAT @@ -184,7 +180,7 @@ cdef bint is_decimal_na(object val): @cython.wraparound(False) @cython.boundscheck(False) -cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False): +cpdef ndarray[uint8_t] isnaobj(ndarray arr): """ Return boolean mask denoting which elements of a 1-D array are na-like, according to the criteria defined in `checknull`: @@ -217,7 +213,7 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False): # equivalents to `val = values[i]` val = cnp.PyArray_GETITEM(arr, cnp.PyArray_ITER_DATA(it)) cnp.PyArray_ITER_NEXT(it) - is_null = checknull(val, inf_as_na=inf_as_na) + is_null = checknull(val) # Dereference pointer (set value) ((cnp.PyArray_ITER_DATA(it2)))[0] = is_null cnp.PyArray_ITER_NEXT(it2) @@ -351,6 +347,14 @@ class NAType(C_NAType): The NA singleton is a missing value indicator defined by pandas. It is used in certain new extension dtypes (currently the "string" dtype). + See Also + -------- + numpy.nan : Floating point representation of Not a Number (NaN) for numerical data. + isna : Detect missing values for an array-like object. + notna : Detect non-missing values for an array-like object. + DataFrame.fillna : Fill missing values in a DataFrame. + Series.fillna : Fill missing values in a Series. + Examples -------- >>> pd.NA diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index 6738a1dff4a9e..81fe81930539d 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -1,7 +1,9 @@ -from typing import ( - Any, +from collections.abc import ( Callable, Iterable, +) +from typing import ( + Any, Literal, TypeAlias, overload, diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 9154e836b3477..567bfc02a2950 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -29,7 +29,7 @@ from pandas._libs.util cimport is_nan @cython.wraparound(False) @cython.boundscheck(False) -def scalar_compare(object[:] values, object val, object op) -> ndarray: +def scalar_compare(ndarray[object] values, object val, object op) -> ndarray: """ Compare each element of `values` array with the scalar `val`, with the comparison operation described by `op`. diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 253bb7303cefb..d18f54c546232 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -1,5 +1,5 @@ +from collections.abc import Hashable from typing import ( - Hashable, Literal, ) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 82e9812094af2..c29cdbcf5975e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -6,7 +6,6 @@ from csv import ( QUOTE_NONE, QUOTE_NONNUMERIC, ) -import time import warnings from pandas.util._exceptions import find_stack_level @@ -344,10 +343,9 @@ cdef class TextReader: object true_values, false_values object handle object orig_header - bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns + bint na_filter, keep_default_na, has_usecols, has_mi_columns bint allow_leading_cols uint64_t parser_start # this is modified after __init__ - list clocks const char *encoding_errors kh_str_starts_t *false_set kh_str_starts_t *true_set @@ -400,7 +398,6 @@ cdef class TextReader: bint allow_leading_cols=True, skiprows=None, skipfooter=0, # int64_t - bint verbose=False, float_precision=None, bint skip_blank_lines=True, encoding_errors=b"strict", @@ -417,9 +414,6 @@ cdef class TextReader: self.parser = parser_new() self.parser.chunksize = tokenize_chunksize - # For timekeeping - self.clocks = [] - self.parser.usecols = (usecols is not None) self._setup_parser_source(source) @@ -507,8 +501,6 @@ cdef class TextReader: self.converters = converters self.na_filter = na_filter - self.verbose = verbose - if float_precision == "round_trip": # see gh-15140 self.parser.double_converter = round_trip_wrapper @@ -896,8 +888,6 @@ cdef class TextReader: int64_t buffered_lines int64_t irows - self._start_clock() - if rows is not None: irows = rows buffered_lines = self.parser.lines - self.parser_start @@ -915,12 +905,8 @@ cdef class TextReader: if self.parser_start >= self.parser.lines: raise StopIteration - self._end_clock("Tokenization") - self._start_clock() columns = self._convert_column_data(rows) - self._end_clock("Type conversion") - self._start_clock() if len(columns) > 0: rows_read = len(list(columns.values())[0]) # trim @@ -929,18 +915,8 @@ cdef class TextReader: parser_trim_buffers(self.parser) self.parser_start -= rows_read - self._end_clock("Parser memory cleanup") - return columns - cdef _start_clock(self): - self.clocks.append(time.time()) - - cdef _end_clock(self, str what): - if self.verbose: - elapsed = time.time() - self.clocks.pop(-1) - print(f"{what} took: {elapsed * 1000:.2f} ms") - def set_noconvert(self, i: int) -> None: self.noconvert.add(i) @@ -1603,7 +1579,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col, # -> ndarray[f'|S{width}'] cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, - int64_t line_end, int64_t width) noexcept: + int64_t line_end, int64_t width): cdef: char *data ndarray result diff --git a/pandas/_libs/properties.pyi b/pandas/_libs/properties.pyi index aaa44a0cf47bf..bbde6ec454202 100644 --- a/pandas/_libs/properties.pyi +++ b/pandas/_libs/properties.pyi @@ -1,5 +1,5 @@ +from collections.abc import Sequence from typing import ( - Sequence, overload, ) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 21d1405328da6..51c1c75ba631b 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -19,7 +19,7 @@ from pandas._libs.lib cimport c_is_list_like @cython.wraparound(False) @cython.boundscheck(False) -def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, +def unstack(const numeric_object_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: """ @@ -40,27 +40,7 @@ def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, cdef: Py_ssize_t i, j, w, nulls, s, offset - if numeric_object_t is not object: - # evaluated at compile-time - with nogil: - for i in range(stride): - - nulls = 0 - for j in range(length): - - for w in range(width): - - offset = j * width + w - - if mask[offset]: - s = i * width + w - new_values[j, s] = values[offset - nulls, i] - new_mask[j, s] = 1 - else: - nulls += 1 - - else: - # object-dtype, identical to above but we cannot use nogil + with nogil(numeric_object_t is not object): for i in range(stride): nulls = 0 @@ -80,7 +60,7 @@ def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, @cython.wraparound(False) @cython.boundscheck(False) -def explode(ndarray[object] values): +def explode(object[:] values): """ transform array list-likes to long form preserve non-list entries diff --git a/pandas/_libs/sas.pyx b/pandas/_libs/sas.pyx index 9e1af2cb9c3e7..209e82c6284f5 100644 --- a/pandas/_libs/sas.pyx +++ b/pandas/_libs/sas.pyx @@ -49,7 +49,7 @@ cdef bytes buf_as_bytes(Buffer buf, size_t offset, size_t length): cdef Buffer buf_new(size_t length) except *: cdef uint8_t *data = calloc(length, sizeof(uint8_t)) - if data == NULL: + if data is NULL: raise MemoryError(f"Failed to allocate {length} bytes") return Buffer(data, length) diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi index 536265b25425e..8727b1a5b0420 100644 --- a/pandas/_libs/sparse.pyi +++ b/pandas/_libs/sparse.pyi @@ -1,4 +1,4 @@ -from typing import Sequence +from collections.abc import Sequence import numpy as np diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 19de51be6e1b2..2c32fb0481486 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -20,6 +20,9 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "datetime.h" +/* Need to import_array for np_datetime.c (for NumPy 1.x support only) */ +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include "numpy/ndarrayobject.h" #include "pandas/datetime/pd_datetime.h" #include "pandas/portable.h" @@ -242,7 +245,12 @@ static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) { } static PyModuleDef_Slot pandas_datetime_slots[] = { - {Py_mod_exec, pandas_datetime_exec}, {0, NULL}}; + {Py_mod_exec, pandas_datetime_exec}, +#if PY_VERSION_HEX >= 0x030D0000 + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, +#endif + {0, NULL}, +}; static struct PyModuleDef pandas_datetimemodule = { PyModuleDef_HEAD_INIT, @@ -255,5 +263,6 @@ static struct PyModuleDef pandas_datetimemodule = { PyMODINIT_FUNC PyInit_pandas_datetime(void) { PyDateTime_IMPORT; + import_array(); return PyModuleDef_Init(&pandas_datetimemodule); } diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c index 48f3cd14cbc30..51cdf071a15cf 100644 --- a/pandas/_libs/src/parser/pd_parser.c +++ b/pandas/_libs/src/parser/pd_parser.c @@ -161,7 +161,12 @@ static int pandas_parser_exec(PyObject *Py_UNUSED(module)) { } static PyModuleDef_Slot pandas_parser_slots[] = { - {Py_mod_exec, pandas_parser_exec}, {0, NULL}}; + {Py_mod_exec, pandas_parser_exec}, +#if PY_VERSION_HEX >= 0x030D0000 + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, +#endif + {0, NULL}, +}; static struct PyModuleDef pandas_parsermodule = { PyModuleDef_HEAD_INIT, diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 0e4188bea4dc7..61e96fc835e4d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -109,6 +109,14 @@ void parser_set_default_options(parser_t *self) { parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } +static void parser_clear_data_buffers(parser_t *self) { + free_if_not_null((void *)&self->stream); + free_if_not_null((void *)&self->words); + free_if_not_null((void *)&self->word_starts); + free_if_not_null((void *)&self->line_start); + free_if_not_null((void *)&self->line_fields); +} + static void parser_cleanup(parser_t *self) { // XXX where to put this free_if_not_null((void *)&self->error_msg); @@ -119,6 +127,7 @@ static void parser_cleanup(parser_t *self) { self->skipset = NULL; } + parser_clear_data_buffers(self); if (self->cb_cleanup != NULL) { self->cb_cleanup(self->source); self->cb_cleanup = NULL; @@ -139,7 +148,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = malloc(STREAM_INIT_SIZE); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -212,9 +221,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { char *orig_ptr = (void *)self->stream; TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = - (char *)grow_buffer((void *)self->stream, self->stream_len, - &self->stream_cap, nbytes * 2, sizeof(char), &status); + self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, 1, &status); TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, " "self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 06e3251db8315..9a022095feee9 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -16,17 +16,16 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt // Licence at LICENSES/NUMPY_LICENSE -#define NO_IMPORT - #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API -#include - #include "pandas/vendored/numpy/datetime/np_datetime.h" -#include +#define NO_IMPORT_ARRAY +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include #include +#include #if defined(_WIN32) #ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS @@ -57,12 +56,15 @@ _Static_assert(0, "__has_builtin not detected; please try a newer compiler"); #endif #endif +#define XSTR(a) STR(a) +#define STR(a) #a + #define PD_CHECK_OVERFLOW(FUNC) \ do { \ if ((FUNC) != 0) { \ PyGILState_STATE gstate = PyGILState_Ensure(); \ PyErr_SetString(PyExc_OverflowError, \ - "Overflow occurred in npy_datetimestruct_to_datetime"); \ + "Overflow occurred at " __FILE__ ":" XSTR(__LINE__)); \ PyGILState_Release(gstate); \ return -1; \ } \ @@ -138,8 +140,8 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { npy_int64 year, days = 0; const int *month_lengths; - year = dts->year - 1970; - days = year * 365; + PD_CHECK_OVERFLOW(checked_int64_sub(dts->year, 1970, &year)); + PD_CHECK_OVERFLOW(checked_int64_mul(year, 365, &days)); /* Adjust for leap years */ if (days >= 0) { @@ -147,32 +149,32 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { * 1968 is the closest leap year before 1970. * Exclude the current year, so add 1. */ - year += 1; + PD_CHECK_OVERFLOW(checked_int64_add(year, 1, &year)); /* Add one day for each 4 years */ - days += year / 4; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days)); /* 1900 is the closest previous year divisible by 100 */ - year += 68; + PD_CHECK_OVERFLOW(checked_int64_add(year, 68, &year)); /* Subtract one day for each 100 years */ - days -= year / 100; + PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days)); /* 1600 is the closest previous year divisible by 400 */ - year += 300; + PD_CHECK_OVERFLOW(checked_int64_add(year, 300, &year)); /* Add one day for each 400 years */ - days += year / 400; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days)); } else { /* * 1972 is the closest later year after 1970. * Include the current year, so subtract 2. */ - year -= 2; + PD_CHECK_OVERFLOW(checked_int64_sub(year, 2, &year)); /* Subtract one day for each 4 years */ - days += year / 4; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days)); /* 2000 is the closest later year divisible by 100 */ - year -= 28; + PD_CHECK_OVERFLOW(checked_int64_sub(year, 28, &year)); /* Add one day for each 100 years */ - days -= year / 100; + PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days)); /* 2000 is also the closest later year divisible by 400 */ /* Subtract one day for each 400 years */ - days += year / 400; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days)); } month_lengths = days_per_month_table[is_leapyear(dts->year)]; @@ -180,11 +182,11 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { /* Add the months */ for (i = 0; i < month; ++i) { - days += month_lengths[i]; + PD_CHECK_OVERFLOW(checked_int64_add(days, month_lengths[i], &days)); } /* Add the days */ - days += dts->day - 1; + PD_CHECK_OVERFLOW(checked_int64_add(days, dts->day - 1, &days)); return days; } @@ -429,6 +431,15 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, } const int64_t days = get_datetimestruct_days(dts); + if (days == -1) { + PyGILState_STATE gstate = PyGILState_Ensure(); + bool did_error = PyErr_Occurred() == NULL ? false : true; + PyGILState_Release(gstate); + if (did_error) { + return -1; + } + } + if (base == NPY_FR_D) { return days; } @@ -482,10 +493,20 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, if (base == NPY_FR_ns) { int64_t nanoseconds; - PD_CHECK_OVERFLOW( - scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + + // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). + const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + if (microseconds == min_nanoseconds / 1000 - 1) { + // For values within one microsecond of min_nanoseconds, use it as base + // and offset it with nanosecond delta to avoid overflow during scaling. + PD_CHECK_OVERFLOW(checked_int64_add( + min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + } else { + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + } return nanoseconds; } @@ -639,11 +660,12 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (npy_int32)extract_unit(&dt, 1000LL); - out->ps = (npy_int32)(dt * 1000); + out->hour = + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->ps = (npy_int32)(dt); break; case NPY_FR_fs: @@ -699,355 +721,121 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, void pandas_timedelta_to_timedeltastruct(npy_timedelta td, NPY_DATETIMEUNIT base, pandas_timedeltastruct *out) { - npy_int64 frac; - npy_int64 sfrac; - npy_int64 ifrac; - int sign; - npy_int64 per_day; - npy_int64 per_sec; - /* Initialize the output to all zeros */ memset(out, 0, sizeof(pandas_timedeltastruct)); - switch (base) { - case NPY_FR_ns: - - per_day = 86400000000000LL; - per_sec = 1000LL * 1000LL * 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = (npy_int32)(frac / 3600LL); - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = (npy_int32)(frac / 60LL); - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = (npy_int32)frac; - frac -= out->sec; - } else { - out->sec = 0; - } + const npy_int64 sec_per_hour = 3600; + const npy_int64 sec_per_min = 60; - sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = (npy_int32)(ifrac / (1000LL * 1000LL)); - ifrac -= out->ms * 1000LL * 1000LL; - out->us = (npy_int32)(ifrac / 1000LL); - ifrac -= out->us * 1000LL; - out->ns = (npy_int32)ifrac; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } + switch (base) { + case NPY_FR_W: + out->days = 7 * td; break; - - case NPY_FR_us: - - per_day = 86400000000LL; - per_sec = 1000LL * 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = (npy_int32)(frac / 3600LL); - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = (npy_int32)(frac / 60LL); - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = (npy_int32)frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = (npy_int32)(ifrac / 1000LL); - ifrac -= out->ms * 1000LL; - out->us = (npy_int32)(ifrac / 1L); - ifrac -= out->us * 1L; - out->ns = (npy_int32)ifrac; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } + case NPY_FR_D: + out->days = td; break; - + case NPY_FR_h: + out->days = td / 24LL; + td -= out->days * 24LL; + out->hrs = (npy_int32)td; + break; + case NPY_FR_m: + out->days = td / 1440LL; + td -= out->days * 1440LL; + out->hrs = (npy_int32)(td / 60LL); + td -= out->hrs * 60LL; + out->min = (npy_int32)td; + break; + case NPY_FR_s: case NPY_FR_ms: - - per_day = 86400000LL; - per_sec = 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = (npy_int32)(frac / 3600LL); - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = (npy_int32)(frac / 60LL); - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = (npy_int32)frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = (npy_int32)ifrac; - out->us = 0; - out->ns = 0; + case NPY_FR_us: + case NPY_FR_ns: { + const npy_int64 sec_per_day = 86400; + npy_int64 per_sec; + if (base == NPY_FR_s) { + per_sec = 1; + } else if (base == NPY_FR_ms) { + per_sec = 1000; + } else if (base == NPY_FR_us) { + per_sec = 1000000; } else { - out->ms = 0; - out->us = 0; - out->ns = 0; + per_sec = 1000000000; } - break; - - case NPY_FR_s: - // special case where we can simplify many expressions bc per_sec=1 - - per_day = 86400LL; - per_sec = 1L; + const npy_int64 per_day = sec_per_day * per_sec; + npy_int64 frac; // put frac in seconds if (td < 0 && td % per_sec != 0) frac = td / per_sec - 1; else frac = td / per_sec; + const int sign = frac < 0 ? -1 : 1; if (frac < 0) { - sign = -1; - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; + if ((-frac % sec_per_day) != 0) { + out->days = -frac / sec_per_day + 1; + frac += sec_per_day * out->days; } else { frac = -frac; } - } else { - sign = 1; - out->days = 0; } - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; + if (frac >= sec_per_day) { + out->days += frac / sec_per_day; + frac -= out->days * sec_per_day; } - if (frac >= 3600) { - out->hrs = (npy_int32)(frac / 3600LL); - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; + if (frac >= sec_per_hour) { + out->hrs = (npy_int32)(frac / sec_per_hour); + frac -= out->hrs * sec_per_hour; } - if (frac >= 60) { - out->min = (npy_int32)(frac / 60LL); - frac -= out->min * 60LL; - } else { - out->min = 0; + if (frac >= sec_per_min) { + out->min = (npy_int32)(frac / sec_per_min); + frac -= out->min * sec_per_min; } if (frac >= 0) { out->sec = (npy_int32)frac; frac -= out->sec; - } else { - out->sec = 0; } - sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; - if (sign < 0) out->days = -out->days; - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = 0; - out->us = 0; - out->ns = 0; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; + if (base > NPY_FR_s) { + const npy_int64 sfrac = + (out->hrs * sec_per_hour + out->min * sec_per_min + out->sec) * + per_sec; + + npy_int64 ifrac = td - (out->days * per_day + sfrac); + + if (base == NPY_FR_ms) { + out->ms = (npy_int32)ifrac; + } else if (base == NPY_FR_us) { + out->ms = (npy_int32)(ifrac / 1000LL); + ifrac = ifrac % 1000LL; + out->us = (npy_int32)ifrac; + } else if (base == NPY_FR_ns) { + out->ms = (npy_int32)(ifrac / (1000LL * 1000LL)); + ifrac = ifrac % (1000LL * 1000LL); + out->us = (npy_int32)(ifrac / 1000LL); + ifrac = ifrac % 1000LL; + out->ns = (npy_int32)ifrac; + } } - break; - - case NPY_FR_m: - - out->days = td / 1440LL; - td -= out->days * 1440LL; - out->hrs = (npy_int32)(td / 60LL); - td -= out->hrs * 60LL; - out->min = (npy_int32)td; - - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_h: - out->days = td / 24LL; - td -= out->days * 24LL; - out->hrs = (npy_int32)td; - - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_D: - out->days = td; - out->hrs = 0; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_W: - out->days = 7 * td; - out->hrs = 0; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; + } break; default: PyErr_SetString(PyExc_RuntimeError, "NumPy timedelta metadata is corrupted with " "invalid base unit"); + break; } - out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; + out->seconds = + (npy_int32)(out->hrs * sec_per_hour + out->min * sec_per_min + out->sec); out->microseconds = out->ms * 1000 + out->us; out->nanoseconds = out->ns; } @@ -1060,5 +848,8 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, */ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype) { - return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta); +#if NPY_ABI_VERSION < 0x02000000 +#define PyDataType_C_METADATA(dtype) ((dtype)->c_metadata) +#endif + return ((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dtype))->meta; } diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index c8d8b5ab6bd6e..1564ecb64b01d 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -920,7 +920,7 @@ Perhaps implement recursion detection */ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) { const char *value; - char *objName; + const char *objName; int count; JSOBJ iterObj; size_t szlen; diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index 7cc20a52f1849..ef6f1104a1fb9 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -38,10 +38,11 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE -#include "pandas/vendored/ujson/lib/ultrajson.h" #define PY_SSIZE_T_CLEAN #include +#include "pandas/vendored/ujson/lib/ultrajson.h" + static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, JSOBJ value) { int ret = PyDict_SetItem(obj, name, value); diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 8bba95dd456de..8342dbcd1763d 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -53,8 +53,8 @@ Numeric decoder derived from TCL library npy_int64 get_nat(void) { return NPY_MIN_INT64; } -typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, - size_t *_outLen); +typedef const char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, + size_t *_outLen); int object_is_decimal_type(PyObject *obj); int object_is_dataframe_type(PyObject *obj); @@ -74,7 +74,6 @@ typedef struct __NpyArrContext { npy_intp ndim; npy_intp index[NPY_MAXDIMS]; int type_num; - PyArray_GetItemFunc *getitem; char **rowLabels; char **columnLabels; @@ -107,7 +106,7 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; - char *cStr; + const char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; int transpose; @@ -302,14 +301,15 @@ static npy_float64 total_seconds(PyObject *td) { return double_val; } -static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), - size_t *_outLen) { +static const char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), + size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); return PyBytes_AS_STRING(obj); } -static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { +static const char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, + size_t *_outLen) { char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); if (encoded == NULL) { /* Something went wrong. @@ -322,8 +322,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); @@ -331,15 +331,15 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); return GET_TC(tc)->cStr; } /* JSON callback */ -static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, - size_t *len) { +static const char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, + size_t *len) { if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; @@ -350,7 +350,8 @@ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, return PyDateTimeToIso(obj, base, len); } -static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { +static const char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, + size_t *outLen) { PyObject *obj = (PyObject *)_obj; PyObject *str = PyObject_CallMethod(obj, "isoformat", NULL); if (str == NULL) { @@ -374,6 +375,27 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { return outValue; } +static const char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, + size_t *len) { + PyObject *obj = (PyObject *)_obj; + PyObject *format_spec = PyUnicode_FromStringAndSize("f", 1); + PyObject *str = PyObject_Format(obj, format_spec); + Py_DECREF(format_spec); + + if (str == NULL) { + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + + GET_TC(tc)->newObj = str; + + Py_ssize_t s_len; + char *outValue = (char *)PyUnicode_AsUTF8AndSize(str, &s_len); + *len = s_len; + + return outValue; +} + //============================================================================= // Numpy array iteration functions //============================================================================= @@ -405,15 +427,14 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } npyarr->array = (PyObject *)obj; - npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; npyarr->dataptr = PyArray_DATA(obj); npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, (int)npyarr->ndim); npyarr->stridedim = npyarr->ndim; npyarr->index[npyarr->ndim] = 0; npyarr->inc = -1; @@ -447,8 +468,15 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { npyarr->curdim--; npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; npyarr->stridedim -= npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArrayPassThru_iterEnd received a non-array object"); + return; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -467,18 +495,25 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) { + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNextItem received a non-array object"); + return 0; + } + PyArrayObject *arrayobj = (PyArrayObject *)npyarr->array; + + if (PyArray_ISDATETIME(arrayobj)) { GET_TC(tc)->itemValue = obj; Py_INCREF(obj); - ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(arrayobj); // Also write the resolution (unit) of the ndarray - PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); + PyArray_Descr *dtype = PyArray_DESCR(arrayobj); ((PyObjectEncoder *)tc->encoder)->valueUnit = get_datetime_metadata_from_dtype(dtype).base; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + GET_TC(tc)->itemValue = PyArray_GETITEM(arrayobj, npyarr->dataptr); } npyarr->dataptr += npyarr->stride; @@ -505,8 +540,15 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { npyarr->curdim++; npyarr->stridedim += npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNext received a non-array object"); + return 0; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; @@ -518,10 +560,10 @@ static JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -569,11 +611,11 @@ static int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { return NpyArr_iterNextItem(obj, tc); } -static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { const npy_intp idx = blkCtxt->colIdx - 1; @@ -591,12 +633,12 @@ static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, return cStr; } -static char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, + size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -777,9 +819,9 @@ static JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -824,9 +866,9 @@ static JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Set_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Set_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -922,8 +964,8 @@ static JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -954,9 +996,9 @@ static JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *List_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *List_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -965,24 +1007,16 @@ static char *List_iterGetName(JSOBJ Py_UNUSED(obj), //============================================================================= static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1002,8 +1036,8 @@ static JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1014,28 +1048,20 @@ static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1057,8 +1083,8 @@ static JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1069,28 +1095,20 @@ static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + GET_TC(tc)->cStr = "columns"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; Py_INCREF(obj); GET_TC(tc)->itemValue = obj; } else { @@ -1110,8 +1128,8 @@ static JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1161,8 +1179,8 @@ static JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -1448,8 +1466,18 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (object_is_decimal_type(obj)) { - pc->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; + PyObject *is_nan_py = PyObject_RichCompare(obj, obj, Py_NE); + if (is_nan_py == NULL) { + goto INVALID; + } + int is_nan = (is_nan_py == Py_True); + Py_DECREF(is_nan_py); + if (is_nan) { + tc->type = JT_NULL; + return; + } + pc->PyTypeToUTF8 = PyDecimalToUTF8Callback; + tc->type = JT_UTF8; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { @@ -1610,7 +1638,14 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (!values) { goto INVALID; } - pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + + if (!PyArray_Check(pc->newObj)) { + PyErr_SetString(PyExc_TypeError, + "Object_beginTypeContext received a non-array object"); + goto INVALID; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; + pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { @@ -1845,7 +1880,6 @@ static void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->rowLabels = NULL; NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); tc->prv = NULL; @@ -1896,8 +1930,8 @@ static JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->iterGetValue(obj, tc); } -static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen) { +static const char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen) { return GET_TC(tc)->iterGetName(obj, tc, outLen); } diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index 075411a23b075..2ee084b9304f4 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -40,6 +40,7 @@ Numeric decoder derived from TCL library #define PY_SSIZE_T_CLEAN #include + #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY #include "numpy/arrayobject.h" @@ -384,6 +385,10 @@ PyMODINIT_FUNC PyInit_json(void) { return NULL; } +#ifdef Py_GIL_DISABLED + PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED); +#endif + #ifndef PYPY_VERSION PyObject *mod_decimal = PyImport_ImportModule("decimal"); if (mod_decimal) { diff --git a/pandas/_libs/testing.pyi b/pandas/_libs/testing.pyi index 01da496975f51..4758483b3b5e7 100644 --- a/pandas/_libs/testing.pyi +++ b/pandas/_libs/testing.pyi @@ -1,4 +1,6 @@ -def assert_dict_equal(a, b, compare_keys: bool = ...): ... +from collections.abc import Mapping + +def assert_dict_equal(a: Mapping, b: Mapping, compare_keys: bool = ...) -> bool: ... def assert_almost_equal( a, b, @@ -9,4 +11,4 @@ def assert_almost_equal( lobj=..., robj=..., index_values=..., -): ... +) -> bool: ... diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index aed0f4b082d4e..cfd31fa610e69 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,6 +1,5 @@ import cmath import math -import warnings import numpy as np @@ -18,7 +17,6 @@ from pandas._libs.util cimport ( is_real_number_object, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.missing import array_equivalent @@ -188,15 +186,7 @@ cpdef assert_almost_equal(a, b, return True elif checknull(b): # GH#18463 - warnings.warn( - f"Mismatched null-like values {a} and {b} found. In a future " - "version, pandas equality-testing functions " - "(e.g. assert_frame_equal) will consider these not-matching " - "and raise.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return True + raise AssertionError(f"Mismatched null-like values {a} != {b}") raise AssertionError(f"{a} != {b}") elif checknull(b): raise AssertionError(f"{a} != {b}") diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 5a340c1d88bc4..7e3372a80db9d 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -11,11 +11,6 @@ def format_array_from_datetime( na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.object_]: ... -def array_with_unit_to_datetime( - values: npt.NDArray[np.object_], - unit: str, - errors: str = ..., -) -> tuple[np.ndarray, tzinfo | None]: ... def first_non_null(values: np.ndarray) -> int: ... def array_to_datetime( values: npt.NDArray[np.object_], @@ -24,6 +19,7 @@ def array_to_datetime( yearfirst: bool = ..., utc: bool = ..., creso: int = ..., + unit_for_numerics: str | None = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 017fdc4bc834f..3c5854602df53 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,7 +1,3 @@ -import warnings - -from pandas.util._exceptions import find_stack_level - cimport cython from datetime import timezone @@ -9,7 +5,6 @@ from datetime import timezone from cpython.datetime cimport ( PyDate_Check, PyDateTime_Check, - datetime, import_datetime, timedelta, tzinfo, @@ -68,14 +63,15 @@ from pandas._libs.tslibs.conversion cimport ( get_datetime64_nanos, parse_pydatetime, ) -from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev +from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + npy_unit_to_abbrev, +) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, - c_NaT as NaT, c_nat_strings as nat_strings, ) from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs import ( Resolution, @@ -237,156 +233,6 @@ def format_array_from_datetime( return result -def array_with_unit_to_datetime( - ndarray[object] values, - str unit, - str errors="coerce" -): - """ - Convert the ndarray to datetime according to the time unit. - - This function converts an array of objects into a numpy array of - datetime64[ns]. It returns the converted array - and also returns the timezone offset - - if errors: - - raise: return converted values or raise OutOfBoundsDatetime - if out of range on the conversion or - ValueError for other conversions (e.g. a string) - - ignore: return non-convertible values as the same unit - - coerce: NaT for non-convertibles - - Parameters - ---------- - values : ndarray - Date-like objects to convert. - unit : str - Time unit to use during conversion. - errors : str, default 'raise' - Error behavior when parsing. - - Returns - ------- - result : ndarray of m8 values - tz : parsed timezone offset or None - """ - cdef: - Py_ssize_t i, n=len(values) - bint is_ignore = errors == "ignore" - bint is_coerce = errors == "coerce" - bint is_raise = errors == "raise" - ndarray[int64_t] iresult - tzinfo tz = None - float fval - - assert is_ignore or is_coerce or is_raise - - if unit == "ns": - result, tz = array_to_datetime( - values.astype(object, copy=False), - errors=errors, - creso=NPY_FR_ns, - ) - return result, tz - - result = np.empty(n, dtype="M8[ns]") - iresult = result.view("i8") - - for i in range(n): - val = values[i] - - try: - if checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - - elif is_integer_object(val) or is_float_object(val): - - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT - else: - iresult[i] = cast_from_unit(val, unit) - - elif isinstance(val, str): - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT - - else: - - try: - fval = float(val) - except ValueError: - raise ValueError( - f"non convertible value {val} with the unit '{unit}'" - ) - warnings.warn( - "The behavior of 'to_datetime' with 'unit' when parsing " - "strings is deprecated. In a future version, strings will " - "be parsed as datetime strings, matching the behavior " - "without a 'unit'. To retain the old behavior, explicitly " - "cast ints or floats to numeric type before calling " - "to_datetime.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - iresult[i] = cast_from_unit(fval, unit) - - else: - # TODO: makes more sense as TypeError, but that would be an - # API change. - raise ValueError( - f"unit='{unit}' not valid with non-numerical val='{val}'" - ) - - except (ValueError, TypeError) as err: - if is_raise: - err.args = (f"{err}, at position {i}",) - raise - elif is_ignore: - # we have hit an exception - # and are in ignore mode - # redo as object - return _array_with_unit_to_datetime_object_fallback(values, unit) - else: - # is_coerce - iresult[i] = NPY_NAT - - return result, tz - - -cdef _array_with_unit_to_datetime_object_fallback(ndarray[object] values, str unit): - cdef: - Py_ssize_t i, n = len(values) - ndarray[object] oresult - tzinfo tz = None - - # TODO: fix subtle differences between this and no-unit code - oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0) - for i in range(n): - val = values[i] - - if checknull_with_nat_and_na(val): - oresult[i] = NaT - elif is_integer_object(val) or is_float_object(val): - - if val != val or val == NPY_NAT: - oresult[i] = NaT - else: - try: - oresult[i] = Timestamp(val, unit=unit) - except OutOfBoundsDatetime: - oresult[i] = val - - elif isinstance(val, str): - if len(val) == 0 or val in nat_strings: - oresult[i] = NaT - - else: - oresult[i] = val - - return oresult, tz - - @cython.wraparound(False) @cython.boundscheck(False) def first_non_null(values: ndarray) -> int: @@ -417,7 +263,8 @@ cpdef array_to_datetime( bint dayfirst=False, bint yearfirst=False, bint utc=False, - NPY_DATETIMEUNIT creso=NPY_FR_ns, + NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC, + str unit_for_numerics=None, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -444,8 +291,9 @@ cpdef array_to_datetime( yearfirst parsing behavior when encountering datetime strings utc : bool, default False indicator whether the dates should be UTC - creso : NPY_DATETIMEUNIT, default NPY_FR_ns - Set to NPY_FR_GENERIC to infer a resolution. + creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC + If NPY_FR_GENERIC, conduct inference. + unit_for_numerics : str, default "ns" Returns ------- @@ -459,14 +307,9 @@ cpdef array_to_datetime( ndarray[int64_t] iresult npy_datetimestruct dts bint utc_convert = bool(utc) - bint seen_datetime_offset = False bint is_raise = errors == "raise" - bint is_ignore = errors == "ignore" bint is_coerce = errors == "coerce" - bint is_same_offsets _TSObject tsobj - float tz_offset - set out_tzoffset_vals = set() tzinfo tz, tz_out = None cnp.flatiter it = cnp.PyArray_IterNew(values) NPY_DATETIMEUNIT item_reso @@ -475,12 +318,19 @@ cpdef array_to_datetime( str abbrev # specify error conditions - assert is_raise or is_ignore or is_coerce + assert is_raise or is_coerce if infer_reso: abbrev = "ns" else: abbrev = npy_unit_to_abbrev(creso) + + if unit_for_numerics is not None: + # either creso or unit_for_numerics should be passed, not both + assert creso == NPY_FR_ns + else: + unit_for_numerics = abbrev + result = np.empty((values).shape, dtype=f"M8[{abbrev}]") iresult = result.view("i8").ravel() @@ -532,7 +382,8 @@ cpdef array_to_datetime( creso = state.creso # we now need to parse this as if unit=abbrev - iresult[i] = cast_from_unit(val, abbrev, out_reso=creso) + iresult[i] = cast_from_unit(val, unit_for_numerics, out_reso=creso) + state.found_other = True elif isinstance(val, str): @@ -541,7 +392,7 @@ cpdef array_to_datetime( # GH#32264 np.str_ object val = str(val) - if parse_today_now(val, &iresult[i], utc, creso): + if parse_today_now(val, &iresult[i], utc, creso, infer_reso=infer_reso): # We can't _quite_ dispatch this to convert_str_to_tsobject # bc there isn't a nice way to pass "utc" item_reso = NPY_DATETIMEUNIT.NPY_FR_us @@ -576,19 +427,19 @@ cpdef array_to_datetime( # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead nsecs = tz.utcoffset(None).total_seconds() - out_tzoffset_vals.add(nsecs) - seen_datetime_offset = True + state.out_tzoffset_vals.add(nsecs) + state.found_aware_str = True else: # Add a marker for naive string, to track if we are # parsing mixed naive and aware strings - out_tzoffset_vals.add("naive") + state.out_tzoffset_vals.add("naive") state.found_naive_str = True else: raise TypeError(f"{type(val)} is not convertible to datetime") except (TypeError, OverflowError, ValueError) as ex: - ex.args = (f"{ex}, at position {i}",) + ex.args = (f"{ex}",) if is_coerce: iresult[i] = NPY_NAT continue @@ -596,38 +447,7 @@ cpdef array_to_datetime( raise return values, None - if seen_datetime_offset and not utc_convert: - # GH#17697 - # 1) If all the offsets are equal, return one offset for - # the parsed dates to (maybe) pass to DatetimeIndex - # 2) If the offsets are different, then force the parsing down the - # object path where an array of datetimes - # (with individual dateutil.tzoffsets) are returned - is_same_offsets = len(out_tzoffset_vals) == 1 - if not is_same_offsets: - return _array_to_datetime_object(values, errors, dayfirst, yearfirst) - elif state.found_naive or state.found_other: - # e.g. test_to_datetime_mixed_awareness_mixed_types - raise ValueError("Cannot mix tz-aware with tz-naive values") - elif tz_out is not None: - # GH#55693 - tz_offset = out_tzoffset_vals.pop() - tz_out2 = timezone(timedelta(seconds=tz_offset)) - if not tz_compare(tz_out, tz_out2): - # e.g. test_to_datetime_mixed_tzs_mixed_types - raise ValueError( - "Mixed timezones detected. pass utc=True in to_datetime " - "or tz='UTC' in DatetimeIndex to convert to a common timezone." - ) - # e.g. test_to_datetime_mixed_types_matching_tzs - else: - tz_offset = out_tzoffset_vals.pop() - tz_out = timezone(timedelta(seconds=tz_offset)) - elif not utc_convert: - if tz_out and (state.found_other or state.found_naive_str): - # found_other indicates a tz-naive int, float, dt64, or date - # e.g. test_to_datetime_mixed_awareness_mixed_types - raise ValueError("Cannot mix tz-aware with tz-naive values") + tz_out = state.check_for_mixed_inputs(tz_out, utc) if infer_reso: if state.creso_ever_changed: @@ -654,116 +474,6 @@ cpdef array_to_datetime( return result, tz_out -@cython.wraparound(False) -@cython.boundscheck(False) -cdef _array_to_datetime_object( - ndarray[object] values, - str errors, - bint dayfirst=False, - bint yearfirst=False, -): - """ - Fall back function for array_to_datetime - - Attempts to parse datetime strings with dateutil to return an array - of datetime objects - - Parameters - ---------- - values : ndarray[object] - date-like objects to convert - errors : str - error behavior when parsing - dayfirst : bool, default False - dayfirst parsing behavior when encountering datetime strings - yearfirst : bool, default False - yearfirst parsing behavior when encountering datetime strings - - Returns - ------- - np.ndarray[object] - Literal[None] - """ - cdef: - Py_ssize_t i, n = values.size - object val - bint is_ignore = errors == "ignore" - bint is_coerce = errors == "coerce" - bint is_raise = errors == "raise" - ndarray oresult_nd - ndarray[object] oresult - npy_datetimestruct dts - cnp.broadcast mi - _TSObject tsobj - - assert is_raise or is_ignore or is_coerce - - oresult_nd = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0) - mi = cnp.PyArray_MultiIterNew2(oresult_nd, values) - oresult = oresult_nd.ravel() - - # We return an object array and only attempt to parse: - # 1) NaT or NaT-like values - # 2) datetime strings, which we return as datetime.datetime - # 3) special strings - "now" & "today" - for i in range(n): - # Analogous to: val = values[i] - val = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] - - if checknull_with_nat_and_na(val) or PyDateTime_Check(val): - # GH 25978. No need to parse NaT-like or datetime-like vals - oresult[i] = val - elif isinstance(val, str): - if type(val) is not str: - # GH#32264 np.str_ objects - val = str(val) - - if len(val) == 0 or val in nat_strings: - oresult[i] = "NaT" - cnp.PyArray_MultiIter_NEXT(mi) - continue - - try: - tsobj = convert_str_to_tsobject( - val, None, dayfirst=dayfirst, yearfirst=yearfirst - ) - tsobj.ensure_reso(NPY_FR_ns, val) - - dts = tsobj.dts - oresult[i] = datetime( - dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, - tzinfo=tsobj.tzinfo, - fold=tsobj.fold, - ) - - except (ValueError, OverflowError) as ex: - ex.args = (f"{ex}, at position {i}", ) - if is_coerce: - oresult[i] = NaT - cnp.PyArray_MultiIter_NEXT(mi) - continue - if is_raise: - raise - return values, None - else: - if is_raise: - raise - return values, None - - cnp.PyArray_MultiIter_NEXT(mi) - - warnings.warn( - "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise an error unless `utc=True`. " - "Please specify `utc=True` to opt in to the new behaviour " - "and silence this warning. To create a `Series` with mixed offsets and " - "`object` dtype, please use `apply` and `datetime.datetime.strptime`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return oresult_nd, None - - def array_to_datetime_with_tz( ndarray values, tzinfo tz, bint dayfirst, bint yearfirst, NPY_DATETIMEUNIT creso ): @@ -826,7 +536,9 @@ def array_to_datetime_with_tz( if state.creso_ever_changed: # We encountered mismatched resolutions, need to re-parse with # the correct one. - return array_to_datetime_with_tz(values, tz=tz, creso=creso) + return array_to_datetime_with_tz( + values, tz=tz, dayfirst=dayfirst, yearfirst=yearfirst, creso=creso + ) elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # i.e. we never encountered anything non-NaT, default to "s". This # ensures that insert and concat-like operations with NaT diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 88a9a259ac8ec..f433a3acf356f 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,42 +1,42 @@ __all__ = [ - "dtypes", - "localize_pydatetime", + "BaseOffset", + "IncompatibleFrequency", "NaT", "NaTType", - "iNaT", - "nat_strings", "OutOfBoundsDatetime", "OutOfBoundsTimedelta", - "IncompatibleFrequency", "Period", "Resolution", + "Tick", "Timedelta", - "normalize_i8_timestamps", - "is_date_array_normalized", - "dt64arr_to_periodarr", + "Timestamp", + "add_overflowsafe", + "astype_overflowsafe", "delta_to_nanoseconds", + "dt64arr_to_periodarr", + "dtypes", + "get_resolution", + "get_supported_dtype", + "get_unit_from_dtype", + "guess_datetime_format", + "iNaT", "ints_to_pydatetime", "ints_to_pytimedelta", - "get_resolution", - "Timestamp", - "tz_convert_from_utc_single", - "tz_convert_from_utc", - "to_offset", - "Tick", - "BaseOffset", - "tz_compare", + "is_date_array_normalized", + "is_supported_dtype", "is_unitless", - "astype_overflowsafe", - "get_unit_from_dtype", + "localize_pydatetime", + "nat_strings", + "normalize_i8_timestamps", "periods_per_day", "periods_per_second", - "guess_datetime_format", - "add_overflowsafe", - "get_supported_dtype", - "is_supported_dtype", + "to_offset", + "tz_compare", + "tz_convert_from_utc", + "tz_convert_from_utc_single", ] -from pandas._libs.tslibs import dtypes # pylint: disable=import-self +from pandas._libs.tslibs import dtypes from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.dtypes import ( Resolution, diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 3a55f5fa0c003..c4acf72ab87d8 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -69,6 +69,7 @@ from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timezones cimport ( get_utcoffset, is_utc, + treat_tz_as_pytz, ) from pandas._libs.tslibs.tzconversion cimport ( Localizer, @@ -136,7 +137,7 @@ def cast_from_unit_vectorized( out = np.empty(shape, dtype="i8") base = np.empty(shape, dtype="i8") - frac = np.empty(shape, dtype="f8") + frac = np.zeros(shape, dtype="f8") for i in range(len(values)): if is_nan(values[i]): @@ -148,18 +149,18 @@ def cast_from_unit_vectorized( if p: frac = np.round(frac, p) - try: - for i in range(len(values)): + for i in range(len(values)): + try: if base[i] == NPY_NAT: out[i] = NPY_NAT else: out[i] = (base[i] * m) + (frac[i] * m) - except (OverflowError, FloatingPointError) as err: - # FloatingPointError can be issued if we have float dtype and have - # set np.errstate(over="raise") - raise OutOfBoundsDatetime( - f"cannot convert input {values[i]} with the unit '{unit}'" - ) from err + except (OverflowError, FloatingPointError) as err: + # FloatingPointError can be issued if we have float dtype and have + # set np.errstate(over="raise") + raise OutOfBoundsDatetime( + f"cannot convert input {values[i]} with the unit '{unit}'" + ) from err return out @@ -606,37 +607,42 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, # equiv: datetime.today().replace(tzinfo=tz) return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) else: - string_to_dts_failed = string_to_dts( - ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - reso = get_supported_reso(out_bestunit) - check_dts_bounds(&dts, reso) - obj = _TSObject() - obj.dts = dts - obj.creso = reso - ival = npy_datetimestruct_to_datetime(reso, &dts) - - if out_local == 1: - obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) - obj.value = tz_localize_to_utc_single( - ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso - ) - if tz is None: - check_overflows(obj, reso) - return obj - _adjust_tsobject_tz_using_offset(obj, tz) - return obj - else: - if tz is not None: - # shift for _localize_tso - ival = tz_localize_to_utc_single( - ival, tz, ambiguous="raise", nonexistent=None, creso=reso + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + ts, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + reso = get_supported_reso(out_bestunit) + check_dts_bounds(&dts, reso) + obj = _TSObject() + obj.dts = dts + obj.creso = reso + ival = npy_datetimestruct_to_datetime(reso, &dts) + + if out_local == 1: + obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) + obj.value = tz_localize_to_utc_single( + ival, + obj.tzinfo, + ambiguous="raise", + nonexistent=None, + creso=reso, ) - obj.value = ival - maybe_localize_tso(obj, tz, obj.creso) - return obj + if tz is None: + check_overflows(obj, reso) + return obj + _adjust_tsobject_tz_using_offset(obj, tz) + return obj + else: + if tz is not None: + # shift for _localize_tso + ival = tz_localize_to_utc_single( + ival, tz, ambiguous="raise", nonexistent=None, creso=reso + ) + obj.value = ival + maybe_localize_tso(obj, tz, obj.creso) + return obj dt = parse_datetime_string( ts, @@ -742,11 +748,17 @@ cdef datetime _localize_pydatetime(datetime dt, tzinfo tz): identically, i.e. discards nanos from Timestamps. It also assumes that the `tz` input is not None. """ - try: + if treat_tz_as_pytz(tz): + import pytz + # datetime.replace with pytz may be incorrect result # TODO: try to respect `fold` attribute - return tz.localize(dt, is_dst=None) - except AttributeError: + try: + return tz.localize(dt, is_dst=None) + except (pytz.AmbiguousTimeError, pytz.NonExistentTimeError) as err: + # As of pandas 3.0, we raise ValueErrors instead of pytz exceptions + raise ValueError(str(err)) from err + else: return dt.replace(tzinfo=tz) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 88cfa6ca60d93..d8c536a34bc04 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -11,14 +11,15 @@ cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1 cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) -cpdef freq_to_period_freqstr(freq_n, freq_name) cdef dict c_OFFSET_TO_PERIOD_FREQSTR -cdef dict c_OFFSET_DEPR_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR -cdef dict c_DEPR_ABBREVS +cdef dict c_PERIOD_TO_OFFSET_FREQSTR +cdef dict c_OFFSET_RENAMED_FREQSTR +cdef dict c_DEPR_UNITS +cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname cdef dict attrname_to_npy_unit +cdef str INVALID_FREQ_ERR_MSG cdef enum c_FreqGroup: # Mirrors FreqGroup in the .pyx file diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 7fdeb88d498ac..b435b9e2d8b31 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -1,18 +1,19 @@ from enum import Enum +from pandas._typing import Self + OFFSET_TO_PERIOD_FREQSTR: dict[str, str] def periods_per_day(reso: int = ...) -> int: ... def periods_per_second(reso: int) -> int: ... def abbrev_to_npy_unit(abbrev: str | None) -> int: ... -def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ... class PeriodDtypeBase: _dtype_code: int # PeriodDtypeCode _n: int # actually __cinit__ - def __new__(cls, code: int, n: int): ... + def __new__(cls, code: int, n: int) -> Self: ... @property def _freq_group_code(self) -> int: ... @property diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 52e1133e596c5..4100f3d90e817 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -1,9 +1,6 @@ # period frequency constants corresponding to scikits timeseries # originals from enum import Enum -import warnings - -from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS from pandas._libs.tslibs.np_datetime cimport ( @@ -176,6 +173,10 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "EOM": "M", "BME": "M", "SME": "M", + "BMS": "M", + "CBME": "M", + "CBMS": "M", + "SMS": "M", "BQS": "Q", "QS": "Q", "BQE": "Q", @@ -228,7 +229,6 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "YE-NOV": "Y-NOV", "W": "W", "ME": "M", - "Y": "Y", "BYE": "Y", "BYE-DEC": "Y-DEC", "BYE-JAN": "Y-JAN", @@ -245,7 +245,7 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "YS": "Y", "BYS": "Y", } -cdef dict c_OFFSET_DEPR_FREQSTR = { +cdef dict c_OFFSET_RENAMED_FREQSTR = { "M": "ME", "Q": "QE", "Q-DEC": "QE-DEC", @@ -273,19 +273,6 @@ cdef dict c_OFFSET_DEPR_FREQSTR = { "Y-SEP": "YE-SEP", "Y-OCT": "YE-OCT", "Y-NOV": "YE-NOV", - "A": "YE", - "A-DEC": "YE-DEC", - "A-JAN": "YE-JAN", - "A-FEB": "YE-FEB", - "A-MAR": "YE-MAR", - "A-APR": "YE-APR", - "A-MAY": "YE-MAY", - "A-JUN": "YE-JUN", - "A-JUL": "YE-JUL", - "A-AUG": "YE-AUG", - "A-SEP": "YE-SEP", - "A-OCT": "YE-OCT", - "A-NOV": "YE-NOV", "BY": "BYE", "BY-DEC": "BYE-DEC", "BY-JAN": "BYE-JAN", @@ -299,19 +286,6 @@ cdef dict c_OFFSET_DEPR_FREQSTR = { "BY-SEP": "BYE-SEP", "BY-OCT": "BYE-OCT", "BY-NOV": "BYE-NOV", - "BA": "BYE", - "BA-DEC": "BYE-DEC", - "BA-JAN": "BYE-JAN", - "BA-FEB": "BYE-FEB", - "BA-MAR": "BYE-MAR", - "BA-APR": "BYE-APR", - "BA-MAY": "BYE-MAY", - "BA-JUN": "BYE-JUN", - "BA-JUL": "BYE-JUL", - "BA-AUG": "BYE-AUG", - "BA-SEP": "BYE-SEP", - "BA-OCT": "BYE-OCT", - "BA-NOV": "BYE-NOV", "BM": "BME", "CBM": "CBME", "SM": "SME", @@ -329,89 +303,68 @@ cdef dict c_OFFSET_DEPR_FREQSTR = { "BQ-OCT": "BQE-OCT", "BQ-NOV": "BQE-NOV", } -cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = { - v: k for k, v in c_OFFSET_DEPR_FREQSTR.items() +PERIOD_TO_OFFSET_FREQSTR = { + "M": "ME", + "Q": "QE", + "Q-DEC": "QE-DEC", + "Q-JAN": "QE-JAN", + "Q-FEB": "QE-FEB", + "Q-MAR": "QE-MAR", + "Q-APR": "QE-APR", + "Q-MAY": "QE-MAY", + "Q-JUN": "QE-JUN", + "Q-JUL": "QE-JUL", + "Q-AUG": "QE-AUG", + "Q-SEP": "QE-SEP", + "Q-OCT": "QE-OCT", + "Q-NOV": "QE-NOV", + "Y": "YE", + "Y-DEC": "YE-DEC", + "Y-JAN": "YE-JAN", + "Y-FEB": "YE-FEB", + "Y-MAR": "YE-MAR", + "Y-APR": "YE-APR", + "Y-MAY": "YE-MAY", + "Y-JUN": "YE-JUN", + "Y-JUL": "YE-JUL", + "Y-AUG": "YE-AUG", + "Y-SEP": "YE-SEP", + "Y-OCT": "YE-OCT", + "Y-NOV": "YE-NOV", } +cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR +cdef dict c_PERIOD_TO_OFFSET_FREQSTR = PERIOD_TO_OFFSET_FREQSTR -cpdef freq_to_period_freqstr(freq_n, freq_name): - if freq_n == 1: - freqstr = f"""{c_OFFSET_TO_PERIOD_FREQSTR.get( - freq_name, freq_name)}""" - else: - freqstr = f"""{freq_n}{c_OFFSET_TO_PERIOD_FREQSTR.get( - freq_name, freq_name)}""" - return freqstr - -# Map deprecated resolution abbreviations to correct resolution abbreviations -cdef dict c_DEPR_ABBREVS = { - "A": "Y", - "a": "Y", - "A-DEC": "Y-DEC", - "A-JAN": "Y-JAN", - "A-FEB": "Y-FEB", - "A-MAR": "Y-MAR", - "A-APR": "Y-APR", - "A-MAY": "Y-MAY", - "A-JUN": "Y-JUN", - "A-JUL": "Y-JUL", - "A-AUG": "Y-AUG", - "A-SEP": "Y-SEP", - "A-OCT": "Y-OCT", - "A-NOV": "Y-NOV", - "BA": "BY", - "BA-DEC": "BY-DEC", - "BA-JAN": "BY-JAN", - "BA-FEB": "BY-FEB", - "BA-MAR": "BY-MAR", - "BA-APR": "BY-APR", - "BA-MAY": "BY-MAY", - "BA-JUN": "BY-JUN", - "BA-JUL": "BY-JUL", - "BA-AUG": "BY-AUG", - "BA-SEP": "BY-SEP", - "BA-OCT": "BY-OCT", - "BA-NOV": "BY-NOV", - "AS": "YS", - "AS-DEC": "YS-DEC", - "AS-JAN": "YS-JAN", - "AS-FEB": "YS-FEB", - "AS-MAR": "YS-MAR", - "AS-APR": "YS-APR", - "AS-MAY": "YS-MAY", - "AS-JUN": "YS-JUN", - "AS-JUL": "YS-JUL", - "AS-AUG": "YS-AUG", - "AS-SEP": "YS-SEP", - "AS-OCT": "YS-OCT", - "AS-NOV": "YS-NOV", - "BAS": "BYS", - "BAS-DEC": "BYS-DEC", - "BAS-JAN": "BYS-JAN", - "BAS-FEB": "BYS-FEB", - "BAS-MAR": "BYS-MAR", - "BAS-APR": "BYS-APR", - "BAS-MAY": "BYS-MAY", - "BAS-JUN": "BYS-JUN", - "BAS-JUL": "BYS-JUL", - "BAS-AUG": "BYS-AUG", - "BAS-SEP": "BYS-SEP", - "BAS-OCT": "BYS-OCT", - "BAS-NOV": "BYS-NOV", +cdef dict c_DEPR_UNITS = { + "w": "W", + "d": "D", "H": "h", - "BH": "bh", - "CBH": "cbh", - "T": "min", - "t": "min", + "MIN": "min", "S": "s", - "L": "ms", - "l": "ms", - "U": "us", - "u": "us", - "N": "ns", - "n": "ns", + "MS": "ms", + "US": "us", + "NS": "ns", +} + +cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { + "w": "W", + "w-mon": "W-MON", + "w-tue": "W-TUE", + "w-wed": "W-WED", + "w-thu": "W-THU", + "w-fri": "W-FRI", + "w-sat": "W-SAT", + "w-sun": "W-SUN", + "d": "D", + "b": "B", + "c": "C", + "MIN": "min", + "US": "us", + "NS": "ns", } +cdef str INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" + class FreqGroup(Enum): # Mirrors c_FreqGroup in the .pxd file @@ -501,39 +454,18 @@ class Resolution(Enum): >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR True """ - cdef: - str abbrev try: - if freq in c_DEPR_ABBREVS: - abbrev = c_DEPR_ABBREVS[freq] - warnings.warn( - f"\'{freq}\' is deprecated and will be removed in a future " - f"version. Please use \'{abbrev}\' " - "instead of \'{freq}\'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - freq = abbrev attr_name = _abbrev_to_attrnames[freq] - except KeyError: + except KeyError as exc: + msg = INVALID_FREQ_ERR_MSG.format(freq) # For quarterly and yearly resolutions, we need to chop off # a month string. split_freq = freq.split("-") if len(split_freq) != 2: - raise + raise ValueError(msg) from exc if split_freq[1] not in _month_names: # i.e. we want e.g. "Q-DEC", not "Q-INVALID" - raise - if split_freq[0] in c_DEPR_ABBREVS: - abbrev = c_DEPR_ABBREVS[split_freq[0]] - warnings.warn( - f"\'{split_freq[0]}\' is deprecated and will be removed in a " - f"future version. Please use \'{abbrev}\' " - f"instead of \'{split_freq[0]}\'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - split_freq[0] = abbrev + raise ValueError(msg) from exc attr_name = _abbrev_to_attrnames[split_freq[0]] return cls.from_attrname(attr_name) diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index c6cfd44e9f6ab..bc55e34f3d208 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -16,7 +16,7 @@ def get_date_name_field( def get_start_end_field( dtindex: npt.NDArray[np.int64], field: str, - freqstr: str | None = ..., + freq_name: str | None = ..., month_kw: int = ..., reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index ff4fb4d635d17..e523ac2e7b5c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -210,7 +210,7 @@ cdef bint _is_on_month(int month, int compare_month, int modby) noexcept nogil: def get_start_end_field( const int64_t[:] dtindex, str field, - str freqstr=None, + str freq_name=None, int month_kw=12, NPY_DATETIMEUNIT reso=NPY_FR_ns, ): @@ -223,7 +223,7 @@ def get_start_end_field( ---------- dtindex : ndarray[int64] field : str - frestr : str or None, default None + freq_name : str or None, default None month_kw : int, default 12 reso : NPY_DATETIMEUNIT, default NPY_FR_ns @@ -243,20 +243,20 @@ def get_start_end_field( out = np.zeros(count, dtype="int8") - if freqstr: - if freqstr == "C": + if freq_name: + if freq_name == "C": raise ValueError(f"Custom business days is not supported by {field}") - is_business = freqstr[0] == "B" + is_business = freq_name[0] == "B" # YearBegin(), BYearBegin() use month = starting month of year. # QuarterBegin(), BQuarterBegin() use startingMonth = starting # month of year. Other offsets use month, startingMonth as ending # month of year. - if (freqstr[0:2] in ["MS", "QS", "YS"]) or ( - freqstr[1:3] in ["MS", "QS", "YS"]): + if freq_name.lstrip("B")[0:2] in ["QS", "YS"]: end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw + else: end_month = month_kw start_month = (end_month % 12) + 1 diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 85410f771233f..ac43dc7db5fb7 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -22,12 +22,18 @@ tslibs_sources = { cython_args = [ '--include-dir', meson.current_build_dir(), - '-X always_allow_keywords=true' + '-X always_allow_keywords=true', ] if get_option('buildtype') == 'debug' cython_args += ['--gdb'] endif +# Use shared utility code to reduce wheel sizes +# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files +if cy.version().version_compare('>=3.1.0') + cython_args += ['--shared=pandas._libs._cyutility'] +endif + foreach ext_name, ext_dict : tslibs_sources py.extension_module( ext_name, @@ -36,7 +42,7 @@ foreach ext_name, ext_dict : tslibs_sources include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs/tslibs', - install: true + install: true, ) endforeach @@ -56,12 +62,9 @@ sources_to_install = [ 'timestamps.pyi', 'timezones.pyi', 'tzconversion.pyi', - 'vectorized.pyi' + 'vectorized.pyi', ] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs/tslibs' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs/tslibs') endforeach diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index bd86a6fdc2174..ff3bb5b70801e 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -3,23 +3,28 @@ from datetime import ( timedelta, tzinfo as _tzinfo, ) -import typing +from typing import ( + Literal, + NoReturn, + TypeAlias, + overload, +) import numpy as np from pandas._libs.tslibs.period import Period -from pandas._typing import Self +from pandas._typing import ( + Frequency, + Self, + TimestampNonexistent, +) NaT: NaTType iNaT: int nat_strings: set[str] -_NaTComparisonTypes: typing.TypeAlias = ( - datetime | timedelta | Period | np.datetime64 | np.timedelta64 -) - -class _NatComparison: - def __call__(self, other: _NaTComparisonTypes) -> bool: ... +_TimeLike: TypeAlias = datetime | timedelta | Period | np.datetime64 | np.timedelta64 +_TimeDelta: TypeAlias = timedelta | np.timedelta64 class NaTType: _value: np.int64 @@ -61,18 +66,37 @@ class NaTType: def week(self) -> float: ... @property def weekofyear(self) -> float: ... + @property + def fold(self) -> int: ... def day_name(self) -> float: ... def month_name(self) -> float: ... def weekday(self) -> float: ... def isoweekday(self) -> float: ... + def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... + def strftime(self, format: str) -> NoReturn: ... def total_seconds(self) -> float: ... def today(self, *args, **kwargs) -> NaTType: ... def now(self, *args, **kwargs) -> NaTType: ... def to_pydatetime(self) -> NaTType: ... def date(self) -> NaTType: ... - def round(self) -> NaTType: ... - def floor(self) -> NaTType: ... - def ceil(self) -> NaTType: ... + def round( + self, + freq: Frequency, + ambiguous: bool | Literal["raise"] | NaTType = ..., + nonexistent: TimestampNonexistent = ..., + ) -> NaTType: ... + def floor( + self, + freq: Frequency, + ambiguous: bool | Literal["raise"] | NaTType = ..., + nonexistent: TimestampNonexistent = ..., + ) -> NaTType: ... + def ceil( + self, + freq: Frequency, + ambiguous: bool | Literal["raise"] | NaTType = ..., + nonexistent: TimestampNonexistent = ..., + ) -> NaTType: ... @property def tzinfo(self) -> None: ... @property @@ -81,8 +105,8 @@ class NaTType: def tz_localize( self, tz: _tzinfo | str | None, - ambiguous: str = ..., - nonexistent: str = ..., + ambiguous: bool | Literal["raise"] | NaTType = ..., + nonexistent: TimestampNonexistent = ..., ) -> NaTType: ... def replace( self, @@ -121,21 +145,39 @@ class NaTType: @property def days(self) -> float: ... @property + def seconds(self) -> float: ... + @property def microseconds(self) -> float: ... @property def nanoseconds(self) -> float: ... # inject Period properties @property def qyear(self) -> float: ... - def __eq__(self, other: object) -> bool: ... - def __ne__(self, other: object) -> bool: ... - __lt__: _NatComparison - __le__: _NatComparison - __gt__: _NatComparison - __ge__: _NatComparison - def __sub__(self, other: Self | timedelta | datetime) -> Self: ... - def __rsub__(self, other: Self | timedelta | datetime) -> Self: ... - def __add__(self, other: Self | timedelta | datetime) -> Self: ... - def __radd__(self, other: Self | timedelta | datetime) -> Self: ... + # comparisons + def __eq__(self, other: object, /) -> Literal[False]: ... + def __ne__(self, other: object, /) -> Literal[True]: ... + def __lt__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __le__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __gt__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __ge__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + # unary operators + def __pos__(self) -> Self: ... + def __neg__(self) -> Self: ... + # binary operators + def __sub__(self, other: Self | _TimeLike, /) -> Self: ... + def __rsub__(self, other: Self | _TimeLike, /) -> Self: ... + def __add__(self, other: Self | _TimeLike, /) -> Self: ... + def __radd__(self, other: Self | _TimeLike, /) -> Self: ... + def __mul__(self, other: float, /) -> Self: ... # analogous to timedelta + def __rmul__(self, other: float, /) -> Self: ... + @overload # analogous to timedelta + def __truediv__(self, other: Self | _TimeDelta, /) -> float: ... # Literal[NaN] + @overload + def __truediv__(self, other: float, /) -> Self: ... + @overload # analogous to timedelta + def __floordiv__(self, other: Self | _TimeDelta, /) -> float: ... # Literal[NaN] + @overload + def __floordiv__(self, other: float, /) -> Self: ... + # other def __hash__(self) -> int: ... def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ... diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 65db0e05f859c..2657b1b9d197b 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -76,7 +76,7 @@ cdef _nat_rdivide_op(self, other): return NotImplemented -def __nat_unpickle(*args): +def _nat_unpickle(*args): # return constant defined in the module return c_NaT @@ -229,7 +229,17 @@ cdef class _NaT(datetime): def to_datetime64(self) -> np.datetime64: """ - Return a numpy.datetime64 object with same precision. + Return a NumPy datetime64 object with same precision. + + This method returns a numpy.datetime64 object with the same + date and time information and precision as the pd.Timestamp object. + + See Also + -------- + numpy.datetime64 : Class to represent dates and times with high precision. + Timestamp.to_numpy : Alias for this method. + Timestamp.asm8 : Alias for this method. + pd.to_datetime : Convert argument to datetime. Examples -------- @@ -244,16 +254,24 @@ cdef class _NaT(datetime): def to_numpy(self, dtype=None, copy=False) -> np.datetime64 | np.timedelta64: """ - Convert the Timestamp to a NumPy datetime64 or timedelta64. + Convert the Timestamp to a NumPy datetime64. - With the default 'dtype', this is an alias method for `NaT.to_datetime64()`. - - The copy parameter is available here only for compatibility. Its value + This is an alias method for `Timestamp.to_datetime64()`. The dtype and + copy parameters are available here only for compatibility. Their values will not affect the return value. + Parameters + ---------- + dtype : dtype, optional + Data type of the output, ignored in this method as the return type + is always `numpy.datetime64`. + copy : bool, default False + Whether to ensure that the returned value is a new object. This + parameter is also ignored as the method does not support copying. + Returns ------- - numpy.datetime64 or numpy.timedelta64 + numpy.datetime64 See Also -------- @@ -269,9 +287,6 @@ cdef class _NaT(datetime): >>> pd.NaT.to_numpy() numpy.datetime64('NaT') - - >>> pd.NaT.to_numpy("m8[ns]") - numpy.timedelta64('NaT','ns') """ if dtype is not None: # GH#44460 @@ -333,6 +348,22 @@ class NaTType(_NaT): """ (N)ot-(A)-(T)ime, the time equivalent of NaN. + NaT is used to denote missing or null values in datetime and timedelta objects + in pandas. It functions similarly to how NaN is used for numerical data. + Operations with NaT will generally propagate the NaT value, similar to NaN. + NaT can be used in pandas data structures like Series and DataFrame + to represent missing datetime values. It is useful in data analysis + and time series analysis when working with incomplete or sparse + time-based data. Pandas provides robust handling of NaT to ensure + consistency and reliability in computations involving datetime objects. + + See Also + -------- + NA : NA ("not available") missing value indicator. + isna : Detect missing values (NaN or NaT) in an array-like object. + notna : Detect non-missing values. + numpy.nan : Floating point representation of Not a Number (NaN) for numerical data. + Examples -------- >>> pd.DataFrame([pd.Timestamp("2023"), np.nan], columns=["col_1"]) @@ -360,7 +391,7 @@ class NaTType(_NaT): return self.__reduce__() def __reduce__(self): - return (__nat_unpickle, (None, )) + return (_nat_unpickle, (None, )) def __rtruediv__(self, other): return _nat_rdivide_op(self, other) @@ -419,6 +450,12 @@ class NaTType(_NaT): Monday == 0 ... Sunday == 6. + See Also + -------- + Timestamp.dayofweek : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isoweekday : Return the day of the week with Monday=1, Sunday=7. + datetime.date.weekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01') @@ -435,6 +472,13 @@ class NaTType(_NaT): Monday == 1 ... Sunday == 7. + See Also + -------- + Timestamp.weekday : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isocalendar : Return a tuple containing ISO year, week number + and weekday. + datetime.date.isoweekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -449,6 +493,16 @@ class NaTType(_NaT): """ Total seconds in the duration. + This method calculates the total duration in seconds by combining + the days, seconds, and microseconds of the `Timedelta` object. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. + Examples -------- >>> td = pd.Timedelta('1min') @@ -463,6 +517,11 @@ class NaTType(_NaT): """ Return the month name of the Timestamp with specified locale. + This method returns the full name of the month corresponding to the + `Timestamp`, such as 'January', 'February', etc. The month name can + be returned in a specified locale if provided; otherwise, it defaults + to the English locale. + Parameters ---------- locale : str, default None (English locale) @@ -471,9 +530,18 @@ class NaTType(_NaT): Returns ------- str + The full month name as a string. + + See Also + -------- + Timestamp.day_name : Returns the name of the day of the week. + Timestamp.strftime : Returns a formatted string of the Timestamp. + datetime.datetime.strftime : Returns a string representing the date and time. Examples -------- + Get the month name in English (default): + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> ts.month_name() 'March' @@ -498,6 +566,11 @@ class NaTType(_NaT): ------- str + See Also + -------- + Timestamp.day_of_week : Return day of the week. + Timestamp.day_of_year : Return day of the year. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -523,6 +596,12 @@ class NaTType(_NaT): """ Return a named tuple containing ISO year, week number, and weekday. + See Also + -------- + DatetimeIndex.isocalendar : Return a 3-tuple containing ISO year, + week number, and weekday for the given DatetimeIndex object. + datetime.date.isocalendar : The equivalent method for `datetime.date` objects. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -537,6 +616,14 @@ class NaTType(_NaT): """ Return the daylight saving time (DST) adjustment. + This method returns the DST adjustment as a `datetime.timedelta` object + if the Timestamp is timezone-aware and DST is applicable. + + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2000-06-01 00:00:00', tz='Europe/Brussels') @@ -549,10 +636,25 @@ class NaTType(_NaT): date = _make_nat_func( "date", """ - Return date object with same year, month and day. + Returns `datetime.date` with the same year, month, and day. + + This method extracts the date component from the `Timestamp` and returns + it as a `datetime.date` object, discarding the time information. + + Returns + ------- + datetime.date + The date part of the `Timestamp`. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.date : Extract the date component from a `datetime` object. Examples -------- + Extract the date from a Timestamp: + >>> ts = pd.Timestamp('2023-01-01 10:00:00.00') >>> ts Timestamp('2023-01-01 10:00:00') @@ -563,7 +665,24 @@ class NaTType(_NaT): utctimetuple = _make_error_func( "utctimetuple", """ - Return UTC time tuple, compatible with time.localtime(). + Return UTC time tuple, compatible with `time.localtime()`. + + This method converts the Timestamp to UTC and returns a time tuple + containing 9 components: year, month, day, hour, minute, second, + weekday, day of year, and DST flag. This is particularly useful for + converting a Timestamp to a format compatible with time module functions. + + Returns + ------- + time.struct_time + A time.struct_time object representing the UTC time. + + See Also + -------- + datetime.datetime.utctimetuple : + Return UTC time tuple, compatible with time.localtime(). + Timestamp.timetuple : Return time tuple of local time. + time.struct_time : Time tuple structure used by time functions. Examples -------- @@ -580,6 +699,22 @@ class NaTType(_NaT): """ Return utc offset. + This method returns the difference between UTC and the local time + as a `timedelta` object. It is useful for understanding the time + difference between the current timezone and UTC. + + Returns + ------- + timedelta + The difference between UTC and the local time as a `timedelta` object. + + See Also + -------- + datetime.datetime.utcoffset : + Standard library method to get the UTC offset of a datetime object. + Timestamp.tzname : Return the name of the timezone. + Timestamp.dst : Return the daylight saving time (DST) adjustment. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -594,6 +729,13 @@ class NaTType(_NaT): """ Return time zone name. + This method returns the name of the Timestamp's time zone as a string. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -608,6 +750,16 @@ class NaTType(_NaT): """ Return time object with same time but with tzinfo=None. + This method extracts the time part of the `Timestamp` object, excluding any + timezone information. It returns a `datetime.time` object which only represents + the time (hours, minutes, seconds, and microseconds). + + See Also + -------- + Timestamp.date : Return date object with same year, month and day. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -622,6 +774,17 @@ class NaTType(_NaT): """ Return time tuple, compatible with time.localtime(). + This method converts the `Timestamp` into a time tuple, which is compatible + with functions like `time.localtime()`. The time tuple is a named tuple with + attributes such as year, month, day, hour, minute, second, weekday, + day of the year, and daylight savings indicator. + + See Also + -------- + time.localtime : Converts a POSIX timestamp into a time tuple. + Timestamp : The `Timestamp` that represents a specific point in time. + datetime.datetime.timetuple : Equivalent method in the `datetime` module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -637,6 +800,19 @@ class NaTType(_NaT): """ Return time object with same time and tzinfo. + This method returns a datetime.time object with + the time and tzinfo corresponding to the pd.Timestamp + object, ignoring any information about the day/date. + + See Also + -------- + datetime.datetime.timetz : Return datetime.time object with the + same time attributes as the datetime object. + datetime.time : Class to represent the time of day, independent + of any particular day. + datetime.datetime.tzinfo : Attribute of datetime.datetime objects + representing the timezone of the datetime object. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -651,6 +827,17 @@ class NaTType(_NaT): """ Return proleptic Gregorian ordinal. January 1 of year 1 is day 1. + The proleptic Gregorian ordinal is a continuous count of days since + January 1 of year 1, which is considered day 1. This method converts + the `Timestamp` to its equivalent ordinal number, useful for date arithmetic + and comparison operations. + + See Also + -------- + datetime.datetime.toordinal : Equivalent method in the `datetime` module. + Timestamp : The `Timestamp` that represents a specific point in time. + Timestamp.fromordinal : Create a `Timestamp` from an ordinal. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:50') @@ -663,7 +850,25 @@ class NaTType(_NaT): ctime = _make_error_func( "ctime", """ - Return ctime() style string. + Return a ctime() style string representing the Timestamp. + + This method returns a string representing the date and time + in the format returned by the standard library's `time.ctime()` + function, which is typically in the form 'Day Mon DD HH:MM:SS YYYY'. + + If the `Timestamp` is outside the range supported by Python's + standard library, a `NotImplementedError` is raised. + + Returns + ------- + str + A string representing the Timestamp in ctime format. + + See Also + -------- + time.ctime : Return a string representing time in ctime format. + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.ctime : Return a ctime style string from a datetime object. Examples -------- @@ -687,6 +892,12 @@ class NaTType(_NaT): See strftime documentation for more information on the format string: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + See Also + -------- + Timestamp.isoformat : Return the time formatted according to ISO 8601. + pd.to_datetime : Convert argument to datetime. + Period.strftime : Format a single Period. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -698,9 +909,27 @@ class NaTType(_NaT): strptime = _make_error_func( "strptime", """ - Timestamp.strptime(string, format) + Convert string argument to datetime. + + This method is not implemented; calling it will raise NotImplementedError. + Use pd.to_datetime() instead. + + Parameters + ---------- + date_string : str + String to convert to a datetime. + format : str, default None + The format string to parse time, e.g. "%d/%m/%Y". - Function is not implemented. Use pd.to_datetime(). + See Also + -------- + pd.to_datetime : Convert argument to datetime. + datetime.datetime.strptime : Return a datetime corresponding to a string + representing a date and time, parsed according to a separate + format string. + datetime.datetime.strftime : Return a string representing the date and + time, controlled by an explicit format string. + Timestamp.isoformat : Return the time formatted according to ISO 8601. Examples -------- @@ -717,6 +946,21 @@ class NaTType(_NaT): Construct a timezone-aware UTC datetime from a POSIX timestamp. + This method creates a datetime object from a POSIX timestamp, keeping the + Timestamp object's timezone. + + Parameters + ---------- + ts : float + POSIX timestamp. + + See Also + -------- + Timezone.tzname : Return time zone name. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local + time from POSIX timestamp. + Notes ----- Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp @@ -731,16 +975,43 @@ class NaTType(_NaT): fromtimestamp = _make_error_func( "fromtimestamp", """ - Timestamp.fromtimestamp(ts) + Create a `Timestamp` object from a POSIX timestamp. - Transform timestamp[, tz] to tz's local time from POSIX timestamp. + This method converts a POSIX timestamp (the number of seconds since + January 1, 1970, 00:00:00 UTC) into a `Timestamp` object. The resulting + `Timestamp` can be localized to a specific time zone if provided. + + Parameters + ---------- + ts : float + The POSIX timestamp to convert, representing seconds since + the epoch (1970-01-01 00:00:00 UTC). + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, optional + Time zone for the `Timestamp`. If not provided, the `Timestamp` will + be timezone-naive (i.e., without time zone information). + + Returns + ------- + Timestamp + A `Timestamp` object representing the given POSIX timestamp. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + datetime.datetime.fromtimestamp : Returns a datetime from a POSIX timestamp. Examples -------- + Convert a POSIX timestamp to a `Timestamp`: + >>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP Timestamp('2020-03-14 15:32:52') - Note that the output may change depending on your local time. + Note that the output may change depending on your local time and time zone: + + >>> pd.Timestamp.fromtimestamp(1584199972, tz='UTC') # doctest: +SKIP + Timestamp('2020-03-14 15:32:52+0000', tz='UTC') """, ) combine = _make_error_func( @@ -748,7 +1019,28 @@ class NaTType(_NaT): """ Timestamp.combine(date, time) - Combine date, time into datetime with same date and time fields. + Combine a date and time into a single Timestamp object. + + This method takes a `date` object and a `time` object + and combines them into a single `Timestamp` + that has the same date and time fields. + + Parameters + ---------- + date : datetime.date + The date part of the Timestamp. + time : datetime.time + The time part of the Timestamp. + + Returns + ------- + Timestamp + A new `Timestamp` object representing the combined date and time. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. Examples -------- @@ -764,6 +1056,18 @@ class NaTType(_NaT): Return a new Timestamp representing UTC day and time. + See Also + -------- + Timestamp : Constructs an arbitrary datetime. + Timestamp.now : Return the current local date and time, which + can be timezone-aware. + Timestamp.today : Return the current local date and time with + timezone information set to None. + to_datetime : Convert argument to datetime. + date_range : Return a fixed frequency DatetimeIndex. + Timestamp.utctimetuple : Return UTC time tuple, compatible with + time.localtime(). + Examples -------- >>> pd.Timestamp.utcnow() # doctest: +SKIP @@ -776,6 +1080,23 @@ class NaTType(_NaT): """ Return POSIX timestamp as float. + This method converts the `Timestamp` object to a POSIX timestamp, which is + the number of seconds since the Unix epoch (January 1, 1970). The returned + value is a floating-point number, where the integer part represents the + seconds, and the fractional part represents the microseconds. + + Returns + ------- + float + The POSIX timestamp representation of the `Timestamp` object. + + See Also + -------- + Timestamp.fromtimestamp : Construct a `Timestamp` from a POSIX timestamp. + datetime.datetime.timestamp : Equivalent method from the `datetime` module. + Timestamp.to_pydatetime : Convert the `Timestamp` to a `datetime` object. + Timestamp.to_datetime64 : Converts `Timestamp` to `numpy.datetime64`. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') @@ -791,9 +1112,14 @@ class NaTType(_NaT): """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -806,6 +1132,13 @@ class NaTType(_NaT): TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: @@ -835,21 +1168,43 @@ class NaTType(_NaT): """ Construct a timestamp from a a proleptic Gregorian ordinal. + This method creates a `Timestamp` object corresponding to the given + proleptic Gregorian ordinal, which is a count of days from January 1, + 0001 (using the proleptic Gregorian calendar). The time part of the + `Timestamp` is set to midnight (00:00:00) by default. + Parameters ---------- ordinal : int Date corresponding to a proleptic Gregorian ordinal. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + Returns + ------- + Timestamp + A `Timestamp` object representing the specified ordinal date. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + Notes ----- By definition there cannot be any tz info on the ordinal itself. Examples -------- + Convert an ordinal to a `Timestamp`: + >>> pd.Timestamp.fromordinal(737425) Timestamp('2020-01-01 00:00:00') + + Create a `Timestamp` from an ordinal with timezone information: + + >>> pd.Timestamp.fromordinal(737425, tz='UTC') + Timestamp('2020-01-01 00:00:00+0000', tz='UTC') """, ) @@ -859,7 +1214,30 @@ class NaTType(_NaT): """ Convert a Timestamp object to a native Python datetime object. - If warn=True, issue a warning if nanoseconds is nonzero. + This method is useful for when you need to utilize a pandas Timestamp + object in contexts where native Python datetime objects are expected + or required. The conversion discards the nanoseconds component, and a + warning can be issued in such cases if desired. + + Parameters + ---------- + warn : bool, default True + If True, issues a warning when the timestamp includes nonzero + nanoseconds, as these will be discarded during the conversion. + + Returns + ------- + datetime.datetime or NaT + Returns a datetime.datetime object representing the timestamp, + with year, month, day, hour, minute, second, and microsecond components. + If the timestamp is NaT (Not a Time), returns NaT. + + See Also + -------- + datetime.datetime : The standard Python datetime class that this method + returns. + Timestamp.timestamp : Convert a Timestamp object to POSIX timestamp. + Timestamp.to_datetime64 : Convert a Timestamp object to numpy.datetime64. Examples -------- @@ -879,11 +1257,21 @@ class NaTType(_NaT): """ Return new Timestamp object representing current time local to tz. + This method returns a new `Timestamp` object that represents the current time. + If a timezone is provided, the current time will be localized to that timezone. + Otherwise, it returns the current local time. + Parameters ---------- tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.today : Return the current time in the local timezone. + Examples -------- >>> pd.Timestamp.now() # doctest: +SKIP @@ -908,6 +1296,12 @@ class NaTType(_NaT): tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + datetime.datetime.today : Returns the current local date. + Timestamp.now : Returns current time with optional timezone. + Timestamp : A class representing a specific timestamp. + Examples -------- >>> pd.Timestamp.today() # doctest: +SKIP @@ -924,6 +1318,12 @@ class NaTType(_NaT): """ Round the Timestamp to the specified resolution. + This method rounds the given Timestamp down to a specified frequency + level. It is particularly useful in data analysis to normalize timestamps + to regular frequency intervals. For instance, rounding to the nearest + minute, hour, or day can help in time series comparisons or resampling + operations. + Parameters ---------- freq : str @@ -934,9 +1334,9 @@ class NaTType(_NaT): * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -947,7 +1347,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -958,6 +1358,14 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted + See Also + -------- + datetime.round : Similar behavior in native Python datetime module. + Timestamp.floor : Round the Timestamp downward to the nearest multiple + of the specified frequency. + Timestamp.ceil : Round the Timestamp upward to the nearest multiple of + the specified frequency. + Notes ----- If the Timestamp has a timezone, rounding will take place relative to the @@ -973,16 +1381,16 @@ timedelta}, default 'raise' A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='h') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.round(freq='min') # minute + >>> ts.round(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.round(freq='s') # seconds + >>> ts.round(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.round(freq='ms') # milliseconds + >>> ts.round(freq='ms') # milliseconds Timestamp('2020-03-14 15:32:52.193000') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -1027,9 +1435,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1040,13 +1448,19 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.ceil : Round up a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.floor : Round down the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, flooring will take place relative to the @@ -1062,16 +1476,16 @@ timedelta}, default 'raise' A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='h') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') - >>> ts.floor(freq='min') # minute + >>> ts.floor(freq='min') # minute Timestamp('2020-03-14 15:32:00') - >>> ts.floor(freq='s') # seconds + >>> ts.floor(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.floor(freq='ns') # nanoseconds + >>> ts.floor(freq='ns') # nanoseconds Timestamp('2020-03-14 15:32:52.192548651') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -1116,9 +1530,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1129,13 +1543,19 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.floor : Round down a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.ceil : Ceil the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, ceiling will take place relative to the @@ -1151,16 +1571,16 @@ timedelta}, default 'raise' A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='h') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.ceil(freq='min') # minute + >>> ts.ceil(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.ceil(freq='s') # seconds + >>> ts.ceil(freq='s') # seconds Timestamp('2020-03-14 15:32:53') - >>> ts.ceil(freq='us') # microseconds + >>> ts.ceil(freq='us') # microseconds Timestamp('2020-03-14 15:32:52.192549') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -1196,9 +1616,14 @@ timedelta}, default 'raise' """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -1211,6 +1636,13 @@ timedelta}, default 'raise' TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: @@ -1245,7 +1677,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding local time. @@ -1262,9 +1694,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1277,7 +1709,7 @@ default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1289,6 +1721,13 @@ default 'raise' TypeError If the Timestamp is tz-aware and tz is not None. + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a naive timestamp object: @@ -1313,22 +1752,48 @@ default 'raise' """ Implements datetime.replace, handles nanoseconds. + This method creates a new `Timestamp` object by replacing the specified + fields with new values. The new `Timestamp` retains the original fields + that are not explicitly replaced. This method handles nanoseconds, and + the `tzinfo` parameter allows for timezone replacement without conversion. + Parameters ---------- year : int, optional + The year to replace. If `None`, the year is not changed. month : int, optional + The month to replace. If `None`, the month is not changed. day : int, optional + The day to replace. If `None`, the day is not changed. hour : int, optional + The hour to replace. If `None`, the hour is not changed. minute : int, optional + The minute to replace. If `None`, the minute is not changed. second : int, optional + The second to replace. If `None`, the second is not changed. microsecond : int, optional + The microsecond to replace. If `None`, the microsecond is not changed. nanosecond : int, optional + The nanosecond to replace. If `None`, the nanosecond is not changed. tzinfo : tz-convertible, optional + The timezone information to replace. If `None`, the timezone is not changed. fold : int, optional + The fold information to replace. If `None`, the fold is not changed. Returns ------- - Timestamp with fields replaced + Timestamp + A new `Timestamp` object with the specified fields replaced. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + + Notes + ----- + The `replace` method does not perform timezone conversions. If you need + to convert the timezone, use the `tz_convert` method instead. Examples -------- @@ -1345,13 +1810,13 @@ default 'raise' Replace timezone (not a conversion): - >>> import pytz - >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> import zoneinfo + >>> ts.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') Analogous for ``pd.NaT``: - >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> pd.NaT.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) NaT """, ) @@ -1366,7 +1831,7 @@ default 'raise' def as_unit(self, str unit, bint round_ok=True) -> "NaTType": """ - Convert the underlying int64 representaton to the given unit. + Convert the underlying int64 representation to the given unit. Parameters ---------- @@ -1378,6 +1843,14 @@ default 'raise' ------- Timestamp + See Also + -------- + Timestamp.asm8 : Return numpy datetime64 format with same precision. + Timestamp.to_pydatetime : Convert Timestamp object to a native + Python datetime object. + to_timedelta : Convert argument into timedelta object, + which can represent differences in times. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 00:00:00.01') diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index cb2658d343772..3e5654b70cd92 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -34,7 +34,7 @@ cdef extern from "numpy/ndarraytypes.h": NPY_FR_as NPY_FR_GENERIC - int64_t NPY_DATETIME_NAT # elswhere we call this NPY_NAT + int64_t NPY_DATETIME_NAT # elsewhere we call this NPY_NAT cdef extern from "pandas/datetime/pd_datetime.h": @@ -89,7 +89,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None = *, + str format = *, bint exact = * ) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index aa01a05d0d932..1b7f04fe17238 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -18,6 +18,7 @@ from cpython.object cimport ( Py_LT, Py_NE, ) +from cpython.unicode cimport PyUnicode_AsUTF8AndSize from libc.stdint cimport INT64_MAX import_datetime() @@ -44,7 +45,6 @@ from pandas._libs.tslibs.dtypes cimport ( npy_unit_to_abbrev, npy_unit_to_attrname, ) -from pandas._libs.tslibs.util cimport get_c_string_buf_and_size cdef extern from "pandas/datetime/pd_datetime.h": @@ -176,6 +176,15 @@ class OutOfBoundsDatetime(ValueError): """ Raised when the datetime is outside the range that can be represented. + This error occurs when attempting to convert or parse a datetime value + that exceeds the bounds supported by pandas' internal datetime + representation. + + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp : Pandas replacement for python ``datetime.datetime`` object. + Examples -------- >>> pd.to_datetime("08335394550") @@ -192,6 +201,10 @@ class OutOfBoundsTimedelta(ValueError): Representation should be within a timedelta64[ns]. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + Examples -------- >>> pd.date_range(start="1/1/1700", freq="B", periods=100000) @@ -331,7 +344,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None=None, + str format=None, bint exact=True, ) except? -1: cdef: @@ -341,13 +354,13 @@ cdef int string_to_dts( const char* format_buf FormatRequirement format_requirement - buf = get_c_string_buf_and_size(val, &length) + buf = PyUnicode_AsUTF8AndSize(val, &length) if format is None: format_buf = b"" format_length = 0 format_requirement = INFER_FORMAT else: - format_buf = get_c_string_buf_and_size(format, &format_length) + format_buf = PyUnicode_AsUTF8AndSize(format, &format_length) format_requirement = exact return parse_iso_8601_datetime(buf, length, want_exc, dts, out_bestunit, out_local, out_tzoffset, diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index 7eb8dc0813868..ad579a5e41522 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -1,3 +1,4 @@ +from collections.abc import Collection from datetime import ( datetime, time, @@ -5,7 +6,6 @@ from datetime import ( ) from typing import ( Any, - Collection, Literal, TypeVar, overload, @@ -20,8 +20,6 @@ from pandas._typing import ( npt, ) -from .timedeltas import Timedelta - _BaseOffsetT = TypeVar("_BaseOffsetT", bound=BaseOffset) _DatetimeT = TypeVar("_DatetimeT", bound=datetime) _TimedeltaT = TypeVar("_TimedeltaT", bound=timedelta) @@ -64,7 +62,7 @@ class BaseOffset: @overload def __rsub__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ... @overload - def __rsub__(self, other: BaseOffset): ... + def __rsub__(self, other: BaseOffset) -> Self: ... @overload def __rsub__(self, other: _DatetimeT) -> _DatetimeT: ... @overload @@ -72,7 +70,7 @@ class BaseOffset: @overload def __mul__(self, other: np.ndarray) -> np.ndarray: ... @overload - def __mul__(self, other: int): ... + def __mul__(self, other: int) -> Self: ... @overload def __rmul__(self, other: np.ndarray) -> np.ndarray: ... @overload @@ -94,13 +92,12 @@ class BaseOffset: def __getstate__(self): ... @property def nanos(self) -> int: ... - def is_anchored(self) -> bool: ... def _get_offset(name: str) -> BaseOffset: ... class SingleConstructorOffset(BaseOffset): @classmethod - def _from_name(cls, suffix: None = ...): ... + def _from_name(cls, suffix: None = ...) -> Self: ... def __reduce__(self): ... @overload @@ -115,8 +112,6 @@ class Tick(SingleConstructorOffset): _prefix: str def __init__(self, n: int = ..., normalize: bool = ...) -> None: ... @property - def delta(self) -> Timedelta: ... - @property def nanos(self) -> int: ... def delta_to_tick(delta: timedelta) -> Tick: ... @@ -173,6 +168,16 @@ class BQuarterEnd(QuarterOffset): ... class BQuarterBegin(QuarterOffset): ... class QuarterEnd(QuarterOffset): ... class QuarterBegin(QuarterOffset): ... + +class HalfYearOffset(SingleConstructorOffset): + def __init__( + self, n: int = ..., normalize: bool = ..., startingMonth: int | None = ... + ) -> None: ... + +class BHalfYearEnd(HalfYearOffset): ... +class BHalfYearBegin(HalfYearOffset): ... +class HalfYearEnd(HalfYearOffset): ... +class HalfYearBegin(HalfYearOffset): ... class MonthOffset(SingleConstructorOffset): ... class MonthEnd(MonthOffset): ... class MonthBegin(MonthOffset): ... @@ -197,7 +202,10 @@ class WeekOfMonth(WeekOfMonthMixin): self, n: int = ..., normalize: bool = ..., week: int = ..., weekday: int = ... ) -> None: ... -class LastWeekOfMonth(WeekOfMonthMixin): ... +class LastWeekOfMonth(WeekOfMonthMixin): + def __init__( + self, n: int = ..., normalize: bool = ..., weekday: int = ... + ) -> None: ... class FY5253Mixin(SingleConstructorOffset): def __init__( diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b3788b6003e67..a16964435ef50 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -32,6 +32,8 @@ cnp.import_array() # TODO: formalize having _libs.properties "above" tslibs in the dependency structure +from typing import ClassVar + from pandas._libs.properties import cache_readonly from pandas._libs.tslibs cimport util @@ -56,9 +58,10 @@ from pandas._libs.tslibs.ccalendar cimport ( ) from pandas._libs.tslibs.conversion cimport localize_pydatetime from pandas._libs.tslibs.dtypes cimport ( - c_DEPR_ABBREVS, - c_OFFSET_DEPR_FREQSTR, - c_REVERSE_OFFSET_DEPR_FREQSTR, + c_OFFSET_RENAMED_FREQSTR, + c_OFFSET_TO_PERIOD_FREQSTR, + c_PERIOD_AND_OFFSET_DEPR_FREQSTR, + c_PERIOD_TO_OFFSET_FREQSTR, periods_per_day, ) from pandas._libs.tslibs.nattype cimport ( @@ -219,8 +222,7 @@ cdef _get_calendar(weekmask, holidays, calendar): holidays = holidays + calendar.holidays().tolist() except AttributeError: pass - holidays = [_to_dt64D(dt) for dt in holidays] - holidays = tuple(sorted(holidays)) + holidays = tuple(sorted(_to_dt64D(dt) for dt in holidays)) kwargs = {"weekmask": weekmask} if holidays: @@ -419,11 +421,10 @@ cdef class BaseOffset: if "holidays" in all_paras and not all_paras["holidays"]: all_paras.pop("holidays") - exclude = ["kwds", "name", "calendar"] - attrs = [(k, v) for k, v in all_paras.items() - if (k not in exclude) and (k[0] != "_")] - attrs = sorted(set(attrs)) - params = tuple([str(type(self))] + attrs) + exclude = {"kwds", "name", "calendar"} + attrs = {(k, v) for k, v in all_paras.items() + if (k not in exclude) and (k[0] != "_")} + params = tuple([str(type(self))] + sorted(attrs)) return params @property @@ -431,6 +432,12 @@ cdef class BaseOffset: """ Return a dict of extra parameters for the offset. + See Also + -------- + tseries.offsets.DateOffset : The base class for all pandas date offsets. + tseries.offsets.WeekOfMonth : Represents the week of the month. + tseries.offsets.LastWeekOfMonth : Represents the last week of the month. + Examples -------- >>> pd.DateOffset(5).kwds @@ -486,6 +493,12 @@ cdef class BaseOffset: elif is_integer_object(other): return type(self)(n=other * self.n, normalize=self.normalize, **self.kwds) + elif isinstance(other, BaseOffset): + # Otherwise raises RecurrsionError due to __rmul__ + raise TypeError( + f"Cannot multiply {type(self).__name__} with " + f"{type(other).__name__}." + ) return NotImplemented def __rmul__(self, other): @@ -502,6 +515,13 @@ cdef class BaseOffset: """ Return a copy of the frequency. + See Also + -------- + tseries.offsets.Week.copy : Return a copy of Week offset. + tseries.offsets.DateOffset.copy : Return a copy of date offset. + tseries.offsets.MonthEnd.copy : Return a copy of MonthEnd offset. + tseries.offsets.YearBegin.copy : Return a copy of YearBegin offset. + Examples -------- >>> freq = pd.DateOffset(1) @@ -553,6 +573,14 @@ cdef class BaseOffset: """ Return a string representing the base frequency. + See Also + -------- + tseries.offsets.Week : Represents a weekly offset. + DateOffset : Base class for all other offset classes. + tseries.offsets.Day : Represents a single day offset. + tseries.offsets.MonthEnd : Represents a monthly offset that + snaps to the end of the month. + Examples -------- >>> pd.offsets.Hour().name @@ -569,6 +597,24 @@ cdef class BaseOffset: @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Hour().rule_code + 'h' + + >>> pd.offsets.Week(5).rule_code + 'W' + """ return self._prefix @cache_readonly @@ -576,6 +622,17 @@ cdef class BaseOffset: """ Return a string representing the frequency. + See Also + -------- + tseries.offsets.BusinessDay.freqstr : + Return a string representing an offset frequency in Business Days. + tseries.offsets.BusinessHour.freqstr : + Return a string representing an offset frequency in Business Hours. + tseries.offsets.Week.freqstr : + Return a string representing an offset frequency in Weeks. + tseries.offsets.Hour.freqstr : + Return a string representing an offset frequency in Hours. + Examples -------- >>> pd.DateOffset(5).freqstr @@ -665,11 +722,24 @@ cdef class BaseOffset: """ Return boolean whether a timestamp intersects with this frequency. + This method determines if a given timestamp aligns with the start + of a custom business month, as defined by this offset. It accounts + for custom rules, such as skipping weekends or other non-business days, + and checks whether the provided datetime falls on a valid business day + that marks the beginning of the custom business month. + Parameters ---------- dt : datetime.datetime Timestamp to check intersections with frequency. + See Also + -------- + tseries.offsets.CustomBusinessMonthBegin : Represents the start of a custom + business month. + tseries.offsets.CustomBusinessMonthEnd : Represents the end of a custom + business month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -737,7 +807,7 @@ cdef class BaseOffset: def __getstate__(self): """ - Return a pickleable state + Return a picklable state """ state = {} state["n"] = self.n @@ -753,22 +823,27 @@ cdef class BaseOffset: @property def nanos(self): - raise ValueError(f"{self} is a non-fixed frequency") - - def is_anchored(self) -> bool: - # TODO: Does this make sense for the general case? It would help - # if there were a canonical docstring for what is_anchored means. """ - Return boolean whether the frequency is a unit frequency (n=1). + Returns a integer of the total number of nanoseconds for fixed frequencies. + + Raises + ------ + ValueError + If the frequency is non-fixed. + + See Also + -------- + tseries.offsets.Hour.nanos : + Returns an integer of the total number of nanoseconds. + tseries.offsets.Day.nanos : + Returns an integer of the total number of nanoseconds. Examples -------- - >>> pd.DateOffset().is_anchored() - True - >>> pd.DateOffset(2).is_anchored() - False + >>> pd.offsets.Week(n=1).nanos + ValueError: Week: weekday=None is a non-fixed frequency """ - return self.n == 1 + raise ValueError(f"{self} is a non-fixed frequency") # ------------------------------------------------------------------ @@ -776,6 +851,15 @@ cdef class BaseOffset: """ Return boolean whether a timestamp occurs on the month start. + Parameters + ---------- + ts : Timestamp + The timestamp to check. + + See Also + -------- + is_month_end : Return boolean whether a timestamp occurs on the month end. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -789,6 +873,15 @@ cdef class BaseOffset: """ Return boolean whether a timestamp occurs on the month end. + Parameters + ---------- + ts : Timestamp + The timestamp to check. + + See Also + -------- + is_month_start : Return boolean whether a timestamp occurs on the month start. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -802,6 +895,15 @@ cdef class BaseOffset: """ Return boolean whether a timestamp occurs on the quarter start. + Parameters + ---------- + ts : Timestamp + The timestamp to check. + + See Also + -------- + is_quarter_end : Return boolean whether a timestamp occurs on the quarter end. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -815,6 +917,16 @@ cdef class BaseOffset: """ Return boolean whether a timestamp occurs on the quarter end. + Parameters + ---------- + ts : Timestamp + The timestamp to check. + + See Also + -------- + is_quarter_start : Return boolean whether a timestamp + occurs on the quarter start. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -828,6 +940,15 @@ cdef class BaseOffset: """ Return boolean whether a timestamp occurs on the year start. + Parameters + ---------- + ts : Timestamp + The timestamp to check. + + See Also + -------- + is_year_end : Return boolean whether a timestamp occurs on the year end. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -841,6 +962,15 @@ cdef class BaseOffset: """ Return boolean whether a timestamp occurs on the year end. + Parameters + ---------- + ts : Timestamp + The timestamp to check. + + See Also + -------- + is_year_start : Return boolean whether a timestamp occurs on the year start. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -917,31 +1047,17 @@ cdef class Tick(SingleConstructorOffset): def _as_pd_timedelta(self): return Timedelta(self) - @property - def delta(self): - warnings.warn( - # GH#55498 - f"{type(self).__name__}.delta is deprecated and will be removed in " - "a future version. Use pd.Timedelta(obj) instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - try: - return self.n * Timedelta(self._nanos_inc) - except OverflowError as err: - # GH#55503 as_unit will raise a more useful OutOfBoundsTimedelta - Timedelta(self).as_unit("ns") - raise AssertionError("This should not be reached.") - @property def nanos(self) -> int64_t: """ - Return an integer of the total number of nanoseconds. + Returns an integer of the total number of nanoseconds. - Raises - ------ - ValueError - If the frequency is non-fixed. + See Also + -------- + tseries.offsets.Hour.nanos : + Returns an integer of the total number of nanoseconds. + tseries.offsets.Day.nanos : + Returns an integer of the total number of nanoseconds. Examples -------- @@ -953,9 +1069,6 @@ cdef class Tick(SingleConstructorOffset): def is_on_offset(self, dt: datetime) -> bool: return True - def is_anchored(self) -> bool: - return False - # This is identical to BaseOffset.__hash__, but has to be redefined here # for Python 3, because we've redefined __eq__. def __hash__(self) -> int: @@ -1064,7 +1177,7 @@ cdef class Day(Tick): """ Offset ``n`` days. - Parameters + Attributes ---------- n : int, default 1 The number of days represented. @@ -1100,7 +1213,7 @@ cdef class Hour(Tick): """ Offset ``n`` hours. - Parameters + Attributes ---------- n : int, default 1 The number of hours represented. @@ -1136,7 +1249,7 @@ cdef class Minute(Tick): """ Offset ``n`` minutes. - Parameters + Attributes ---------- n : int, default 1 The number of minutes represented. @@ -1172,7 +1285,7 @@ cdef class Second(Tick): """ Offset ``n`` seconds. - Parameters + Attributes ---------- n : int, default 1 The number of seconds represented. @@ -1208,7 +1321,7 @@ cdef class Milli(Tick): """ Offset ``n`` milliseconds. - Parameters + Attributes ---------- n : int, default 1 The number of milliseconds represented. @@ -1245,7 +1358,7 @@ cdef class Micro(Tick): """ Offset ``n`` microseconds. - Parameters + Attributes ---------- n : int, default 1 The number of microseconds represented. @@ -1282,7 +1395,7 @@ cdef class Nano(Tick): """ Offset ``n`` nanoseconds. - Parameters + Attributes ---------- n : int, default 1 The number of nanoseconds represented. @@ -1358,7 +1471,7 @@ cdef class RelativeDeltaOffset(BaseOffset): def __getstate__(self): """ - Return a pickleable state + Return a picklable state """ # RelativeDeltaOffset (technically DateOffset) is the only non-cdef # class, so the only one with __dict__ @@ -1428,13 +1541,22 @@ cdef class RelativeDeltaOffset(BaseOffset): "minutes", "seconds", "microseconds", + "milliseconds", } # relativedelta/_offset path only valid for base DateOffset if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): + td_args = { + "days", + "hours", + "minutes", + "seconds", + "microseconds", + "milliseconds" + } td_kwds = { key: val for key, val in kwds.items() - if key in ["days", "hours", "minutes", "seconds", "microseconds"] + if key in td_args } if "weeks" in kwds: days = td_kwds.get("days", 0) @@ -1444,6 +1566,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(**td_kwds) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") else: @@ -1461,6 +1585,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(self._offset * self.n) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") return delta @@ -1514,7 +1640,7 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): Standard kind of date increment used for a date range. Works exactly like the keyword argument form of relativedelta. - Note that the positional argument form of relativedelata is not + Note that the positional argument form of relativedelta is not supported. Use of the keyword n is discouraged-- you would be better off specifying n in the keywords you use, but regardless it is there for you. n is needed for DateOffset subclasses. @@ -1556,7 +1682,7 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): Besides, adding a DateOffsets specified by the singular form of the date component can be used to replace certain component of the timestamp. - Parameters + Attributes ---------- n : int, default 1 The number of time periods the offset represents. @@ -2366,6 +2492,24 @@ cdef class WeekOfMonthMixin(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Week(5).rule_code + 'W' + + >>> pd.offsets.WeekOfMonth(n=1, week=0, weekday=0).rule_code + 'WOM-1MON' + """ weekday = int_to_weekday.get(self.weekday, "") if self.week == -1: # LastWeekOfMonth @@ -2382,8 +2526,7 @@ cdef class YearOffset(SingleConstructorOffset): """ _attributes = tuple(["n", "normalize", "month"]) - # FIXME(cython#4446): python annotation here gives compile-time errors - # _default_month: int + _default_month: ClassVar[int] cdef readonly: int month @@ -2412,6 +2555,24 @@ cdef class YearOffset(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.tseries.offsets.YearBegin(n=1, month=2).rule_code + 'YS-FEB' + + >>> pd.tseries.offsets.YearEnd(n=1, month=6).rule_code + 'YE-JUN' + """ month = MONTH_ALIASES[self.month] return f"{self._prefix}-{month}" @@ -2446,7 +2607,7 @@ cdef class BYearEnd(YearOffset): """ DateOffset increments between the last business day of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2485,7 +2646,7 @@ cdef class BYearBegin(YearOffset): """ DateOffset increments between the first business day of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2526,7 +2687,7 @@ cdef class YearEnd(YearOffset): YearEnd goes to the next date which is the end of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2580,7 +2741,7 @@ cdef class YearBegin(YearOffset): YearBegin goes to the next date which is the start of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2628,9 +2789,8 @@ cdef class QuarterOffset(SingleConstructorOffset): # point. Also apply_index, is_on_offset, rule_code if # startingMonth vs month attr names are resolved - # FIXME(cython#4446): python annotation here gives compile-time errors - # _default_starting_month: int - # _from_name_starting_month: int + _default_starting_month: ClassVar[int] + _from_name_starting_month: ClassVar[int] cdef readonly: int startingMonth @@ -2662,9 +2822,6 @@ cdef class QuarterOffset(SingleConstructorOffset): month = MONTH_ALIASES[self.startingMonth] return f"{self._prefix}-{month}" - def is_anchored(self) -> bool: - return self.n == 1 and self.startingMonth is not None - def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False @@ -2706,7 +2863,7 @@ cdef class BQuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2747,7 +2904,7 @@ cdef class BQuarterBegin(QuarterOffset): startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ... startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2788,7 +2945,7 @@ cdef class QuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2829,7 +2986,7 @@ cdef class QuarterBegin(QuarterOffset): startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ... startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2854,6 +3011,227 @@ cdef class QuarterBegin(QuarterOffset): _day_opt = "start" +# ---------------------------------------------------------------------- +# HalfYear-Based Offset Classes + +cdef class HalfYearOffset(SingleConstructorOffset): + _attributes = tuple(["n", "normalize", "startingMonth"]) + # TODO: Consider combining HalfYearOffset, QuarterOffset and YearOffset + + _default_starting_month: ClassVar[int] + _from_name_starting_month: ClassVar[int] + + cdef readonly: + int startingMonth + + def __init__(self, n=1, normalize=False, startingMonth=None): + BaseOffset.__init__(self, n, normalize) + + if startingMonth is None: + startingMonth = self._default_starting_month + self.startingMonth = startingMonth + + cpdef __setstate__(self, state): + self.startingMonth = state.pop("startingMonth") + self.n = state.pop("n") + self.normalize = state.pop("normalize") + + @classmethod + def _from_name(cls, suffix=None): + kwargs = {} + if suffix: + kwargs["startingMonth"] = MONTH_TO_CAL_NUM[suffix] + else: + if cls._from_name_starting_month is not None: + kwargs["startingMonth"] = cls._from_name_starting_month + return cls(**kwargs) + + @property + def rule_code(self) -> str: + month = MONTH_ALIASES[self.startingMonth] + return f"{self._prefix}-{month}" + + def is_on_offset(self, dt: datetime) -> bool: + if self.normalize and not _is_normalized(dt): + return False + mod_month = (dt.month - self.startingMonth) % 6 + return mod_month == 0 and dt.day == self._get_offset_day(dt) + + @apply_wraps + def _apply(self, other: datetime) -> datetime: + # months_since: find the calendar half containing other.month, + # e.g. if other.month == 8, the calendar half is [Jul, Aug, Sep, ..., Dec]. + # Then find the month in that half containing an is_on_offset date for + # self. `months_since` is the number of months to shift other.month + # to get to this on-offset month. + months_since = other.month % 6 - self.startingMonth % 6 + hlvs = roll_qtrday( + other, self.n, self.startingMonth, day_opt=self._day_opt, modby=6 + ) + months = hlvs * 6 - months_since + return shift_month(other, months, self._day_opt) + + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: + reso = get_unit_from_dtype(dtarr.dtype) + shifted = shift_quarters( + dtarr.view("i8"), + self.n, + self.startingMonth, + self._day_opt, + modby=6, + reso=reso, + ) + return shifted + + +cdef class BHalfYearEnd(HalfYearOffset): + """ + DateOffset increments between the last business day of each half-year. + + startingMonth = 1 corresponds to dates like 1/31/2007, 7/31/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 8/31/2007, ... + startingMonth = 6 corresponds to dates like 6/30/2007, 12/31/2007, ... + + Attributes + ---------- + n : int, default 1 + The number of half-years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + startingMonth : int, default 6 + A specific integer for the month of the year from which we start half-years. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + >>> from pandas.tseries.offsets import BHalfYearEnd + >>> ts = pd.Timestamp('2020-05-24 05:01:15') + >>> ts + BHalfYearEnd() + Timestamp('2020-06-30 05:01:15') + >>> ts + BHalfYearEnd(2) + Timestamp('2020-12-31 05:01:15') + >>> ts + BHalfYearEnd(1, startingMonth=2) + Timestamp('2020-08-31 05:01:15') + >>> ts + BHalfYearEnd(startingMonth=2) + Timestamp('2020-08-31 05:01:15') + """ + _output_name = "BusinessHalfYearEnd" + _default_starting_month = 6 + _from_name_starting_month = 12 + _prefix = "BHYE" + _day_opt = "business_end" + + +cdef class BHalfYearBegin(HalfYearOffset): + """ + DateOffset increments between the first business day of each half-year. + + startingMonth = 1 corresponds to dates like 1/01/2007, 7/01/2007, ... + startingMonth = 2 corresponds to dates like 2/01/2007, 8/01/2007, ... + startingMonth = 3 corresponds to dates like 3/01/2007, 9/01/2007, ... + + Attributes + ---------- + n : int, default 1 + The number of half-years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + startingMonth : int, default 1 + A specific integer for the month of the year from which we start half-years. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + >>> from pandas.tseries.offsets import BHalfYearBegin + >>> ts = pd.Timestamp('2020-05-24 05:01:15') + >>> ts + BHalfYearBegin() + Timestamp('2020-07-01 05:01:15') + >>> ts + BHalfYearBegin(2) + Timestamp('2021-01-01 05:01:15') + >>> ts + BHalfYearBegin(startingMonth=2) + Timestamp('2020-08-03 05:01:15') + >>> ts + BHalfYearBegin(-1) + Timestamp('2020-01-01 05:01:15') + """ + _output_name = "BusinessHalfYearBegin" + _default_starting_month = 1 + _from_name_starting_month = 1 + _prefix = "BHYS" + _day_opt = "business_start" + + +cdef class HalfYearEnd(HalfYearOffset): + """ + DateOffset increments between half-year end dates. + + startingMonth = 1 corresponds to dates like 1/31/2007, 7/31/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 8/31/2007, ... + startingMonth = 6 corresponds to dates like 6/30/2007, 12/31/2007, ... + + Attributes + ---------- + n : int, default 1 + The number of half-years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + startingMonth : int, default 6 + A specific integer for the month of the year from which we start half-years. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.HalfYearEnd() + Timestamp('2022-06-30 00:00:00') + """ + _default_starting_month = 6 + _from_name_starting_month = 12 + _prefix = "HYE" + _day_opt = "end" + + +cdef class HalfYearBegin(HalfYearOffset): + """ + DateOffset increments between half-year start dates. + + startingMonth = 1 corresponds to dates like 1/01/2007, 7/01/2007, ... + startingMonth = 2 corresponds to dates like 2/01/2007, 8/01/2007, ... + startingMonth = 3 corresponds to dates like 3/01/2007, 9/01/2007, ... + + Attributes + ---------- + n : int, default 1 + The number of half-years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + startingMonth : int, default 1 + A specific integer for the month of the year from which we start half-years. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + >>> ts = pd.Timestamp(2022, 2, 1) + >>> ts + pd.offsets.HalfYearBegin() + Timestamp('2022-07-01 00:00:00') + """ + _default_starting_month = 1 + _from_name_starting_month = 1 + _prefix = "HYS" + _day_opt = "start" + + # ---------------------------------------------------------------------- # Month-Based Offset Classes @@ -2889,7 +3267,7 @@ cdef class MonthEnd(MonthOffset): MonthEnd goes to the next date which is an end of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -2927,7 +3305,7 @@ cdef class MonthBegin(MonthOffset): MonthBegin goes to the next date which is a start of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -2964,7 +3342,7 @@ cdef class BusinessMonthEnd(MonthOffset): BusinessMonthEnd goes to the next date which is the last business day of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3002,7 +3380,7 @@ cdef class BusinessMonthBegin(MonthOffset): BusinessMonthBegin goes to the next date which is the first business day of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3172,7 +3550,12 @@ cdef class SemiMonthEnd(SemiMonthOffset): """ Two DateOffset's per month repeating on the last day of the month & day_of_month. - Parameters + This offset allows for flexibility in generating date ranges or adjusting dates + to the end of a month or a specific day in the month, such as the 15th or the last + day of the month. It is useful for financial or scheduling applications where + events occur bi-monthly. + + Attributes ---------- n : int, default 1 The number of months represented. @@ -3181,6 +3564,13 @@ cdef class SemiMonthEnd(SemiMonthOffset): day_of_month : int, {1, 3,...,27}, default 15 A specific integer for the day of the month. + See Also + -------- + tseries.offsets.SemiMonthBegin : Offset for semi-monthly frequencies, starting at + the beginning of the month. + tseries.offsets.MonthEnd : Offset to the last calendar day of the month. + tseries.offsets.MonthBegin : Offset to the first calendar day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 14) @@ -3215,7 +3605,11 @@ cdef class SemiMonthBegin(SemiMonthOffset): """ Two DateOffset's per month repeating on the first day of the month & day_of_month. - Parameters + This offset moves dates to the first day of the month and an additional specified + day (typically the 15th by default), useful in scenarios where bi-monthly processing + occurs on set days. + + Attributes ---------- n : int, default 1 The number of months represented. @@ -3224,6 +3618,13 @@ cdef class SemiMonthBegin(SemiMonthOffset): day_of_month : int, {1, 3,...,27}, default 15 A specific integer for the day of the month. + See Also + -------- + tseries.offsets.SemiMonthEnd : Two DateOffset's per month repeating on the last day + of the month & day_of_month. + tseries.offsets.MonthEnd : Offset to the last calendar day of the month. + tseries.offsets.MonthBegin : Offset to the first calendar day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -3247,7 +3648,7 @@ cdef class Week(SingleConstructorOffset): """ Weekly offset. - Parameters + Attributes ---------- n : int, default 1 The number of weeks represented. @@ -3307,9 +3708,6 @@ cdef class Week(SingleConstructorOffset): self.weekday = state.pop("weekday") self._cache = state.pop("_cache", {}) - def is_anchored(self) -> bool: - return self.n == 1 and self.weekday is not None - @apply_wraps def _apply(self, other): if self.weekday is None: @@ -3404,6 +3802,24 @@ cdef class Week(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.name : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.name : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Hour().rule_code + 'h' + + >>> pd.offsets.Week(5).rule_code + 'W' + """ suffix = "" if self.weekday is not None: weekday = int_to_weekday[self.weekday] @@ -3423,7 +3839,12 @@ cdef class WeekOfMonth(WeekOfMonthMixin): """ Describes monthly dates like "the Tuesday of the 2nd week of each month". - Parameters + This offset allows for generating or adjusting dates by specifying + a particular week and weekday within a month. The week is zero-indexed, + where 0 corresponds to the first week of the month, and weekday follows + a Monday=0 convention. + + Attributes ---------- n : int, default 1 The number of months represented. @@ -3443,6 +3864,12 @@ cdef class WeekOfMonth(WeekOfMonthMixin): - 5 is Saturday - 6 is Sunday. + See Also + -------- + offsets.Week : Describes weekly frequency adjustments. + offsets.MonthEnd : Describes month-end frequency adjustments. + date_range : Generates a range of dates based on a specific frequency. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -3500,7 +3927,7 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): For example "the last Tuesday of each month". - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3517,6 +3944,15 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): - 5 is Saturday - 6 is Sunday. + See Also + -------- + tseries.offsets.WeekOfMonth : + Date offset for a specific weekday in a month. + tseries.offsets.MonthEnd : + Date offset for the end of the month. + tseries.offsets.BMonthEnd : + Date offset for the last business day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -3596,11 +4032,6 @@ cdef class FY5253Mixin(SingleConstructorOffset): self.weekday = state.pop("weekday") self.variation = state.pop("variation") - def is_anchored(self) -> bool: - return ( - self.n == 1 and self.startingMonth is not None and self.weekday is not None - ) - # -------------------------------------------------------------------- # Name-related methods @@ -3645,7 +4076,7 @@ cdef class FY5253(FY5253Mixin): X is a specific day of the week. Y is a certain month of the year - Parameters + Attributes ---------- n : int The number of fiscal years represented. @@ -3848,7 +4279,7 @@ cdef class FY5253Quarter(FY5253Mixin): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... - Parameters + Attributes ---------- n : int The number of business quarters represented. @@ -4083,7 +4514,7 @@ cdef class Easter(SingleConstructorOffset): Right now uses the revised method which is valid in years 1583-4099. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -4221,9 +4652,7 @@ cdef class CustomBusinessDay(BusinessDay): @property def _period_dtype_code(self): # GH#52534 - raise TypeError( - "CustomBusinessDay is not supported as period frequency" - ) + raise ValueError(f"{self.base} is not supported as period frequency") _apply_array = BaseOffset._apply_array @@ -4615,6 +5044,8 @@ prefix_mapping = { BusinessMonthEnd, # 'BME' BQuarterEnd, # 'BQE' BQuarterBegin, # 'BQS' + BHalfYearEnd, # 'BHYE' + BHalfYearBegin, # 'BHYS' BusinessHour, # 'bh' CustomBusinessDay, # 'C' CustomBusinessMonthEnd, # 'CBME' @@ -4631,10 +5062,13 @@ prefix_mapping = { Micro, # 'us' QuarterEnd, # 'QE' QuarterBegin, # 'QS' + HalfYearEnd, # 'HYE' + HalfYearBegin, # 'HYS' Milli, # 'ms' Hour, # 'h' Day, # 'D' WeekOfMonth, # 'WOM' + LastWeekOfMonth, # 'LWOM' FY5253, FY5253Quarter, ] @@ -4655,35 +5089,9 @@ _lite_rule_alias = { "BYS": "BYS-JAN", # BYearBegin(month=1), "Min": "min", - "min": "min", - "ms": "ms", - "us": "us", - "ns": "ns", } -_dont_uppercase = { - "h", - "bh", - "cbh", - "MS", - "ms", - "s", - "me", - "qe", - "qe-dec", - "qe-jan", - "qe-feb", - "qe-mar", - "qe-apr", - "qe-may", - "qe-jun", - "qe-jul", - "qe-aug", - "qe-sep", - "qe-oct", - "qe-nov", - "ye", -} +_dont_uppercase = {"min", "h", "bh", "cbh", "s", "ms", "us", "ns"} INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4693,6 +5101,61 @@ INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" _offset_map = {} +def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: + if name in _lite_rule_alias: + return name + if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\'" + f" instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return c_PERIOD_AND_OFFSET_DEPR_FREQSTR[name] + + for _name in (name.lower(), name.upper()): + if name == _name: + continue + if _name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR.values(): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{_name}\'" + f" instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return _name + + return name + + +def _validate_to_offset_alias(alias: str, is_period: bool) -> None: + if not is_period: + if alias.upper() in c_OFFSET_RENAMED_FREQSTR: + raise ValueError( + f"\'{alias}\' is no longer supported for offsets. Please " + f"use \'{c_OFFSET_RENAMED_FREQSTR.get(alias.upper())}\' " + f"instead." + ) + if (alias.upper() != alias and + alias.lower() not in {"s", "ms", "us", "ns"} and + alias.upper().split("-")[0].endswith(("S", "E"))): + raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) + if ( + is_period and + alias in c_OFFSET_TO_PERIOD_FREQSTR and + alias != c_OFFSET_TO_PERIOD_FREQSTR[alias] + ): + alias_msg = c_OFFSET_TO_PERIOD_FREQSTR.get(alias) + raise ValueError( + f"for Period, please use \'{alias_msg}\' " + f"instead of \'{alias}\'" + ) + + # TODO: better name? def _get_offset(name: str) -> BaseOffset: """ @@ -4702,13 +5165,6 @@ def _get_offset(name: str) -> BaseOffset: -------- _get_offset('EOM') --> BMonthEnd(1) """ - if name.lower() not in _dont_uppercase: - name = name.upper() - name = _lite_rule_alias.get(name, name) - name = _lite_rule_alias.get(name.lower(), name) - else: - name = _lite_rule_alias.get(name, name) - if name not in _offset_map: try: split = name.split("-") @@ -4734,6 +5190,10 @@ cpdef to_offset(freq, bint is_period=False): Parameters ---------- freq : str, datetime.timedelta, BaseOffset or None + The frequency represented. + is_period : bool, default False + Convert string denoting period frequency to corresponding offsets + frequency if is_period=True. Returns ------- @@ -4768,23 +5228,34 @@ cpdef to_offset(freq, bint is_period=False): >>> to_offset(pd.offsets.Hour()) + + Passing the parameter ``is_period`` equal to True, you can use a string + denoting period frequency: + + >>> freq = to_offset(freq="ME", is_period=False) + >>> freq.rule_code + 'ME' + + >>> freq = to_offset(freq="M", is_period=True) + >>> freq.rule_code + 'ME' """ if freq is None: return None - if isinstance(freq, BaseOffset): - return freq - if isinstance(freq, tuple): raise TypeError( f"to_offset does not support tuples {freq}, pass as a string instead" ) + if isinstance(freq, BaseOffset): + result = freq + elif PyDelta_Check(freq): - return delta_to_tick(freq) + result = delta_to_tick(freq) elif isinstance(freq, str): - delta = None + result = None stride_sign = None try: @@ -4795,64 +5266,30 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if is_period is False and name in c_OFFSET_DEPR_FREQSTR: - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_OFFSET_DEPR_FREQSTR[name] - if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR: - if name.startswith("Y"): - raise ValueError( - f"for Period, please use \'Y{name[2:]}\' " - f"instead of \'{name}\'" - ) - if (name.startswith("B") or - name.startswith("S") or name.startswith("C")): - raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) - else: - raise ValueError( - f"for Period, please use " - f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name)}\' " - f"instead of \'{name}\'" - ) - elif is_period is True and name in c_OFFSET_DEPR_FREQSTR: - if name.startswith("A"): - warnings.warn( - f"\'{name}\' is deprecated and will be removed in a future " - f"version, please use \'{c_DEPR_ABBREVS.get(name)}\' " - f"instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_OFFSET_DEPR_FREQSTR.get(name) + name = _warn_about_deprecated_aliases(name, is_period) + _validate_to_offset_alias(name, is_period) + if is_period: + if name.upper() in c_PERIOD_TO_OFFSET_FREQSTR: + if name.upper() != name: + raise ValueError( + f"\'{name}\' is no longer supported, " + f"please use \'{name.upper()}\' instead.", + ) + name = c_PERIOD_TO_OFFSET_FREQSTR[name.upper()] + name = _lite_rule_alias.get(name, name) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") - prefix = _lite_rule_alias.get(name) or name if stride_sign is None: stride_sign = -1 if stride.startswith("-") else 1 if not stride: stride = 1 - if prefix in c_DEPR_ABBREVS: - warnings.warn( - f"\'{prefix}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_DEPR_ABBREVS.get(prefix)}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - prefix = c_DEPR_ABBREVS[prefix] - - if prefix in {"D", "h", "min", "s", "ms", "us", "ns"}: + if name in {"D", "h", "min", "s", "ms", "us", "ns"}: # For these prefixes, we have something like "3h" or # "2.5min", so we can construct a Timedelta with the # matching unit and get our offset from delta_to_tick - td = Timedelta(1, unit=prefix) + td = Timedelta(1, unit=name) off = delta_to_tick(td) offset = off * float(stride) if n != 0: @@ -4861,24 +5298,35 @@ cpdef to_offset(freq, bint is_period=False): offset *= stride_sign else: stride = int(stride) - offset = _get_offset(prefix) + offset = _get_offset(name) offset = offset * int(np.fabs(stride) * stride_sign) - if delta is None: - delta = offset + if result is None: + result = offset else: - delta = delta + offset + result = result + offset except (ValueError, TypeError) as err: raise ValueError(INVALID_FREQ_ERR_MSG.format( f"{freq}, failed to parse with error message: {repr(err)}") - ) + ) from err else: - delta = None + result = None - if delta is None: + if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - return delta + try: + has_period_dtype_code = hasattr(result, "_period_dtype_code") + except ValueError: + has_period_dtype_code = False + + if is_period and not has_period_dtype_code: + if isinstance(freq, str): + raise ValueError(f"{result.name} is not supported as period frequency") + else: + raise ValueError(f"{freq} is not supported as period frequency") + + return result # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index 40394f915d4b0..845bd9a5a5635 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -27,7 +27,4 @@ def guess_datetime_format( dt_str: str, dayfirst: bool | None = ..., ) -> str | None: ... -def concat_date_cols( - date_cols: tuple, -) -> npt.NDArray[np.object_]: ... def get_rule_month(source: str) -> str: ... diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index d0872a509c440..308183402198d 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -7,7 +7,6 @@ import warnings from pandas.util._exceptions import find_stack_level -cimport cython from cpython.datetime cimport ( datetime, datetime_new, @@ -18,7 +17,7 @@ from cpython.datetime cimport ( from datetime import timezone -from cpython.object cimport PyObject_Str +from cpython.unicode cimport PyUnicode_AsUTF8AndSize from cython cimport Py_ssize_t from libc.string cimport strchr @@ -27,15 +26,7 @@ import_datetime() import numpy as np cimport numpy as cnp -from numpy cimport ( - PyArray_GETITEM, - PyArray_ITER_DATA, - PyArray_ITER_NEXT, - PyArray_IterNew, - flatiter, - float64_t, - int64_t, -) +from numpy cimport int64_t cnp.import_array() @@ -45,7 +36,6 @@ from decimal import InvalidOperation from dateutil.parser import DEFAULTPARSER from dateutil.tz import ( - tzlocal as _dateutil_tzlocal, tzoffset, tzutc as _dateutil_tzutc, ) @@ -75,11 +65,6 @@ import_pandas_datetime() from pandas._libs.tslibs.strptime import array_strptime -from pandas._libs.tslibs.util cimport ( - get_c_string_buf_and_size, - is_array, -) - cdef extern from "pandas/portable.h": int getdigit_ascii(char c, int default) nogil @@ -176,7 +161,7 @@ cdef datetime _parse_delimited_date( int day = 1, month = 1, year bint can_swap = 0 - buf = get_c_string_buf_and_size(date_string, &length) + buf = PyUnicode_AsUTF8AndSize(date_string, &length) if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]): # parsing MM?DD?YYYY and DD?MM?YYYY dates month = _parse_2digit(buf) @@ -252,7 +237,7 @@ cdef bint _does_string_look_like_time(str parse_string): Py_ssize_t length int hour = -1, minute = -1 - buf = get_c_string_buf_and_size(parse_string, &length) + buf = PyUnicode_AsUTF8AndSize(parse_string, &length) if length >= 4: if buf[1] == b":": # h:MM format @@ -392,32 +377,33 @@ def parse_datetime_string_with_reso( raise ValueError(f'Given date string "{date_string}" not likely a datetime') # Try iso8601 first, as it handles nanoseconds - string_to_dts_failed = string_to_dts( - date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - # Match Timestamp and drop picoseconds, femtoseconds, attoseconds - # The new resolution will just be nano - # GH#50417 - if out_bestunit in _timestamp_units: - out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns - - if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: - # TODO: avoid circular import - from pandas import Timestamp - parsed = Timestamp(date_string) - else: - if out_local: - tz = timezone(timedelta(minutes=out_tzoffset)) + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + date_string, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + # Match Timestamp and drop picoseconds, femtoseconds, attoseconds + # The new resolution will just be nano + # GH#50417 + if out_bestunit in _timestamp_units: + out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns + + if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: + # TODO: avoid circular import + from pandas import Timestamp + parsed = Timestamp(date_string) else: - tz = None - parsed = datetime_new( - dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz - ) + if out_local: + tz = timezone(timedelta(minutes=out_tzoffset)) + else: + tz = None + parsed = datetime_new( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz + ) - reso = npy_unit_to_attrname[out_bestunit] - return parsed, reso + reso = npy_unit_to_attrname[out_bestunit] + return parsed, reso parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit) if parsed is not None: @@ -468,7 +454,7 @@ cpdef bint _does_string_look_like_datetime(str py_string): char first int error = 0 - buf = get_c_string_buf_and_size(py_string, &length) + buf = PyUnicode_AsUTF8AndSize(py_string, &length) if length >= 1: first = buf[0] if first == b"0": @@ -522,7 +508,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default, pass if 4 <= date_len <= 7: - buf = get_c_string_buf_and_size(date_string, &date_len) + buf = PyUnicode_AsUTF8AndSize(date_string, &date_len) try: i = date_string.index("Q", 1, 6) if i == 1: @@ -703,17 +689,12 @@ cdef datetime dateutil_parse( if res.tzname and res.tzname in time.tzname: # GH#50791 if res.tzname != "UTC": - # If the system is localized in UTC (as many CI runs are) - # we get tzlocal, once the deprecation is enforced will get - # timezone.utc, not raise. - warnings.warn( + raise ValueError( f"Parsing '{res.tzname}' as tzlocal (dependent on system timezone) " - "is deprecated and will raise in a future version. Pass the 'tz' " + "is no longer supported. Pass the 'tz' " "keyword or call tz_localize after construction instead", - FutureWarning, - stacklevel=find_stack_level() ) - ret = ret.replace(tzinfo=_dateutil_tzlocal()) + ret = ret.replace(tzinfo=timezone.utc) elif res.tzoffset == 0: ret = ret.replace(tzinfo=_dateutil_tzutc()) elif res.tzoffset: @@ -733,15 +714,10 @@ cdef datetime dateutil_parse( ) elif res.tzname is not None: # e.g. "1994 Jan 15 05:16 FOO" where FOO is not recognized - # GH#18702 - warnings.warn( + # GH#18702, # GH 50235 enforced in 3.0 + raise ValueError( f'Parsed string "{timestr}" included an un-recognized timezone ' - f'"{res.tzname}". Dropping unrecognized timezones is deprecated; ' - "in a future version this will raise. Instead pass the string " - "without the timezone, then use .tz_localize to convert to a " - "recognized timezone.", - FutureWarning, - stacklevel=find_stack_level() + f'"{res.tzname}".' ) out_bestunit[0] = attrname_to_npy_unit[reso] @@ -884,6 +860,10 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: """ Guess the datetime format of a given datetime string. + This function attempts to deduce the format of a given datetime string. It is + useful for situations where the datetime format is unknown and needs to be + determined for proper parsing. The function is not guaranteed to return a format. + Parameters ---------- dt_str : str @@ -901,6 +881,12 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: datetime format string (for `strftime` or `strptime`), or None if it can't be guessed. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp : Pandas replacement for python datetime.datetime object. + DatetimeIndex : Immutable ndarray-like of datetime64 data. + Examples -------- >>> from pandas.tseries.api import guess_datetime_format @@ -920,8 +906,8 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: (("year", "month", "day", "hour"), "%Y%m%d%H", 0), (("year", "month", "day"), "%Y%m%d", 0), (("hour", "minute", "second"), "%H%M%S", 0), - (("hour", "minute"), "%H%M", 0), (("year",), "%Y", 0), + (("hour", "minute"), "%H%M", 0), (("month",), "%B", 0), (("month",), "%b", 0), (("month",), "%m", 2), @@ -941,8 +927,10 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: datetime_attrs_to_format.remove(day_attribute_and_format) datetime_attrs_to_format.insert(0, day_attribute_and_format) - # same default used by dateutil - default = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + # Use this instead of the dateutil default of + # `datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)` + # as that causes issues on the 29th of February. + default = datetime(1970, 1, 1) try: parsed_datetime = dateutil_parse( dt_str, @@ -1108,115 +1096,6 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept: ) -@cython.wraparound(False) -@cython.boundscheck(False) -cdef object convert_to_unicode(object item, bint keep_trivial_numbers): - """ - Convert `item` to str. - - Parameters - ---------- - item : object - keep_trivial_numbers : bool - if True, then conversion (to string from integer/float zero) - is not performed - - Returns - ------- - str or int or float - """ - cdef: - float64_t float_item - - if keep_trivial_numbers: - if isinstance(item, int): - if item == 0: - return item - elif isinstance(item, float): - float_item = item - if float_item == 0.0 or float_item != float_item: - return item - - if not isinstance(item, str): - item = PyObject_Str(item) - - return item - - -@cython.wraparound(False) -@cython.boundscheck(False) -def concat_date_cols(tuple date_cols) -> np.ndarray: - """ - Concatenates elements from numpy arrays in `date_cols` into strings. - - Parameters - ---------- - date_cols : tuple[ndarray] - - Returns - ------- - arr_of_rows : ndarray[object] - - Examples - -------- - >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) - >>> times=np.array(['11:20', '10:45'], dtype=object) - >>> result = concat_date_cols((dates, times)) - >>> result - array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) - """ - cdef: - Py_ssize_t rows_count = 0, col_count = len(date_cols) - Py_ssize_t col_idx, row_idx - list list_to_join - cnp.ndarray[object] iters - object[::1] iters_view - flatiter it - cnp.ndarray[object] result - object[::1] result_view - - if col_count == 0: - return np.zeros(0, dtype=object) - - if not all(is_array(array) for array in date_cols): - raise ValueError("not all elements from date_cols are numpy arrays") - - rows_count = min(len(array) for array in date_cols) - result = np.zeros(rows_count, dtype=object) - result_view = result - - if col_count == 1: - array = date_cols[0] - it = PyArray_IterNew(array) - for row_idx in range(rows_count): - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - result_view[row_idx] = convert_to_unicode(item, True) - PyArray_ITER_NEXT(it) - else: - # create fixed size list - more efficient memory allocation - list_to_join = [None] * col_count - iters = np.zeros(col_count, dtype=object) - - # create memoryview of iters ndarray, that will contain some - # flatiter's for each array in `date_cols` - more efficient indexing - iters_view = iters - for col_idx, array in enumerate(date_cols): - iters_view[col_idx] = PyArray_IterNew(array) - - # array elements that are on the same line are converted to one string - for row_idx in range(rows_count): - for col_idx, array in enumerate(date_cols): - # this cast is needed, because we did not find a way - # to efficiently store `flatiter` type objects in ndarray - it = iters_view[col_idx] - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - list_to_join[col_idx] = convert_to_unicode(item, False) - PyArray_ITER_NEXT(it) - result_view[row_idx] = " ".join(list_to_join) - - return result - - cpdef str get_rule_month(str source): """ Return starting month of given freq, default is December. diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 74804336f09a3..350216cf89ce4 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -38,10 +38,7 @@ from libc.time cimport ( tm, ) -from pandas._libs.tslibs.dtypes cimport ( - c_OFFSET_TO_PERIOD_FREQSTR, - freq_to_period_freqstr, -) +from pandas._libs.tslibs.dtypes cimport c_OFFSET_TO_PERIOD_FREQSTR from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime @@ -97,9 +94,6 @@ from pandas._libs.tslibs.dtypes cimport ( attrname_to_abbrevs, freq_group_code_to_npy_unit, ) - -from pandas._libs.tslibs.dtypes import freq_to_period_freqstr - from pandas._libs.tslibs.parsing cimport quarter_to_myear from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso @@ -120,6 +114,7 @@ from pandas._libs.tslibs.offsets import ( INVALID_FREQ_ERR_MSG, BDay, ) +from pandas.util._decorators import set_module cdef: enum: @@ -684,7 +679,9 @@ cdef char* c_strftime(npy_datetimestruct *dts, char *fmt): c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1 c_date.tm_isdst = -1 - result = malloc(result_len * sizeof(char)) + result = malloc(result_len) + if result is NULL: + raise MemoryError() strftime(result, result_len, fmt, &c_date) @@ -1554,7 +1551,7 @@ def extract_ordinals(ndarray values, freq) -> np.ndarray: # if we don't raise here, we'll segfault later! raise TypeError("extract_ordinals values must be object-dtype") - freqstr = freq_to_period_freqstr(freq.n, freq.name) + freqstr = PeriodDtypeBase(freq._period_dtype_code, freq.n)._freqstr for i in range(n): # Analogous to: p = values[i] @@ -1722,8 +1719,15 @@ cdef class PeriodMixin: condition = self.freq != other if condition: - freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) - other_freqstr = freq_to_period_freqstr(other.n, other.name) + freqstr = PeriodDtypeBase( + self.freq._period_dtype_code, self.freq.n + )._freqstr + if hasattr(other, "_period_dtype_code"): + other_freqstr = PeriodDtypeBase( + other._period_dtype_code, other.n + )._freqstr + else: + other_freqstr = other.freqstr msg = DIFFERENT_FREQ.format( cls=type(self).__name__, own_freq=freqstr, @@ -1748,9 +1752,6 @@ cdef class _Period(PeriodMixin): def __cinit__(self, int64_t ordinal, BaseOffset freq): self.ordinal = ordinal self.freq = freq - # Note: this is more performant than PeriodDtype.from_date_offset(freq) - # because from_date_offset cannot be made a cdef method (until cython - # supported cdef classmethods) self._dtype = PeriodDtypeBase(freq._period_dtype_code, freq.n) @classmethod @@ -1909,21 +1910,59 @@ cdef class _Period(PeriodMixin): Parameters ---------- - freq : str, BaseOffset - The desired frequency. If passing a `str`, it needs to be a - valid :ref:`period alias `. + freq : str, DateOffset + The target frequency to convert the Period object to. + If a string is provided, + it must be a valid :ref:`period alias `. + how : {'E', 'S', 'end', 'start'}, default 'end' - Start or end of the timespan. + Specifies whether to align the period to the start or end of the interval: + - 'E' or 'end': Align to the end of the interval. + - 'S' or 'start': Align to the start of the interval. Returns ------- - resampled : Period + Period : Period object with the specified frequency, aligned to the parameter. + + See Also + -------- + Period.end_time : Return the end Timestamp. + Period.start_time : Return the start Timestamp. + Period.dayofyear : Return the day of the year. + Period.dayofweek : Return the day of the week. Examples -------- - >>> period = pd.Period('2023-1-1', freq='D') + Convert a daily period to an hourly period, aligning to the end of the day: + + >>> period = pd.Period('2023-01-01', freq='D') >>> period.asfreq('h') Period('2023-01-01 23:00', 'h') + + Convert a monthly period to a daily period, aligning to the start of the month: + + >>> period = pd.Period('2023-01', freq='M') + >>> period.asfreq('D', how='start') + Period('2023-01-01', 'D') + + Convert a yearly period to a monthly period, aligning to the last month: + + >>> period = pd.Period('2023', freq='Y') + >>> period.asfreq('M', how='end') + Period('2023-12', 'M') + + Convert a monthly period to an hourly period, + aligning to the first day of the month: + + >>> period = pd.Period('2023-01', freq='M') + >>> period.asfreq('h', how='start') + Period('2023-01-01 00:00', 'H') + + Convert a weekly period to a daily period, aligning to the last day of the week: + + >>> period = pd.Period('2023-08-01', freq='W') + >>> period.asfreq('D', how='end') + Period('2023-08-04', 'D') """ freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) @@ -1960,6 +1999,12 @@ cdef class _Period(PeriodMixin): ------- Timestamp + See Also + -------- + Timestamp : A class representing a single point in time. + Period : Represents a span of time with a fixed frequency. + PeriodIndex.to_timestamp : Convert a `PeriodIndex` to a `DatetimeIndex`. + Examples -------- >>> period = pd.Period('2023-1-1', freq='D') @@ -1997,11 +2042,44 @@ cdef class _Period(PeriodMixin): """ Return the year this Period falls on. + Returns + ------- + int + + See Also + -------- + period.month : Get the month of the year for the given Period. + period.day : Return the day of the month the Period falls on. + + Notes + ----- + The year is based on the `ordinal` and `base` attributes of the Period. + Examples -------- - >>> period = pd.Period('2022-01', 'M') + Create a Period object for January 2023 and get the year: + + >>> period = pd.Period('2023-01', 'M') + >>> period.year + 2023 + + Create a Period object for 01 January 2023 and get the year: + + >>> period = pd.Period('2023', 'D') + >>> period.year + 2023 + + Get the year for a period representing a quarter: + + >>> period = pd.Period('2023Q2', 'Q') + >>> period.year + 2023 + + Handle a case where the Period object is empty, which results in `NaN`: + + >>> period = pd.Period('nan', 'M') >>> period.year - 2022 + nan """ base = self._dtype._dtype_code return pyear(self.ordinal, base) @@ -2011,11 +2089,45 @@ cdef class _Period(PeriodMixin): """ Return the month this Period falls on. + Returns + ------- + int + + See Also + -------- + period.week : Get the week of the year on the given Period. + Period.year : Return the year this Period falls on. + Period.day : Return the day of the month this Period falls on. + + Notes + ----- + The month is based on the `ordinal` and `base` attributes of the Period. + Examples -------- + Create a Period object for January 2022 and get the month: + >>> period = pd.Period('2022-01', 'M') >>> period.month 1 + + Period object with no specified frequency, resulting in a default frequency: + + >>> period = pd.Period('2022', 'Y') + >>> period.month + 12 + + Create a Period object with a specified frequency but an incomplete date string: + + >>> period = pd.Period('2022', 'M') + >>> period.month + 1 + + Handle a case where the Period object is empty, which results in `NaN`: + + >>> period = pd.Period('nan', 'M') + >>> period.month + nan """ base = self._dtype._dtype_code return pmonth(self.ordinal, base) @@ -2025,6 +2137,12 @@ cdef class _Period(PeriodMixin): """ Get day of the month that a Period falls on. + The `day` property provides a simple way to access the day component + of a `Period` object, which represents time spans in various frequencies + (e.g., daily, hourly, monthly). If the period's frequency does not include + a day component (e.g., yearly or quarterly periods), the returned day + corresponds to the first day of that period. + Returns ------- int @@ -2326,6 +2444,12 @@ cdef class _Period(PeriodMixin): """ Return the quarter this Period falls on. + See Also + -------- + Timestamp.quarter : Return the quarter of the Timestamp. + Period.year : Return the year of the period. + Period.month : Return the month of the period. + Examples -------- >>> period = pd.Period('2022-04', 'M') @@ -2440,6 +2564,12 @@ cdef class _Period(PeriodMixin): """ Return True if the period's year is in a leap year. + See Also + -------- + Timestamp.is_leap_year : Check if the year in a Timestamp is a leap year. + DatetimeIndex.is_leap_year : Boolean indicator if the date belongs to a + leap year. + Examples -------- >>> period = pd.Period('2022-01', 'M') @@ -2457,11 +2587,24 @@ cdef class _Period(PeriodMixin): """ Return the period of now's date. + The `now` method provides a convenient way to generate a period + object for the current date and time. This can be particularly + useful in financial and economic analysis, where data is often + collected and analyzed in regular intervals (e.g., hourly, daily, + monthly). By specifying the frequency, users can create periods + that match the granularity of their data. + Parameters ---------- - freq : str, BaseOffset + freq : str, DateOffset Frequency to use for the returned period. + See Also + -------- + to_datetime : Convert argument to datetime. + Period : Represents a period of time. + Period.to_timestamp : Return the Timestamp representation of the Period. + Examples -------- >>> pd.Period.now('h') # doctest: +SKIP @@ -2474,12 +2617,23 @@ cdef class _Period(PeriodMixin): """ Return a string representation of the frequency. + This property provides the frequency string associated with the `Period` + object. The frequency string describes the granularity of the time span + represented by the `Period`. Common frequency strings include 'D' for + daily, 'M' for monthly, 'Y' for yearly, etc. + + See Also + -------- + Period.asfreq : Convert Period to desired frequency, at the start or end + of the interval. + period_range : Return a fixed frequency PeriodIndex. + Examples -------- >>> pd.Period('2020-01', 'D').freqstr 'D' """ - freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) + freqstr = PeriodDtypeBase(self.freq._period_dtype_code, self.freq.n)._freqstr return freqstr def __repr__(self) -> str: @@ -2611,6 +2765,27 @@ cdef class _Period(PeriodMixin): | ``%%`` | A literal ``'%'`` character. | | +-----------+--------------------------------+-------+ + The `strftime` method provides a way to represent a :class:`Period` + object as a string in a specified format. This is particularly useful + when displaying date and time data in different locales or customized + formats, suitable for reports or user interfaces. It extends the standard + Python string formatting capabilities with additional directives specific + to `pandas`, accommodating features like fiscal years and precise + sub-second components. + + Parameters + ---------- + fmt : str or None + String containing the desired format directives. If ``None``, the + format is determined based on the Period's frequency. + + See Also + -------- + Timestamp.strftime : Return a formatted string of the Timestamp. + to_datetime : Convert argument to datetime. + time.strftime : Format a time object as a string according to a + specified format string in the standard Python library. + Notes ----- @@ -2659,10 +2834,16 @@ cdef class _Period(PeriodMixin): return period_format(self.ordinal, base, fmt) +@set_module("pandas") class Period(_Period): """ Represents a period of time. + A `Period` represents a specific time span rather than a point in time. + Unlike `Timestamp`, which represents a single instant, a `Period` defines a + duration, such as a month, quarter, or year. The exact representation is + determined by the `freq` parameter. + Parameters ---------- value : Period, str, datetime, date or pandas.Timestamp, default None @@ -2690,6 +2871,12 @@ class Period(_Period): second : int, default 0 Second value of the period. + See Also + -------- + Timestamp : Pandas replacement for python datetime.datetime object. + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Generates a fixed frequency range of timedeltas. + Examples -------- >>> period = pd.Period('2012-1-1', freq='D') diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index dd8936f080b31..d2eae910a87b5 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -18,9 +18,12 @@ cdef class DatetimeParseState: bint found_tz bint found_naive bint found_naive_str + bint found_aware_str bint found_other bint creso_ever_changed NPY_DATETIMEUNIT creso + set out_tzoffset_vals cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert) cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso) noexcept + cdef tzinfo check_for_mixed_inputs(self, tzinfo tz_out, bint utc) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index ee72b1311051e..b443aa7bede22 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -16,6 +16,7 @@ FUNCTIONS: strptime -- Calculates the time struct represented by the passed-in string """ from datetime import timezone +import zoneinfo from cpython.datetime cimport ( PyDate_Check, @@ -38,7 +39,6 @@ from _thread import allocate_lock as _thread_allocate_lock import re import numpy as np -import pytz cimport numpy as cnp from numpy cimport ( @@ -58,7 +58,6 @@ from pandas._libs.tslibs.dtypes cimport ( ) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, - c_NaT as NaT, c_nat_strings as nat_strings, ) from pandas._libs.tslibs.np_datetime cimport ( @@ -132,7 +131,7 @@ cdef bint parse_today_now( if infer_reso: creso = NPY_DATETIMEUNIT.NPY_FR_us if utc: - ts = <_Timestamp>Timestamp.utcnow() + ts = <_Timestamp>Timestamp.now(timezone.utc) iresult[0] = ts._as_creso(creso)._value else: # GH#18705 make sure to_datetime("now") matches Timestamp("now") @@ -253,8 +252,11 @@ cdef class DatetimeParseState: # found_naive_str refers to a string that was parsed to a timezone-naive # datetime. self.found_naive_str = False + self.found_aware_str = False self.found_other = False + self.out_tzoffset_vals = set() + self.creso = creso self.creso_ever_changed = False @@ -293,6 +295,58 @@ cdef class DatetimeParseState: "tz-naive values") return tz + cdef tzinfo check_for_mixed_inputs( + self, + tzinfo tz_out, + bint utc, + ): + cdef: + bint is_same_offsets + float tz_offset + + if self.found_aware_str and not utc: + # GH#17697, GH#57275 + # 1) If all the offsets are equal, return one offset for + # the parsed dates to (maybe) pass to DatetimeIndex + # 2) If the offsets are different, then do not force the parsing + # and raise a ValueError: "cannot parse datetimes with + # mixed time zones unless `utc=True`" instead + is_same_offsets = len(self.out_tzoffset_vals) == 1 + if not is_same_offsets or (self.found_naive or self.found_other): + # e.g. test_to_datetime_mixed_awareness_mixed_types (array_to_datetime) + raise ValueError( + "Mixed timezones detected. Pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + elif tz_out is not None: + # GH#55693 + tz_offset = self.out_tzoffset_vals.pop() + tz_out2 = timezone(timedelta(seconds=tz_offset)) + if not tz_compare(tz_out, tz_out2): + # e.g. (array_strptime) + # test_to_datetime_mixed_offsets_with_utc_false_removed + # e.g. test_to_datetime_mixed_tzs_mixed_types (array_to_datetime) + raise ValueError( + "Mixed timezones detected. Pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + # e.g. (array_strptime) + # test_guess_datetime_format_with_parseable_formats + # e.g. test_to_datetime_mixed_types_matching_tzs (array_to_datetime) + else: + # e.g. test_to_datetime_iso8601_with_timezone_valid (array_strptime) + tz_offset = self.out_tzoffset_vals.pop() + tz_out = timezone(timedelta(seconds=tz_offset)) + elif not utc: + if tz_out and (self.found_other or self.found_naive_str): + # found_other indicates a tz-naive int, float, dt64, or date + # e.g. test_to_datetime_mixed_awareness_mixed_types (array_to_datetime) + raise ValueError( + "Mixed timezones detected. Pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + return tz_out + def array_strptime( ndarray[object] values, @@ -300,7 +354,7 @@ def array_strptime( bint exact=True, errors="raise", bint utc=False, - NPY_DATETIMEUNIT creso=NPY_FR_ns, + NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC, ): """ Calculates the datetime structs represented by the passed array of strings @@ -310,8 +364,8 @@ def array_strptime( values : ndarray of string-like objects fmt : string-like regex exact : matches must be exact if True, search if False - errors : string specifying error handling, {'raise', 'ignore', 'coerce'} - creso : NPY_DATETIMEUNIT, default NPY_FR_ns + errors : string specifying error handling, {'raise', 'coerce'} + creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC Set to NPY_FR_GENERIC to infer a resolution. """ @@ -320,12 +374,8 @@ def array_strptime( npy_datetimestruct dts int64_t[::1] iresult object val - bint seen_datetime_offset = False bint is_raise = errors=="raise" - bint is_ignore = errors=="ignore" bint is_coerce = errors=="coerce" - bint is_same_offsets - set out_tzoffset_vals = set() tzinfo tz, tz_out = None bint iso_format = format_is_iso(fmt) NPY_DATETIMEUNIT out_bestunit, item_reso @@ -334,7 +384,7 @@ def array_strptime( bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC DatetimeParseState state = DatetimeParseState(creso) - assert is_raise or is_ignore or is_coerce + assert is_raise or is_coerce _validate_fmt(fmt) format_regex, locale_time = _get_format_regex(fmt) @@ -394,6 +444,9 @@ def array_strptime( else: val = str(val) + out_local = 0 + out_tzoffset = 0 + if fmt == "ISO8601": string_to_dts_succeeded = not string_to_dts( val, &dts, &out_bestunit, &out_local, @@ -420,15 +473,15 @@ def array_strptime( ) from err if out_local == 1: nsecs = out_tzoffset * 60 - out_tzoffset_vals.add(nsecs) - seen_datetime_offset = True + state.out_tzoffset_vals.add(nsecs) + state.found_aware_str = True tz = timezone(timedelta(minutes=out_tzoffset)) value = tz_localize_to_utc_single( value, tz, ambiguous="raise", nonexistent=None, creso=creso ) else: tz = None - out_tzoffset_vals.add("naive") + state.out_tzoffset_vals.add("naive") state.found_naive_str = True iresult[i] = value continue @@ -477,16 +530,16 @@ def array_strptime( elif creso == NPY_DATETIMEUNIT.NPY_FR_ms: nsecs = nsecs // 10**3 - out_tzoffset_vals.add(nsecs) - seen_datetime_offset = True + state.out_tzoffset_vals.add(nsecs) + state.found_aware_str = True else: state.found_naive_str = True tz = None - out_tzoffset_vals.add("naive") + state.out_tzoffset_vals.add("naive") except ValueError as ex: ex.args = ( - f"{str(ex)}, at position {i}. You might want to try:\n" + f"{str(ex)}. You might want to try:\n" " - passing `format` if your strings have a consistent format;\n" " - passing `format='ISO8601'` if your strings are " "all ISO8601 but not necessarily in exactly the same format;\n" @@ -501,35 +554,7 @@ def array_strptime( raise return values, None - if seen_datetime_offset and not utc: - is_same_offsets = len(out_tzoffset_vals) == 1 - if not is_same_offsets or (state.found_naive or state.found_other): - result2 = _array_strptime_object_fallback( - values, fmt=fmt, exact=exact, errors=errors, utc=utc - ) - return result2, None - elif tz_out is not None: - # GH#55693 - tz_offset = out_tzoffset_vals.pop() - tz_out2 = timezone(timedelta(seconds=tz_offset)) - if not tz_compare(tz_out, tz_out2): - # e.g. test_to_datetime_mixed_offsets_with_utc_false_deprecated - result2 = _array_strptime_object_fallback( - values, fmt=fmt, exact=exact, errors=errors, utc=utc - ) - return result2, None - # e.g. test_guess_datetime_format_with_parseable_formats - else: - # e.g. test_to_datetime_iso8601_with_timezone_valid - tz_offset = out_tzoffset_vals.pop() - tz_out = timezone(timedelta(seconds=tz_offset)) - elif not utc: - if tz_out and (state.found_other or state.found_naive_str): - # found_other indicates a tz-naive int, float, dt64, or date - result2 = _array_strptime_object_fallback( - values, fmt=fmt, exact=exact, errors=errors, utc=utc - ) - return result2, None + tz_out = state.check_for_mixed_inputs(tz_out, utc) if infer_reso: if state.creso_ever_changed: @@ -690,7 +715,7 @@ cdef tzinfo _parse_with_format( elif len(s) <= 6: item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us else: - item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ns + item_reso[0] = NPY_FR_ns # Pad to always return nanoseconds s += "0" * (9 - len(s)) us = int(s) @@ -725,7 +750,7 @@ cdef tzinfo _parse_with_format( week_of_year_start = 0 elif parse_code == 17: # e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z' - tz = pytz.timezone(found_dict["Z"]) + tz = zoneinfo.ZoneInfo(found_dict["Z"]) elif parse_code == 19: # e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z' tz = parse_timezone_directive(found_dict["z"]) @@ -791,157 +816,6 @@ cdef tzinfo _parse_with_format( return tz -def _array_strptime_object_fallback( - ndarray[object] values, - str fmt, - bint exact=True, - errors="raise", - bint utc=False, -): - - cdef: - Py_ssize_t i, n = len(values) - npy_datetimestruct dts - int64_t iresult - object val - tzinfo tz - bint is_raise = errors=="raise" - bint is_ignore = errors=="ignore" - bint is_coerce = errors=="coerce" - bint iso_format = format_is_iso(fmt) - NPY_DATETIMEUNIT creso, out_bestunit, item_reso - int out_local = 0, out_tzoffset = 0 - bint string_to_dts_succeeded = 0 - - assert is_raise or is_ignore or is_coerce - - item_reso = NPY_DATETIMEUNIT.NPY_FR_GENERIC - format_regex, locale_time = _get_format_regex(fmt) - - result = np.empty(n, dtype=object) - - dts.us = dts.ps = dts.as = 0 - - for i in range(n): - val = values[i] - try: - if isinstance(val, str): - if len(val) == 0 or val in nat_strings: - result[i] = NaT - continue - elif checknull_with_nat_and_na(val): - result[i] = NaT - continue - elif PyDateTime_Check(val): - result[i] = Timestamp(val) - continue - elif PyDate_Check(val): - result[i] = Timestamp(val) - continue - elif cnp.is_datetime64_object(val): - result[i] = Timestamp(val) - continue - elif ( - (is_integer_object(val) or is_float_object(val)) - and (val != val or val == NPY_NAT) - ): - result[i] = NaT - continue - else: - val = str(val) - - if fmt == "ISO8601": - string_to_dts_succeeded = not string_to_dts( - val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, None, False - ) - elif iso_format: - string_to_dts_succeeded = not string_to_dts( - val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, fmt, exact - ) - if string_to_dts_succeeded: - # No error reported by string_to_dts, pick back up - # where we left off - creso = get_supported_reso(out_bestunit) - try: - value = npy_datetimestruct_to_datetime(creso, &dts) - except OverflowError as err: - raise OutOfBoundsDatetime( - f"Out of bounds nanosecond timestamp: {val}" - ) from err - if out_local == 1: - tz = timezone(timedelta(minutes=out_tzoffset)) - value = tz_localize_to_utc_single( - value, tz, ambiguous="raise", nonexistent=None, creso=creso - ) - else: - tz = None - ts = Timestamp._from_value_and_reso(value, creso, tz) - result[i] = ts - continue - - if parse_today_now(val, &iresult, utc, NPY_FR_ns): - result[i] = Timestamp(val) - continue - - # Some ISO formats can't be parsed by string_to_dts - # For example, 6-digit YYYYMD. So, if there's an error, and a format - # was specified, then try the string-matching code below. If the format - # specified was 'ISO8601', then we need to error, because - # only string_to_dts handles mixed ISO8601 formats. - if not string_to_dts_succeeded and fmt == "ISO8601": - raise ValueError(f"Time data {val} is not ISO8601 format") - - tz = _parse_with_format( - val, fmt, exact, format_regex, locale_time, &dts, &item_reso - ) - try: - iresult = npy_datetimestruct_to_datetime(item_reso, &dts) - except OverflowError as err: - raise OutOfBoundsDatetime( - f"Out of bounds nanosecond timestamp: {val}" - ) from err - if tz is not None: - iresult = tz_localize_to_utc_single( - iresult, tz, ambiguous="raise", nonexistent=None, creso=item_reso - ) - ts = Timestamp._from_value_and_reso(iresult, item_reso, tz) - result[i] = ts - - except (ValueError, OutOfBoundsDatetime) as ex: - ex.args = ( - f"{str(ex)}, at position {i}. You might want to try:\n" - " - passing `format` if your strings have a consistent format;\n" - " - passing `format='ISO8601'` if your strings are " - "all ISO8601 but not necessarily in exactly the same format;\n" - " - passing `format='mixed'`, and the format will be " - "inferred for each element individually. " - "You might want to use `dayfirst` alongside this.", - ) - if is_coerce: - result[i] = NaT - continue - elif is_raise: - raise - return values - - import warnings - - from pandas.util._exceptions import find_stack_level - warnings.warn( - "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise an error unless `utc=True`. Please specify `utc=True` " - "to opt in to the new behaviour and silence this warning. " - "To create a `Series` with mixed offsets and `object` dtype, " - "please use `apply` and `datetime.datetime.strptime`", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return result - - class TimeRE(_TimeRE): """ Handle conversion from format directives to regexes. @@ -966,7 +840,7 @@ class TimeRE(_TimeRE): if key == "Z": # lazy computation if self._Z is None: - self._Z = self.__seqToRE(pytz.all_timezones, "Z") + self._Z = self.__seqToRE(zoneinfo.available_timezones(), "Z") # Note: handling Z is the key difference vs using the stdlib # _strptime.TimeRE. test_to_datetime_parse_tzname_or_tzoffset with # fmt='%Y-%m-%d %H:%M:%S %Z' fails with the stdlib version. @@ -1053,6 +927,13 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday) correction = date(iso_year, 1, 4).isoweekday() + 3 ordinal = (iso_week * 7) + iso_weekday - correction + + if iso_week == 53: + now = date.fromordinal(date(iso_year, 1, 1).toordinal() + ordinal - iso_weekday) + jan_4th = date(iso_year+1, 1, 4) + if (jan_4th - now).days < 7: + raise ValueError(f"Week 53 does not exist in ISO year {iso_year}.") + # ordinal may be negative or 0 now, which means the date is in the previous # calendar year if ordinal < 1: diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 24ec6c8891a89..c885543b2fc6d 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -3,7 +3,6 @@ from typing import ( ClassVar, Literal, TypeAlias, - TypeVar, overload, ) @@ -39,8 +38,6 @@ UnitChoices: TypeAlias = Literal[ "minute", "min", "minutes", - "T", - "t", "s", "seconds", "sec", @@ -50,23 +47,18 @@ UnitChoices: TypeAlias = Literal[ "millisecond", "milli", "millis", - "L", - "l", "us", "microseconds", "microsecond", "µs", "micro", "micros", - "u", "ns", "nanoseconds", "nano", "nanos", "nanosecond", - "n", ] -_S = TypeVar("_S", bound=timedelta) def get_unit_for_round(freq, creso: int) -> int: ... def disallow_ambiguous_unit(unit: str | None) -> None: ... @@ -101,11 +93,11 @@ class Timedelta(timedelta): _value: int # np.int64 # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") def __new__( # type: ignore[misc] - cls: type[_S], + cls: type[Self], value=..., unit: str | None = ..., **kwargs: float | np.integer | np.floating, - ) -> _S | NaTType: ... + ) -> Self | NaTType: ... @classmethod def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ... @property diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f6c69cf6d3875..222a6070016e0 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,6 +1,7 @@ import collections import warnings +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level cimport cython @@ -43,7 +44,7 @@ from pandas._libs.tslibs.conversion cimport ( precision_from_unit, ) from pandas._libs.tslibs.dtypes cimport ( - c_DEPR_ABBREVS, + c_DEPR_UNITS, get_supported_reso, is_supported_unit, npy_unit_to_abbrev, @@ -719,15 +720,15 @@ cpdef inline str parse_timedelta_unit(str unit): return "ns" elif unit == "M": return unit - elif unit in c_DEPR_ABBREVS: + elif unit in c_DEPR_UNITS: warnings.warn( f"\'{unit}\' is deprecated and will be removed in a " - f"future version. Please use \'{c_DEPR_ABBREVS.get(unit)}\' " + f"future version. Please use \'{c_DEPR_UNITS.get(unit)}\' " f"instead of \'{unit}\'.", FutureWarning, stacklevel=find_stack_level(), ) - unit = c_DEPR_ABBREVS[unit] + unit = c_DEPR_UNITS[unit] try: return timedelta_abbrevs[unit.lower()] except KeyError: @@ -997,8 +998,9 @@ class MinMaxReso: and Timedelta class. On an instance, these depend on the object's _reso. On the class, we default to the values we would get with nanosecond _reso. """ - def __init__(self, name): + def __init__(self, name, docstring): self._name = name + self.__doc__ = docstring def __get__(self, obj, type=None): if self._name == "min": @@ -1011,9 +1013,13 @@ class MinMaxReso: if obj is None: # i.e. this is on the class, default to nanos - return Timedelta(val) + result = Timedelta(val) else: - return Timedelta._from_value_and_reso(val, obj._creso) + result = Timedelta._from_value_and_reso(val, obj._creso) + + result.__doc__ = self.__doc__ + + return result def __set__(self, obj, value): raise AttributeError(f"{self._name} is not settable.") @@ -1032,12 +1038,97 @@ cdef class _Timedelta(timedelta): # higher than np.ndarray and np.matrix __array_priority__ = 100 - min = MinMaxReso("min") - max = MinMaxReso("max") - resolution = MinMaxReso("resolution") + + _docstring_min = """ + Returns the minimum bound possible for Timedelta. + + This property provides access to the smallest possible value that + can be represented by a Timedelta object. + + Returns + ------- + Timedelta + + See Also + -------- + Timedelta.max: Returns the maximum bound possible for Timedelta. + Timedelta.resolution: Returns the smallest possible difference between + non-equal Timedelta objects. + + Examples + -------- + >>> pd.Timedelta.min + -106752 days +00:12:43.145224193 + """ + + _docstring_max = """ + Returns the maximum bound possible for Timedelta. + + This property provides access to the largest possible value that + can be represented by a Timedelta object. + + Returns + ------- + Timedelta + + See Also + -------- + Timedelta.min: Returns the minimum bound possible for Timedelta. + Timedelta.resolution: Returns the smallest possible difference between + non-equal Timedelta objects. + + Examples + -------- + >>> pd.Timedelta.max + 106751 days 23:47:16.854775807 + """ + + _docstring_reso = """ + Returns the smallest possible difference between non-equal Timedelta objects. + + The resolution value is determined by the underlying representation of time + units and is equivalent to Timedelta(nanoseconds=1). + + Returns + ------- + Timedelta + + See Also + -------- + Timedelta.max: Returns the maximum bound possible for Timedelta. + Timedelta.min: Returns the minimum bound possible for Timedelta. + + Examples + -------- + >>> pd.Timedelta.resolution + 0 days 00:00:00.000000001 + """ + + min = MinMaxReso("min", _docstring_min) + max = MinMaxReso("max", _docstring_max) + resolution = MinMaxReso("resolution", _docstring_reso) @property def value(self): + """ + Return the value of Timedelta object in nanoseconds. + + Return the total seconds, milliseconds and microseconds + of the timedelta as nanoseconds. + + Returns + ------- + int + + See Also + -------- + Timedelta.unit : Return the unit of Timedelta object. + + Examples + -------- + >>> pd.Timedelta(1, "us").value + 1000 + """ try: return convert_reso(self._value, self._creso, NPY_FR_ns, False) except OverflowError: @@ -1059,10 +1150,22 @@ cdef class _Timedelta(timedelta): """ Returns the days of the timedelta. + The `days` attribute of a `pandas.Timedelta` object provides the number + of days represented by the `Timedelta`. This is useful for extracting + the day component from a `Timedelta` that may also include hours, minutes, + seconds, and smaller time units. This attribute simplifies the process + of working with durations where only the day component is of interest. + Returns ------- int + See Also + -------- + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. + Timedelta.total_seconds : Returns the total duration in seconds. + Examples -------- >>> td = pd.Timedelta(1, "d") @@ -1120,6 +1223,37 @@ cdef class _Timedelta(timedelta): def microseconds(self) -> int: # TODO(cython3): make cdef property # NB: using the python C-API PyDateTime_DELTA_GET_MICROSECONDS will fail # (or be incorrect) + """ + Return the number of microseconds (n), where 0 <= n < 1 millisecond. + + Timedelta.microseconds = milliseconds * 1000 + microseconds. + + Returns + ------- + int + Number of microseconds. + + See Also + -------- + Timedelta.components : Return all attributes with assigned values + (i.e. days, hours, minutes, seconds, milliseconds, microseconds, + nanoseconds). + + Examples + -------- + **Using string input** + + >>> td = pd.Timedelta('1 days 2 min 3 us') + + >>> td.microseconds + 3 + + **Using integer input** + + >>> td = pd.Timedelta(42, unit='us') + >>> td.microseconds + 42 + """ self._ensure_components() return self._ms * 1000 + self._us @@ -1127,6 +1261,16 @@ cdef class _Timedelta(timedelta): """ Total seconds in the duration. + This method calculates the total duration in seconds by combining + the days, seconds, and microseconds of the `Timedelta` object. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. + Examples -------- >>> td = pd.Timedelta('1min') @@ -1141,6 +1285,26 @@ cdef class _Timedelta(timedelta): @property def unit(self) -> str: + """ + Return the unit of Timedelta object. + + The unit of Timedelta object is nanosecond, i.e., 'ns' by default. + + Returns + ------- + str + + See Also + -------- + Timedelta.value : Return the value of Timedelta object in nanoseconds. + Timedelta.as_unit : Convert the underlying int64 representation to + the given unit. + + Examples + -------- + >>> td = pd.Timedelta(42, unit='us') + 'ns' + """ return npy_unit_to_abbrev(self._creso) def __hash__(_Timedelta self): @@ -1306,7 +1470,10 @@ cdef class _Timedelta(timedelta): datetime.timedelta(days=3) """ if self._creso == NPY_FR_ns: - return timedelta(microseconds=int(self._value) / 1000) + us, remainder = divmod(self._value, 1000) + if remainder >= 500: + us += 1 + return timedelta(microseconds=us) # TODO(@WillAyd): is this the right way to use components? self._ensure_components() @@ -1318,6 +1485,18 @@ cdef class _Timedelta(timedelta): """ Return a numpy.timedelta64 object with 'ns' precision. + Since NumPy uses ``timedelta64`` objects for its time operations, converting + a pandas ``Timedelta`` into a NumPy ``timedelta64`` provides seamless + integration between the two libraries, especially when working in environments + that heavily rely on NumPy for array-based calculations. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + numpy.timedelta64 : A NumPy object for time duration. + Timedelta : Represents a duration, the difference between two dates + or times. + Examples -------- >>> td = pd.Timedelta('3D') @@ -1336,9 +1515,16 @@ cdef class _Timedelta(timedelta): """ Convert the Timedelta to a NumPy timedelta64. - This is an alias method for `Timedelta.to_timedelta64()`. The dtype and - copy parameters are available here only for compatibility. Their values - will not affect the return value. + This is an alias method for `Timedelta.to_timedelta64()`. + + Parameters + ---------- + dtype : NoneType + It is available here only for compatibility. Its value will not + affect the return value. + copy : bool, default False + It is available here only for compatibility. Its value will not + affect the return value. Returns ------- @@ -1366,11 +1552,27 @@ cdef class _Timedelta(timedelta): """ Array view compatibility. + This method allows you to reinterpret the underlying data of a Timedelta + object as a different dtype. The `view` method provides a way to reinterpret + the internal representation of the `Timedelta` object without modifying its + data. This is particularly useful when you need to work with the underlying + data directly, such as for performance optimizations or interfacing with + low-level APIs. The returned value is typically the number of nanoseconds + since the epoch, represented as an integer or another specified dtype. + Parameters ---------- dtype : str or dtype The dtype to view the underlying data as. + See Also + -------- + Timedelta.asm8 : Return a numpy timedelta64 array scalar view. + numpy.ndarray.view : Returns a view of an array with the same data. + Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64. + Timedelta.total_seconds : Returns the total duration of the Timedelta + object in seconds. + Examples -------- >>> td = pd.Timedelta('3D') @@ -1386,6 +1588,17 @@ cdef class _Timedelta(timedelta): """ Return a components namedtuple-like. + Each component represents a different time unit, allowing you to access the + breakdown of the total duration in terms of days, hours, minutes, seconds, + milliseconds, microseconds, and nanoseconds. + + See Also + -------- + Timedelta.total_seconds : Returns the total duration of the Timedelta in + seconds. + to_timedelta : Convert argument to Timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Examples -------- >>> td = pd.Timedelta('2 day 4 min 3 us 42 ns') @@ -1413,6 +1626,12 @@ cdef class _Timedelta(timedelta): numpy timedelta64 array scalar view Array scalar view of the timedelta in nanoseconds. + See Also + -------- + Timedelta.total_seconds : Return the total seconds in the duration. + Timedelta.components : Return a namedtuple of the Timedelta's components. + Timedelta.to_timedelta64 : Convert the Timedelta to a numpy.timedelta64. + Examples -------- >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') @@ -1592,7 +1811,8 @@ cdef class _Timedelta(timedelta): Format the Timedelta as ISO 8601 Duration. ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the - values. See https://en.wikipedia.org/wiki/ISO_8601#Durations. + values. See Wikipedia: + `ISO 8601 § Durations `_. Returns ------- @@ -1658,6 +1878,12 @@ cdef class _Timedelta(timedelta): ------- Timedelta + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + to_timedelta : Convert argument to timedelta. + Timedelta.asm8 : Return a numpy timedelta64 array scalar view. + Examples -------- >>> td = pd.Timedelta('1001ms') @@ -1701,7 +1927,7 @@ cdef class _Timedelta(timedelta): # Python front end to C extension type _Timedelta # This serves as the box for timedelta64 - +@set_module("pandas") class Timedelta(_Timedelta): """ Represents a duration, the difference between two dates or times. @@ -1711,9 +1937,12 @@ class Timedelta(_Timedelta): Parameters ---------- - value : Timedelta, timedelta, np.timedelta64, str, or int + value : Timedelta, timedelta, np.timedelta64, str, int or float + Input value. unit : str, default 'ns' - Denote the unit of the input, if input is an integer. + If input is an integer, denote the unit of the input. + If input is a float, denote the unit of the integer parts. + The decimal parts with resolution lower than 1 nanosecond are ignored. Possible values: @@ -1726,10 +1955,10 @@ class Timedelta(_Timedelta): * 'microseconds', 'microsecond', 'micros', 'micro', or 'us' * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'. - .. deprecated:: 2.2.0 + .. deprecated:: 3.0.0 - Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour - of the values `h`, `min`, `s`, `ms`, `us`, and `ns`. + Allowing the values `w`, `d`, `MIN`, `MS`, `US` and `NS` to denote units + are deprecated in favour of the values `W`, `D`, `min`, `ms`, `us` and `ns`. **kwargs Available kwargs: {days, seconds, microseconds, @@ -1737,6 +1966,15 @@ class Timedelta(_Timedelta): Values for construction in compat with datetime.timedelta. Numpy ints and floats will be coerced to python ints and floats. + See Also + -------- + Timestamp : Represents a single timestamp in time. + TimedeltaIndex : Immutable Index of timedelta64 data. + DateOffset : Standard kind of date increment used for a date range. + to_timedelta : Convert argument to timedelta. + datetime.timedelta : Represents a duration in the datetime module. + numpy.timedelta64 : Represents a duration compatible with NumPy. + Notes ----- The constructor may take in either both values of value and unit or @@ -1751,7 +1989,7 @@ class Timedelta(_Timedelta): -------- Here we initialize Timedelta object with both value and unit - >>> td = pd.Timedelta(1, "d") + >>> td = pd.Timedelta(1, "D") >>> td Timedelta('1 days 00:00:00') @@ -1960,6 +2198,12 @@ class Timedelta(_Timedelta): ------ ValueError if the freq cannot be converted + See Also + -------- + Timedelta.floor : Floor the Timedelta to the specified resolution. + Timedelta.round : Round the Timedelta to the nearest specified resolution. + Timestamp.ceil : Similar method for Timestamp objects. + Examples -------- >>> td = pd.Timedelta('1001ms') @@ -1980,6 +2224,16 @@ class Timedelta(_Timedelta): Frequency string indicating the flooring resolution. It uses the same units as class constructor :class:`~pandas.Timedelta`. + Returns + ------- + Timedelta + A new Timedelta object floored to the specified resolution. + + See Also + -------- + Timestamp.ceil : Round the Timestamp up to the nearest specified resolution. + Timestamp.round : Round the Timestamp to the nearest specified resolution. + Examples -------- >>> td = pd.Timedelta('1001ms') @@ -1997,8 +2251,20 @@ class Timedelta(_Timedelta): Parameters ---------- freq : str - Frequency string indicating the ceiling resolution. - It uses the same units as class constructor :class:`~pandas.Timedelta`. + Frequency string indicating the ceiling resolution. Must be a fixed + frequency like 's' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + + Returns + ------- + Timedelta + A new Timedelta object ceiled to the specified resolution. + + See Also + -------- + Timedelta.floor : Floor the Timedelta to the specified resolution. + Timedelta.round : Round the Timedelta to the nearest specified resolution. Examples -------- diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index bd73c713f6c04..c5ec92fabc7f8 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -21,7 +21,7 @@ cdef _Timestamp create_timestamp_from_ts(int64_t value, cdef class _Timestamp(ABCTimestamp): cdef readonly: - int64_t _value, nanosecond, year + int64_t _value, _nanosecond, _year NPY_DATETIMEUNIT _creso cdef bint _get_start_end_field(self, str field, freq) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 568539b53aee0..390267db8267f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -50,6 +50,7 @@ import datetime as dt from pandas._libs.tslibs cimport ccalendar from pandas._libs.tslibs.base cimport ABCTimestamp +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.conversion cimport ( @@ -161,8 +162,8 @@ cdef _Timestamp create_timestamp_from_ts( dts.sec, dts.us, tz, fold=fold) ts_base._value = value - ts_base.year = dts.year - ts_base.nanosecond = dts.ps // 1000 + ts_base._year = dts.year + ts_base._nanosecond = dts.ps // 1000 ts_base._creso = reso return ts_base @@ -199,8 +200,9 @@ class MinMaxReso: See also: timedeltas.MinMaxReso """ - def __init__(self, name): + def __init__(self, name, docstring): self._name = name + self.__doc__ = docstring def __get__(self, obj, type=None): cls = Timestamp @@ -215,11 +217,15 @@ class MinMaxReso: if obj is None: # i.e. this is on the class, default to nanos - return cls(val) + result = cls(val) elif self._name == "resolution": - return Timedelta._from_value_and_reso(val, obj._creso) + result = Timedelta._from_value_and_reso(val, obj._creso) else: - return Timestamp._from_value_and_reso(val, obj._creso, tz=None) + result = Timestamp._from_value_and_reso(val, obj._creso, tz=None) + + result.__doc__ = self.__doc__ + + return result def __set__(self, obj, value): raise AttributeError(f"{self._name} is not settable.") @@ -234,12 +240,98 @@ cdef class _Timestamp(ABCTimestamp): dayofweek = _Timestamp.day_of_week dayofyear = _Timestamp.day_of_year - min = MinMaxReso("min") - max = MinMaxReso("max") - resolution = MinMaxReso("resolution") # GH#21336, GH#21365 + _docstring_min = """ + Returns the minimum bound possible for Timestamp. + + This property provides access to the smallest possible value that + can be represented by a Timestamp object. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.max: Returns the maximum bound possible for Timestamp. + Timestamp.resolution: Returns the smallest possible difference between + non-equal Timestamp objects. + + Examples + -------- + >>> pd.Timestamp.min + Timestamp('1677-09-21 00:12:43.145224193') + """ + + _docstring_max = """ + Returns the maximum bound possible for Timestamp. + + This property provides access to the largest possible value that + can be represented by a Timestamp object. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.min: Returns the minimum bound possible for Timestamp. + Timestamp.resolution: Returns the smallest possible difference between + non-equal Timestamp objects. + + Examples + -------- + >>> pd.Timestamp.max + Timestamp('2262-04-11 23:47:16.854775807') + """ + + _docstring_reso = """ + Returns the smallest possible difference between non-equal Timestamp objects. + + The resolution value is determined by the underlying representation of time + units and is equivalent to Timedelta(nanoseconds=1). + + Returns + ------- + Timedelta + + See Also + -------- + Timestamp.max: Returns the maximum bound possible for Timestamp. + Timestamp.min: Returns the minimum bound possible for Timestamp. + + Examples + -------- + >>> pd.Timestamp.resolution + Timedelta('0 days 00:00:00.000000001') + """ + + min = MinMaxReso("min", _docstring_min) + max = MinMaxReso("max", _docstring_max) + resolution = MinMaxReso("resolution", _docstring_reso) # GH#21336, GH#21365 @property def value(self) -> int: + """ + Return the value of the Timestamp. + + Returns + ------- + int + The integer representation of the Timestamp object in nanoseconds + since the Unix epoch (1970-01-01 00:00:00 UTC). + + See Also + -------- + Timestamp.second : Return the second of the Timestamp. + Timestamp.minute : Return the minute of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.value + 1725120990000000000 + """ + try: return convert_reso(self._value, self._creso, NPY_FR_ns, False) except OverflowError: @@ -254,6 +346,28 @@ cdef class _Timestamp(ABCTimestamp): """ The abbreviation associated with self._creso. + This property returns a string representing the time unit of the Timestamp's + resolution. It corresponds to the smallest time unit that can be represented + by this Timestamp object. The possible values are: + - 's' (second) + - 'ms' (millisecond) + - 'us' (microsecond) + - 'ns' (nanosecond) + + Returns + ------- + str + A string abbreviation of the Timestamp's resolution unit: + - 's' for second + - 'ms' for millisecond + - 'us' for microsecond + - 'ns' for nanosecond + + See Also + -------- + Timestamp.resolution : Return resolution of the Timestamp. + Timedelta : A duration expressing the difference between two dates or times. + Examples -------- >>> pd.Timestamp("2020-01-01 12:34:56").unit @@ -299,7 +413,7 @@ cdef class _Timestamp(ABCTimestamp): def _from_dt64(cls, dt64: np.datetime64): # construct a Timestamp from a np.datetime64 object, keeping the # resolution of the input. - # This is herely mainly so we can incrementally implement non-nano + # This is here mainly so we can incrementally implement non-nano # (e.g. only tznaive at first) cdef: int64_t value @@ -312,9 +426,9 @@ cdef class _Timestamp(ABCTimestamp): # ----------------------------------------------------------------- def __hash__(_Timestamp self): - if self.nanosecond: + if self._nanosecond: return hash(self._value) - if not (1 <= self.year <= 9999): + if not (1 <= self._year <= 9999): # out of bounds for pydatetime return hash(self._value) if self.fold: @@ -332,7 +446,7 @@ cdef class _Timestamp(ABCTimestamp): elif cnp.is_datetime64_object(other): ots = Timestamp(other) elif PyDateTime_Check(other): - if self.nanosecond == 0: + if self._nanosecond == 0: val = self.to_pydatetime() return PyObject_RichCompareBool(val, other, op) @@ -411,7 +525,7 @@ cdef class _Timestamp(ABCTimestamp): if not self._can_compare(other): return NotImplemented - if self.nanosecond == 0: + if self._nanosecond == 0: return PyObject_RichCompareBool(dtval, other, op) # otherwise we have dtval < self @@ -420,9 +534,9 @@ cdef class _Timestamp(ABCTimestamp): if op == Py_EQ: return False if op == Py_LE or op == Py_LT: - return self.year <= other.year + return self._year <= other.year if op == Py_GE or op == Py_GT: - return self.year >= other.year + return self._year >= other.year cdef bint _can_compare(self, datetime other): if self.tzinfo is not None: @@ -563,7 +677,7 @@ cdef class _Timestamp(ABCTimestamp): if own_tz is not None and not is_utc(own_tz): pydatetime_to_dtstruct(self, &dts) - val = npy_datetimestruct_to_datetime(self._creso, &dts) + self.nanosecond + val = npy_datetimestruct_to_datetime(self._creso, &dts) + self._nanosecond else: val = self._value return val @@ -579,15 +693,15 @@ cdef class _Timestamp(ABCTimestamp): if freq: kwds = freq.kwds month_kw = kwds.get("startingMonth", kwds.get("month", 12)) - freqstr = freq.freqstr + freq_name = freq.name else: month_kw = 12 - freqstr = None + freq_name = None val = self._maybe_convert_value_to_local() out = get_start_end_field(np.array([val], dtype=np.int64), - field, freqstr, month_kw, self._creso) + field, freq_name, month_kw, self._creso) return out[0] @property @@ -771,6 +885,11 @@ cdef class _Timestamp(ABCTimestamp): ------- str + See Also + -------- + Timestamp.day_of_week : Return day of the week. + Timestamp.day_of_year : Return day of the year. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -788,6 +907,11 @@ cdef class _Timestamp(ABCTimestamp): """ Return the month name of the Timestamp with specified locale. + This method returns the full name of the month corresponding to the + `Timestamp`, such as 'January', 'February', etc. The month name can + be returned in a specified locale if provided; otherwise, it defaults + to the English locale. + Parameters ---------- locale : str, default None (English locale) @@ -796,9 +920,18 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- str + The full month name as a string. + + See Also + -------- + Timestamp.day_name : Returns the name of the day of the week. + Timestamp.strftime : Returns a formatted string of the Timestamp. + datetime.datetime.strftime : Returns a string representing the date and time. Examples -------- + Get the month name in English (default): + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> ts.month_name() 'March' @@ -815,9 +948,20 @@ cdef class _Timestamp(ABCTimestamp): """ Return True if year is a leap year. + A leap year is a year, which has 366 days (instead of 365) including 29th of + February as an intercalary day. Leap years are years which are multiples of + four with the exception of years divisible by 100 but not by 400. + Returns ------- bool + True if year is a leap year, else False + + See Also + -------- + Period.is_leap_year : Return True if the period’s year is in a leap year. + DatetimeIndex.is_leap_year : Boolean indicator if the date belongs to a + leap year. Examples -------- @@ -825,7 +969,7 @@ cdef class _Timestamp(ABCTimestamp): >>> ts.is_leap_year True """ - return bool(ccalendar.is_leapyear(self.year)) + return bool(ccalendar.is_leapyear(self._year)) @property def day_of_week(self) -> int: @@ -836,6 +980,12 @@ cdef class _Timestamp(ABCTimestamp): ------- int + See Also + -------- + Timestamp.isoweekday : Return the ISO day of the week represented by the date. + Timestamp.weekday : Return the day of the week represented by the date. + Timestamp.day_of_year : Return day of the year. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14) @@ -853,31 +1003,264 @@ cdef class _Timestamp(ABCTimestamp): ------- int + See Also + -------- + Timestamp.day_of_week : Return day of the week. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14) >>> ts.day_of_year 74 """ - return ccalendar.get_day_of_year(self.year, self.month, self.day) + return ccalendar.get_day_of_year(self._year, self.month, self.day) @property def quarter(self) -> int: """ - Return the quarter of the year. + Return the quarter of the year for the `Timestamp`. + + This property returns an integer representing the quarter of the year in + which the `Timestamp` falls. The quarters are defined as follows: + - Q1: January 1 to March 31 + - Q2: April 1 to June 30 + - Q3: July 1 to September 30 + - Q4: October 1 to December 31 Returns ------- int + The quarter of the year (1 through 4). + + See Also + -------- + Timestamp.month : Returns the month of the `Timestamp`. + Timestamp.year : Returns the year of the `Timestamp`. Examples -------- + Get the quarter for a `Timestamp`: + >>> ts = pd.Timestamp(2020, 3, 14) >>> ts.quarter 1 + + For a `Timestamp` in the fourth quarter: + + >>> ts = pd.Timestamp(2020, 10, 14) + >>> ts.quarter + 4 """ return ((self.month - 1) // 3) + 1 + @property + def day(self) -> int: + """ + Return the day of the Timestamp. + + Returns + ------- + int + The day of the Timestamp. + + See Also + -------- + Timestamp.week : Return the week number of the year. + Timestamp.weekday : Return the day of the week. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.day + 31 + """ + return super().day + + @property + def fold(self) -> int: + """ + Return the fold value of the Timestamp. + + Returns + ------- + int + The fold value of the Timestamp, where 0 indicates the first occurrence + of the ambiguous time, and 1 indicates the second. + + See Also + -------- + Timestamp.dst : Return the daylight saving time (DST) adjustment. + Timestamp.tzinfo : Return the timezone information associated. + + Examples + -------- + >>> ts = pd.Timestamp("2024-11-03 01:30:00") + >>> ts.fold + 0 + """ + return super().fold + + @property + def year(self) -> int: + """ + Return the year of the Timestamp. + + Returns + ------- + int + The year of the Timestamp. + + See Also + -------- + Timestamp.month : Return the month of the Timestamp. + Timestamp.day : Return the day of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.year + 2024 + """ + return self._year + + @property + def month(self) -> int: + """ + Return the month of the Timestamp. + + Returns + ------- + int + The month of the Timestamp. + + See Also + -------- + Timestamp.day : Return the day of the Timestamp. + Timestamp.year : Return the year of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.month + 8 + """ + return super().month + + @property + def hour(self) -> int: + """ + Return the hour of the Timestamp. + + Returns + ------- + int + The hour of the Timestamp. + + See Also + -------- + Timestamp.minute : Return the minute of the Timestamp. + Timestamp.second : Return the second of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.hour + 16 + """ + return super().hour + + @property + def minute(self) -> int: + """ + Return the minute of the Timestamp. + + Returns + ------- + int + The minute of the Timestamp. + + See Also + -------- + Timestamp.hour : Return the hour of the Timestamp. + Timestamp.second : Return the second of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.minute + 16 + """ + return super().minute + + @property + def second(self) -> int: + """ + Return the second of the Timestamp. + + Returns + ------- + int + The second of the Timestamp. + + See Also + -------- + Timestamp.microsecond : Return the microsecond of the Timestamp. + Timestamp.minute : Return the minute of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.second + 30 + """ + return super().second + + @property + def microsecond(self) -> int: + """ + Return the microsecond of the Timestamp. + + Returns + ------- + int + The microsecond of the Timestamp. + + See Also + -------- + Timestamp.second : Return the second of the Timestamp. + Timestamp.minute : Return the minute of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30.2304") + >>> ts.microsecond + 230400 + """ + return super().microsecond + + @property + def nanosecond(self) -> int: + """ + Return the nanosecond of the Timestamp. + + Returns + ------- + int + The nanosecond of the Timestamp. + + See Also + -------- + Timestamp.second : Return the second of the Timestamp. + Timestamp.microsecond : Return the microsecond of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30.230400015") + >>> ts.nanosecond + 15 + """ + return self._nanosecond + @property def week(self) -> int: """ @@ -887,13 +1270,18 @@ cdef class _Timestamp(ABCTimestamp): ------- int + See Also + -------- + Timestamp.weekday : Return the day of the week. + Timestamp.quarter : Return the quarter of the year. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14) >>> ts.week 11 """ - return ccalendar.get_week_of_year(self.year, self.month, self.day) + return ccalendar.get_week_of_year(self._year, self.month, self.day) @property def days_in_month(self) -> int: @@ -904,13 +1292,18 @@ cdef class _Timestamp(ABCTimestamp): ------- int + See Also + -------- + Timestamp.month_name : Return the month name of the Timestamp with + specified locale. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14) >>> ts.days_in_month 31 """ - return ccalendar.get_days_in_month(self.year, self.month) + return ccalendar.get_days_in_month(self._year, self.month) # ----------------------------------------------------------------- # Transformation Methods @@ -919,6 +1312,21 @@ cdef class _Timestamp(ABCTimestamp): """ Normalize Timestamp to midnight, preserving tz information. + This method sets the time component of the `Timestamp` to midnight (00:00:00), + while preserving the date and time zone information. It is useful when you + need to standardize the time across different `Timestamp` objects without + altering the time zone or the date. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.floor : Rounds `Timestamp` down to the nearest frequency. + Timestamp.ceil : Rounds `Timestamp` up to the nearest frequency. + Timestamp.round : Rounds `Timestamp` to the nearest frequency. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14, 15, 30) @@ -969,9 +1377,9 @@ cdef class _Timestamp(ABCTimestamp): The full format looks like 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn'. By default, the fractional part is omitted if self.microsecond == 0 - and self.nanosecond == 0. + and self._nanosecond == 0. - If self.tzinfo is not None, the UTC offset is also attached, giving + If self.tzinfo is not None, the UTC offset is also attached, giving a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn+HH:MM'. Parameters @@ -988,6 +1396,12 @@ cdef class _Timestamp(ABCTimestamp): ------- str + See Also + -------- + Timestamp.strftime : Return a formatted string. + Timestamp.isocalendar : Return a tuple containing ISO year, week number and + weekday. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -999,9 +1413,9 @@ cdef class _Timestamp(ABCTimestamp): base_ts = "microseconds" if timespec == "nanoseconds" else timespec base = super(_Timestamp, self).isoformat(sep=sep, timespec=base_ts) # We need to replace the fake year 1970 with our real year - base = f"{self.year:04d}-" + base.split("-", 1)[1] + base = f"{self._year:04d}-" + base.split("-", 1)[1] - if self.nanosecond == 0 and timespec != "nanoseconds": + if self._nanosecond == 0 and timespec != "nanoseconds": return base if self.tzinfo is not None: @@ -1009,11 +1423,11 @@ cdef class _Timestamp(ABCTimestamp): else: base1, base2 = base, "" - if timespec == "nanoseconds" or (timespec == "auto" and self.nanosecond): + if timespec == "nanoseconds" or (timespec == "auto" and self._nanosecond): if self.microsecond or timespec == "nanoseconds": - base1 += f"{self.nanosecond:03d}" + base1 += f"{self._nanosecond:03d}" else: - base1 += f".{self.nanosecond:09d}" + base1 += f".{self._nanosecond:09d}" return base1 + base2 @@ -1047,14 +1461,14 @@ cdef class _Timestamp(ABCTimestamp): def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but # the datetime strftime() methods require year >= 1900 and is slower - return f"{self.year}-{self.month:02d}-{self.day:02d}" + return f"{self._year}-{self.month:02d}-{self.day:02d}" @property def _time_repr(self) -> str: result = f"{self.hour:02d}:{self.minute:02d}:{self.second:02d}" - if self.nanosecond != 0: - result += f".{self.nanosecond + 1000 * self.microsecond:09d}" + if self._nanosecond != 0: + result += f".{self._nanosecond + 1000 * self.microsecond:09d}" elif self.microsecond != 0: result += f".{self.microsecond:06d}" @@ -1083,7 +1497,7 @@ cdef class _Timestamp(ABCTimestamp): def as_unit(self, str unit, bint round_ok=True): """ - Convert the underlying int64 representaton to the given unit. + Convert the underlying int64 representation to the given unit. Parameters ---------- @@ -1095,6 +1509,14 @@ cdef class _Timestamp(ABCTimestamp): ------- Timestamp + See Also + -------- + Timestamp.asm8 : Return numpy datetime64 format with same precision. + Timestamp.to_pydatetime : Convert Timestamp object to a native + Python datetime object. + to_timedelta : Convert argument into timedelta object, + which can represent differences in times. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 00:00:00.01') @@ -1120,7 +1542,13 @@ cdef class _Timestamp(ABCTimestamp): @property def asm8(self) -> np.datetime64: """ - Return numpy datetime64 format in nanoseconds. + Return numpy datetime64 format with same precision. + + See Also + -------- + numpy.datetime64 : Numpy datatype for dates and times with high precision. + Timestamp.to_numpy : Convert the Timestamp to a NumPy datetime64. + to_datetime : Convert argument to datetime. Examples -------- @@ -1134,6 +1562,23 @@ cdef class _Timestamp(ABCTimestamp): """ Return POSIX timestamp as float. + This method converts the `Timestamp` object to a POSIX timestamp, which is + the number of seconds since the Unix epoch (January 1, 1970). The returned + value is a floating-point number, where the integer part represents the + seconds, and the fractional part represents the microseconds. + + Returns + ------- + float + The POSIX timestamp representation of the `Timestamp` object. + + See Also + -------- + Timestamp.fromtimestamp : Construct a `Timestamp` from a POSIX timestamp. + datetime.datetime.timestamp : Equivalent method from the `datetime` module. + Timestamp.to_pydatetime : Convert the `Timestamp` to a `datetime` object. + Timestamp.to_datetime64 : Converts `Timestamp` to `numpy.datetime64`. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') @@ -1151,7 +1596,30 @@ cdef class _Timestamp(ABCTimestamp): """ Convert a Timestamp object to a native Python datetime object. - If warn=True, issue a warning if nanoseconds is nonzero. + This method is useful for when you need to utilize a pandas Timestamp + object in contexts where native Python datetime objects are expected + or required. The conversion discards the nanoseconds component, and a + warning can be issued in such cases if desired. + + Parameters + ---------- + warn : bool, default True + If True, issues a warning when the timestamp includes nonzero + nanoseconds, as these will be discarded during the conversion. + + Returns + ------- + datetime.datetime or NaT + Returns a datetime.datetime object representing the timestamp, + with year, month, day, hour, minute, second, and microsecond components. + If the timestamp is NaT (Not a Time), returns NaT. + + See Also + -------- + datetime.datetime : The standard Python datetime class that this method + returns. + Timestamp.timestamp : Convert a Timestamp object to POSIX timestamp. + Timestamp.to_datetime64 : Convert a Timestamp object to numpy.datetime64. Examples -------- @@ -1164,17 +1632,27 @@ cdef class _Timestamp(ABCTimestamp): >>> pd.NaT.to_pydatetime() NaT """ - if self.nanosecond != 0 and warn: + if self._nanosecond != 0 and warn: warnings.warn("Discarding nonzero nanoseconds in conversion.", UserWarning, stacklevel=find_stack_level()) - return datetime(self.year, self.month, self.day, + return datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) cpdef to_datetime64(self): """ - Return a numpy.datetime64 object with same precision. + Return a NumPy datetime64 object with same precision. + + This method returns a numpy.datetime64 object with the same + date and time information and precision as the pd.Timestamp object. + + See Also + -------- + numpy.datetime64 : Class to represent dates and times with high precision. + Timestamp.to_numpy : Alias for this method. + Timestamp.asm8 : Alias for this method. + pd.to_datetime : Convert argument to datetime. Examples -------- @@ -1197,6 +1675,15 @@ cdef class _Timestamp(ABCTimestamp): copy parameters are available here only for compatibility. Their values will not affect the return value. + Parameters + ---------- + dtype : dtype, optional + Data type of the output, ignored in this method as the return type + is always `numpy.datetime64`. + copy : bool, default False + Whether to ensure that the returned value is a new object. This + parameter is also ignored as the method does not support copying. + Returns ------- numpy.datetime64 @@ -1226,6 +1713,21 @@ cdef class _Timestamp(ABCTimestamp): """ Return an period of which this timestamp is an observation. + This method converts the given Timestamp to a Period object, + which represents a span of time,such as a year, month, etc., + based on the specified frequency. + + Parameters + ---------- + freq : str, optional + Frequency string for the period (e.g., 'Y', 'M', 'W'). Defaults to `None`. + + See Also + -------- + Timestamp : Represents a specific timestamp. + Period : Represents a span of time. + to_period : Converts an object to a Period. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -1263,7 +1765,7 @@ cdef class _Timestamp(ABCTimestamp): # Python front end to C extension type _Timestamp # This serves as the box for datetime64 - +@set_module("pandas") class Timestamp(_Timestamp): """ Pandas replacement for python datetime.datetime object. @@ -1277,15 +1779,29 @@ class Timestamp(_Timestamp): ---------- ts_input : datetime-like, str, int, float Value to be converted to Timestamp. - year, month, day : int - hour, minute, second, microsecond : int, optional, default 0 + year : int + Value of year. + month : int + Value of month. + day : int + Value of day. + hour : int, optional, default 0 + Value of hour. + minute : int, optional, default 0 + Value of minute. + second : int, optional, default 0 + Value of second. + microsecond : int, optional, default 0 + Value of microsecond. tzinfo : datetime.tzinfo, optional, default None + Timezone info. nanosecond : int, optional, default 0 - tz : str, pytz.timezone, dateutil.tz.tzfile or None + Value of nanosecond. + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will have. unit : str Unit used for conversion if ts_input is of type int or float. The - valid values are 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For + valid values are 'W', 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For example, 's' means seconds and 'ms' means milliseconds. For float inputs, the result will be stored in nanoseconds, and @@ -1296,6 +1812,11 @@ class Timestamp(_Timestamp): datetime-like corresponds to the first (0) or the second time (1) the wall clock hits the ambiguous time. + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + datetime.datetime : Python datetime.datetime object. + Notes ----- There are essentially three calling conventions for the constructor. The @@ -1319,6 +1840,11 @@ class Timestamp(_Timestamp): >>> pd.Timestamp(1513393355.5, unit='s') Timestamp('2017-12-16 03:02:35.500000') + This converts an int representing a Unix-epoch in units of weeks + + >>> pd.Timestamp(1535, unit='W') + Timestamp('1999-06-03 00:00:00') + This converts an int representing a Unix-epoch in units of seconds and for a particular timezone @@ -1339,21 +1865,43 @@ class Timestamp(_Timestamp): """ Construct a timestamp from a a proleptic Gregorian ordinal. + This method creates a `Timestamp` object corresponding to the given + proleptic Gregorian ordinal, which is a count of days from January 1, + 0001 (using the proleptic Gregorian calendar). The time part of the + `Timestamp` is set to midnight (00:00:00) by default. + Parameters ---------- ordinal : int Date corresponding to a proleptic Gregorian ordinal. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + Returns + ------- + Timestamp + A `Timestamp` object representing the specified ordinal date. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + Notes ----- By definition there cannot be any tz info on the ordinal itself. Examples -------- + Convert an ordinal to a `Timestamp`: + >>> pd.Timestamp.fromordinal(737425) Timestamp('2020-01-01 00:00:00') + + Create a `Timestamp` from an ordinal with timezone information: + + >>> pd.Timestamp.fromordinal(737425, tz='UTC') + Timestamp('2020-01-01 00:00:00+0000', tz='UTC') """ return cls(datetime.fromordinal(ordinal), tz=tz) @@ -1362,11 +1910,21 @@ class Timestamp(_Timestamp): """ Return new Timestamp object representing current time local to tz. + This method returns a new `Timestamp` object that represents the current time. + If a timezone is provided, the current time will be localized to that timezone. + Otherwise, it returns the current local time. + Parameters ---------- tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.today : Return the current time in the local timezone. + Examples -------- >>> pd.Timestamp.now() # doctest: +SKIP @@ -1394,6 +1952,12 @@ class Timestamp(_Timestamp): tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + datetime.datetime.today : Returns the current local date. + Timestamp.now : Returns current time with optional timezone. + Timestamp : A class representing a specific timestamp. + Examples -------- >>> pd.Timestamp.today() # doctest: +SKIP @@ -1413,11 +1977,31 @@ class Timestamp(_Timestamp): Return a new Timestamp representing UTC day and time. + See Also + -------- + Timestamp : Constructs an arbitrary datetime. + Timestamp.now : Return the current local date and time, which + can be timezone-aware. + Timestamp.today : Return the current local date and time with + timezone information set to None. + to_datetime : Convert argument to datetime. + date_range : Return a fixed frequency DatetimeIndex. + Timestamp.utctimetuple : Return UTC time tuple, compatible with + time.localtime(). + Examples -------- >>> pd.Timestamp.utcnow() # doctest: +SKIP Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC') """ + warnings.warn( + # The stdlib datetime.utcnow is deprecated, so we deprecate to match. + # GH#56680 + "Timestamp.utcnow is deprecated and will be removed in a future " + "version. Use Timestamp.now('UTC') instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return cls.now(UTC) @classmethod @@ -1427,6 +2011,21 @@ class Timestamp(_Timestamp): Construct a timezone-aware UTC datetime from a POSIX timestamp. + This method creates a datetime object from a POSIX timestamp, keeping the + Timestamp object's timezone. + + Parameters + ---------- + ts : float + POSIX timestamp. + + See Also + -------- + Timezone.tzname : Return time zone name. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local + time from POSIX timestamp. + Notes ----- Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp @@ -1438,21 +2037,56 @@ class Timestamp(_Timestamp): Timestamp('2020-03-14 15:32:52+0000', tz='UTC') """ # GH#22451 + warnings.warn( + # The stdlib datetime.utcfromtimestamp is deprecated, so we deprecate + # to match. GH#56680 + "Timestamp.utcfromtimestamp is deprecated and will be removed in a " + "future version. Use Timestamp.fromtimestamp(ts, 'UTC') instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return cls.fromtimestamp(ts, tz="UTC") @classmethod def fromtimestamp(cls, ts, tz=None): """ - Timestamp.fromtimestamp(ts) + Create a `Timestamp` object from a POSIX timestamp. + + This method converts a POSIX timestamp (the number of seconds since + January 1, 1970, 00:00:00 UTC) into a `Timestamp` object. The resulting + `Timestamp` can be localized to a specific time zone if provided. - Transform timestamp[, tz] to tz's local time from POSIX timestamp. + Parameters + ---------- + ts : float + The POSIX timestamp to convert, representing seconds since + the epoch (1970-01-01 00:00:00 UTC). + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, optional + Time zone for the `Timestamp`. If not provided, the `Timestamp` will + be timezone-naive (i.e., without time zone information). + + Returns + ------- + Timestamp + A `Timestamp` object representing the given POSIX timestamp. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + datetime.datetime.fromtimestamp : Returns a datetime from a POSIX timestamp. Examples -------- + Convert a POSIX timestamp to a `Timestamp`: + >>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP Timestamp('2020-03-14 15:32:52') - Note that the output may change depending on your local time. + Note that the output may change depending on your local time and time zone: + + >>> pd.Timestamp.fromtimestamp(1584199972, tz='UTC') # doctest: +SKIP + Timestamp('2020-03-14 15:32:52+0000', tz='UTC') """ tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) @@ -1468,6 +2102,12 @@ class Timestamp(_Timestamp): See strftime documentation for more information on the format string: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + See Also + -------- + Timestamp.isoformat : Return the time formatted according to ISO 8601. + pd.to_datetime : Convert argument to datetime. + Period.strftime : Format a single Period. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -1475,7 +2115,7 @@ class Timestamp(_Timestamp): '2020-03-14 15:32:52' """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -1489,7 +2129,25 @@ class Timestamp(_Timestamp): def ctime(self): """ - Return ctime() style string. + Return a ctime() style string representing the Timestamp. + + This method returns a string representing the date and time + in the format returned by the standard library's `time.ctime()` + function, which is typically in the form 'Day Mon DD HH:MM:SS YYYY'. + + If the `Timestamp` is outside the range supported by Python's + standard library, a `NotImplementedError` is raised. + + Returns + ------- + str + A string representing the Timestamp in ctime format. + + See Also + -------- + time.ctime : Return a string representing time in ctime format. + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.ctime : Return a ctime style string from a datetime object. Examples -------- @@ -1500,7 +2158,7 @@ class Timestamp(_Timestamp): 'Sun Jan 1 10:00:00 2023' """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -1514,10 +2172,25 @@ class Timestamp(_Timestamp): def date(self): """ - Return date object with same year, month and day. + Returns `datetime.date` with the same year, month, and day. + + This method extracts the date component from the `Timestamp` and returns + it as a `datetime.date` object, discarding the time information. + + Returns + ------- + datetime.date + The date part of the `Timestamp`. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.date : Extract the date component from a `datetime` object. Examples -------- + Extract the date from a Timestamp: + >>> ts = pd.Timestamp('2023-01-01 10:00:00.00') >>> ts Timestamp('2023-01-01 10:00:00') @@ -1525,7 +2198,7 @@ class Timestamp(_Timestamp): datetime.date(2023, 1, 1) """ try: - _dt = dt.date(self.year, self.month, self.day) + _dt = dt.date(self._year, self.month, self.day) except ValueError as err: raise NotImplementedError( "date not yet supported on Timestamps which " @@ -1537,6 +2210,14 @@ class Timestamp(_Timestamp): """ Return the daylight saving time (DST) adjustment. + This method returns the DST adjustment as a `datetime.timedelta` object + if the Timestamp is timezone-aware and DST is applicable. + + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2000-06-01 00:00:00', tz='Europe/Brussels') @@ -1551,6 +2232,12 @@ class Timestamp(_Timestamp): """ Return a named tuple containing ISO year, week number, and weekday. + See Also + -------- + DatetimeIndex.isocalendar : Return a 3-tuple containing ISO year, + week number, and weekday for the given DatetimeIndex object. + datetime.date.isocalendar : The equivalent method for `datetime.date` objects. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -1560,7 +2247,7 @@ class Timestamp(_Timestamp): datetime.IsoCalendarDate(year=2022, week=52, weekday=7) """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -1574,6 +2261,13 @@ class Timestamp(_Timestamp): """ Return time zone name. + This method returns the name of the Timestamp's time zone as a string. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -1584,10 +2278,54 @@ class Timestamp(_Timestamp): """ return super().tzname() + @property + def tzinfo(self): + """ + Returns the timezone info of the Timestamp. + + This property returns a `datetime.tzinfo` object if the Timestamp + is timezone-aware. If the Timestamp has no timezone, it returns `None`. + If the Timestamp is in UTC or a fixed-offset timezone, + it returns `datetime.timezone`. If the Timestamp uses an + IANA timezone (e.g., "America/New_York"), it returns `zoneinfo.ZoneInfo`. + + See Also + -------- + Timestamp.tz : Alias for `tzinfo`, may return a `zoneinfo.ZoneInfo` object. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a specific timezone. + + Examples + -------- + >>> ts = pd.Timestamp("2023-01-01 12:00:00", tz="UTC") + >>> ts.tzinfo + datetime.timezone.utc + + >>> ts_naive = pd.Timestamp("2023-01-01 12:00:00") + >>> ts_naive.tzinfo + """ + return super().tzinfo + def utcoffset(self): """ Return utc offset. + This method returns the difference between UTC and the local time + as a `timedelta` object. It is useful for understanding the time + difference between the current timezone and UTC. + + Returns + ------- + timedelta + The difference between UTC and the local time as a `timedelta` object. + + See Also + -------- + datetime.datetime.utcoffset : + Standard library method to get the UTC offset of a datetime object. + Timestamp.tzname : Return the name of the timezone. + Timestamp.dst : Return the daylight saving time (DST) adjustment. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -1600,7 +2338,24 @@ class Timestamp(_Timestamp): def utctimetuple(self): """ - Return UTC time tuple, compatible with time.localtime(). + Return UTC time tuple, compatible with `time.localtime()`. + + This method converts the Timestamp to UTC and returns a time tuple + containing 9 components: year, month, day, hour, minute, second, + weekday, day of year, and DST flag. This is particularly useful for + converting a Timestamp to a format compatible with time module functions. + + Returns + ------- + time.struct_time + A time.struct_time object representing the UTC time. + + See Also + -------- + datetime.datetime.utctimetuple : + Return UTC time tuple, compatible with time.localtime(). + Timestamp.timetuple : Return time tuple of local time. + time.struct_time : Time tuple structure used by time functions. Examples -------- @@ -1617,6 +2372,16 @@ class Timestamp(_Timestamp): """ Return time object with same time but with tzinfo=None. + This method extracts the time part of the `Timestamp` object, excluding any + timezone information. It returns a `datetime.time` object which only represents + the time (hours, minutes, seconds, and microseconds). + + See Also + -------- + Timestamp.date : Return date object with same year, month and day. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -1631,6 +2396,17 @@ class Timestamp(_Timestamp): """ Return time tuple, compatible with time.localtime(). + This method converts the `Timestamp` into a time tuple, which is compatible + with functions like `time.localtime()`. The time tuple is a named tuple with + attributes such as year, month, day, hour, minute, second, weekday, + day of the year, and daylight savings indicator. + + See Also + -------- + time.localtime : Converts a POSIX timestamp into a time tuple. + Timestamp : The `Timestamp` that represents a specific point in time. + datetime.datetime.timetuple : Equivalent method in the `datetime` module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -1641,7 +2417,7 @@ class Timestamp(_Timestamp): tm_hour=10, tm_min=0, tm_sec=0, tm_wday=6, tm_yday=1, tm_isdst=-1) """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -1655,6 +2431,19 @@ class Timestamp(_Timestamp): """ Return time object with same time and tzinfo. + This method returns a datetime.time object with + the time and tzinfo corresponding to the pd.Timestamp + object, ignoring any information about the day/date. + + See Also + -------- + datetime.datetime.timetz : Return datetime.time object with the + same time attributes as the datetime object. + datetime.time : Class to represent the time of day, independent + of any particular day. + datetime.datetime.tzinfo : Attribute of datetime.datetime objects + representing the timezone of the datetime object. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -1669,6 +2458,17 @@ class Timestamp(_Timestamp): """ Return proleptic Gregorian ordinal. January 1 of year 1 is day 1. + The proleptic Gregorian ordinal is a continuous count of days since + January 1 of year 1, which is considered day 1. This method converts + the `Timestamp` to its equivalent ordinal number, useful for date arithmetic + and comparison operations. + + See Also + -------- + datetime.datetime.toordinal : Equivalent method in the `datetime` module. + Timestamp : The `Timestamp` that represents a specific point in time. + Timestamp.fromordinal : Create a `Timestamp` from an ordinal. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:50') @@ -1678,7 +2478,7 @@ class Timestamp(_Timestamp): 738521 """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -1692,9 +2492,27 @@ class Timestamp(_Timestamp): @classmethod def strptime(cls, date_string, format): """ - Timestamp.strptime(string, format) + Convert string argument to datetime. + + This method is not implemented; calling it will raise NotImplementedError. + Use pd.to_datetime() instead. + + Parameters + ---------- + date_string : str + String to convert to a datetime. + format : str, default None + The format string to parse time, e.g. "%d/%m/%Y". - Function is not implemented. Use pd.to_datetime(). + See Also + -------- + pd.to_datetime : Convert argument to datetime. + datetime.datetime.strptime : Return a datetime corresponding to a string + representing a date and time, parsed according to a separate + format string. + datetime.datetime.strftime : Return a string representing the date and + time, controlled by an explicit format string. + Timestamp.isoformat : Return the time formatted according to ISO 8601. Examples -------- @@ -1712,7 +2530,28 @@ class Timestamp(_Timestamp): """ Timestamp.combine(date, time) - Combine date, time into datetime with same date and time fields. + Combine a date and time into a single Timestamp object. + + This method takes a `date` object and a `time` object + and combines them into a single `Timestamp` + that has the same date and time fields. + + Parameters + ---------- + date : datetime.date + The date part of the Timestamp. + time : datetime.time + The time part of the Timestamp. + + Returns + ------- + Timestamp + A new `Timestamp` object representing the combined date and time. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. Examples -------- @@ -1735,7 +2574,7 @@ class Timestamp(_Timestamp): tzinfo_type tzinfo=None, *, nanosecond=None, - tz=None, + tz=_no_input, unit=None, fold=None, ): @@ -1767,6 +2606,10 @@ class Timestamp(_Timestamp): _date_attributes = [year, month, day, hour, minute, second, microsecond, nanosecond] + explicit_tz_none = tz is None + if tz is _no_input: + tz = None + if tzinfo is not None: # GH#17690 tzinfo must be a datetime.tzinfo object, ensured # by the cython annotation. @@ -1867,6 +2710,11 @@ class Timestamp(_Timestamp): if ts.value == NPY_NAT: return NaT + if ts.tzinfo is not None and explicit_tz_none: + raise ValueError( + "Passed data is timezone-aware, incompatible with 'tz=None'." + ) + return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, ts.fold, ts.creso) def _round(self, freq, mode, ambiguous="raise", nonexistent="raise"): @@ -1908,6 +2756,12 @@ class Timestamp(_Timestamp): """ Round the Timestamp to the specified resolution. + This method rounds the given Timestamp down to a specified frequency + level. It is particularly useful in data analysis to normalize timestamps + to regular frequency intervals. For instance, rounding to the nearest + minute, hour, or day can help in time series comparisons or resampling + operations. + Parameters ---------- freq : str @@ -1918,9 +2772,9 @@ class Timestamp(_Timestamp): * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1931,7 +2785,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1942,6 +2796,14 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted + See Also + -------- + datetime.round : Similar behavior in native Python datetime module. + Timestamp.floor : Round the Timestamp downward to the nearest multiple + of the specified frequency. + Timestamp.ceil : Round the Timestamp upward to the nearest multiple of + the specified frequency. + Notes ----- If the Timestamp has a timezone, rounding will take place relative to the @@ -1957,16 +2819,16 @@ timedelta}, default 'raise' A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='h') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.round(freq='min') # minute + >>> ts.round(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.round(freq='s') # seconds + >>> ts.round(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.round(freq='ms') # milliseconds + >>> ts.round(freq='ms') # milliseconds Timestamp('2020-03-14 15:32:52.193000') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -2013,9 +2875,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2026,13 +2888,19 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.ceil : Round up a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.floor : Round down the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, flooring will take place relative to the @@ -2048,16 +2916,16 @@ timedelta}, default 'raise' A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='h') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') - >>> ts.floor(freq='min') # minute + >>> ts.floor(freq='min') # minute Timestamp('2020-03-14 15:32:00') - >>> ts.floor(freq='s') # seconds + >>> ts.floor(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.floor(freq='ns') # nanoseconds + >>> ts.floor(freq='ns') # nanoseconds Timestamp('2020-03-14 15:32:52.192548651') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -2102,9 +2970,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2115,13 +2983,19 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.floor : Round down a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.ceil : Ceil the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, ceiling will take place relative to the @@ -2137,16 +3011,16 @@ timedelta}, default 'raise' A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='h') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.ceil(freq='min') # minute + >>> ts.ceil(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.ceil(freq='s') # seconds + >>> ts.ceil(freq='s') # seconds Timestamp('2020-03-14 15:32:53') - >>> ts.ceil(freq='us') # microseconds + >>> ts.ceil(freq='us') # microseconds Timestamp('2020-03-14 15:32:52.192549') ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): @@ -2182,11 +3056,22 @@ timedelta}, default 'raise' """ Alias for tzinfo. + The `tz` property provides a simple and direct way to retrieve the timezone + information of a `Timestamp` object. It is particularly useful when working + with time series data that includes timezone information, allowing for easy + access and manipulation of the timezone context. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp(1584226800, unit='s', tz='Europe/Stockholm') >>> ts.tz - + zoneinfo.ZoneInfo(key='Europe/Stockholm') """ return self.tzinfo @@ -2207,7 +3092,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding local time. @@ -2224,9 +3109,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2239,7 +3124,7 @@ default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -2251,6 +3136,13 @@ default 'raise' TypeError If the Timestamp is tz-aware and tz is not None. + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a naive timestamp object: @@ -2307,9 +3199,14 @@ default 'raise' """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -2322,6 +3219,13 @@ default 'raise' TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: @@ -2374,22 +3278,48 @@ default 'raise' """ Implements datetime.replace, handles nanoseconds. + This method creates a new `Timestamp` object by replacing the specified + fields with new values. The new `Timestamp` retains the original fields + that are not explicitly replaced. This method handles nanoseconds, and + the `tzinfo` parameter allows for timezone replacement without conversion. + Parameters ---------- year : int, optional + The year to replace. If `None`, the year is not changed. month : int, optional + The month to replace. If `None`, the month is not changed. day : int, optional + The day to replace. If `None`, the day is not changed. hour : int, optional + The hour to replace. If `None`, the hour is not changed. minute : int, optional + The minute to replace. If `None`, the minute is not changed. second : int, optional + The second to replace. If `None`, the second is not changed. microsecond : int, optional + The microsecond to replace. If `None`, the microsecond is not changed. nanosecond : int, optional + The nanosecond to replace. If `None`, the nanosecond is not changed. tzinfo : tz-convertible, optional + The timezone information to replace. If `None`, the timezone is not changed. fold : int, optional + The fold information to replace. If `None`, the fold is not changed. Returns ------- - Timestamp with fields replaced + Timestamp + A new `Timestamp` object with the specified fields replaced. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + + Notes + ----- + The `replace` method does not perform timezone conversions. If you need + to convert the timezone, use the `tz_convert` method instead. Examples -------- @@ -2406,13 +3336,13 @@ default 'raise' Replace timezone (not a conversion): - >>> import pytz - >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> import zoneinfo + >>> ts.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') Analogous for ``pd.NaT``: - >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> pd.NaT.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) NaT """ @@ -2437,7 +3367,7 @@ default 'raise' # setup components pandas_datetime_to_datetimestruct(value, self._creso, &dts) - dts.ps = self.nanosecond * 1000 + dts.ps = self._nanosecond * 1000 # replace def validate(k, v): @@ -2512,7 +3442,14 @@ default 'raise' """ Convert TimeStamp to a Julian Date. - 0 Julian date is noon January 1, 4713 BC. + This method returns the number of days as a float since + 0 Julian date, which is noon January 1, 4713 BC. + + See Also + -------- + Timestamp.toordinal : Return proleptic Gregorian ordinal. + Timestamp.timestamp : Return POSIX timestamp as float. + Timestamp : Represents a single timestamp. Examples -------- @@ -2520,7 +3457,7 @@ default 'raise' >>> ts.to_julian_date() 2458923.147824074 """ - year = self.year + year = self._year month = self.month day = self.day if month <= 2: @@ -2537,7 +3474,7 @@ default 'raise' self.minute / 60.0 + self.second / 3600.0 + self.microsecond / 3600.0 / 1e+6 + - self.nanosecond / 3600.0 / 1e+9 + self._nanosecond / 3600.0 / 1e+9 ) / 24.0) def isoweekday(self): @@ -2546,6 +3483,13 @@ default 'raise' Monday == 1 ... Sunday == 7. + See Also + -------- + Timestamp.weekday : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isocalendar : Return a tuple containing ISO year, week number + and weekday. + datetime.date.isoweekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -2565,6 +3509,12 @@ default 'raise' Monday == 0 ... Sunday == 6. + See Also + -------- + Timestamp.dayofweek : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isoweekday : Return the day of the week with Monday=1, Sunday=7. + datetime.date.weekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01') @@ -2575,7 +3525,7 @@ default 'raise' """ # same as super().weekday(), but that breaks because of how # we have overridden year, see note in create_timestamp_from_ts - return ccalendar.dayofweek(self.year, self.month, self.day) + return ccalendar.dayofweek(self._year, self.month, self.day) # Aliases diff --git a/pandas/_libs/tslibs/timezones.pyi b/pandas/_libs/tslibs/timezones.pyi index 4e9f0c6ae6c33..26ffa568a8480 100644 --- a/pandas/_libs/tslibs/timezones.pyi +++ b/pandas/_libs/tslibs/timezones.pyi @@ -1,8 +1,8 @@ +from collections.abc import Callable from datetime import ( datetime, tzinfo, ) -from typing import Callable import numpy as np diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 10e5790dd1c35..36b644ffc826d 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -2,17 +2,10 @@ from datetime import ( timedelta, timezone, ) +import zoneinfo from pandas.compat._optional import import_optional_dependency -try: - # py39+ - import zoneinfo - from zoneinfo import ZoneInfo -except ImportError: - zoneinfo = None - ZoneInfo = None - from cpython.datetime cimport ( datetime, timedelta, @@ -28,8 +21,8 @@ from dateutil.tz import ( tzutc as _dateutil_tzutc, ) import numpy as np -import pytz -from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo + +pytz = import_optional_dependency("pytz", errors="ignore") cimport numpy as cnp from numpy cimport int64_t @@ -45,10 +38,11 @@ from pandas._libs.tslibs.util cimport ( cdef int64_t NPY_NAT = get_nat() cdef tzinfo utc_stdlib = timezone.utc -cdef tzinfo utc_pytz = pytz.utc +cdef tzinfo utc_pytz = pytz.UTC if pytz else None cdef tzinfo utc_dateutil_str = dateutil_gettz("UTC") # NB: *not* the same as tzutc() cdef tzinfo utc_zoneinfo = None +cdef type ZoneInfo = zoneinfo.ZoneInfo # ---------------------------------------------------------------------- @@ -56,13 +50,13 @@ cdef tzinfo utc_zoneinfo = None cdef bint is_utc_zoneinfo(tzinfo tz): # Workaround for cases with missing tzdata # https://github.com/pandas-dev/pandas/pull/46425#discussion_r830633025 - if tz is None or zoneinfo is None: + if tz is None: return False global utc_zoneinfo if utc_zoneinfo is None: try: - utc_zoneinfo = ZoneInfo("UTC") + utc_zoneinfo = zoneinfo.ZoneInfo("UTC") except zoneinfo.ZoneInfoNotFoundError: return False # Warn if tzdata is too old, even if there is a system tzdata to alert @@ -74,17 +68,15 @@ cdef bint is_utc_zoneinfo(tzinfo tz): cpdef inline bint is_utc(tzinfo tz): return ( - tz is utc_pytz - or tz is utc_stdlib + tz is utc_stdlib or isinstance(tz, _dateutil_tzutc) or tz is utc_dateutil_str or is_utc_zoneinfo(tz) + or (utc_pytz is not None and tz is utc_pytz) ) cdef bint is_zoneinfo(tzinfo tz): - if ZoneInfo is None: - return False return isinstance(tz, ZoneInfo) @@ -119,27 +111,26 @@ cpdef inline object get_timezone(tzinfo tz): raise TypeError("tz argument cannot be None") if is_utc(tz): return tz + elif is_zoneinfo(tz): + return tz.key + elif treat_tz_as_pytz(tz): + zone = tz.zone + if zone is None: + return tz + return zone + elif treat_tz_as_dateutil(tz): + if ".tar.gz" in tz._filename: + raise ValueError( + "Bad tz filename. Dateutil on python 3 on windows has a " + "bug which causes tzfile._filename to be the same for all " + "timezone files. Please construct dateutil timezones " + 'implicitly by passing a string like "dateutil/Europe' + '/London" when you construct your pandas objects instead ' + "of passing a timezone object. See " + "https://github.com/pandas-dev/pandas/pull/7362") + return "dateutil/" + tz._filename else: - if treat_tz_as_dateutil(tz): - if ".tar.gz" in tz._filename: - raise ValueError( - "Bad tz filename. Dateutil on python 3 on windows has a " - "bug which causes tzfile._filename to be the same for all " - "timezone files. Please construct dateutil timezones " - 'implicitly by passing a string like "dateutil/Europe' - '/London" when you construct your pandas objects instead ' - "of passing a timezone object. See " - "https://github.com/pandas-dev/pandas/pull/7362") - return "dateutil/" + tz._filename - else: - # tz is a pytz timezone or unknown. - try: - zone = tz.zone - if zone is None: - return tz - return zone - except AttributeError: - return tz + return tz cpdef inline tzinfo maybe_get_tz(object tz): @@ -167,7 +158,7 @@ cpdef inline tzinfo maybe_get_tz(object tz): elif tz == "UTC" or tz == "utc": tz = utc_stdlib else: - tz = pytz.timezone(tz) + tz = zoneinfo.ZoneInfo(tz) elif is_integer_object(tz): tz = timezone(timedelta(seconds=tz)) elif isinstance(tz, tzinfo): @@ -206,7 +197,7 @@ cdef object tz_cache_key(tzinfo tz): the same tz file). Also, pytz objects are not always hashable so we use str(tz) instead. """ - if isinstance(tz, _pytz_BaseTzInfo): + if pytz is not None and isinstance(tz, pytz.tzinfo.BaseTzInfo): return tz.zone elif isinstance(tz, _dateutil_tzfile): if ".tar.gz" in tz._filename: @@ -240,7 +231,7 @@ cpdef inline bint is_fixed_offset(tzinfo tz): return 1 else: return 0 - elif treat_tz_as_pytz(tz): + elif treat_tz_as_pytz(tz) and pytz is not None: if (len(tz._transition_info) == 0 and len(tz._utc_transition_times) == 0): return 1 diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index 2108fa0f35547..07ee46858577a 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -1,8 +1,8 @@ +from collections.abc import Iterable from datetime import ( timedelta, tzinfo, ) -from typing import Iterable import numpy as np diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 2c4f0cd14db13..c100f315e9a19 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -15,7 +15,6 @@ from cython cimport Py_ssize_t import_datetime() import numpy as np -import pytz cimport numpy as cnp from numpy cimport ( @@ -196,8 +195,8 @@ def tz_localize_to_utc( NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns, ): """ - Localize tzinfo-naive i8 to given time zone (using pytz). If - there are ambiguities in the values, raise AmbiguousTimeError. + Localize tzinfo-naive i8 to given time zone. If + there are ambiguities in the values, raise ValueError. Parameters ---------- @@ -368,7 +367,7 @@ timedelta-like} result[i] = NPY_NAT else: stamp = _render_tstamp(val, creso=creso) - raise pytz.AmbiguousTimeError( + raise ValueError( f"Cannot infer dst time from {stamp}, try using the " "'ambiguous' argument" ) @@ -428,7 +427,10 @@ timedelta-like} result[i] = NPY_NAT else: stamp = _render_tstamp(val, creso=creso) - raise pytz.NonExistentTimeError(stamp) + raise ValueError( + f"{stamp} is a nonexistent time due to daylight savings time. " + "Try using the 'nonexistent' argument." + ) return result.base # .base to get underlying ndarray @@ -607,7 +609,8 @@ cdef ndarray[int64_t] _get_dst_hours( ndarray[uint8_t, cast=True] mismatch ndarray[int64_t] delta, dst_hours ndarray[intp_t] switch_idxs, trans_idx, grp, a_idx, b_idx, one_diff - list trans_grp + # TODO: Can uncomment when numpy >=2 is the minimum + # tuple trans_grp intp_t switch_idx int64_t left, right @@ -630,7 +633,7 @@ cdef ndarray[int64_t] _get_dst_hours( if trans_idx.size == 1: # see test_tz_localize_to_utc_ambiguous_infer stamp = _render_tstamp(vals[trans_idx[0]], creso=creso) - raise pytz.AmbiguousTimeError( + raise ValueError( f"Cannot infer dst time from {stamp} as there " "are no repeated times" ) @@ -652,14 +655,16 @@ cdef ndarray[int64_t] _get_dst_hours( if grp.size == 1 or np.all(delta > 0): # see test_tz_localize_to_utc_ambiguous_infer stamp = _render_tstamp(vals[grp[0]], creso=creso) - raise pytz.AmbiguousTimeError(stamp) + raise ValueError( + f"{stamp} is an ambiguous time and cannot be inferred." + ) # Find the index for the switch and pull from a for dst and b # for standard switch_idxs = (delta <= 0).nonzero()[0] if switch_idxs.size > 1: # see test_tz_localize_to_utc_ambiguous_infer - raise pytz.AmbiguousTimeError( + raise ValueError( f"There are {switch_idxs.size} dst switches when " "there should only be 1." ) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e4ac3a9e167a3..f144275e0ee6a 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,6 +1,5 @@ from cpython.object cimport PyTypeObject -from cpython.unicode cimport PyUnicode_AsUTF8AndSize cdef extern from "Python.h": @@ -155,41 +154,11 @@ cdef inline bint is_nan(object val): return is_complex_object(val) and val != val -cdef inline const char* get_c_string_buf_and_size(str py_string, - Py_ssize_t *length) except NULL: - """ - Extract internal char* buffer of unicode or bytes object `py_string` with - getting length of this internal buffer saved in `length`. - - Notes - ----- - Python object owns memory, thus returned char* must not be freed. - `length` can be NULL if getting buffer length is not needed. - - Parameters - ---------- - py_string : str - length : Py_ssize_t* - - Returns - ------- - buf : const char* - """ - # Note PyUnicode_AsUTF8AndSize() can - # potentially allocate memory inside in unlikely case of when underlying - # unicode object was stored as non-utf8 and utf8 wasn't requested before. - return PyUnicode_AsUTF8AndSize(py_string, length) - - -cdef inline const char* get_c_string(str py_string) except NULL: - return get_c_string_buf_and_size(py_string, NULL) - - -cdef inline bytes string_encode_locale(str py_string) noexcept: +cdef inline bytes string_encode_locale(str py_string): """As opposed to PyUnicode_Encode, use current system locale to encode.""" return PyUnicode_EncodeLocale(py_string, NULL) -cdef inline object char_to_string_locale(const char* data) noexcept: +cdef inline object char_to_string_locale(const char* data): """As opposed to PyUnicode_FromString, use current system locale to decode.""" return PyUnicode_DecodeLocale(data, NULL) diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index de19f592da62b..f377c2e26ab81 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -1,7 +1,5 @@ -""" -For cython types that cannot be represented precisely, closest-available -python equivalents are used, and the precise types kept as adjacent comments. -""" +# For cython types that cannot be represented precisely, closest-available +# python equivalents are used, and the precise types kept as adjacent comments. from datetime import tzinfo import numpy as np diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 0a19092f57706..1e09874639d4f 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -234,7 +234,7 @@ def get_resolution( for i in range(n): # Analogous to: utc_val = stamps[i] - utc_val = cnp.PyArray_GETITEM(stamps, cnp.PyArray_ITER_DATA(it)) + utc_val = (cnp.PyArray_ITER_DATA(it))[0] if utc_val == NPY_NAT: pass @@ -331,7 +331,7 @@ def is_date_array_normalized(ndarray stamps, tzinfo tz, NPY_DATETIMEUNIT reso) - for i in range(n): # Analogous to: utc_val = stamps[i] - utc_val = cnp.PyArray_GETITEM(stamps, cnp.PyArray_ITER_DATA(it)) + utc_val = (cnp.PyArray_ITER_DATA(it))[0] local_val = info.utc_val_to_local_val(utc_val, &pos) diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index a6cfbec9b15b9..99413751cd5c2 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -1,6 +1,6 @@ +from collections.abc import Callable from typing import ( Any, - Callable, Literal, ) @@ -60,6 +60,18 @@ def roll_min( end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ) -> np.ndarray: ... # np.ndarray[float] +def roll_first( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_last( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] def roll_quantile( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] @@ -77,6 +89,12 @@ def roll_rank( method: WindowingRankType, ascending: bool, ) -> np.ndarray: ... # np.ndarray[float] +def roll_nunique( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] def roll_apply( obj: object, start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 6365c030b695b..0c8ea28b60ce8 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -6,6 +6,8 @@ from libc.math cimport ( sqrt, ) from libcpp.deque cimport deque +from libcpp.stack cimport stack +from libcpp.unordered_map cimport unordered_map from pandas._libs.algos cimport TiebreakEnumType @@ -987,39 +989,29 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, # ---------------------------------------------------------------------- -# Moving maximum / minimum code taken from Bottleneck -# Licence at LICENSES/BOTTLENECK_LICENCE - - -cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) noexcept nogil: - - if ai == ai: - nobs[0] = nobs[0] + 1 - elif is_max: - ai = MINfloat64 - else: - ai = MAXfloat64 - - return ai - - -cdef void remove_mm(float64_t aold, Py_ssize_t *nobs) noexcept nogil: - """ remove a value from the mm calc """ - if aold == aold: - nobs[0] = nobs[0] - 1 - - -cdef float64_t calc_mm(int64_t minp, Py_ssize_t nobs, - float64_t value) noexcept nogil: - cdef: - float64_t result +cdef int64_t bisect_left( + deque[int64_t]& a, + int64_t x, + int64_t lo=0, + int64_t hi=-1 +) nogil: + """Same as https://docs.python.org/3/library/bisect.html.""" + + cdef int64_t mid + if hi == -1: + hi = a.size() + while lo < hi: + mid = (lo + hi) // 2 + if a.at(mid) < x: + lo = mid + 1 + else: + hi = mid + return lo - if nobs >= minp: - result = value - else: - result = NaN +from libc.math cimport isnan - return result +# Prior version of moving maximum / minimum code taken from Bottleneck +# Licence at LICENSES/BOTTLENECK_LICENCE def roll_max(ndarray[float64_t] values, ndarray[int64_t] start, @@ -1067,69 +1059,193 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start, return _roll_min_max(values, start, end, minp, is_max=0) -cdef _roll_min_max(ndarray[float64_t] values, - ndarray[int64_t] starti, - ndarray[int64_t] endi, - int64_t minp, - bint is_max): +def _roll_min_max( + ndarray[float64_t] values, + ndarray[int64_t] start, + ndarray[int64_t] end, + int64_t minp, + bint is_max +): cdef: - float64_t ai - int64_t curr_win_size, start - Py_ssize_t i, k, nobs = 0, N = len(starti) - deque Q[int64_t] # min/max always the front - deque W[int64_t] # track the whole window for nobs compute + Py_ssize_t i, i_next, k, valid_start, last_end, last_start, N = len(start) + # Indices of bounded extrema in `values`. `candidates[i]` is always increasing. + # `values[candidates[i]]` is decreasing for max and increasing for min. + deque candidates[int64_t] + # Indices of largest windows that "cover" preceding windows. + stack dominators[int64_t] ndarray[float64_t, ndim=1] output + Py_ssize_t this_start, this_end, stash_start + int64_t q_idx + output = np.empty(N, dtype=np.float64) - Q = deque[int64_t]() - W = deque[int64_t]() + candidates = deque[int64_t]() + dominators = stack[int64_t]() + + # This function was "ported" / translated from sliding_min_max() + # in /pandas/core/_numba/kernels/min_max_.py. + # (See there for credits and some comments.) + # Code translation assumptions/rules: + # - min_periods --> minp + # - deque[0] --> front() + # - deque[-1] --> back() + # - stack[-1] --> top() + # - bool(stack/deque) --> !empty() + # - deque.append() --> push_back() + # - stack.append() --> push() + # - deque.popleft --> pop_front() + # - deque.pop() --> pop_back() with nogil: + if minp < 1: + minp = 1 + + if N>2: + i_next = N - 1 + for i in range(N - 2, -1, -1): + if start[i_next] < start[i] \ + and ( + dominators.empty() + or start[dominators.top()] > start[i_next] + ): + dominators.push(i_next) + i_next = i + + # NaN tracking to guarantee minp + valid_start = -minp + + last_end = 0 + last_start = -1 - # This is using a modified version of the C++ code in this - # SO post: https://stackoverflow.com/a/12239580 - # The original impl didn't deal with variable window sizes - # So the code was optimized for that - - # first window's size - curr_win_size = endi[0] - starti[0] - # GH 32865 - # Anchor output index to values index to provide custom - # BaseIndexer support for i in range(N): + this_start = start[i] + this_end = end[i] - curr_win_size = endi[i] - starti[i] - if i == 0: - start = starti[i] - else: - start = endi[i - 1] - - for k in range(start, endi[i]): - ai = init_mm(values[k], &nobs, is_max) - # Discard previous entries if we find new min or max - if is_max: - while not Q.empty() and ((ai >= values[Q.back()]) or - values[Q.back()] != values[Q.back()]): - Q.pop_back() - else: - while not Q.empty() and ((ai <= values[Q.back()]) or - values[Q.back()] != values[Q.back()]): - Q.pop_back() - Q.push_back(k) - W.push_back(k) - - # Discard entries outside and left of current window - while not Q.empty() and Q.front() <= starti[i] - 1: - Q.pop_front() - while not W.empty() and W.front() <= starti[i] - 1: - remove_mm(values[W.front()], &nobs) - W.pop_front() - - # Save output based on index in input value array - if not Q.empty() and curr_win_size > 0: - output[i] = calc_mm(minp, nobs, values[Q.front()]) + if (not dominators.empty() and dominators.top() == i): + dominators.pop() + + if not (this_end > last_end + or (this_end == last_end and this_start >= last_start)): + raise ValueError( + "Start/End ordering requirement is violated at index {}".format(i)) + + if dominators.empty(): + stash_start = this_start else: + stash_start = min(this_start, start[dominators.top()]) + + while not candidates.empty() and candidates.front() < stash_start: + candidates.pop_front() + + for k in range(last_end, this_end): + if not isnan(values[k]): + valid_start += 1 + while valid_start >= 0 and isnan(values[valid_start]): + valid_start += 1 + + if is_max: + while (not candidates.empty() + and values[k] >= values[candidates.back()]): + candidates.pop_back() + else: + while (not candidates.empty() + and values[k] <= values[candidates.back()]): + candidates.pop_back() + candidates.push_back(k) + + if candidates.empty() or this_start > valid_start: output[i] = NaN + elif candidates.front() >= this_start: + # ^^ This is here to avoid costly bisection for fixed window sizes. + output[i] = values[candidates.front()] + else: + q_idx = bisect_left(candidates, this_start, lo=1) + output[i] = values[candidates[q_idx]] + last_end = this_end + last_start = this_start + + return output + +# ---------------------------------------------------------------------- +# Rolling first, last + + +def roll_first(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=1) + + +def roll_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=0) + + +cdef _roll_first_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, bint is_first): + cdef: + Py_ssize_t i, j, fl_idx + bint is_monotonic_increasing_bounds + int64_t nobs = 0, N = len(start), s, e + float64_t val, res + ndarray[float64_t] output + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + + output = np.empty(N, dtype=np.float64) + + if (end - start).max() == 0: + output[:] = NaN + return output + + with nogil: + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + fl_idx = -1 + nobs = 0 + for j in range(s, e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + else: + # handle deletes + for j in range(start[i - 1], s): + val = values[j] + if val == val: + nobs -= 1 + + # update fl_idx if out of range, if first + if is_first and fl_idx < s: + fl_idx = -1 + for j in range(s, end[i - 1]): + val = values[j] + if val == val: + fl_idx = j + break + + # handle adds + for j in range(end[i - 1], e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + + if nobs >= minp and fl_idx >= s: + res = values[fl_idx] + else: + res = NaN + + output[i] = res + + if not is_monotonic_increasing_bounds: + nobs = 0 return output @@ -1238,8 +1354,8 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, if interpolation_type == LINEAR: vlow = skiplist_get(skiplist, idx, &ret) vhigh = skiplist_get(skiplist, idx + 1, &ret) - output[i] = ((vlow + (vhigh - vlow) * - (idx_with_fraction - idx))) + output[i] = (vlow + (vhigh - vlow) * + (idx_with_fraction - idx)) elif interpolation_type == LOWER: output[i] = skiplist_get(skiplist, idx, &ret) elif interpolation_type == HIGHER: @@ -1387,6 +1503,66 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, return np.asarray(output) +def roll_nunique(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + """ + Rolling number of unique elements in the window + """ + cdef: + Py_ssize_t i, j, s, e, N = len(start) + int64_t nobs = 0 + float64_t val + float64_t[::1] output + unordered_map[float64_t, int64_t] value_counts + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + output = np.empty(N, dtype=np.float64) + value_counts = unordered_map[float64_t, int64_t]() + + with nogil: + for i in range(N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + if i != 0: + nobs = 0 + value_counts.clear() + + # setup + for j in range(s, e): + val = values[j] + if val == val: + nobs += 1 + value_counts[val] += 1 + + else: + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if val == val: + value_counts[val] -= 1 + if value_counts[val] == 0: + value_counts.erase(val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if val == val: + nobs += 1 + value_counts[val] += 1 + + if nobs >= minp: + output[i] = value_counts.size() + else: + output[i] = NaN + + return np.asarray(output) + + def roll_apply(object obj, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, @@ -1813,6 +1989,9 @@ def ewm(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, if normalize: # avoid numerical errors on constant series if weighted != cur: + if not adjust and com == 1: + # update in case of irregular-interval series + new_wt = 1. - old_wt weighted = old_wt * weighted + new_wt * cur weighted /= (old_wt + new_wt) if adjust: diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index ad15644f73a0c..8c00a98b1241a 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -1,31 +1,31 @@ +cy_args = ['-X always_allow_keywords=true'] +# Use shared utility code to reduce wheel sizes +# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files +if cy.version().version_compare('>=3.1.0') + cython_args += ['--shared=pandas._libs._cyutility'] +endif + py.extension_module( 'aggregations', ['aggregations.pyx'], - cython_args: ['-X always_allow_keywords=true'], + cython_args: cy_args, include_directories: [inc_np, inc_pd], subdir: 'pandas/_libs/window', - override_options : ['cython_language=cpp'], - install: true + override_options: ['cython_language=cpp'], + install: true, ) py.extension_module( 'indexers', ['indexers.pyx'], - cython_args: ['-X always_allow_keywords=true'], + cython_args: cy_args, include_directories: [inc_np, inc_pd], subdir: 'pandas/_libs/window', - install: true + install: true, ) -sources_to_install = [ - '__init__.py', - 'aggregations.pyi', - 'indexers.pyi' -] +sources_to_install = ['__init__.py', 'aggregations.pyi', 'indexers.pyi'] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs/window' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs/window') endforeach diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 672c16a85086c..ec9b5098c97c9 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -6,14 +6,12 @@ from sys import byteorder from typing import ( TYPE_CHECKING, - Callable, ContextManager, - cast, ) -import warnings import numpy as np +from pandas._config import using_string_dtype from pandas._config.localization import ( can_set_locale, get_locales, @@ -22,8 +20,6 @@ from pandas.compat import pa_version_under10p1 -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( ArrowDtype, @@ -34,7 +30,6 @@ Series, ) from pandas._testing._io import ( - round_trip_localpath, round_trip_pathlib, round_trip_pickle, write_to_compressed, @@ -59,7 +54,6 @@ assert_indexing_slices_equivalent, assert_interval_array_equal, assert_is_sorted, - assert_is_valid_plot_return_object, assert_metadata_equivalent, assert_numpy_array_equal, assert_period_array_equal, @@ -73,29 +67,28 @@ get_obj, ) from pandas._testing.contexts import ( - assert_cow_warning, decompress_file, ensure_clean, raises_chained_assignment_error, set_timezone, - use_numexpr, with_csv_dialect, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, - ExtensionArray, NumpyExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.construction import extract_array if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( Dtype, NpDtype, ) - from pandas.core.arrays import ArrowExtensionArray UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"] @@ -110,7 +103,11 @@ ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES] COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] -STRING_DTYPES: list[Dtype] = [str, "str", "U"] +if using_string_dtype(): + STRING_DTYPES: list[Dtype] = ["U"] +else: + STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] +COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"] @@ -236,11 +233,18 @@ + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) + ALL_REAL_PYARROW_DTYPES_STR_REPR = ( + ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR + ) else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] ALL_PYARROW_DTYPES = [] + ALL_REAL_PYARROW_DTYPES_STR_REPR = [] +ALL_REAL_NULLABLE_DTYPES = ( + FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR +) arithmetic_dunder_methods = [ "__add__", @@ -286,17 +290,11 @@ def box_expected(expected, box_cls, transpose: bool = True): else: expected = pd.array(expected, copy=False) elif box_cls is Index: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Index(expected) + expected = Index(expected) elif box_cls is Series: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected) + expected = Series(expected) elif box_cls is DataFrame: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected).to_frame() + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame @@ -394,9 +392,6 @@ def external_error_raised(expected_exception: type[Exception]) -> ContextManager return pytest.raises(expected_exception, match=None) -cython_table = pd.core.common._cython_table.items() - - def get_cython_table_params(ndframe, func_names_and_expected): """ Combine frame, functions from com._cython_table @@ -417,11 +412,6 @@ def get_cython_table_params(ndframe, func_names_and_expected): results = [] for func_name, expected in func_names_and_expected: results.append((ndframe, func_name, expected)) - results += [ - (ndframe, func, expected) - for func, name in cython_table - if name == func_name - ] return results @@ -483,7 +473,7 @@ def iat(x): _UNITS = ["s", "ms", "us", "ns"] -def get_finest_unit(left: str, right: str): +def get_finest_unit(left: str, right: str) -> str: """ Find the higher of two datetime64 units. """ @@ -507,6 +497,8 @@ def shares_memory(left, right) -> bool: if isinstance(left, MultiIndex): return shares_memory(left._codes, right) if isinstance(left, (Index, Series)): + if isinstance(right, (Index, Series)): + return shares_memory(left._values, right._values) return shares_memory(left._values, right) if isinstance(left, NDArrayBackedExtensionArray): @@ -516,24 +508,18 @@ def shares_memory(left, right) -> bool: if isinstance(left, pd.core.arrays.IntervalArray): return shares_memory(left._left, right) or shares_memory(left._right, right) - if ( - isinstance(left, ExtensionArray) - and is_string_dtype(left.dtype) - and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] - ): - # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 - left = cast("ArrowExtensionArray", left) - if ( - isinstance(right, ExtensionArray) - and is_string_dtype(right.dtype) - and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] - ): - right = cast("ArrowExtensionArray", right) + if isinstance(left, ArrowExtensionArray): + if isinstance(right, ArrowExtensionArray): + # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left_pa_data = left._pa_array right_pa_data = right._pa_array left_buf1 = left_pa_data.chunk(0).buffers()[1] right_buf1 = right_pa_data.chunk(0).buffers()[1] - return left_buf1 == right_buf1 + return left_buf1.address == right_buf1.address + else: + # if we have one one ArrowExtensionArray and one other array, assume + # they can only share memory if they share the same numpy buffer + return np.shares_memory(left, right) if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray): # By convention, we'll say these share memory if they share *either* @@ -542,8 +528,8 @@ def shares_memory(left, right) -> bool: left._mask, right._mask ) - if isinstance(left, DataFrame) and len(left._mgr.arrays) == 1: - arr = left._mgr.arrays[0] + if isinstance(left, DataFrame) and len(left._mgr.blocks) == 1: + arr = left._mgr.blocks[0].values return shares_memory(arr, right) raise NotImplementedError(type(left), type(right)) @@ -554,6 +540,25 @@ def shares_memory(left, right) -> bool: "ALL_INT_NUMPY_DTYPES", "ALL_NUMPY_DTYPES", "ALL_REAL_NUMPY_DTYPES", + "BOOL_DTYPES", + "BYTES_DTYPES", + "COMPLEX_DTYPES", + "DATETIME64_DTYPES", + "ENDIAN", + "FLOAT_EA_DTYPES", + "FLOAT_NUMPY_DTYPES", + "NARROW_NP_DTYPES", + "NP_NAT_OBJECTS", + "NULL_OBJECTS", + "OBJECT_DTYPES", + "SIGNED_INT_EA_DTYPES", + "SIGNED_INT_NUMPY_DTYPES", + "STRING_DTYPES", + "TIMEDELTA64_DTYPES", + "UNSIGNED_INT_EA_DTYPES", + "UNSIGNED_INT_NUMPY_DTYPES", + "SubclassedDataFrame", + "SubclassedSeries", "assert_almost_equal", "assert_attr_equal", "assert_categorical_equal", @@ -569,7 +574,6 @@ def shares_memory(left, right) -> bool: "assert_indexing_slices_equivalent", "assert_interval_array_equal", "assert_is_sorted", - "assert_is_valid_plot_return_object", "assert_metadata_equivalent", "assert_numpy_array_equal", "assert_period_array_equal", @@ -577,55 +581,33 @@ def shares_memory(left, right) -> bool: "assert_series_equal", "assert_sp_array_equal", "assert_timedelta_array_equal", - "assert_cow_warning", "at", - "BOOL_DTYPES", "box_expected", - "BYTES_DTYPES", "can_set_locale", - "COMPLEX_DTYPES", "convert_rows_list_to_csv_str", - "DATETIME64_DTYPES", "decompress_file", - "ENDIAN", "ensure_clean", "external_error_raised", - "FLOAT_EA_DTYPES", - "FLOAT_NUMPY_DTYPES", "get_cython_table_params", "get_dtype", - "getitem", - "get_locales", "get_finest_unit", + "get_locales", "get_obj", "get_op_from_name", + "getitem", "iat", "iloc", "loc", "maybe_produces_warning", - "NARROW_NP_DTYPES", - "NP_NAT_OBJECTS", - "NULL_OBJECTS", - "OBJECT_DTYPES", "raise_assert_detail", "raises_chained_assignment_error", - "round_trip_localpath", "round_trip_pathlib", "round_trip_pickle", - "setitem", "set_locale", "set_timezone", + "setitem", "shares_memory", - "SIGNED_INT_EA_DTYPES", - "SIGNED_INT_NUMPY_DTYPES", - "STRING_DTYPES", - "SubclassedDataFrame", - "SubclassedSeries", - "TIMEDELTA64_DTYPES", "to_array", - "UNSIGNED_INT_EA_DTYPES", - "UNSIGNED_INT_NUMPY_DTYPES", - "use_numexpr", "with_csv_dialect", "write_to_compressed", ] diff --git a/pandas/_testing/_hypothesis.py b/pandas/_testing/_hypothesis.py index 084ca9c306d19..bbad21d8ab8d1 100644 --- a/pandas/_testing/_hypothesis.py +++ b/pandas/_testing/_hypothesis.py @@ -1,11 +1,11 @@ """ Hypothesis data generator helpers. """ + from datetime import datetime from hypothesis import strategies as st from hypothesis.extra.dateutil import timezones as dateutil_timezones -from hypothesis.extra.pytz import timezones as pytz_timezones from pandas.compat import is_platform_windows @@ -54,13 +54,9 @@ DATETIME_NO_TZ = st.datetimes() DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes( - min_value=pd.Timestamp( - 1900, 1, 1 - ).to_pydatetime(), # pyright: ignore[reportGeneralTypeIssues] - max_value=pd.Timestamp( - 1900, 1, 1 - ).to_pydatetime(), # pyright: ignore[reportGeneralTypeIssues] - timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), + min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportArgumentType] + max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportArgumentType] + timezones=st.one_of(st.none(), dateutil_timezones(), st.timezones()), ) DATETIME_IN_PD_TIMESTAMP_RANGE_NO_TZ = st.datetimes( diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 95977edb600ad..e1841c95dcdfe 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -7,21 +7,18 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import uuid import zipfile -from pandas.compat import ( - get_bz2_file, - get_lzma_file, -) from pandas.compat._optional import import_optional_dependency import pandas as pd from pandas._testing.contexts import ensure_clean if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( FilePath, ReadPickleBuffer, @@ -89,35 +86,6 @@ def round_trip_pathlib(writer, reader, path: str | None = None): return obj -def round_trip_localpath(writer, reader, path: str | None = None): - """ - Write an object to file specified by a py.path LocalPath and read it back. - - Parameters - ---------- - writer : callable bound to pandas object - IO writing function (e.g. DataFrame.to_csv ) - reader : callable - IO reading function (e.g. pd.read_csv ) - path : str, default None - The path where the object is written and then read. - - Returns - ------- - pandas object - The original object that was serialized and then re-read. - """ - import pytest - - LocalPath = pytest.importorskip("py.path").local - if path is None: - path = "___localpath___" - with ensure_clean(path) as path: - writer(LocalPath(path)) - obj = reader(LocalPath(path)) - return obj - - def write_to_compressed(compression, path, data, dest: str = "test") -> None: """ Write data to a compressed file. @@ -158,11 +126,15 @@ def write_to_compressed(compression, path, data, dest: str = "test") -> None: elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": - compress_method = get_bz2_file() + import bz2 + + compress_method = bz2.BZ2File elif compression == "zstd": compress_method = import_optional_dependency("zstandard").open elif compression == "xz": - compress_method = get_lzma_file() + import lzma + + compress_method = lzma.LZMAFile else: raise ValueError(f"Unrecognized compression type: {compression}") diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f11dc11f6ac0d..a752c8db90f38 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -1,6 +1,7 @@ from __future__ import annotations from contextlib import ( + AbstractContextManager, contextmanager, nullcontext, ) @@ -10,6 +11,7 @@ from typing import ( TYPE_CHECKING, Literal, + Union, cast, ) import warnings @@ -31,8 +33,9 @@ def assert_produces_warning( ] = "always", check_stacklevel: bool = True, raise_on_extra_warnings: bool = True, - match: str | None = None, -) -> Generator[list[warnings.WarningMessage], None, None]: + match: str | tuple[str | None, ...] | None = None, + must_find_all_warnings: bool = True, +) -> Generator[list[warnings.WarningMessage]]: """ Context manager for running code expected to either raise a specific warning, multiple specific warnings, or not raise any warnings. Verifies that the code @@ -67,18 +70,23 @@ class for all warnings. To raise multiple types of exceptions, raise_on_extra_warnings : bool, default True Whether extra warnings not of the type `expected_warning` should cause the test to fail. - match : str, optional - Match warning message. + match : {str, tuple[str, ...]}, optional + Match warning message. If it's a tuple, it has to be the size of + `expected_warning`. If additionally `must_find_all_warnings` is + True, each expected warning's message gets matched with a respective + match. Otherwise, multiple values get treated as an alternative. + must_find_all_warnings : bool, default True + If True and `expected_warning` is a tuple, each expected warning + type must get encountered. Otherwise, even one expected warning + results in success. Examples -------- >>> import warnings >>> with assert_produces_warning(): ... warnings.warn(UserWarning()) - ... >>> with assert_produces_warning(False): ... warnings.warn(RuntimeWarning()) - ... Traceback (most recent call last): ... AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. @@ -98,13 +106,35 @@ class for all warnings. To raise multiple types of exceptions, yield w finally: if expected_warning: - expected_warning = cast(type[Warning], expected_warning) - _assert_caught_expected_warning( - caught_warnings=w, - expected_warning=expected_warning, - match=match, - check_stacklevel=check_stacklevel, - ) + if isinstance(expected_warning, tuple) and must_find_all_warnings: + match = ( + match + if isinstance(match, tuple) + else (match,) * len(expected_warning) + ) + for warning_type, warning_match in zip(expected_warning, match): + _assert_caught_expected_warnings( + caught_warnings=w, + expected_warning=warning_type, + match=warning_match, + check_stacklevel=check_stacklevel, + ) + else: + expected_warning = cast( + Union[type[Warning], tuple[type[Warning], ...]], + expected_warning, + ) + match = ( + "|".join(m for m in match if m) + if isinstance(match, tuple) + else match + ) + _assert_caught_expected_warnings( + caught_warnings=w, + expected_warning=expected_warning, + match=match, + check_stacklevel=check_stacklevel, + ) if raise_on_extra_warnings: _assert_caught_no_extra_warnings( caught_warnings=w, @@ -112,7 +142,9 @@ class for all warnings. To raise multiple types of exceptions, ) -def maybe_produces_warning(warning: type[Warning], condition: bool, **kwargs): +def maybe_produces_warning( + warning: type[Warning], condition: bool, **kwargs +) -> AbstractContextManager: """ Return a context manager that possibly checks a warning based on the condition """ @@ -122,10 +154,10 @@ def maybe_produces_warning(warning: type[Warning], condition: bool, **kwargs): return nullcontext() -def _assert_caught_expected_warning( +def _assert_caught_expected_warnings( *, caught_warnings: Sequence[warnings.WarningMessage], - expected_warning: type[Warning], + expected_warning: type[Warning] | tuple[type[Warning], ...], match: str | None, check_stacklevel: bool, ) -> None: @@ -133,6 +165,11 @@ def _assert_caught_expected_warning( saw_warning = False matched_message = False unmatched_messages = [] + warning_name = ( + tuple(x.__name__ for x in expected_warning) + if isinstance(expected_warning, tuple) + else expected_warning.__name__ + ) for actual_warning in caught_warnings: if issubclass(actual_warning.category, expected_warning): @@ -148,14 +185,11 @@ def _assert_caught_expected_warning( unmatched_messages.append(actual_warning.message) if not saw_warning: - raise AssertionError( - f"Did not see expected warning of class " - f"{repr(expected_warning.__name__)}" - ) + raise AssertionError(f"Did not see expected warning of class {warning_name!r}") if match and not matched_message: raise AssertionError( - f"Did not see warning {repr(expected_warning.__name__)} " + f"Did not see warning {warning_name!r} " f"matching '{match}'. The emitted warning messages are " f"{unmatched_messages}" ) @@ -197,7 +231,7 @@ def _assert_caught_no_extra_warnings( ) if extra_warnings: - raise AssertionError(f"Caused unexpected warning(s): {repr(extra_warnings)}") + raise AssertionError(f"Caused unexpected warning(s): {extra_warnings!r}") def _is_unexpected_warning( @@ -218,7 +252,12 @@ def _assert_raised_with_correct_stacklevel( frame = inspect.currentframe() for _ in range(4): frame = frame.f_back # type: ignore[union-attr] - caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + try: + caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame msg = ( "Warning not set with correct stacklevel. " f"File where warning is raised: {actual_warning.filename} != " diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index e342f76dc724b..daa5187cdb636 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -4,11 +4,13 @@ from typing import ( TYPE_CHECKING, Literal, + NoReturn, cast, ) import numpy as np +from pandas._libs import lib from pandas._libs.missing import is_matching_na from pandas._libs.sparse import SparseIndex import pandas._libs.testing as _testing @@ -143,7 +145,7 @@ def assert_almost_equal( ) -def _check_isinstance(left, right, cls): +def _check_isinstance(left, right, cls) -> None: """ Helper method for our assert_* methods that ensures that the two objects being compared have the right type before @@ -186,7 +188,7 @@ def assert_index_equal( check_order: bool = True, rtol: float = 1.0e-5, atol: float = 1.0e-8, - obj: str = "Index", + obj: str | None = None, ) -> None: """ Check that left and right Index are equal. @@ -194,7 +196,9 @@ def assert_index_equal( Parameters ---------- left : Index + The first index to compare. right : Index + The second index to compare. exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted for @@ -213,10 +217,15 @@ def assert_index_equal( Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. - obj : str, default 'Index' + obj : str, default 'Index' or 'MultiIndex' Specify object name being compared, internally used to show appropriate assertion message. + See Also + -------- + testing.assert_series_equal : Check that two Series are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm @@ -226,6 +235,9 @@ def assert_index_equal( """ __tracebackhide__ = True + if obj is None: + obj = "MultiIndex" if isinstance(left, MultiIndex) else "Index" + def _check_types(left, right, obj: str = "Index") -> None: if not exact: return @@ -274,7 +286,7 @@ def _check_types(left, right, obj: str = "Index") -> None: right = cast(MultiIndex, right) for level in range(left.nlevels): - lobj = f"MultiIndex level [{level}]" + lobj = f"{obj} level [{level}]" try: # try comparison on levels/codes to avoid densifying MultiIndex assert_index_equal( @@ -305,7 +317,7 @@ def _check_types(left, right, obj: str = "Index") -> None: obj=lobj, ) # get_level_values may change dtype - _check_types(left.levels[level], right.levels[level], obj=obj) + _check_types(left.levels[level], right.levels[level], obj=lobj) # skip exact index checking when `check_categorical` is False elif check_exact and check_categorical: @@ -420,28 +432,6 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None: return None -def assert_is_valid_plot_return_object(objs) -> None: - from matplotlib.artist import Artist - from matplotlib.axes import Axes - - if isinstance(objs, (Series, np.ndarray)): - if isinstance(objs, Series): - objs = objs._values - for el in objs.ravel(): - msg = ( - "one of 'objs' is not a matplotlib Axes instance, " - f"type encountered {repr(type(el).__name__)}" - ) - assert isinstance(el, (Axes, dict)), msg - else: - msg = ( - "objs is neither an ndarray of Artist instances nor a single " - "ArtistArtist instance, tuple, or dict, 'objs' is a " - f"{repr(type(objs).__name__)}" - ) - assert isinstance(objs, (Artist, tuple, dict)), msg - - def assert_is_sorted(seq) -> None: """Assert that the sequence is sorted.""" if isinstance(seq, (Index, Series)): @@ -540,7 +530,7 @@ def assert_interval_array_equal( kwargs["check_freq"] = False assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs) - assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs) + assert_equal(left._right, right._right, obj=f"{obj}.right", **kwargs) assert_attr_equal("closed", left, right, obj=obj) @@ -576,7 +566,7 @@ def assert_timedelta_array_equal( def raise_assert_detail( obj, message, left, right, diff=None, first_diff=None, index_values=None -): +) -> NoReturn: __tracebackhide__ = True msg = f"""{obj} are different @@ -591,13 +581,19 @@ def raise_assert_detail( if isinstance(left, np.ndarray): left = pprint_thing(left) - elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(left, (CategoricalDtype, NumpyEADtype)): left = repr(left) + elif isinstance(left, StringDtype): + # TODO(infer_string) this special case could be avoided if we have + # a more informative repr https://github.com/pandas-dev/pandas/issues/59342 + left = f"StringDtype(storage={left.storage}, na_value={left.na_value})" if isinstance(right, np.ndarray): right = pprint_thing(right) - elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(right, (CategoricalDtype, NumpyEADtype)): right = repr(right) + elif isinstance(right, StringDtype): + right = f"StringDtype(storage={right.storage}, na_value={right.na_value})" msg += f""" [left]: {left} @@ -659,12 +655,12 @@ def _get_base(obj): if check_same == "same": if left_base is not right_base: - raise AssertionError(f"{repr(left_base)} is not {repr(right_base)}") + raise AssertionError(f"{left_base!r} is not {right_base!r}") elif check_same == "copy": if left_base is right_base: - raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") + raise AssertionError(f"{left_base!r} is {right_base!r}") - def _raise(left, right, err_msg): + def _raise(left, right, err_msg) -> NoReturn: if err_msg is None: if left.shape != right.shape: raise_assert_detail( @@ -697,14 +693,18 @@ def assert_extension_array_equal( right, check_dtype: bool | Literal["equiv"] = True, index_values=None, - check_exact: bool = False, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + check_exact: bool | lib.NoDefault = lib.no_default, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "ExtensionArray", ) -> None: """ Check that left and right ExtensionArrays are equal. + This method compares two ``ExtensionArray`` instances for equality, + including checks for missing values, the dtype of the arrays, and + the exactness of the comparison (or tolerance when comparing floats). + Parameters ---------- left, right : ExtensionArray @@ -714,7 +714,12 @@ def assert_extension_array_equal( index_values : Index | numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -725,6 +730,12 @@ def assert_extension_array_equal( .. versionadded:: 2.0.0 + See Also + -------- + testing.assert_series_equal : Check that left and right ``Series`` are equal. + testing.assert_frame_equal : Check that left and right ``DataFrame`` are equal. + testing.assert_index_equal : Check that left and right ``Index`` are equal. + Notes ----- Missing values are checked separately from valid values. @@ -738,6 +749,20 @@ def assert_extension_array_equal( >>> b, c = a.array, a.array >>> tm.assert_extension_array_equal(b, c) """ + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype) + ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: @@ -781,12 +806,27 @@ def assert_extension_array_equal( left_na, right_na, obj=f"{obj} NA mask", index_values=index_values ) + # Specifically for StringArrayNumpySemantics, validate here we have a valid array + if ( + isinstance(left.dtype, StringDtype) + and left.dtype.storage == "python" + and left.dtype.na_value is np.nan + ): + assert np.all( + [np.isnan(val) for val in left._ndarray[left_na]] # type: ignore[attr-defined] + ), "wrong missing value sentinels" + if ( + isinstance(right.dtype, StringDtype) + and right.dtype.storage == "python" + and right.dtype.na_value is np.nan + ): + assert np.all( + [np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined] + ), "wrong missing value sentinels" + left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) - if check_exact or ( - (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) - or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) - ): + if check_exact: assert_numpy_array_equal( left_valid, right_valid, obj=obj, index_values=index_values ) @@ -810,14 +850,14 @@ def assert_series_equal( check_index_type: bool | Literal["equiv"] = "equiv", check_series_type: bool = True, check_names: bool = True, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_category_order: bool = True, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "Series", *, check_index: bool = True, @@ -829,7 +869,9 @@ def assert_series_equal( Parameters ---------- left : Series + First Series to compare. right : Series + Second Series to compare. check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' @@ -840,7 +882,19 @@ def assert_series_equal( check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. This also applies when checking + Index equivalence. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. + + .. versionchanged:: 3.0.0 + + check_exact for comparing the Indexes defaults to True by + checking if an Index is of integer dtypes. + check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -868,6 +922,11 @@ def assert_series_equal( .. versionadded:: 1.5.0 + See Also + -------- + testing.assert_index_equal : Check that two Indexes are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm @@ -876,6 +935,31 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype) + ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) + left_index_dtypes = ( + [left.index.dtype] if left.index.nlevels == 1 else left.index.dtypes + ) + right_index_dtypes = ( + [right.index.dtype] if right.index.nlevels == 1 else right.index.dtypes + ) + check_exact_index = all( + dtype.kind in "iu" for dtype in left_index_dtypes + ) or all(dtype.kind in "iu" for dtype in right_index_dtypes) + elif check_exact is lib.no_default: + check_exact = False + check_exact_index = False + else: + check_exact_index = check_exact + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 if not check_index and check_like: raise ValueError("check_like must be False if check_index is False") @@ -893,7 +977,7 @@ def assert_series_equal( raise_assert_detail(obj, "Series length are different", msg1, msg2) if check_flags: - assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + assert left.flags == right.flags, f"{left.flags!r} != {right.flags!r}" if check_index: # GH #38183 @@ -902,7 +986,7 @@ def assert_series_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=check_exact_index, check_categorical=check_categorical, check_order=not check_like, rtol=rtol, @@ -930,10 +1014,7 @@ def assert_series_equal( pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact or ( - (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) - or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) - ): + if check_exact: left_values = left._values right_values = right._values # Only check exact if dtype is numeric @@ -948,9 +1029,15 @@ def assert_series_equal( obj=str(obj), ) else: + # convert both to NumPy if not, check_dtype would raise earlier + lv, rv = left_values, right_values + if isinstance(left_values, ExtensionArray): + lv = left_values.to_numpy() + if isinstance(right_values, ExtensionArray): + rv = right_values.to_numpy() assert_numpy_array_equal( - left_values, - right_values, + lv, + rv, check_dtype=check_dtype, obj=str(obj), index_values=left.index, @@ -1054,14 +1141,14 @@ def assert_frame_equal( check_frame_type: bool = True, check_names: bool = True, by_blocks: bool = False, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_like: bool = False, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "DataFrame", ) -> None: """ @@ -1096,7 +1183,15 @@ def assert_frame_equal( Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. If False, the comparison uses the + relative tolerance (``rtol``) and absolute tolerance (``atol``) + parameters to determine if two values are considered close, + according to the formula: ``|a - b| <= (atol + rtol * |b|)``. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -1128,8 +1223,8 @@ def assert_frame_equal( but with columns of differing dtypes. >>> from pandas.testing import assert_frame_equal - >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) - >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) + >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + >>> df2 = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}) df1 equals itself. @@ -1151,6 +1246,9 @@ def assert_frame_equal( >>> assert_frame_equal(df1, df2, check_dtype=False) """ __tracebackhide__ = True + _rtol = rtol if rtol is not lib.no_default else 1.0e-5 + _atol = atol if atol is not lib.no_default else 1.0e-8 + _check_exact = check_exact if check_exact is not lib.no_default else False # instance validation _check_isinstance(left, right, DataFrame) @@ -1162,11 +1260,11 @@ def assert_frame_equal( # shape comparison if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}" + obj, f"{obj} shape mismatch", f"{left.shape!r}", f"{right.shape!r}" ) if check_flags: - assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + assert left.flags == right.flags, f"{left.flags!r} != {right.flags!r}" # index comparison assert_index_equal( @@ -1174,11 +1272,11 @@ def assert_frame_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.index", ) @@ -1188,11 +1286,11 @@ def assert_frame_equal( right.columns, exact=check_column_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.columns", ) @@ -1316,7 +1414,7 @@ def assert_sp_array_equal(left, right) -> None: def assert_contains_all(iterable, dic) -> None: for k in iterable: - assert k in dic, f"Did not contain item: {repr(k)}" + assert k in dic, f"Did not contain item: {k!r}" def assert_copy(iter1, iter2, **eql_kwargs) -> None: @@ -1331,7 +1429,7 @@ def assert_copy(iter1, iter2, **eql_kwargs) -> None: for elem1, elem2 in zip(iter1, iter2): assert_almost_equal(elem1, elem2, **eql_kwargs) msg = ( - f"Expected object {repr(type(elem1))} and object {repr(type(elem2))} to be " + f"Expected object {type(elem1)!r} and object {type(elem2)!r} to be " "different objects, but they were the same object." ) assert elem1 is not elem2, msg diff --git a/pandas/_testing/compat.py b/pandas/_testing/compat.py index cc352ba7b8f2f..722ba61a3227f 100644 --- a/pandas/_testing/compat.py +++ b/pandas/_testing/compat.py @@ -1,6 +1,7 @@ """ Helpers for sharing tests between DataFrame/Series """ + from __future__ import annotations from typing import TYPE_CHECKING diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index eb6e4a917889a..da147c117ad43 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -3,6 +3,7 @@ from contextlib import contextmanager import os from pathlib import Path +import sys import tempfile from typing import ( IO, @@ -11,13 +12,9 @@ ) import uuid -from pandas._config import using_copy_on_write - from pandas.compat import PYPY from pandas.errors import ChainedAssignmentError -from pandas import set_option - from pandas.io.common import get_handle if TYPE_CHECKING: @@ -33,7 +30,7 @@ @contextmanager def decompress_file( path: FilePath | BaseBuffer, compression: CompressionOptions -) -> Generator[IO[bytes], None, None]: +) -> Generator[IO[bytes]]: """ Open a compressed file and return a file object. @@ -54,7 +51,7 @@ def decompress_file( @contextmanager -def set_timezone(tz: str) -> Generator[None, None, None]: +def set_timezone(tz: str) -> Generator[None]: """ Context manager for temporarily setting a timezone. @@ -70,22 +67,24 @@ def set_timezone(tz: str) -> Generator[None, None, None]: >>> tzlocal().tzname(datetime(2021, 1, 1)) # doctest: +SKIP 'IST' - >>> with set_timezone('US/Eastern'): + >>> with set_timezone("US/Eastern"): ... tzlocal().tzname(datetime(2021, 1, 1)) - ... 'EST' """ import time def setTZ(tz) -> None: - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() + if hasattr(time, "tzset"): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + # Next line allows typing checks to pass on Windows + if sys.platform != "win32": + time.tzset() orig_tz = os.environ.get("TZ") setTZ(tz) @@ -96,9 +95,7 @@ def setTZ(tz) -> None: @contextmanager -def ensure_clean( - filename=None, return_filelike: bool = False, **kwargs: Any -) -> Generator[Any, None, None]: +def ensure_clean(filename=None) -> Generator[Any]: """ Gets a temporary path and agrees to remove on close. @@ -110,12 +107,6 @@ def ensure_clean( ---------- filename : str (optional) suffix of the created file. - return_filelike : bool (default False) - if True, returns a file-like which is *always* cleaned. Necessary for - savefig and other functions which want to append extensions. - **kwargs - Additional keywords are passed to open(). - """ folder = Path(tempfile.gettempdir()) @@ -126,25 +117,17 @@ def ensure_clean( path.touch() - handle_or_str: str | IO = str(path) - encoding = kwargs.pop("encoding", None) - if return_filelike: - kwargs.setdefault("mode", "w+b") - if encoding is None and "b" not in kwargs["mode"]: - encoding = "utf-8" - handle_or_str = open(path, encoding=encoding, **kwargs) + handle_or_str = str(path) try: yield handle_or_str finally: - if not isinstance(handle_or_str, str): - handle_or_str.close() if path.is_file(): path.unlink() @contextmanager -def with_csv_dialect(name: str, **kwargs) -> Generator[None, None, None]: +def with_csv_dialect(name: str, **kwargs) -> Generator[None]: """ Context manager to temporarily register a CSV dialect for parsing CSV. @@ -177,81 +160,28 @@ def with_csv_dialect(name: str, **kwargs) -> Generator[None, None, None]: csv.unregister_dialect(name) -@contextmanager -def use_numexpr(use, min_elements=None) -> Generator[None, None, None]: - from pandas.core.computation import expressions as expr - - if min_elements is None: - min_elements = expr._MIN_ELEMENTS - - olduse = expr.USE_NUMEXPR - oldmin = expr._MIN_ELEMENTS - set_option("compute.use_numexpr", use) - expr._MIN_ELEMENTS = min_elements - try: - yield - finally: - expr._MIN_ELEMENTS = oldmin - set_option("compute.use_numexpr", olduse) - - -def raises_chained_assignment_error(warn=True, extra_warnings=(), extra_match=()): +def raises_chained_assignment_error(extra_warnings=(), extra_match=()): from pandas._testing import assert_produces_warning - if not warn: - from contextlib import nullcontext - - return nullcontext() + if PYPY: + if not extra_warnings: + from contextlib import nullcontext - if PYPY and not extra_warnings: - from contextlib import nullcontext - - return nullcontext() - elif PYPY and extra_warnings: - return assert_produces_warning( - extra_warnings, - match="|".join(extra_match), - ) - else: - if using_copy_on_write(): - warning = ChainedAssignmentError - match = ( - "A value is trying to be set on a copy of a DataFrame or Series " - "through chained assignment" - ) + return nullcontext() else: - warning = FutureWarning # type: ignore[assignment] - # TODO update match - match = "ChainedAssignmentError" + return assert_produces_warning( + extra_warnings, + match=extra_match, + ) + else: + warning = ChainedAssignmentError + match = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment" + ) if extra_warnings: warning = (warning, *extra_warnings) # type: ignore[assignment] return assert_produces_warning( warning, - match="|".join((match, *extra_match)), + match=(match, *extra_match), ) - - -def assert_cow_warning(warn=True, match=None, **kwargs): - """ - Assert that a warning is raised in the CoW warning mode. - - Parameters - ---------- - warn : bool, default True - By default, check that a warning is raised. Can be turned off by passing False. - match : str - The warning message to match against, if different from the default. - kwargs - Passed through to assert_produces_warning - """ - from pandas._testing import assert_produces_warning - - if not warn: - from contextlib import nullcontext - - return nullcontext() - - if not match: - match = "Setting a value on a view" - - return assert_produces_warning(FutureWarning, match=match, **kwargs) diff --git a/pandas/_typing.py b/pandas/_typing.py index 3df9a47a35fca..889252bb00438 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,6 +1,8 @@ from __future__ import annotations +from builtins import type as type_t # pyright: ignore[reportUnusedImport] from collections.abc import ( + Callable, Hashable, Iterator, Mapping, @@ -18,24 +20,24 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, - Optional, Protocol, - Type as type_t, + TypeAlias, TypeVar, Union, overload, ) import numpy as np +import numpy.typing as npt # To prevent import cycles place any internal imports in the branch below # and use a string literal forward reference to it in subsequent types # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles -if TYPE_CHECKING: - import numpy.typing as npt +# Note that Union is needed when a Union includes a pandas type + +if TYPE_CHECKING: from pandas._libs import ( NaTType, Period, @@ -46,7 +48,12 @@ from pandas.core.dtypes.dtypes import ExtensionDtype - from pandas import Interval + from pandas import ( + DatetimeIndex, + Interval, + PeriodIndex, + TimedeltaIndex, + ) from pandas.arrays import ( DatetimeArray, TimedeltaArray, @@ -61,9 +68,7 @@ ) from pandas.core.indexes.base import Index from pandas.core.internals import ( - ArrayManager, BlockManager, - SingleArrayManager, SingleBlockManager, ) from pandas.core.resample import Resampler @@ -73,44 +78,46 @@ from pandas.io.formats.format import EngFormatter from pandas.tseries.holiday import AbstractHolidayCalendar - ScalarLike_co = Union[ - int, - float, - complex, - str, - bytes, - np.generic, - ] + ScalarLike_co: TypeAlias = int | float | complex | str | bytes | np.generic # numpy compatible types - NumpyValueArrayLike = Union[ScalarLike_co, npt.ArrayLike] + NumpyValueArrayLike: TypeAlias = ScalarLike_co | npt.ArrayLike # Name "npt._ArrayLikeInt_co" is not defined [name-defined] - NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined] + NumpySorter: TypeAlias = npt._ArrayLikeInt_co | None # type: ignore[name-defined] - from typing import SupportsIndex + from typing import ( + ParamSpec, + SupportsIndex, + ) + from typing import Concatenate # pyright: ignore[reportUnusedImport] + from typing import TypeGuard # pyright: ignore[reportUnusedImport] - if sys.version_info >= (3, 10): - from typing import TypeGuard # pyright: ignore[reportUnusedImport] - else: - from typing_extensions import TypeGuard # pyright: ignore[reportUnusedImport] + P = ParamSpec("P") if sys.version_info >= (3, 11): from typing import Self # pyright: ignore[reportUnusedImport] + from typing import Unpack # pyright: ignore[reportUnusedImport] else: from typing_extensions import Self # pyright: ignore[reportUnusedImport] + from typing_extensions import Unpack # pyright: ignore[reportUnusedImport] + else: - npt: Any = None + ParamSpec: Any = None Self: Any = None TypeGuard: Any = None + Concatenate: Any = None + Unpack: Any = None HashableT = TypeVar("HashableT", bound=Hashable) +HashableT2 = TypeVar("HashableT2", bound=Hashable) MutableMappingT = TypeVar("MutableMappingT", bound=MutableMapping) # array-like -ArrayLike = Union["ExtensionArray", np.ndarray] -AnyArrayLike = Union[ArrayLike, "Index", "Series"] -TimeArrayLike = Union["DatetimeArray", "TimedeltaArray"] +ArrayLike: TypeAlias = Union["ExtensionArray", np.ndarray] +ArrayLikeT = TypeVar("ArrayLikeT", "ExtensionArray", np.ndarray) +AnyArrayLike: TypeAlias = Union[ArrayLike, "Index", "Series"] +TimeArrayLike: TypeAlias = Union["DatetimeArray", "TimedeltaArray"] # list-like @@ -121,57 +128,49 @@ class SequenceNotStr(Protocol[_T_co]): @overload - def __getitem__(self, index: SupportsIndex, /) -> _T_co: - ... + def __getitem__(self, index: SupportsIndex, /) -> _T_co: ... @overload - def __getitem__(self, index: slice, /) -> Sequence[_T_co]: - ... + def __getitem__(self, index: slice, /) -> Sequence[_T_co]: ... - def __contains__(self, value: object, /) -> bool: - ... + def __contains__(self, value: object, /) -> bool: ... - def __len__(self) -> int: - ... + def __len__(self) -> int: ... - def __iter__(self) -> Iterator[_T_co]: - ... + def __iter__(self) -> Iterator[_T_co]: ... - def index(self, value: Any, /, start: int = 0, stop: int = ...) -> int: - ... + def index(self, value: Any, start: int = ..., stop: int = ..., /) -> int: ... - def count(self, value: Any, /) -> int: - ... + def count(self, value: Any, /) -> int: ... - def __reversed__(self) -> Iterator[_T_co]: - ... + def __reversed__(self) -> Iterator[_T_co]: ... -ListLike = Union[AnyArrayLike, SequenceNotStr, range] +ListLike: TypeAlias = AnyArrayLike | SequenceNotStr | range # scalars -PythonScalar = Union[str, float, bool] -DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"] -PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] -Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, date] -IntStrT = TypeVar("IntStrT", bound=Union[int, str]) - +PythonScalar: TypeAlias = str | float | bool +DatetimeLikeScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta"] +PandasScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta", "Interval"] +Scalar: TypeAlias = PythonScalar | PandasScalar | np.datetime64 | np.timedelta64 | date +IntStrT = TypeVar("IntStrT", bound=int | str) # timestamp and timedelta convertible types -TimestampConvertibleTypes = Union[ +TimestampConvertibleTypes: TypeAlias = Union[ "Timestamp", date, np.datetime64, np.int64, float, str ] -TimestampNonexistent = Union[ - Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta -] -TimedeltaConvertibleTypes = Union[ +TimestampNonexistent: TypeAlias = ( + Literal["shift_forward", "shift_backward", "NaT", "raise"] | timedelta +) + +TimedeltaConvertibleTypes: TypeAlias = Union[ "Timedelta", timedelta, np.timedelta64, np.int64, float, str ] -Timezone = Union[str, tzinfo] +Timezone: TypeAlias = str | tzinfo -ToTimestampHow = Literal["s", "e", "start", "end"] +ToTimestampHow: TypeAlias = Literal["s", "e", "start", "end"] # NDFrameT is stricter and ensures that the same subclass of NDFrame always is # used. E.g. `def func(a: NDFrameT) -> NDFrameT: ...` means that if a @@ -179,70 +178,70 @@ def __reversed__(self) -> Iterator[_T_co]: # passed in, a DataFrame is always returned. NDFrameT = TypeVar("NDFrameT", bound="NDFrame") +IndexT = TypeVar("IndexT", bound="Index") +FreqIndexT = TypeVar("FreqIndexT", "DatetimeIndex", "PeriodIndex", "TimedeltaIndex") NumpyIndexT = TypeVar("NumpyIndexT", np.ndarray, "Index") -AxisInt = int -Axis = Union[AxisInt, Literal["index", "columns", "rows"]] -IndexLabel = Union[Hashable, Sequence[Hashable]] -Level = Hashable -Shape = tuple[int, ...] -Suffixes = tuple[Optional[str], Optional[str]] -Ordered = Optional[bool] -JSONSerializable = Optional[Union[PythonScalar, list, dict]] -Frequency = Union[str, "BaseOffset"] -Axes = ListLike - -RandomState = Union[ - int, - np.ndarray, - np.random.Generator, - np.random.BitGenerator, - np.random.RandomState, -] +AxisInt: TypeAlias = int +Axis: TypeAlias = AxisInt | Literal["index", "columns", "rows"] +IndexLabel: TypeAlias = Hashable | Sequence[Hashable] +Level: TypeAlias = Hashable +Shape: TypeAlias = tuple[int, ...] +Suffixes: TypeAlias = Sequence[str | None] +Ordered: TypeAlias = bool | None +JSONSerializable: TypeAlias = PythonScalar | list | dict | None +Frequency: TypeAlias = Union[str, "BaseOffset"] +Axes: TypeAlias = ListLike + +RandomState: TypeAlias = ( + int + | np.ndarray + | np.random.Generator + | np.random.BitGenerator + | np.random.RandomState +) + # dtypes -NpDtype = Union[str, np.dtype, type_t[Union[str, complex, bool, object]]] -Dtype = Union["ExtensionDtype", NpDtype] -AstypeArg = Union["ExtensionDtype", "npt.DTypeLike"] +NpDtype: TypeAlias = str | np.dtype | type[str | complex | bool | object] +Dtype: TypeAlias = Union["ExtensionDtype", NpDtype] +AstypeArg: TypeAlias = Union["ExtensionDtype", npt.DTypeLike] # DtypeArg specifies all allowable dtypes in a functions its dtype argument -DtypeArg = Union[Dtype, dict[Hashable, Dtype]] -DtypeObj = Union[np.dtype, "ExtensionDtype"] +DtypeArg: TypeAlias = Dtype | Mapping[Hashable, Dtype] +DtypeObj: TypeAlias = Union[np.dtype, "ExtensionDtype"] # converters -ConvertersArg = dict[Hashable, Callable[[Dtype], Dtype]] +ConvertersArg: TypeAlias = dict[Hashable, Callable[[Dtype], Dtype]] # parse_dates -ParseDatesArg = Union[ - bool, list[Hashable], list[list[Hashable]], dict[Hashable, list[Hashable]] -] +ParseDatesArg: TypeAlias = ( + bool | list[Hashable] | list[list[Hashable]] | dict[Hashable, list[Hashable]] +) # For functions like rename that convert one label to another -Renamer = Union[Mapping[Any, Hashable], Callable[[Any], Hashable]] +Renamer: TypeAlias = Mapping[Any, Hashable] | Callable[[Any], Hashable] # to maintain type information across generic functions and parametrization T = TypeVar("T") # used in decorators to preserve the signature of the function it decorates # see https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators -FuncType = Callable[..., Any] +FuncType: TypeAlias = Callable[..., Any] F = TypeVar("F", bound=FuncType) +TypeT = TypeVar("TypeT", bound=type) # types of vectorized key functions for DataFrame::sort_values and # DataFrame::sort_index, among others -ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]] -IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]] +ValueKeyFunc: TypeAlias = Callable[["Series"], Union["Series", AnyArrayLike]] | None +IndexKeyFunc: TypeAlias = Callable[["Index"], Union["Index", AnyArrayLike]] | None # types of `func` kwarg for DataFrame.aggregate and Series.aggregate -AggFuncTypeBase = Union[Callable, str] -AggFuncTypeDict = MutableMapping[ - Hashable, Union[AggFuncTypeBase, list[AggFuncTypeBase]] -] -AggFuncType = Union[ - AggFuncTypeBase, - list[AggFuncTypeBase], - AggFuncTypeDict, +AggFuncTypeBase: TypeAlias = Callable | str +AggFuncTypeDict: TypeAlias = MutableMapping[ + Hashable, AggFuncTypeBase | list[AggFuncTypeBase] ] -AggObjType = Union[ +AggFuncType: TypeAlias = AggFuncTypeBase | list[AggFuncTypeBase] | AggFuncTypeDict +AggObjType: TypeAlias = Union[ "Series", "DataFrame", "GroupBy", @@ -252,7 +251,7 @@ def __reversed__(self) -> Iterator[_T_co]: "Resampler", ] -PythonFuncType = Callable[[Any], Any] +PythonFuncType: TypeAlias = Callable[[Any], Any] # filenames and file-like-objects AnyStr_co = TypeVar("AnyStr_co", str, bytes, covariant=True) @@ -265,7 +264,7 @@ def mode(self) -> str: # for _get_filepath_or_buffer ... - def seek(self, __offset: int, __whence: int = ...) -> int: + def seek(self, offset: int, whence: int = ..., /) -> int: # with one argument: gzip.GzipFile, bz2.BZ2File # with two arguments: zip.ZipFile, read_sas ... @@ -280,13 +279,13 @@ def tell(self) -> int: class ReadBuffer(BaseBuffer, Protocol[AnyStr_co]): - def read(self, __n: int = ...) -> AnyStr_co: + def read(self, n: int = ..., /) -> AnyStr_co: # for BytesIOWrapper, gzip.GzipFile, bz2.BZ2File ... class WriteBuffer(BaseBuffer, Protocol[AnyStr_contra]): - def write(self, __b: AnyStr_contra) -> Any: + def write(self, b: AnyStr_contra, /) -> Any: # for gzip.GzipFile, bz2.BZ2File ... @@ -296,13 +295,11 @@ def flush(self) -> Any: class ReadPickleBuffer(ReadBuffer[bytes], Protocol): - def readline(self) -> bytes: - ... + def readline(self) -> bytes: ... class WriteExcelBuffer(WriteBuffer[bytes], Protocol): - def truncate(self, size: int | None = ...) -> int: - ... + def truncate(self, size: int | None = ..., /) -> int: ... class ReadCsvBuffer(ReadBuffer[AnyStr_co], Protocol): @@ -320,35 +317,34 @@ def readline(self) -> AnyStr_co: @property def closed(self) -> bool: - # for enine=pyarrow + # for engine=pyarrow ... -FilePath = Union[str, "PathLike[str]"] +FilePath: TypeAlias = str | PathLike[str] # for arbitrary kwargs passed during reading/writing files -StorageOptions = Optional[dict[str, Any]] - +StorageOptions: TypeAlias = dict[str, Any] | None # compression keywords and compression -CompressionDict = dict[str, Any] -CompressionOptions = Optional[ - Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], CompressionDict] -] +CompressionDict: TypeAlias = dict[str, Any] +CompressionOptions: TypeAlias = ( + Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"] | CompressionDict | None +) # types in DataFrameFormatter -FormattersType = Union[ - list[Callable], tuple[Callable, ...], Mapping[Union[str, int], Callable] -] -ColspaceType = Mapping[Hashable, Union[str, int]] -FloatFormatType = Union[str, Callable, "EngFormatter"] -ColspaceArgType = Union[ - str, int, Sequence[Union[str, int]], Mapping[Hashable, Union[str, int]] -] +FormattersType: TypeAlias = ( + list[Callable] | tuple[Callable, ...] | Mapping[str | int, Callable] +) +ColspaceType: TypeAlias = Mapping[Hashable, str | int] +FloatFormatType: TypeAlias = Union[str, Callable, "EngFormatter"] +ColspaceArgType: TypeAlias = ( + str | int | Sequence[str | int] | Mapping[Hashable, str | int] +) # Arguments for fillna() -FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"] -InterpolateOptions = Literal[ +FillnaOptions: TypeAlias = Literal["backfill", "bfill", "ffill", "pad"] +InterpolateOptions: TypeAlias = Literal[ "linear", "time", "index", @@ -370,11 +366,7 @@ def closed(self) -> bool: ] # internals -Manager = Union[ - "ArrayManager", "SingleArrayManager", "BlockManager", "SingleBlockManager" -] -SingleManager = Union["SingleArrayManager", "SingleBlockManager"] -Manager2D = Union["ArrayManager", "BlockManager"] +Manager: TypeAlias = Union["BlockManager", "SingleBlockManager"] # indexing # PositionalIndexer -> valid 1D positional indexer, e.g. can pass @@ -387,61 +379,62 @@ def closed(self) -> bool: # https://github.com/python/typing/issues/684#issuecomment-548203158 # https://bugs.python.org/issue41810 # Using List[int] here rather than Sequence[int] to disallow tuples. -ScalarIndexer = Union[int, np.integer] -SequenceIndexer = Union[slice, list[int], np.ndarray] -PositionalIndexer = Union[ScalarIndexer, SequenceIndexer] -PositionalIndexerTuple = tuple[PositionalIndexer, PositionalIndexer] -PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple] -if TYPE_CHECKING: - TakeIndexer = Union[Sequence[int], Sequence[np.integer], npt.NDArray[np.integer]] -else: - TakeIndexer = Any +ScalarIndexer: TypeAlias = int | np.integer +SequenceIndexer: TypeAlias = slice | list[int] | np.ndarray +PositionalIndexer: TypeAlias = ScalarIndexer | SequenceIndexer +PositionalIndexerTuple: TypeAlias = tuple[PositionalIndexer, PositionalIndexer] +PositionalIndexer2D: TypeAlias = PositionalIndexer | PositionalIndexerTuple +TakeIndexer: TypeAlias = Sequence[int] | Sequence[np.integer] | npt.NDArray[np.integer] # Shared by functions such as drop and astype -IgnoreRaise = Literal["ignore", "raise"] +IgnoreRaise: TypeAlias = Literal["ignore", "raise"] # Windowing rank methods -WindowingRankType = Literal["average", "min", "max"] +WindowingRankType: TypeAlias = Literal["average", "min", "max"] # read_csv engines -CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"] +CSVEngine: TypeAlias = Literal["c", "python", "pyarrow", "python-fwf"] # read_json engines -JSONEngine = Literal["ujson", "pyarrow"] +JSONEngine: TypeAlias = Literal["ujson", "pyarrow"] # read_xml parsers -XMLParsers = Literal["lxml", "etree"] +XMLParsers: TypeAlias = Literal["lxml", "etree"] # read_html flavors -HTMLFlavors = Literal["lxml", "html5lib", "bs4"] +HTMLFlavors: TypeAlias = Literal["lxml", "html5lib", "bs4"] # Interval closed type -IntervalLeftRight = Literal["left", "right"] -IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]] +IntervalLeftRight: TypeAlias = Literal["left", "right"] +IntervalClosedType: TypeAlias = IntervalLeftRight | Literal["both", "neither"] # datetime and NaTType -DatetimeNaTType = Union[datetime, "NaTType"] -DateTimeErrorChoices = Union[IgnoreRaise, Literal["coerce"]] +DatetimeNaTType: TypeAlias = Union[datetime, "NaTType"] +DateTimeErrorChoices: TypeAlias = Literal["raise", "coerce"] # sort_index -SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"] -NaPosition = Literal["first", "last"] +SortKind: TypeAlias = Literal["quicksort", "mergesort", "heapsort", "stable"] +NaPosition: TypeAlias = Literal["first", "last"] -# Arguments for nsmalles and n_largest -NsmallestNlargestKeep = Literal["first", "last", "all"] +# Arguments for nsmallest and nlargest +NsmallestNlargestKeep: TypeAlias = Literal["first", "last", "all"] # quantile interpolation -QuantileInterpolation = Literal["linear", "lower", "higher", "midpoint", "nearest"] +QuantileInterpolation: TypeAlias = Literal[ + "linear", "lower", "higher", "midpoint", "nearest" +] # plotting -PlottingOrientation = Literal["horizontal", "vertical"] +PlottingOrientation: TypeAlias = Literal["horizontal", "vertical"] # dropna -AnyAll = Literal["any", "all"] +AnyAll: TypeAlias = Literal["any", "all"] # merge -MergeHow = Literal["left", "right", "inner", "outer", "cross"] -MergeValidate = Literal[ +MergeHow: TypeAlias = Literal[ + "left", "right", "inner", "outer", "cross", "left_anti", "right_anti" +] +MergeValidate: TypeAlias = Literal[ "one_to_one", "1:1", "one_to_many", @@ -453,8 +446,8 @@ def closed(self) -> bool: ] # join -JoinHow = Literal["left", "right", "inner", "outer"] -JoinValidate = Literal[ +JoinHow: TypeAlias = Literal["left", "right", "inner", "outer"] +JoinValidate: TypeAlias = Literal[ "one_to_one", "1:1", "one_to_many", @@ -466,25 +459,28 @@ def closed(self) -> bool: ] # reindex -ReindexMethod = Union[FillnaOptions, Literal["nearest"]] +ReindexMethod: TypeAlias = FillnaOptions | Literal["nearest"] -MatplotlibColor = Union[str, Sequence[float]] -TimeGrouperOrigin = Union[ +MatplotlibColor: TypeAlias = str | Sequence[float] +TimeGrouperOrigin: TypeAlias = Union[ "Timestamp", Literal["epoch", "start", "start_day", "end", "end_day"] ] -TimeAmbiguous = Union[Literal["infer", "NaT", "raise"], "npt.NDArray[np.bool_]"] -TimeNonexistent = Union[ - Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta -] -DropKeep = Literal["first", "last", False] -CorrelationMethod = Union[ - Literal["pearson", "kendall", "spearman"], Callable[[np.ndarray, np.ndarray], float] -] -AlignJoin = Literal["outer", "inner", "left", "right"] -DtypeBackend = Literal["pyarrow", "numpy_nullable"] +TimeAmbiguous: TypeAlias = Literal["infer", "NaT", "raise"] | npt.NDArray[np.bool_] +TimeNonexistent: TypeAlias = ( + Literal["shift_forward", "shift_backward", "NaT", "raise"] | timedelta +) + +DropKeep: TypeAlias = Literal["first", "last", False] +CorrelationMethod: TypeAlias = ( + Literal["pearson", "kendall", "spearman"] + | Callable[[np.ndarray, np.ndarray], float] +) + +AlignJoin: TypeAlias = Literal["outer", "inner", "left", "right"] +DtypeBackend: TypeAlias = Literal["pyarrow", "numpy_nullable"] -TimeUnit = Literal["s", "ms", "us", "ns"] -OpenFileErrors = Literal[ +TimeUnit: TypeAlias = Literal["s", "ms", "us", "ns"] +OpenFileErrors: TypeAlias = Literal[ "strict", "ignore", "replace", @@ -495,31 +491,32 @@ def closed(self) -> bool: ] # update -UpdateJoin = Literal["left"] +UpdateJoin: TypeAlias = Literal["left"] # applymap -NaAction = Literal["ignore"] +NaAction: TypeAlias = Literal["ignore"] # from_dict -FromDictOrient = Literal["columns", "index", "tight"] - -# to_gbc -ToGbqIfexist = Literal["fail", "replace", "append"] +FromDictOrient: TypeAlias = Literal["columns", "index", "tight"] # to_stata -ToStataByteorder = Literal[">", "<", "little", "big"] +ToStataByteorder: TypeAlias = Literal[">", "<", "little", "big"] # ExcelWriter -ExcelWriterIfSheetExists = Literal["error", "new", "replace", "overlay"] +ExcelWriterIfSheetExists: TypeAlias = Literal["error", "new", "replace", "overlay"] +ExcelWriterMergeCells: TypeAlias = bool | Literal["columns"] # Offsets -OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] +OffsetCalendar: TypeAlias = Union[np.busdaycalendar, "AbstractHolidayCalendar"] # read_csv: usecols -UsecolsArgType = Union[ - SequenceNotStr[Hashable], - range, - AnyArrayLike, - Callable[[HashableT], bool], - None, -] +UsecolsArgType: TypeAlias = ( + SequenceNotStr[Hashable] | range | AnyArrayLike | Callable[[HashableT], bool] | None +) + +# maintain the sub-type of any hashable sequence +SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable]) + +SliceType: TypeAlias = Hashable | None + +__all__ = ["type_t"] diff --git a/pandas/_version.py b/pandas/_version.py index 5d610b5e1ea7e..c5e3c16d3f773 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -1,5 +1,5 @@ # This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag +# git-archive tarball (such as those provided by github's download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. @@ -10,13 +10,13 @@ """Git implementation of _version.py.""" +from collections.abc import Callable import errno import functools import os import re import subprocess import sys -from typing import Callable def get_keywords(): @@ -141,7 +141,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): if verbose: print( - f"Tried directories {str(rootdirs)} \ + f"Tried directories {rootdirs!s} \ but none started with prefix {parentdir_prefix}" ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -358,9 +358,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces[ - "error" - ] = f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" + pieces["error"] = ( + f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" + ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] @@ -386,7 +386,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index a0d42b6541fdf..a016e67a41360 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,5 +1,7 @@ -""" public toolkit API """ +"""public toolkit API""" + from pandas.api import ( + executors, extensions, indexers, interchange, @@ -8,9 +10,10 @@ ) __all__ = [ - "interchange", + "executors", "extensions", "indexers", + "interchange", "types", "typing", ] diff --git a/pandas/api/executors/__init__.py b/pandas/api/executors/__init__.py new file mode 100644 index 0000000000000..04c94ee688332 --- /dev/null +++ b/pandas/api/executors/__init__.py @@ -0,0 +1,7 @@ +""" +Public API for function executor engines to be used with ``map`` and ``apply``. +""" + +from pandas.core.apply import BaseExecutionEngine + +__all__ = ["BaseExecutionEngine"] diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index ea5f1ba926899..1c88c0d35b4d7 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -21,13 +21,13 @@ ) __all__ = [ - "no_default", + "ExtensionArray", "ExtensionDtype", - "register_extension_dtype", + "ExtensionScalarOpsMixin", + "no_default", "register_dataframe_accessor", + "register_extension_dtype", "register_index_accessor", "register_series_accessor", "take", - "ExtensionArray", - "ExtensionScalarOpsMixin", ] diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index 78357f11dc3b7..f3c6546218de4 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -10,8 +10,8 @@ ) __all__ = [ - "check_array_indexer", "BaseIndexer", "FixedForwardWindowIndexer", "VariableOffsetWindowIndexer", + "check_array_indexer", ] diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py index 2f3a73bc46b31..aded37abc7224 100644 --- a/pandas/api/interchange/__init__.py +++ b/pandas/api/interchange/__init__.py @@ -5,4 +5,4 @@ from pandas.core.interchange.dataframe_protocol import DataFrame from pandas.core.interchange.from_dataframe import from_dataframe -__all__ = ["from_dataframe", "DataFrame"] +__all__ = ["DataFrame", "from_dataframe"] diff --git a/pandas/api/internals.py b/pandas/api/internals.py new file mode 100644 index 0000000000000..03d8992a87575 --- /dev/null +++ b/pandas/api/internals.py @@ -0,0 +1,62 @@ +import numpy as np + +from pandas._typing import ArrayLike + +from pandas import ( + DataFrame, + Index, +) +from pandas.core.internals.api import _make_block +from pandas.core.internals.managers import BlockManager as _BlockManager + + +def create_dataframe_from_blocks( + blocks: list[tuple[ArrayLike, np.ndarray]], index: Index, columns: Index +) -> DataFrame: + """ + Low-level function to create a DataFrame from arrays as they are + representing the block structure of the resulting DataFrame. + + Attention: this is an advanced, low-level function that should only be + used if you know that the below-mentioned assumptions are guaranteed. + If passing data that do not follow those assumptions, subsequent + subsequent operations on the resulting DataFrame might lead to strange + errors. + For almost all use cases, you should use the standard pd.DataFrame(..) + constructor instead. If you are planning to use this function, let us + know by opening an issue at https://github.com/pandas-dev/pandas/issues. + + Assumptions: + + - The block arrays are either a 2D numpy array or a pandas ExtensionArray + - In case of a numpy array, it is assumed to already be in the expected + shape for Blocks (2D, (cols, rows), i.e. transposed compared to the + DataFrame columns). + - All arrays are taken as is (no type inference) and expected to have the + correct size. + - The placement arrays have the correct length (equalling the number of + columns that its equivalent block array represents), and all placement + arrays together form a complete set of 0 to n_columns - 1. + + Parameters + ---------- + blocks : list of tuples of (block_array, block_placement) + This should be a list of tuples existing of (block_array, block_placement), + where: + + - block_array is a 2D numpy array or a 1D ExtensionArray, following the + requirements listed above. + - block_placement is a 1D integer numpy array + index : Index + The Index object for the `index` of the resulting DataFrame. + columns : Index + The Index object for the `columns` of the resulting DataFrame. + + Returns + ------- + DataFrame + """ + block_objs = [_make_block(*block) for block in blocks] + axes = [columns, index] + mgr = _BlockManager(block_objs, axes) + return DataFrame._from_mgr(mgr, mgr.axes) diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index c601086bb9f86..4a5c742b1628b 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -14,10 +14,10 @@ ) __all__ = [ - "infer_dtype", - "union_categoricals", "CategoricalDtype", "DatetimeTZDtype", "IntervalDtype", "PeriodDtype", + "infer_dtype", + "union_categoricals", ] diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index 9b5d2cb06b523..c1178c72f3edc 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -3,12 +3,14 @@ """ from pandas._libs import NaTType +from pandas._libs.lib import NoDefault from pandas._libs.missing import NAType from pandas.core.groupby import ( DataFrameGroupBy, SeriesGroupBy, ) +from pandas.core.indexes.frozen import FrozenList from pandas.core.resample import ( DatetimeIndexResamplerGroupby, PeriodIndexResamplerGroupby, @@ -29,6 +31,7 @@ # TODO: Can't import Styler without importing jinja2 # from pandas.io.formats.style import Styler from pandas.io.json._json import JsonReader +from pandas.io.sas.sasreader import SASReader from pandas.io.stata import StataReader __all__ = [ @@ -38,18 +41,19 @@ "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", + "FrozenList", "JsonReader", - "NaTType", "NAType", + "NaTType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", "RollingGroupby", + "SASReader", "SeriesGroupBy", "StataReader", - # See TODO above - # "Styler", - "TimedeltaIndexResamplerGroupby", "TimeGrouper", + "TimedeltaIndexResamplerGroupby", "Window", ] diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index a11755275d00e..b5c1c98da1c78 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -3,6 +3,7 @@ See :ref:`extending.extension-types` for more. """ + from pandas.core.arrays import ( ArrowExtensionArray, ArrowStringArray, @@ -34,20 +35,3 @@ "StringArray", "TimedeltaArray", ] - - -def __getattr__(name: str) -> type[NumpyExtensionArray]: - if name == "PandasArray": - # GH#53694 - import warnings - - from pandas.util._exceptions import find_stack_level - - warnings.warn( - "PandasArray has been renamed NumpyExtensionArray. Use that " - "instead. This alias will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return NumpyExtensionArray - raise AttributeError(f"module 'pandas.arrays' has no attribute '{name}'") diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 738442fab8c70..9f3bfdc205498 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -7,6 +7,7 @@ Other items: * platform checker """ + from __future__ import annotations import os @@ -17,19 +18,24 @@ from pandas.compat._constants import ( IS64, ISMUSL, - PY310, PY311, PY312, PYPY, + WASM, ) -import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( + HAS_PYARROW, pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, pa_version_under14p1, + pa_version_under16p0, + pa_version_under17p0, + pa_version_under18p0, + pa_version_under19p0, + pa_version_under20p0, ) if TYPE_CHECKING: @@ -120,76 +126,48 @@ def is_platform_power() -> bool: return platform.machine() in ("ppc64", "ppc64le") -def is_ci_environment() -> bool: +def is_platform_riscv64() -> bool: """ - Checking if running in a continuous integration environment by checking - the PANDAS_CI environment variable. + Checking if the running platform use riscv64 architecture. Returns ------- bool - True if the running in a continuous integration environment. - """ - return os.environ.get("PANDAS_CI", "0") == "1" - - -def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: - """ - Importing the `LZMAFile` class from the `lzma` module. - - Returns - ------- - class - The `LZMAFile` class from the `lzma` module. - - Raises - ------ - RuntimeError - If the `lzma` module was not imported correctly, or didn't exist. + True if the running platform uses riscv64 architecture. """ - if not pandas.compat.compressors.has_lzma: - raise RuntimeError( - "lzma module not available. " - "A Python re-install with the proper dependencies, " - "might be required to solve this issue." - ) - return pandas.compat.compressors.LZMAFile + return platform.machine() == "riscv64" -def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: +def is_ci_environment() -> bool: """ - Importing the `BZ2File` class from the `bz2` module. + Checking if running in a continuous integration environment by checking + the PANDAS_CI environment variable. Returns ------- - class - The `BZ2File` class from the `bz2` module. - - Raises - ------ - RuntimeError - If the `bz2` module was not imported correctly, or didn't exist. + bool + True if the running in a continuous integration environment. """ - if not pandas.compat.compressors.has_bz2: - raise RuntimeError( - "bz2 module not available. " - "A Python re-install with the proper dependencies, " - "might be required to solve this issue." - ) - return pandas.compat.compressors.BZ2File + return os.environ.get("PANDAS_CI", "0") == "1" __all__ = [ + "HAS_PYARROW", + "IS64", + "ISMUSL", + "PY311", + "PY312", + "PYPY", + "WASM", "is_numpy_dev", "pa_version_under10p1", "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", "pa_version_under14p1", - "IS64", - "ISMUSL", - "PY310", - "PY311", - "PY312", - "PYPY", + "pa_version_under16p0", + "pa_version_under17p0", + "pa_version_under18p0", + "pa_version_under19p0", + "pa_version_under20p0", ] diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index 7bc3fbaaefebf..c7b7341013251 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -13,18 +13,18 @@ IS64 = sys.maxsize > 2**32 -PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) PY312 = sys.version_info >= (3, 12) PYPY = platform.python_implementation() == "PyPy" +WASM = (sys.platform == "emscripten") or (platform.machine() in ["wasm32", "wasm64"]) ISMUSL = "musl" in (sysconfig.get_config_var("HOST_GNU_TYPE") or "") REF_COUNT = 2 if PY311 else 3 __all__ = [ "IS64", "ISMUSL", - "PY310", "PY311", "PY312", "PYPY", + "WASM", ] diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 9d04d7c0a1216..f01dfab0de829 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -2,7 +2,11 @@ import importlib import sys -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Literal, + overload, +) import warnings from pandas.util._exceptions import find_stack_level @@ -12,44 +16,44 @@ if TYPE_CHECKING: import types -# Update install.rst & setup.cfg when updating versions! +# Update install.rst, actions-310-minimum_versions.yaml, +# deps_minimum.toml & pyproject.toml when updating versions! VERSIONS = { - "adbc-driver-postgresql": "0.8.0", + "adbc-driver-postgresql": "0.10.0", "adbc-driver-sqlite": "0.8.0", - "bs4": "4.11.2", - "blosc": "1.21.3", + "bs4": "4.12.3", "bottleneck": "1.3.6", - "dataframe-api-compat": "0.1.7", - "fastparquet": "2022.12.0", - "fsspec": "2022.11.0", + "fastparquet": "2024.2.0", + "fsspec": "2023.12.2", "html5lib": "1.1", - "hypothesis": "6.46.1", - "gcsfs": "2022.11.0", - "jinja2": "3.1.2", + "hypothesis": "6.84.0", + "gcsfs": "2023.12.2", + "jinja2": "3.1.3", "lxml.etree": "4.9.2", - "matplotlib": "3.6.3", - "numba": "0.56.4", - "numexpr": "2.8.4", + "matplotlib": "3.8.3", + "numba": "0.59.0", + "numexpr": "2.9.0", "odfpy": "1.4.1", - "openpyxl": "3.1.0", - "pandas_gbq": "0.19.0", + "openpyxl": "3.1.2", "psycopg2": "2.9.6", # (dt dec pq3 ext lo64) - "pymysql": "1.0.2", + "pymysql": "1.1.0", "pyarrow": "10.0.1", - "pyreadstat": "1.2.0", + "pyiceberg": "0.7.1", + "pyreadstat": "1.2.6", "pytest": "7.3.2", "python-calamine": "0.1.7", + "pytz": "2023.4", "pyxlsb": "1.0.10", - "s3fs": "2022.11.0", - "scipy": "1.10.0", + "s3fs": "2023.12.2", + "scipy": "1.12.0", "sqlalchemy": "2.0.0", "tables": "3.8.0", "tabulate": "0.9.0", - "xarray": "2022.12.0", + "xarray": "2024.1.1", "xlrd": "2.0.1", - "xlsxwriter": "3.0.5", - "zstandard": "0.19.0", + "xlsxwriter": "3.2.0", + "zstandard": "0.22.0", "tzdata": "2022.7", "qtpy": "2.3.0", "pyqt5": "5.15.9", @@ -64,7 +68,6 @@ "jinja2": "Jinja2", "lxml.etree": "lxml", "odf": "odfpy", - "pandas_gbq": "pandas-gbq", "python_calamine": "python-calamine", "sqlalchemy": "SQLAlchemy", "tables": "pytables", @@ -82,12 +85,33 @@ def get_version(module: types.ModuleType) -> str: return version +@overload +def import_optional_dependency( + name: str, + extra: str = ..., + min_version: str | None = ..., + *, + errors: Literal["raise"] = ..., +) -> types.ModuleType: ... + + +@overload +def import_optional_dependency( + name: str, + extra: str = ..., + min_version: str | None = ..., + *, + errors: Literal["warn", "ignore"], +) -> types.ModuleType | None: ... + + def import_optional_dependency( name: str, extra: str = "", - errors: str = "raise", min_version: str | None = None, -): + *, + errors: Literal["raise", "warn", "ignore"] = "raise", +) -> types.ModuleType | None: """ Import an optional dependency. @@ -120,9 +144,8 @@ def import_optional_dependency( The imported module, when found and the version is correct. None is returned when the package is not found and `errors` is False, or when the package's version is too old and `errors` - is ``'warn'``. + is ``'warn'`` or ``'ignore'``. """ - assert errors in {"warn", "raise", "ignore"} package_name = INSTALL_MAPPING.get(name) @@ -134,9 +157,9 @@ def import_optional_dependency( ) try: module = importlib.import_module(name) - except ImportError: + except ImportError as err: if errors == "raise": - raise ImportError(msg) + raise ImportError(msg) from err return None # Handle submodules: if we have submodule, grab parent module from sys.modules @@ -163,5 +186,7 @@ def import_optional_dependency( return None elif errors == "raise": raise ImportError(msg) + else: + return None return module diff --git a/pandas/compat/compressors.py b/pandas/compat/compressors.py deleted file mode 100644 index 1f31e34c092c9..0000000000000 --- a/pandas/compat/compressors.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5. -""" - -from __future__ import annotations - -from pickle import PickleBuffer - -from pandas.compat._constants import PY310 - -try: - import bz2 - - has_bz2 = True -except ImportError: - has_bz2 = False - -try: - import lzma - - has_lzma = True -except ImportError: - has_lzma = False - - -def flatten_buffer( - b: bytes | bytearray | memoryview | PickleBuffer, -) -> bytes | bytearray | memoryview: - """ - Return some 1-D `uint8` typed buffer. - - Coerces anything that does not match that description to one that does - without copying if possible (otherwise will copy). - """ - - if isinstance(b, (bytes, bytearray)): - return b - - if not isinstance(b, PickleBuffer): - b = PickleBuffer(b) - - try: - # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy - return b.raw() - except BufferError: - # perform in-memory copy if buffer is not contiguous - return memoryview(b).tobytes("A") - - -if has_bz2: - - class BZ2File(bz2.BZ2File): - if not PY310: - - def write(self, b) -> int: - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) - - -if has_lzma: - - class LZMAFile(lzma.LZMAFile): - if not PY310: - - def write(self, b) -> int: - # Workaround issue where `lzma.LZMAFile` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 3014bd652d8c4..e95b44c879940 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,4 +1,5 @@ -""" support numpy compatibility across versions """ +"""support numpy compatibility across versions""" + import warnings import numpy as np @@ -8,13 +9,12 @@ # numpy versioning _np_version = np.__version__ _nlv = Version(_np_version) -np_version_lt1p23 = _nlv < Version("1.23") np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") -np_version_gt2 = _nlv >= Version("2.0.0.dev0") +np_version_gt2 = _nlv >= Version("2.0.0") is_numpy_dev = _nlv.dev is not None -_min_numpy_ver = "1.22.4" +_min_numpy_ver = "1.23.5" if _nlv < Version(_min_numpy_ver): @@ -36,8 +36,8 @@ r".*In the future `np\.long` will be defined as.*", FutureWarning, ) - np_long = np.long # type: ignore[attr-defined] - np_ulong = np.ulong # type: ignore[attr-defined] + np_long = np.long + np_ulong = np.ulong except AttributeError: np_long = np.int_ np_ulong = np.uint @@ -47,7 +47,7 @@ __all__ = [ - "np", "_np_version", "is_numpy_dev", + "np", ] diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index a36e25a9df410..abf86fc415641 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -15,6 +15,7 @@ methods that are spread throughout the codebase. This module will make it easier to adjust to future upstream changes in the analogous numpy signatures. """ + from __future__ import annotations from typing import ( @@ -138,6 +139,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["stable"] = None validate_argsort = CompatValidator( @@ -149,6 +151,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None +ARGSORT_DEFAULTS_KIND["stable"] = None validate_argsort_kind = CompatValidator( ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" ) @@ -177,13 +180,11 @@ def validate_argsort_with_ascending(ascending: bool | int | None, args, kwargs) @overload -def validate_clip_with_axis(axis: ndarray, args, kwargs) -> None: - ... +def validate_clip_with_axis(axis: ndarray, args, kwargs) -> None: ... @overload -def validate_clip_with_axis(axis: AxisNoneT, args, kwargs) -> AxisNoneT: - ... +def validate_clip_with_axis(axis: AxisNoneT, args, kwargs) -> AxisNoneT: ... def validate_clip_with_axis( @@ -257,10 +258,6 @@ def validate_cum_func_with_skipna(skipna: bool, args, kwargs, name) -> bool: MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 ) -RESHAPE_DEFAULTS: dict[str, str] = {"order": "C"} -validate_reshape = CompatValidator( - RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 -) REPEAT_DEFAULTS: dict[str, Any] = {"axis": None} validate_repeat = CompatValidator( @@ -272,12 +269,6 @@ def validate_cum_func_with_skipna(skipna: bool, args, kwargs, name) -> bool: ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS: dict[str, int | str | None] = {} -SORT_DEFAULTS["axis"] = -1 -SORT_DEFAULTS["kind"] = "quicksort" -SORT_DEFAULTS["order"] = None -validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") - STAT_FUNC_DEFAULTS: dict[str, Any | None] = {} STAT_FUNC_DEFAULTS["dtype"] = None STAT_FUNC_DEFAULTS["out"] = None @@ -323,20 +314,6 @@ def validate_cum_func_with_skipna(skipna: bool, args, kwargs, name) -> bool: validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") -def validate_take_with_convert(convert: ndarray | bool | None, args, kwargs) -> bool: - """ - If this function is called via the 'numpy' library, the third parameter in - its signature is 'axis', which takes either an ndarray or 'None', so check - if the 'convert' parameter is either an instance of ndarray or is None - """ - if isinstance(convert, ndarray) or convert is None: - args = (convert,) + args - convert = True - - validate_take(args, kwargs, max_fname_arg_count=3, method="both") - return convert - - TRANSPOSE_DEFAULTS = {"axes": None} validate_transpose = CompatValidator( TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0 @@ -361,23 +338,6 @@ def validate_groupby_func(name: str, args, kwargs, allowed=None) -> None: ) -RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") - - -def validate_resampler_func(method: str, args, kwargs) -> None: - """ - 'args' and 'kwargs' should be empty because all of their necessary - parameters are explicitly listed in the function signature - """ - if len(args) + len(kwargs) > 0: - if method in RESAMPLER_NUMPY_OPS: - raise UnsupportedFunctionCall( - "numpy operations are not valid with resample. " - f"Use .resample(...).{method}() instead" - ) - raise TypeError("too many arguments passed in") - - def validate_minmax_axis(axis: AxisInt | None, ndim: int = 1) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index cd98087c06c18..beaaa3f8ed3cc 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -1,20 +1,22 @@ """ -Support pre-0.12 series pickle compatibility. +Pickle compatibility to pandas version 1.0 """ + from __future__ import annotations import contextlib -import copy import io -import pickle as pkl -from typing import TYPE_CHECKING +import pickle +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np from pandas._libs.arrays import NDArrayBacked from pandas._libs.tslibs import BaseOffset -from pandas import Index from pandas.core.arrays import ( DatetimeArray, PeriodArray, @@ -26,111 +28,20 @@ from collections.abc import Generator -def load_reduce(self) -> None: - stack = self.stack - args = stack.pop() - func = stack[-1] - - try: - stack[-1] = func(*args) - return - except TypeError as err: - # If we have a deprecated function, - # try to replace and try again. - - msg = "_reconstruct: First argument must be a sub-type of ndarray" - - if msg in str(err): - try: - cls = args[0] - stack[-1] = object.__new__(cls) - return - except TypeError: - pass - elif args and isinstance(args[0], type) and issubclass(args[0], BaseOffset): - # TypeError: object.__new__(Day) is not safe, use Day.__new__() - cls = args[0] - stack[-1] = cls.__new__(*args) - return - elif args and issubclass(args[0], PeriodArray): - cls = args[0] - stack[-1] = NDArrayBacked.__new__(*args) - return - - raise - - # If classes are moved, provide compat here. _class_locations_map = { - ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"), - # 15477 - ("pandas.core.base", "FrozenNDArray"): ("numpy", "ndarray"), # Re-routing unpickle block logic to go through _unpickle_block instead # for pandas <= 1.3.5 ("pandas.core.internals.blocks", "new_block"): ( "pandas._libs.internals", "_unpickle_block", ), - ("pandas.core.indexes.frozen", "FrozenNDArray"): ("numpy", "ndarray"), - ("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"), - # 10890 - ("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"), - ("pandas.sparse.series", "SparseTimeSeries"): ( - "pandas.core.sparse.series", - "SparseSeries", - ), - # 12588, extensions moving - ("pandas._sparse", "BlockIndex"): ("pandas._libs.sparse", "BlockIndex"), - ("pandas.tslib", "Timestamp"): ("pandas._libs.tslib", "Timestamp"), - # 18543 moving period - ("pandas._period", "Period"): ("pandas._libs.tslibs.period", "Period"), - ("pandas._libs.period", "Period"): ("pandas._libs.tslibs.period", "Period"), - # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype - ("pandas.tslib", "__nat_unpickle"): ( + # Avoid Cython's warning "contradiction to to Python 'class private name' rules" + ("pandas._libs.tslibs.nattype", "__nat_unpickle"): ( "pandas._libs.tslibs.nattype", - "__nat_unpickle", - ), - ("pandas._libs.tslib", "__nat_unpickle"): ( - "pandas._libs.tslibs.nattype", - "__nat_unpickle", - ), - # 15998 top-level dirs moving - ("pandas.sparse.array", "SparseArray"): ( - "pandas.core.arrays.sparse", - "SparseArray", - ), - ("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"), - ("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"), - ("pandas.indexes.numeric", "Int64Index"): ( - "pandas.core.indexes.base", - "Index", # updated in 50775 - ), - ("pandas.indexes.range", "RangeIndex"): ("pandas.core.indexes.range", "RangeIndex"), - ("pandas.indexes.multi", "MultiIndex"): ("pandas.core.indexes.multi", "MultiIndex"), - ("pandas.tseries.index", "_new_DatetimeIndex"): ( - "pandas.core.indexes.datetimes", - "_new_DatetimeIndex", - ), - ("pandas.tseries.index", "DatetimeIndex"): ( - "pandas.core.indexes.datetimes", - "DatetimeIndex", - ), - ("pandas.tseries.period", "PeriodIndex"): ( - "pandas.core.indexes.period", - "PeriodIndex", - ), - # 19269, arrays moving - ("pandas.core.categorical", "Categorical"): ("pandas.core.arrays", "Categorical"), - # 19939, add timedeltaindex, float64index compat from 15998 move - ("pandas.tseries.tdi", "TimedeltaIndex"): ( - "pandas.core.indexes.timedeltas", - "TimedeltaIndex", - ), - ("pandas.indexes.numeric", "Float64Index"): ( - "pandas.core.indexes.base", - "Index", # updated in 50775 + "_nat_unpickle", ), - # 50775, remove Int64Index, UInt64Index & Float64Index from codabase + # 50775, remove Int64Index, UInt64Index & Float64Index from codebase ("pandas.core.indexes.numeric", "Int64Index"): ( "pandas.core.indexes.base", "Index", @@ -152,85 +63,55 @@ def load_reduce(self) -> None: # our Unpickler sub-class to override methods and some dispatcher # functions for compat and uses a non-public class of the pickle module. - - -class Unpickler(pkl._Unpickler): - def find_class(self, module, name): - # override superclass +class Unpickler(pickle._Unpickler): + def find_class(self, module: str, name: str) -> Any: key = (module, name) module, name = _class_locations_map.get(key, key) return super().find_class(module, name) + dispatch = pickle._Unpickler.dispatch.copy() -Unpickler.dispatch = copy.copy(Unpickler.dispatch) -Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce - - -def load_newobj(self) -> None: - args = self.stack.pop() - cls = self.stack[-1] - - # compat - if issubclass(cls, Index): - obj = object.__new__(cls) - elif issubclass(cls, DatetimeArray) and not args: - arr = np.array([], dtype="M8[ns]") - obj = cls.__new__(cls, arr, arr.dtype) - elif issubclass(cls, TimedeltaArray) and not args: - arr = np.array([], dtype="m8[ns]") - obj = cls.__new__(cls, arr, arr.dtype) - elif cls is BlockManager and not args: - obj = cls.__new__(cls, (), [], False) - else: - obj = cls.__new__(cls, *args) - - self.stack[-1] = obj - - -Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj - - -def load_newobj_ex(self) -> None: - kwargs = self.stack.pop() - args = self.stack.pop() - cls = self.stack.pop() - - # compat - if issubclass(cls, Index): - obj = object.__new__(cls) - else: - obj = cls.__new__(cls, *args, **kwargs) - self.append(obj) + def load_reduce(self) -> None: + stack = self.stack # type: ignore[attr-defined] + args = stack.pop() + func = stack[-1] - -try: - Unpickler.dispatch[pkl.NEWOBJ_EX[0]] = load_newobj_ex -except (AttributeError, KeyError): - pass - - -def load(fh, encoding: str | None = None, is_verbose: bool = False): - """ - Load a pickle, with a provided encoding, - - Parameters - ---------- - fh : a filelike object - encoding : an optional encoding - is_verbose : show exception output - """ - try: - fh.seek(0) - if encoding is not None: - up = Unpickler(fh, encoding=encoding) + try: + stack[-1] = func(*args) + except TypeError: + # If we have a deprecated function, + # try to replace and try again. + if args and isinstance(args[0], type) and issubclass(args[0], BaseOffset): + # TypeError: object.__new__(Day) is not safe, use Day.__new__() + cls = args[0] + stack[-1] = cls.__new__(*args) + return + elif args and issubclass(args[0], PeriodArray): + cls = args[0] + stack[-1] = NDArrayBacked.__new__(*args) + return + raise + + dispatch[pickle.REDUCE[0]] = load_reduce # type: ignore[assignment] + + def load_newobj(self) -> None: + args = self.stack.pop() # type: ignore[attr-defined] + cls = self.stack.pop() # type: ignore[attr-defined] + + # compat + if issubclass(cls, DatetimeArray) and not args: + arr = np.array([], dtype="M8[ns]") + obj = cls.__new__(cls, arr, arr.dtype) + elif issubclass(cls, TimedeltaArray) and not args: + arr = np.array([], dtype="m8[ns]") + obj = cls.__new__(cls, arr, arr.dtype) + elif cls is BlockManager and not args: + obj = cls.__new__(cls, (), [], False) else: - up = Unpickler(fh) - # "Unpickler" has no attribute "is_verbose" [attr-defined] - up.is_verbose = is_verbose # type: ignore[attr-defined] + obj = cls.__new__(cls, *args) + self.append(obj) # type: ignore[attr-defined] - return up.load() - except (ValueError, TypeError): - raise + dispatch[pickle.NEWOBJ[0]] = load_newobj # type: ignore[assignment] def loads( @@ -239,7 +120,7 @@ def loads( fix_imports: bool = True, encoding: str = "ASCII", errors: str = "strict", -): +) -> Any: """ Analogous to pickle._loads. """ @@ -250,13 +131,13 @@ def loads( @contextlib.contextmanager -def patch_pickle() -> Generator[None, None, None]: +def patch_pickle() -> Generator[None]: """ Temporarily patch pickle to use our unpickler. """ - orig_loads = pkl.loads + orig_loads = pickle.loads try: - setattr(pkl, "loads", loads) + setattr(pickle, "loads", loads) yield finally: - setattr(pkl, "loads", orig_loads) + setattr(pickle, "loads", orig_loads) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index beb4814914101..163934bee509c 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -1,4 +1,4 @@ -""" support pyarrow compatibility across versions """ +"""support pyarrow compatibility across versions""" from __future__ import annotations @@ -15,6 +15,12 @@ pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") + pa_version_under16p0 = _palv < Version("16.0.0") + pa_version_under17p0 = _palv < Version("17.0.0") + pa_version_under18p0 = _palv < Version("18.0.0") + pa_version_under19p0 = _palv < Version("19.0.0") + pa_version_under20p0 = _palv < Version("20.0.0") + HAS_PYARROW = True except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -23,3 +29,9 @@ pa_version_under14p0 = True pa_version_under14p1 = True pa_version_under15p0 = True + pa_version_under16p0 = True + pa_version_under17p0 = True + pa_version_under18p0 = True + pa_version_under19p0 = True + pa_version_under20p0 = True + HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 983272d79081e..f9c10a7758bd2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -17,6 +17,7 @@ - Dtypes - Misc """ + from __future__ import annotations from collections import abc @@ -28,12 +29,14 @@ timezone, ) from decimal import Decimal +import gc import operator import os from typing import ( TYPE_CHECKING, - Callable, + Any, ) +import uuid from dateutil.tz import ( tzlocal, @@ -43,13 +46,8 @@ from hypothesis import strategies as st import numpy as np import pytest -from pytz import ( - FixedOffset, - utc, -) - -from pandas._config.config import _get_option +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -78,10 +76,10 @@ Index, MultiIndex, ) -from pandas.util.version import Version if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, ) @@ -94,12 +92,7 @@ del pa has_pyarrow = True -import zoneinfo - -try: - zoneinfo.ZoneInfo("UTC") -except zoneinfo.ZoneInfoNotFoundError: - zoneinfo = None # type: ignore[assignment] +pytz = import_optional_dependency("pytz", errors="ignore") # ---------------------------------------------------------------- @@ -124,7 +117,7 @@ def ignore_doctest_warning(item: pytest.Item, path: str, message: str) -> None: item : pytest.Item pytest test item. path : str - Module path to Python object, e.g. "pandas.core.frame.DataFrame.append". A + Module path to Python object, e.g. "pandas.DataFrame.append". A warning will be filtered when item.name ends with in given path. So it is sufficient to specify e.g. "DataFrame.append". message : str @@ -149,7 +142,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), - ("NDFrame.replace", "The 'method' keyword"), + ("DataFrameGroupBy.corrwith", "DataFrameGroupBy.corrwith is deprecated"), ("NDFrame.replace", "Series.replace without 'value'"), ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"), ("Series.idxmin", "The behavior of Series.idxmin"), @@ -157,17 +150,10 @@ def pytest_collection_modifyitems(items, config) -> None: ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"), ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"), + ("to_pytimedelta", "The behavior of TimedeltaProperties.to_pytimedelta"), + ("NDFrame.reindex_like", "keyword argument 'method' is deprecated"), # Docstring divides by zero to show behavior difference ("missing.mask_zero_div_zero", "divide by zero encountered"), - ( - "to_pydatetime", - "The behavior of DatetimeProperties.to_pydatetime is deprecated", - ), - ( - "pandas.core.generic.NDFrame.bool", - "(Series|DataFrame).bool is now deprecated and will be removed " - "in future version of pandas", - ), ( "pandas.core.generic.NDFrame.first", "first is deprecated and will be removed in a future version. " @@ -181,26 +167,19 @@ def pytest_collection_modifyitems(items, config) -> None: "DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna with 'method' is deprecated", ), - ( - "DataFrameGroupBy.fillna", - "DataFrame.fillna with 'method' is deprecated", - ), ("read_parquet", "Passing a BlockManager to DataFrame is deprecated"), ] if is_doctest: for item in items: - # autouse=True for the add_doctest_imports can lead to expensive teardowns - # since doctest_namespace is a session fixture - item.add_marker(pytest.mark.usefixtures("add_doctest_imports")) - for path, message in ignored_doctest_warnings: ignore_doctest_warning(item, path, message) -hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] -if Version(hypothesis.__version__) >= Version("6.83.2"): - hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors) +hypothesis_health_checks = [ + hypothesis.HealthCheck.too_slow, + hypothesis.HealthCheck.differing_executors, +] # Hypothesis hypothesis.settings.register_profile( @@ -250,7 +229,14 @@ def pytest_collection_modifyitems(items, config) -> None: ) -@pytest.fixture +# ---------------------------------------------------------------- +# Autouse fixtures +# ---------------------------------------------------------------- + + +# https://github.com/pytest-dev/pytest/issues/11873 +# Would like to avoid autouse=True, but cannot as of pytest 8.0.0 +@pytest.fixture(autouse=True) def add_doctest_imports(doctest_namespace) -> None: """ Make `np` and `pd` names available for doctests. @@ -259,9 +245,6 @@ def add_doctest_imports(doctest_namespace) -> None: doctest_namespace["pd"] = pd -# ---------------------------------------------------------------- -# Autouse fixtures -# ---------------------------------------------------------------- @pytest.fixture(autouse=True) def configure_tests() -> None: """ @@ -273,7 +256,7 @@ def configure_tests() -> None: # ---------------------------------------------------------------- # Common arguments # ---------------------------------------------------------------- -@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis={repr(x)}") +@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis={x!r}") def axis(request): """ Fixture for returning the axis numbers of a DataFrame. @@ -281,18 +264,7 @@ def axis(request): return request.param -axis_frame = axis - - -@pytest.fixture(params=[1, "columns"], ids=lambda x: f"axis={repr(x)}") -def axis_1(request): - """ - Fixture for returning aliases of axis 1 of a DataFrame. - """ - return request.param - - -@pytest.fixture(params=[True, False, None]) +@pytest.fixture(params=[True, False]) def observed(request): """ Pass in the observed keyword to groupby for [True, False] @@ -313,6 +285,22 @@ def ordered(request): return request.param +@pytest.fixture(params=[True, False]) +def dropna(request): + """ + Boolean 'dropna' parameter. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def sort(request): + """ + Boolean 'sort' parameter. + """ + return request.param + + @pytest.fixture(params=[True, False]) def skipna(request): """ @@ -414,6 +402,74 @@ def nselect_method(request): return request.param +@pytest.fixture(params=[None, "ignore"]) +def na_action(request): + """ + Fixture for 'na_action' argument in map. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def ascending(request): + """ + Fixture for 'na_action' argument in sort_values/sort_index/rank. + """ + return request.param + + +@pytest.fixture(params=["average", "min", "max", "first", "dense"]) +def rank_method(request): + """ + Fixture for 'rank' argument in rank. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_index(request): + """ + Fixture for 'as_index' argument in groupby. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def cache(request): + """ + Fixture for 'cache' argument in to_datetime. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def parallel(request): + """ + Fixture for parallel keyword argument for numba.jit. + """ + return request.param + + +# Can parameterize nogil & nopython over True | False, but limiting per +# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472 + + +@pytest.fixture(params=[False]) +def nogil(request): + """ + Fixture for nogil keyword argument for numba.jit. + """ + return request.param + + +@pytest.fixture(params=[True]) +def nopython(request): + """ + Fixture for nopython keyword argument for numba.jit. + """ + return request.param + + # ---------------------------------------------------------------- # Missing values & co. # ---------------------------------------------------------------- @@ -478,10 +534,6 @@ def index_or_series(request): return request.param -# Generate cartesian product of index_or_series fixture: -index_or_series2 = index_or_series - - @pytest.fixture(params=[Index, Series, pd.array], ids=["index", "series", "array"]) def index_or_series_or_array(request): """ @@ -548,7 +600,7 @@ def multiindex_year_month_day_dataframe_random_data(): """ tdf = DataFrame( np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100, freq="B"), ) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() @@ -615,47 +667,48 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "string": Index([f"pandas_{i}" for i in range(100)]), - "datetime": date_range("2020-01-01", periods=100), - "datetime-tz": date_range("2020-01-01", periods=100, tz="US/Pacific"), - "period": period_range("2020-01-01", periods=100, freq="D"), - "timedelta": timedelta_range(start="1 day", periods=100, freq="D"), - "range": RangeIndex(100), - "int8": Index(np.arange(100), dtype="int8"), - "int16": Index(np.arange(100), dtype="int16"), - "int32": Index(np.arange(100), dtype="int32"), - "int64": Index(np.arange(100), dtype="int64"), - "uint8": Index(np.arange(100), dtype="uint8"), - "uint16": Index(np.arange(100), dtype="uint16"), - "uint32": Index(np.arange(100), dtype="uint32"), - "uint64": Index(np.arange(100), dtype="uint64"), - "float32": Index(np.arange(100), dtype="float32"), - "float64": Index(np.arange(100), dtype="float64"), + "object": Index([f"pandas_{i}" for i in range(10)], dtype=object), + "string": Index([f"pandas_{i}" for i in range(10)], dtype="str"), + "datetime": date_range("2020-01-01", periods=10), + "datetime-tz": date_range("2020-01-01", periods=10, tz="US/Pacific"), + "period": period_range("2020-01-01", periods=10, freq="D"), + "timedelta": timedelta_range(start="1 day", periods=10, freq="D"), + "range": RangeIndex(10), + "int8": Index(np.arange(10), dtype="int8"), + "int16": Index(np.arange(10), dtype="int16"), + "int32": Index(np.arange(10), dtype="int32"), + "int64": Index(np.arange(10), dtype="int64"), + "uint8": Index(np.arange(10), dtype="uint8"), + "uint16": Index(np.arange(10), dtype="uint16"), + "uint32": Index(np.arange(10), dtype="uint32"), + "uint64": Index(np.arange(10), dtype="uint64"), + "float32": Index(np.arange(10), dtype="float32"), + "float64": Index(np.arange(10), dtype="float64"), "bool-object": Index([True, False] * 5, dtype=object), "bool-dtype": Index([True, False] * 5, dtype=bool), "complex64": Index( - np.arange(100, dtype="complex64") + 1.0j * np.arange(100, dtype="complex64") + np.arange(10, dtype="complex64") + 1.0j * np.arange(10, dtype="complex64") ), "complex128": Index( - np.arange(100, dtype="complex128") + 1.0j * np.arange(100, dtype="complex128") + np.arange(10, dtype="complex128") + 1.0j * np.arange(10, dtype="complex128") ), - "categorical": CategoricalIndex(list("abcd") * 25), - "interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=101)), + "categorical": CategoricalIndex(list("abcd") * 2), + "interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=11)), "empty": Index([]), "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), - "nullable_int": Index(np.arange(100), dtype="Int64"), - "nullable_uint": Index(np.arange(100), dtype="UInt16"), - "nullable_float": Index(np.arange(100), dtype="Float32"), - "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), + "nullable_int": Index(np.arange(10), dtype="Int64"), + "nullable_uint": Index(np.arange(10), dtype="UInt16"), + "nullable_float": Index(np.arange(10), dtype="Float32"), + "nullable_bool": Index(np.arange(10).astype(bool), dtype="boolean"), "string-python": Index( - pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]") + pd.array([f"pandas_{i}" for i in range(10)], dtype="string[python]") ), } if has_pyarrow: - idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]")) + idx = Index(pd.array([f"pandas_{i}" for i in range(10)], dtype="string[pyarrow]")) indices_dict["string-pyarrow"] = idx @@ -674,10 +727,6 @@ def index(request): return indices_dict[request.param].copy() -# Needed to generate cartesian product of indices -index_fixture2 = index - - @pytest.fixture( params=[ key for key, value in indices_dict.items() if not isinstance(value, MultiIndex) @@ -691,10 +740,6 @@ def index_flat(request): return indices_dict[key].copy() -# Alias so we can test with cartesian product of index_flat -index_flat2 = index_flat - - @pytest.fixture( params=[ key @@ -742,7 +787,7 @@ def string_series() -> Series: """ return Series( np.arange(30, dtype=np.float64) * 1.1, - index=Index([f"i_{i}" for i in range(30)], dtype=object), + index=Index([f"i_{i}" for i in range(30)]), name="series", ) @@ -753,7 +798,7 @@ def object_series() -> Series: Fixture for Series of dtype object with Index of unique strings """ data = [f"foo_{i}" for i in range(30)] - index = Index([f"bar_{i}" for i in range(30)], dtype=object) + index = Index([f"bar_{i}" for i in range(30)]) return Series(data, index=index, name="objects", dtype=object) @@ -845,8 +890,8 @@ def int_frame() -> DataFrame: """ return DataFrame( np.ones((30, 4), dtype=np.int64), - index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + index=Index([f"foo_{i}" for i in range(30)]), + columns=Index(list("ABCD")), ) @@ -902,6 +947,9 @@ def rand_series_with_duplicate_datetimeindex() -> Series: ] ) def ea_scalar_and_dtype(request): + """ + Fixture that tests each scalar and datetime type. + """ return request.param @@ -1147,19 +1195,19 @@ def deco(*args): "UTC-02:15", tzutc(), tzlocal(), - FixedOffset(300), - FixedOffset(0), - FixedOffset(-300), timezone.utc, timezone(timedelta(hours=1)), timezone(timedelta(hours=-1), name="foo"), ] -if zoneinfo is not None: +if pytz is not None: TIMEZONES.extend( - [ - zoneinfo.ZoneInfo("US/Pacific"), # type: ignore[list-item] - zoneinfo.ZoneInfo("UTC"), # type: ignore[list-item] - ] + ( + pytz.FixedOffset(300), + pytz.FixedOffset(0), + pytz.FixedOffset(-300), + pytz.timezone("US/Pacific"), + pytz.timezone("UTC"), + ) ) TIMEZONE_IDS = [repr(i) for i in TIMEZONES] @@ -1182,13 +1230,10 @@ def tz_aware_fixture(request): return request.param -# Generate cartesian product of tz_aware_fixture: -tz_aware_fixture2 = tz_aware_fixture - +_UTCS = ["utc", "dateutil/UTC", tzutc(), timezone.utc] -_UTCS = ["utc", "dateutil/UTC", utc, tzutc(), timezone.utc] -if zoneinfo is not None: - _UTCS.append(zoneinfo.ZoneInfo("UTC")) +if pytz is not None: + _UTCS.append(pytz.utc) @pytest.fixture(params=_UTCS) @@ -1228,6 +1273,34 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_no_object(request): + """ + Parametrized fixture for string dtypes. + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) + """ + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) + + @pytest.fixture( params=[ "string[python]", @@ -1244,11 +1317,26 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'str[pyarrow]' + * 'string[pyarrow]' + """ + return pd.StringDtype(*request.param) + + @pytest.fixture( params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1257,7 +1345,31 @@ def string_storage(request): * 'python' * 'pyarrow' - * 'pyarrow_numpy' + """ + return request.param + + +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_arguments(request): + """ + Parametrized fixture for StringDtype storage and na_value. + + * 'python' + pd.NA + * 'pyarrow' + pd.NA + * 'pyarrow' + np.nan """ return request.param @@ -1280,6 +1392,7 @@ def dtype_backend(request): # Alias so we can test with cartesian product of string_storage string_storage2 = string_storage +string_dtype_arguments2 = string_dtype_arguments @pytest.fixture(params=tm.BYTES_DTYPES) @@ -1306,20 +1419,36 @@ def object_dtype(request): @pytest.fixture( params=[ - "object", - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ] + np.dtype("object"), + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=object", + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], ) def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string[python]' - * 'string[pyarrow]' + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) """ - return request.param + if isinstance(request.param, np.dtype): + return request.param + else: + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) @pytest.fixture(params=tm.DATETIME64_DTYPES) @@ -1349,7 +1478,7 @@ def fixed_now_ts() -> Timestamp: """ Fixture emits fixed Timestamp.now() """ - return Timestamp( # pyright: ignore[reportGeneralTypeIssues] + return Timestamp( # pyright: ignore[reportReturnType] year=2021, month=1, day=1, hour=12, minute=4, second=13, microsecond=22 ) @@ -1403,6 +1532,21 @@ def complex_dtype(request): return request.param +@pytest.fixture(params=tm.COMPLEX_FLOAT_DTYPES) +def complex_or_float_dtype(request): + """ + Parameterized fixture for complex and numpy float dtypes. + + * complex + * 'complex64' + * 'complex128' + * float + * 'float32' + * 'float64' + """ + return request.param + + @pytest.fixture(params=tm.SIGNED_INT_NUMPY_DTYPES) def any_signed_int_numpy_dtype(request): """ @@ -1642,6 +1786,38 @@ def any_numpy_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES) +def any_real_nullable_dtype(request): + """ + Parameterized fixture for all real dtypes that can hold NA. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + * 'UInt8' + * 'UInt16' + * 'UInt32' + * 'UInt64' + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + * 'uint8[pyarrow]' + * 'uint16[pyarrow]' + * 'uint32[pyarrow]' + * 'uint64[pyarrow]' + * 'int8[pyarrow]' + * 'int16[pyarrow]' + * 'int32[pyarrow]' + * 'int64[pyarrow]' + * 'float[pyarrow]' + * 'double[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.ALL_NUMERIC_DTYPES) def any_numeric_dtype(request): """ @@ -1777,14 +1953,37 @@ def ip(): return InteractiveShell(config=c) -@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) -def spmatrix(request): - """ - Yields scipy sparse matrix classes. +@pytest.fixture +def mpl_cleanup(): """ - sparse = pytest.importorskip("scipy.sparse") + Ensure Matplotlib is cleaned up around a test. + + Before a test is run: - return getattr(sparse, request.param + "_matrix") + 1) Set the backend to "template" to avoid requiring a GUI. + + After a test is run: + + 1) Reset units registry + 2) Reset rc_context + 3) Close all figures + + See matplotlib/testing/decorators.py#L24. + """ + mpl = pytest.importorskip("matplotlib") + mpl_units = pytest.importorskip("matplotlib.units") + plt = pytest.importorskip("matplotlib.pyplot") + orig_units_registry = mpl_units.registry.copy() + try: + with mpl.rc_context(): + mpl.use("template") + yield + finally: + mpl_units.registry.clear() + mpl_units.registry.update(orig_units_registry) + plt.close("all") + # https://matplotlib.org/stable/users/prev_whats_new/whats_new_3.6.0.html#garbage-collection-is-no-longer-run-on-figure-close # noqa: E501 + gc.collect(1) @pytest.fixture( @@ -1877,34 +2076,14 @@ def indexer_ial(request): return request.param -@pytest.fixture -def using_array_manager() -> bool: - """ - Fixture to check if the array manager is being used. - """ - return _get_option("mode.data_manager", silent=True) == "array" - - -@pytest.fixture -def using_copy_on_write() -> bool: - """ - Fixture to check if Copy-on-Write is enabled. - """ - return ( - pd.options.mode.copy_on_write is True - and _get_option("mode.data_manager", silent=True) == "block" - ) - - -@pytest.fixture -def warn_copy_on_write() -> bool: +@pytest.fixture(params=[True, False]) +def performance_warning(request) -> Iterator[bool | type[Warning]]: """ - Fixture to check if Copy-on-Write is in warning mode. + Fixture to check if performance warnings are enabled. Either produces + ``PerformanceWarning`` if they are enabled, otherwise ``False``. """ - return ( - pd.options.mode.copy_on_write == "warn" - and _get_option("mode.data_manager", silent=True) == "block" - ) + with pd.option_context("mode.performance_warnings", request.param): + yield pd.errors.PerformanceWarning if request.param else False @pytest.fixture @@ -1915,12 +2094,12 @@ def using_infer_string() -> bool: return pd.options.future.infer_string is True -warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] -if zoneinfo is not None: - warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] +_warsaws: list[Any] = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] +if pytz is not None: + _warsaws.append(pytz.timezone("Europe/Warsaw")) -@pytest.fixture(params=warsaws) +@pytest.fixture(params=_warsaws) def warsaw(request) -> str: """ tzinfo for Europe/Warsaw using pytz, dateutil, or zoneinfo. @@ -1928,6 +2107,12 @@ def warsaw(request) -> str: return request.param -@pytest.fixture() -def arrow_string_storage(): - return ("pyarrow", "pyarrow_numpy") +@pytest.fixture +def temp_file(tmp_path): + """ + Generate a unique file for testing use. See link for removal policy. + https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html#the-default-base-temporary-directory + """ + file_path = tmp_path / str(uuid.uuid4()) + file_path.touch() + return file_path diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 0a26acb7df60a..3f3ebe8dbe023 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -4,16 +4,18 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) if TYPE_CHECKING: + from collections.abc import Callable from pandas._typing import Scalar import numpy as np from pandas.compat._optional import import_optional_dependency +from pandas.core.util.numba_ import jit_user_function + @functools.cache def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): @@ -21,10 +23,10 @@ def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): import numba else: numba = import_optional_dependency("numba") - nb_compat_func = numba.extending.register_jitable(func) + nb_compat_func = jit_user_function(func) @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def nb_looper(values, axis): + def nb_looper(values, axis, *args): # Operate on the first row/col in order to get # the output shape if axis == 0: @@ -33,7 +35,7 @@ def nb_looper(values, axis): else: first_elem = values[0] dim0 = values.shape[0] - res0 = nb_compat_func(first_elem) + res0 = nb_compat_func(first_elem, *args) # Use np.asarray to get shape for # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape @@ -44,11 +46,11 @@ def nb_looper(values, axis): if axis == 1: buff[0] = res0 for i in numba.prange(1, values.shape[0]): - buff[i] = nb_compat_func(values[i]) + buff[i] = nb_compat_func(values[i], *args) else: buff[:, 0] = res0 for j in numba.prange(1, values.shape[1]): - buff[:, j] = nb_compat_func(values[:, j]) + buff[:, j] = nb_compat_func(values[:, j], *args) return buff return nb_looper diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index ee09c9380fb0f..413fdafc7fd04 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -12,6 +12,7 @@ from contextlib import contextmanager import operator +from typing import TYPE_CHECKING import numba from numba import types @@ -40,6 +41,9 @@ from pandas.core.internals import SingleBlockManager from pandas.core.series import Series +if TYPE_CHECKING: + from pandas._typing import Self + # Helper function to hack around fact that Index casts numpy string dtype to object # @@ -49,7 +53,8 @@ @contextmanager def set_numba_data(index: Index): numba_data = index._data - if numba_data.dtype == object: + if numba_data.dtype in (object, "string"): + numba_data = np.asarray(numba_data) if not lib.is_string_array(numba_data): raise ValueError( "The numba engine only supports using string or numeric column names" @@ -84,7 +89,7 @@ def key(self): def as_array(self): return types.Array(self.dtype, 1, self.layout) - def copy(self, dtype=None, ndim: int = 1, layout=None): + def copy(self, dtype=None, ndim: int = 1, layout=None) -> Self: assert ndim == 1 if dtype is None: dtype = self.dtype @@ -114,7 +119,7 @@ def key(self): def as_array(self): return self.values - def copy(self, dtype=None, ndim: int = 1, layout: str = "C"): + def copy(self, dtype=None, ndim: int = 1, layout: str = "C") -> Self: assert ndim == 1 assert layout == "C" if dtype is None: @@ -123,7 +128,7 @@ def copy(self, dtype=None, ndim: int = 1, layout: str = "C"): @typeof_impl.register(Index) -def typeof_index(val, c): +def typeof_index(val, c) -> IndexType: """ This will assume that only strings are in object dtype index. @@ -136,7 +141,7 @@ def typeof_index(val, c): @typeof_impl.register(Series) -def typeof_series(val, c): +def typeof_series(val, c) -> SeriesType: index = typeof_impl(val.index, c) arrty = typeof_impl(val.values, c) namety = typeof_impl(val.name, c) @@ -532,7 +537,7 @@ def key(self): @typeof_impl.register(_iLocIndexer) -def typeof_iloc(val, c): +def typeof_iloc(val, c) -> IlocType: objtype = typeof_impl(val.obj, c) return IlocType(objtype) diff --git a/pandas/core/_numba/kernels/__init__.py b/pandas/core/_numba/kernels/__init__.py index 1116c61c4ca8e..6983711480455 100644 --- a/pandas/core/_numba/kernels/__init__.py +++ b/pandas/core/_numba/kernels/__init__.py @@ -16,12 +16,12 @@ ) __all__ = [ - "sliding_mean", "grouped_mean", - "sliding_sum", + "grouped_min_max", "grouped_sum", - "sliding_var", "grouped_var", + "sliding_mean", "sliding_min_max", - "grouped_min_max", + "sliding_sum", + "sliding_var", ] diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index f415804781753..2b59ea2fe12a5 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -6,6 +6,7 @@ Mirrors pandas/_libs/window/aggregation.pyx """ + from __future__ import annotations from typing import TYPE_CHECKING @@ -107,7 +108,7 @@ def sliding_mean( neg_ct, compensation_add, num_consecutive_same_value, - prev_value, # pyright: ignore[reportGeneralTypeIssues] + prev_value, # pyright: ignore[reportArgumentType] ) else: for j in range(start[i - 1], s): @@ -132,7 +133,7 @@ def sliding_mean( neg_ct, compensation_add, num_consecutive_same_value, - prev_value, # pyright: ignore[reportGeneralTypeIssues] + prev_value, # pyright: ignore[reportArgumentType] ) if nobs >= min_periods and nobs > 0: @@ -168,9 +169,10 @@ def grouped_mean( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index c9803980e64a6..c03f20c871012 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -6,9 +6,13 @@ Mirrors pandas/_libs/window/aggregation.pyx """ + from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import numba import numpy as np @@ -17,6 +21,20 @@ from pandas._typing import npt +@numba.njit(nogil=True, parallel=False) +def bisect_left(a: list[Any], x: Any, lo: int = 0, hi: int = -1) -> int: + """Same as https://docs.python.org/3/library/bisect.html; not in numba yet!""" + if hi == -1: + hi = len(a) + while lo < hi: + mid = (lo + hi) // 2 + if a[mid] < x: + lo = mid + 1 + else: + hi = mid + return lo + + @numba.jit(nopython=True, nogil=True, parallel=False) def sliding_min_max( values: np.ndarray, @@ -26,55 +44,87 @@ def sliding_min_max( min_periods: int, is_max: bool, ) -> tuple[np.ndarray, list[int]]: + # Basic idea of the algorithm: https://stackoverflow.com/a/12239580 + # It was generalized to work with an arbitrary list of any window size and position + # by adding the Dominators stack. + N = len(start) - nobs = 0 - output = np.empty(N, dtype=result_dtype) na_pos = [] - # Use deque once numba supports it - # https://github.com/numba/numba/issues/7417 - Q: list = [] - W: list = [] - for i in range(N): - curr_win_size = end[i] - start[i] - if i == 0: - st = start[i] - else: - st = end[i - 1] - - for k in range(st, end[i]): - ai = values[k] - if not np.isnan(ai): - nobs += 1 - elif is_max: - ai = -np.inf - else: - ai = np.inf - # Discard previous entries if we find new min or max - if is_max: - while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]): - Q.pop() - else: - while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]): - Q.pop() - Q.append(k) - W.append(k) - - # Discard entries outside and left of current window - while Q and Q[0] <= start[i] - 1: - Q.pop(0) - while W and W[0] <= start[i] - 1: - if not np.isnan(values[W[0]]): - nobs -= 1 - W.pop(0) - - # Save output based on index in input value array - if Q and curr_win_size > 0 and nobs >= min_periods: - output[i] = values[Q[0]] + output = np.empty(N, dtype=result_dtype) + + def cmp(a: Any, b: Any, is_max: bool) -> bool: + if is_max: + return a >= b else: + return a <= b + + # Indices of bounded extrema in `values`. `candidates[i]` is always increasing. + # `values[candidates[i]]` is decreasing for max and increasing for min. + candidates: list[int] = [] # this is a queue + # Indices of largest windows that "cover" preceding windows. + dominators: list[int] = [] # this is a stack + + if min_periods < 1: + min_periods = 1 + + if N > 2: + i_next = N - 1 # equivalent to i_next = i+1 inside the loop + for i in range(N - 2, -1, -1): + next_dominates = start[i_next] < start[i] + if next_dominates and ( + not dominators or start[dominators[-1]] > start[i_next] + ): + dominators.append(i_next) + i_next = i + + # NaN tracking to guarantee min_periods + valid_start = -min_periods + + last_end = 0 + last_start = -1 + + for i in range(N): + this_start = start[i].item() + this_end = end[i].item() + + if dominators and dominators[-1] == i: + dominators.pop() + + if not ( + this_end > last_end or (this_end == last_end and this_start >= last_start) + ): + raise ValueError( + "Start/End ordering requirement is violated at index " + str(i) + ) + + stash_start = ( + this_start if not dominators else min(this_start, start[dominators[-1]]) + ) + while candidates and candidates[0] < stash_start: + candidates.pop(0) + + for k in range(last_end, this_end): + if not np.isnan(values[k]): + valid_start += 1 + while valid_start >= 0 and np.isnan(values[valid_start]): + valid_start += 1 + while candidates and cmp(values[k], values[candidates[-1]], is_max): + candidates.pop() # Q.pop_back() + candidates.append(k) # Q.push_back(k) + + if not candidates or (this_start > valid_start): if values.dtype.kind != "i": output[i] = np.nan else: na_pos.append(i) + elif candidates[0] >= this_start: + # ^^ This is here to avoid costly bisection for fixed window sizes. + output[i] = values[candidates[0]] + else: + q_idx = bisect_left(candidates, this_start, lo=1) + output[i] = values[candidates[q_idx]] + last_end = this_end + last_start = this_start return output, na_pos @@ -87,6 +137,7 @@ def grouped_min_max( ngroups: int, min_periods: int, is_max: bool, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) nobs = np.zeros(ngroups, dtype=np.int64) @@ -96,13 +147,16 @@ def grouped_min_max( for i in range(N): lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or (not skipna and nobs[lab] >= 1 and np.isnan(output[lab])): continue if values.dtype.kind == "i" or not np.isnan(val): nobs[lab] += 1 else: - # NaN value cannot be a min/max value + if not skipna: + # If skipna is False and we encounter a NaN, + # both min and max of the group will be NaN + output[lab] = np.nan continue if nobs[lab] == 1: diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 94db84267ceec..9f2e9541b31d0 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -6,6 +6,7 @@ Mirrors pandas/_libs/window/aggregation.pyx """ + from __future__ import annotations from typing import ( @@ -164,6 +165,7 @@ def grouped_kahan_sum( result_dtype: np.dtype, labels: npt.NDArray[np.intp], ngroups: int, + skipna: bool, ) -> tuple[ np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray ]: @@ -179,7 +181,15 @@ def grouped_kahan_sum( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan + nobs_arr[lab] += 1 + comp_arr[lab] = np.nan + consecutive_counts[lab] = 1 + prev_vals[lab] = np.nan continue sum_x = output[lab] @@ -218,11 +228,12 @@ def grouped_sum( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: na_pos = [] output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index c63d0b90b0fc3..5d720c877815d 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -6,6 +6,7 @@ Mirrors pandas/_libs/window/aggregation.pyx """ + from __future__ import annotations from typing import TYPE_CHECKING @@ -175,6 +176,7 @@ def grouped_var( ngroups: int, min_periods: int, ddof: int = 1, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) @@ -189,7 +191,11 @@ def grouped_var( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan continue mean_x = means[lab] diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 698abb2ec4989..0331c26c805b6 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -4,10 +4,12 @@ that can be mixed into or pinned onto other pandas classes. """ + from __future__ import annotations +import functools from typing import ( - Callable, + TYPE_CHECKING, final, ) import warnings @@ -15,6 +17,14 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level +if TYPE_CHECKING: + from collections.abc import Callable + + from pandas._typing import TypeT + + from pandas import Index + from pandas.core.generic import NDFrame + class DirNamesMixin: _accessors: set[str] = set() @@ -54,7 +64,7 @@ class PandasDelegate: def _delegate_property_get(self, name: str, *args, **kwargs): raise TypeError(f"You cannot access the property {name}") - def _delegate_property_set(self, name: str, value, *args, **kwargs): + def _delegate_property_set(self, name: str, value, *args, **kwargs) -> None: raise TypeError(f"The property {name} cannot be set") def _delegate_method(self, name: str, *args, **kwargs): @@ -108,12 +118,12 @@ def _setter(self, new_values): ) def _create_delegator_method(name: str): + method = getattr(delegate, accessor_mapping(name)) + + @functools.wraps(method) def f(self, *args, **kwargs): return self._delegate_method(name, *args, **kwargs) - f.__name__ = name - f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__ - return f for name in accessors: @@ -187,17 +197,11 @@ def add_delegate_accessors(cls): return add_delegate_accessors -# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE -# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py -# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors -# 2. We use a UserWarning instead of a custom Warning - - -class CachedAccessor: +class Accessor: """ Custom property-like object. - A descriptor for caching accessors. + A descriptor for accessors. Parameters ---------- @@ -221,17 +225,18 @@ def __get__(self, obj, cls): if obj is None: # we're accessing the attribute of the class, i.e., Dataset.geo return self._accessor - accessor_obj = self._accessor(obj) - # Replace the property with the accessor object. Inspired by: - # https://www.pydanny.com/cached-property.html - # We need to use object.__setattr__ because we overwrite __setattr__ on - # NDFrame - object.__setattr__(obj, self._name, accessor_obj) - return accessor_obj + return self._accessor(obj) + +# Alias kept for downstream libraries +# TODO: Deprecate as name is now misleading +CachedAccessor = Accessor -@doc(klass="", others="") -def _register_accessor(name: str, cls): + +@doc(klass="", examples="", others="") +def _register_accessor( + name: str, cls: type[NDFrame | Index] +) -> Callable[[TypeT], TypeT]: """ Register a custom accessor on {klass} objects. @@ -254,87 +259,137 @@ def _register_accessor(name: str, cls): Notes ----- - When accessed, your accessor will be initialized with the pandas object - the user is interacting with. So the signature must be - - .. code-block:: python - - def __init__(self, pandas_object): # noqa: E999 - ... - - For consistency with pandas methods, you should raise an ``AttributeError`` - if the data passed to your accessor has an incorrect dtype. - - >>> pd.Series(['a', 'b']).dt - Traceback (most recent call last): - ... - AttributeError: Can only use .dt accessor with datetimelike values + This function allows you to register a custom-defined accessor class for {klass}. + The requirements for the accessor class are as follows: - Examples - -------- - In your library code:: - - import pandas as pd + * Must contain an init method that: - @pd.api.extensions.register_dataframe_accessor("geo") - class GeoAccessor: - def __init__(self, pandas_obj): - self._obj = pandas_obj + * accepts a single {klass} object - @property - def center(self): - # return the geographic center point of this DataFrame - lat = self._obj.latitude - lon = self._obj.longitude - return (float(lon.mean()), float(lat.mean())) + * raises an AttributeError if the {klass} object does not have correctly + matching inputs for the accessor - def plot(self): - # plot this array's data on a map, e.g., using Cartopy - pass + * Must contain a method for each access pattern. - Back in an interactive IPython session: + * The methods should be able to take any argument signature. - .. code-block:: ipython + * Accessible using the @property decorator if no additional arguments are + needed. - In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10), - ...: "latitude": np.linspace(0, 20)}}) - In [2]: ds.geo.center - Out[2]: (5.0, 10.0) - In [3]: ds.geo.plot() # plots data on a map + Examples + -------- + {examples} """ - def decorator(accessor): + def decorator(accessor: TypeT) -> TypeT: if hasattr(cls, name): warnings.warn( - f"registration of accessor {repr(accessor)} under name " - f"{repr(name)} for type {repr(cls)} is overriding a preexisting " + f"registration of accessor {accessor!r} under name " + f"{name!r} for type {cls!r} is overriding a preexisting " f"attribute with the same name.", UserWarning, stacklevel=find_stack_level(), ) - setattr(cls, name, CachedAccessor(name, accessor)) + setattr(cls, name, Accessor(name, accessor)) cls._accessors.add(name) return accessor return decorator -@doc(_register_accessor, klass="DataFrame") -def register_dataframe_accessor(name: str): +_register_df_examples = """ +An accessor that only accepts integers could +have a class defined like this: + +>>> @pd.api.extensions.register_dataframe_accessor("int_accessor") +... class IntAccessor: +... def __init__(self, pandas_obj): +... if not all(pandas_obj[col].dtype == 'int64' for col in pandas_obj.columns): +... raise AttributeError("All columns must contain integer values only") +... self._obj = pandas_obj +... +... def sum(self): +... return self._obj.sum() +... +>>> df = pd.DataFrame([[1, 2], ['x', 'y']]) +>>> df.int_accessor +Traceback (most recent call last): +... +AttributeError: All columns must contain integer values only. +>>> df = pd.DataFrame([[1, 2], [3, 4]]) +>>> df.int_accessor.sum() +0 4 +1 6 +dtype: int64""" + + +@doc(_register_accessor, klass="DataFrame", examples=_register_df_examples) +def register_dataframe_accessor(name: str) -> Callable[[TypeT], TypeT]: from pandas import DataFrame return _register_accessor(name, DataFrame) -@doc(_register_accessor, klass="Series") -def register_series_accessor(name: str): +_register_series_examples = """ +An accessor that only accepts integers could +have a class defined like this: + +>>> @pd.api.extensions.register_series_accessor("int_accessor") +... class IntAccessor: +... def __init__(self, pandas_obj): +... if not pandas_obj.dtype == 'int64': +... raise AttributeError("The series must contain integer data only") +... self._obj = pandas_obj +... +... def sum(self): +... return self._obj.sum() +... +>>> df = pd.Series([1, 2, 'x']) +>>> df.int_accessor +Traceback (most recent call last): +... +AttributeError: The series must contain integer data only. +>>> df = pd.Series([1, 2, 3]) +>>> df.int_accessor.sum() +np.int64(6)""" + + +@doc(_register_accessor, klass="Series", examples=_register_series_examples) +def register_series_accessor(name: str) -> Callable[[TypeT], TypeT]: from pandas import Series return _register_accessor(name, Series) -@doc(_register_accessor, klass="Index") -def register_index_accessor(name: str): +_register_index_examples = """ +An accessor that only accepts integers could +have a class defined like this: + +>>> @pd.api.extensions.register_index_accessor("int_accessor") +... class IntAccessor: +... def __init__(self, pandas_obj): +... if not all(isinstance(x, int) for x in pandas_obj): +... raise AttributeError("The index must only be an integer value") +... self._obj = pandas_obj +... +... def even(self): +... return [x for x in self._obj if x % 2 == 0] +>>> df = pd.DataFrame.from_dict( +... {"row1": {"1": 1, "2": "a"}, "row2": {"1": 2, "2": "b"}}, orient="index" +... ) +>>> df.index.int_accessor +Traceback (most recent call last): +... +AttributeError: The index must only be an integer value. +>>> df = pd.DataFrame( +... {"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}, index=[1, 2, 5, 8] +... ) +>>> df.index.int_accessor.even() +[2, 8]""" + + +@doc(_register_accessor, klass="Index", examples=_register_index_examples) +def register_index_accessor(name: str) -> Callable[[TypeT], TypeT]: from pandas import Index return _register_accessor(name, Index) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 15a07da76d2f7..7fc391d3ffb51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2,6 +2,7 @@ Generic data algorithms. This module is experimental at the moment and not intended for public consumption """ + from __future__ import annotations import decimal @@ -22,9 +23,11 @@ iNaT, lib, ) +from pandas._libs.missing import NA from pandas._typing import ( AnyArrayLike, ArrayLike, + ArrayLikeT, AxisInt, DtypeObj, TakeIndexer, @@ -41,11 +44,12 @@ ensure_float64, ensure_object, ensure_platform_int, - is_array_like, is_bool_dtype, is_complex_dtype, is_dict_like, + is_dtype_equal, is_extension_array_dtype, + is_float, is_float_dtype, is_integer, is_integer_dtype, @@ -66,6 +70,7 @@ ABCExtensionArray, ABCIndex, ABCMultiIndex, + ABCNumpyExtensionArray, ABCSeries, ABCTimedeltaArray, ) @@ -182,8 +187,8 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: def _reconstruct_data( - values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike -) -> ArrayLike: + values: ArrayLikeT, dtype: DtypeObj, original: AnyArrayLike +) -> ArrayLikeT: """ reverse of _ensure_data @@ -206,10 +211,12 @@ def _reconstruct_data( # that values.dtype == dtype cls = dtype.construct_array_type() - values = cls._from_sequence(values, dtype=dtype) + # error: Incompatible types in assignment (expression has type + # "ExtensionArray", variable has type "ndarray[Any, Any]") + values = cls._from_sequence(values, dtype=dtype) # type: ignore[assignment] else: - values = values.astype(dtype, copy=False) + values = values.astype(dtype, copy=False) # type: ignore[assignment] return values @@ -218,16 +225,17 @@ def _ensure_arraylike(values, func_name: str) -> ArrayLike: """ ensure that we are arraylike if not already """ - if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): + if not isinstance( + values, + (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray, ABCNumpyExtensionArray), + ): # GH#52986 if func_name != "isin-targets": # Make an exception for the comps argument in isin. - warnings.warn( - f"{func_name} with argument that is not not a Series, Index, " - "ExtensionArray, or np.ndarray is deprecated and will raise in a " - "future version.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + f"{func_name} requires a Series, Index, " + f"ExtensionArray, np.ndarray or NumpyExtensionArray " + f"got {type(values).__name__}." ) inferred = lib.infer_dtype(values, skipna=False) @@ -259,7 +267,9 @@ def _ensure_arraylike(values, func_name: str) -> ArrayLike: } -def _get_hashtable_algo(values: np.ndarray): +def _get_hashtable_algo( + values: np.ndarray, +) -> tuple[type[htable.HashTable], np.ndarray]: """ Parameters ---------- @@ -316,10 +326,12 @@ def unique(values): Parameters ---------- values : 1d array-like + The input array-like object containing values from which to extract + unique values. Returns ------- - numpy.ndarray or ExtensionArray + numpy.ndarray, ExtensionArray or NumpyExtensionArray The return can be: @@ -327,7 +339,7 @@ def unique(values): * Categorical : when the input is a Categorical dtype * ndarray : when the input is a Series/ndarray - Return numpy.ndarray or ExtensionArray. + Return numpy.ndarray, ExtensionArray or NumpyExtensionArray. See Also -------- @@ -343,14 +355,15 @@ def unique(values): array([2, 1]) >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) - array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + array(['2016-01-01T00:00:00'], dtype='datetime64[s]') >>> pd.unique( ... pd.Series( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) @@ -362,7 +375,8 @@ def unique(values): ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], @@ -397,6 +411,13 @@ def unique(values): >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) + + An NumpyExtensionArray of complex + + >>> pd.unique(pd.array([1 + 1j, 2, 3])) + + [(1+1j), (2+0j), (3+0j)] + Length: 3, dtype: complex128 """ return unique_with_mask(values) @@ -432,6 +453,10 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): # Dispatch to extension dtype's unique. return values.unique() + if isinstance(values, ABCIndex): + # Dispatch to Index's unique. + return values.unique() + original = values hashtable, values = _get_hashtable_algo(values) @@ -487,6 +512,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: len(values) > 0 and values.dtype.kind in "iufcb" and not is_signed_integer_dtype(comps) + and not is_dtype_equal(values, comps) ): # GH#46485 Use object to avoid upcast to float64 later # TODO: Share with _find_common_type_compat @@ -521,10 +547,15 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # isin is faster for small sizes + + # GH60678 + # Ensure values don't contain , otherwise it throws exception with np.in1d + if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object + and not any(v is NA for v in values) ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan @@ -812,53 +843,6 @@ def factorize( return codes, uniques -def value_counts( - values, - sort: bool = True, - ascending: bool = False, - normalize: bool = False, - bins=None, - dropna: bool = True, -) -> Series: - """ - Compute a histogram of the counts of non-null values. - - Parameters - ---------- - values : ndarray (1-d) - sort : bool, default True - Sort by values - ascending : bool, default False - Sort in ascending order - normalize: bool, default False - If True then compute a relative histogram - bins : integer, optional - Rather than count values, group them into half-open bins, - convenience for pd.cut, only works with numeric data - dropna : bool, default True - Don't include counts of NaN - - Returns - ------- - Series - """ - warnings.warn( - # GH#53493 - "pandas.value_counts is deprecated and will be removed in a " - "future version. Use pd.Series(obj).value_counts() instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return value_counts_internal( - values, - sort=sort, - ascending=ascending, - normalize=normalize, - bins=bins, - dropna=dropna, - ) - - def value_counts_internal( values, sort: bool = True, @@ -928,26 +912,9 @@ def value_counts_internal( if keys.dtype == np.float16: keys = keys.astype(np.float32) - # For backwards compatibility, we let Index do its normal type - # inference, _except_ for if if infers from object to bool. - idx = Index(keys) - if idx.dtype == bool and keys.dtype == object: - idx = idx.astype(object) - elif ( - idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 - and idx.dtype != "string[pyarrow_numpy]" - ): - warnings.warn( - # GH#56161 - "The behavior of value_counts with object-dtype is deprecated. " - "In a future version, this will *not* perform dtype inference " - "on the resulting index. To retain the old behavior, use " - "`result.index = result.index.infer_objects()`", - FutureWarning, - stacklevel=find_stack_level(), - ) - idx.name = index_name - + # Starting in 3.0, we no longer perform dtype inference on the + # Index object we construct here, xref GH#56161 + idx = Index(keys, dtype=keys.dtype, name=index_name) result = Series(counts, index=idx, name=name, copy=False) if sort: @@ -1022,7 +989,7 @@ def duplicated( def mode( values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None -) -> ArrayLike: +) -> tuple[np.ndarray, npt.NDArray[np.bool_]] | ExtensionArray: """ Returns the mode(s) of an array. @@ -1035,7 +1002,7 @@ def mode( Returns ------- - np.ndarray or ExtensionArray + Union[Tuple[np.ndarray, npt.NDArray[np.bool_]], ExtensionArray] """ values = _ensure_arraylike(values, func_name="mode") original = values @@ -1049,11 +1016,13 @@ def mode( values = _ensure_data(values) npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) - if res_mask is not None: - return npresult, res_mask # type: ignore[return-value] + if res_mask is None: + res_mask = np.zeros(npresult.shape, dtype=np.bool_) + else: + return npresult, res_mask try: - npresult = np.sort(npresult) + npresult = safe_sort(npresult) except TypeError as err: warnings.warn( f"Unable to sort modes: {err}", @@ -1061,7 +1030,7 @@ def mode( ) result = _reconstruct_data(npresult, original.dtype, original) - return result + return result, res_mask def rank( @@ -1209,34 +1178,40 @@ def take( >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) array([10., 10., nan]) - >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, - ... fill_value=-10) + >>> pd.api.extensions.take( + ... np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, fill_value=-10 + ... ) array([ 10, 10, -10]) """ - if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): + if not isinstance( + arr, + (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, ABCNumpyExtensionArray), + ): # GH#52981 - warnings.warn( - "pd.api.extensions.take accepting non-standard inputs is deprecated " - "and will raise in a future version. Pass either a numpy.ndarray, " - "ExtensionArray, Index, or Series instead.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " + f"Index, Series, or NumpyExtensionArray got {type(arr).__name__}." ) - if not is_array_like(arr): - arr = np.asarray(arr) - indices = ensure_platform_int(indices) if allow_fill: # Pandas style, -1 means NA validate_indices(indices, arr.shape[axis]) + # error: Argument 1 to "take_nd" has incompatible type + # "ndarray[Any, Any] | ExtensionArray | Index | Series"; expected + # "ndarray[Any, Any]" result = take_nd( - arr, indices, axis=axis, allow_fill=True, fill_value=fill_value + arr, # type: ignore[arg-type] + indices, + axis=axis, + allow_fill=True, + fill_value=fill_value, ) else: # NumPy style - result = arr.take(indices, axis=axis) + # error: Unexpected keyword argument "axis" for "take" of "ExtensionArray" + result = arr.take(indices, axis=axis) # type: ignore[call-arg,assignment] return result @@ -1356,7 +1331,12 @@ def diff(arr, n: int, axis: AxisInt = 0): shifted """ - n = int(n) + # added a check on the integer value of period + # see https://github.com/pandas-dev/pandas/issues/56607 + if not lib.is_integer(n): + if not (is_float(n) and n.is_integer()): + raise ValueError("periods must be an integer") + n = int(n) na = np.nan dtype = arr.dtype @@ -1550,16 +1530,16 @@ def safe_sort( hash_klass, values = _get_hashtable_algo(values) # type: ignore[arg-type] t = hash_klass(len(values)) t.map_locations(values) - sorter = ensure_platform_int(t.lookup(ordered)) + # error: Argument 1 to "lookup" of "HashTable" has incompatible type + # "ExtensionArray | ndarray[Any, Any] | Index | Series"; expected "ndarray" + sorter = ensure_platform_int(t.lookup(ordered)) # type: ignore[arg-type] if use_na_sentinel: # take_nd is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() if verify: mask = (codes < -len(values)) | (codes >= len(values)) - codes[mask] = 0 - else: - mask = None + codes[mask] = -1 new_codes = take_nd(order2, codes, fill_value=-1) else: reverse_indexer = np.empty(len(sorter), dtype=int) @@ -1568,14 +1548,6 @@ def safe_sort( # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") - if use_na_sentinel: - mask = codes == -1 - if verify: - mask = mask | (codes < -len(values)) | (codes >= len(values)) - - if use_na_sentinel and mask is not None: - np.putmask(new_codes, mask, -1) - return ordered, ensure_platform_int(new_codes) @@ -1634,16 +1606,8 @@ def union_with_duplicates( """ from pandas import Series - with warnings.catch_warnings(): - # filter warning from object dtype inference; we will end up discarding - # the index here, so the deprecation does not affect the end result here. - warnings.filterwarnings( - "ignore", - "The behavior of value_counts with object-dtype is deprecated", - category=FutureWarning, - ) - l_count = value_counts_internal(lvals, dropna=False) - r_count = value_counts_internal(rvals, dropna=False) + l_count = value_counts_internal(lvals, dropna=False) + r_count = value_counts_internal(rvals, dropna=False) l_count, r_count = l_count.align(r_count, fill_value=0) final_count = np.maximum(l_count.values, r_count.values) final_count = Series(final_count, index=l_count.index, dtype="int", copy=False) @@ -1668,7 +1632,6 @@ def map_array( arr: ArrayLike, mapper, na_action: Literal["ignore"] | None = None, - convert: bool = True, ) -> np.ndarray | ExtensionArray | Index: """ Map values using an input mapping or function. @@ -1680,9 +1643,6 @@ def map_array( na_action : {None, 'ignore'}, default None If 'ignore', propagate NA values, without passing them to the mapping correspondence. - convert : bool, default True - Try to find better dtype for elementwise function results. If - False, leave as dtype=object. Returns ------- @@ -1691,6 +1651,8 @@ def map_array( If the function returns a tuple with more than one element a MultiIndex will be returned. """ + from pandas import Index + if na_action not in (None, "ignore"): msg = f"na_action must either be 'ignore' or None, {na_action} was passed" raise ValueError(msg) @@ -1720,6 +1682,10 @@ def map_array( if len(mapper) == 0: mapper = Series(mapper, dtype=np.float64) + elif isinstance(mapper, dict): + mapper = Series( + mapper.values(), index=Index(mapper.keys(), tupleize_cols=False) + ) else: mapper = Series(mapper) @@ -1740,8 +1706,6 @@ def map_array( # we must convert to python types values = arr.astype(object, copy=False) if na_action is None: - return lib.map_infer(values, mapper, convert=convert) + return lib.map_infer(values, mapper) else: - return lib.map_infer_mask( - values, mapper, mask=isna(values).view(np.uint8), convert=convert - ) + return lib.map_infer_mask(values, mapper, mask=isna(values).view(np.uint8)) diff --git a/pandas/core/api.py b/pandas/core/api.py index 2cfe5ffc0170d..ec12d543d8389 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,7 +23,6 @@ from pandas.core.algorithms import ( factorize, unique, - value_counts, ) from pandas.core.arrays import Categorical from pandas.core.arrays.boolean import BooleanDtype @@ -42,7 +41,7 @@ UInt64Dtype, ) from pandas.core.arrays.string_ import StringDtype -from pandas.core.construction import array +from pandas.core.construction import array # noqa: ICN001 from pandas.core.flags import Flags from pandas.core.groupby import ( Grouper, @@ -81,60 +80,59 @@ from pandas.core.frame import DataFrame # isort:skip __all__ = [ - "array", + "NA", "ArrowDtype", - "bdate_range", "BooleanDtype", "Categorical", "CategoricalDtype", "CategoricalIndex", "DataFrame", "DateOffset", - "date_range", "DatetimeIndex", "DatetimeTZDtype", - "factorize", "Flags", "Float32Dtype", "Float64Dtype", "Grouper", "Index", "IndexSlice", + "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", - "Int8Dtype", "Interval", "IntervalDtype", "IntervalIndex", - "interval_range", - "isna", - "isnull", "MultiIndex", - "NA", - "NamedAgg", "NaT", - "notna", - "notnull", + "NamedAgg", "Period", "PeriodDtype", "PeriodIndex", - "period_range", "RangeIndex", "Series", - "set_eng_float_format", "StringDtype", "Timedelta", "TimedeltaIndex", - "timedelta_range", "Timestamp", - "to_datetime", - "to_numeric", - "to_timedelta", + "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", - "UInt8Dtype", + "array", + "bdate_range", + "date_range", + "factorize", + "interval_range", + "isna", + "isnull", + "notna", + "notnull", + "period_range", + "set_eng_float_format", + "timedelta_range", + "to_datetime", + "to_numeric", + "to_timedelta", "unique", - "value_counts", ] diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 25a71ce5b5f4f..2c96f1ef020ac 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -2,23 +2,19 @@ import abc from collections import defaultdict +from collections.abc import Callable import functools from functools import partial import inspect from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, ) -import warnings import numpy as np -from pandas._config import option_context - -from pandas._libs import lib from pandas._libs.internals import BlockValuesRefs from pandas._typing import ( AggFuncType, @@ -33,7 +29,6 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import SpecificationError from pandas.util._decorators import cache_readonly -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( @@ -43,10 +38,7 @@ is_numeric_dtype, is_sequence, ) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCNDFrame, @@ -56,6 +48,10 @@ from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.util.numba_ import ( + get_jit_arguments, + prepare_function_arguments, +) if TYPE_CHECKING: from collections.abc import ( @@ -75,10 +71,113 @@ from pandas.core.resample import Resampler from pandas.core.window.rolling import BaseWindow - ResType = dict[int, Any] +class BaseExecutionEngine(abc.ABC): + """ + Base class for execution engines for map and apply methods. + + An execution engine receives all the parameters of a call to + ``apply`` or ``map``, such as the data container, the function, + etc. and takes care of running the execution. + + Supporting different engines allows functions to be JIT compiled, + run in parallel, and others. Besides the default executor which + simply runs the code with the Python interpreter and pandas. + """ + + @staticmethod + @abc.abstractmethod + def map( + data: Series | DataFrame | np.ndarray, + func: AggFuncType, + args: tuple, + kwargs: dict[str, Any], + decorator: Callable | None, + skip_na: bool, + ): + """ + Executor method to run functions elementwise. + + In general, pandas uses ``map`` for running functions elementwise, + but ``Series.apply`` with the default ``by_row='compat'`` will also + call this executor function. + + Parameters + ---------- + data : Series, DataFrame or NumPy ndarray + The object to use for the data. Some methods implement a ``raw`` + parameter which will convert the original pandas object to a + NumPy array, which will then be passed here to the executor. + func : function or NumPy ufunc + The function to execute. + args : tuple + Positional arguments to be passed to ``func``. + kwargs : dict + Keyword arguments to be passed to ``func``. + decorator : function, optional + For JIT compilers and other engines that need to decorate the + function ``func``, this is the decorator to use. While the + executor may already know which is the decorator to use, this + is useful as for a single executor the user can specify for + example ``numba.jit`` or ``numba.njit(nogil=True)``, and this + decorator parameter will contain the exact decorator from the + executor the user wants to use. + skip_na : bool + Whether the function should be called for missing values or not. + This is specified by the pandas user as ``map(na_action=None)`` + or ``map(na_action='ignore')``. + """ + + @staticmethod + @abc.abstractmethod + def apply( + data: Series | DataFrame | np.ndarray, + func: AggFuncType, + args: tuple, + kwargs: dict[str, Any], + decorator: Callable, + axis: Axis, + ): + """ + Executor method to run functions by an axis. + + While we can see ``map`` as executing the function for each cell + in a ``DataFrame`` (or ``Series``), ``apply`` will execute the + function for each column (or row). + + Parameters + ---------- + data : Series, DataFrame or NumPy ndarray + The object to use for the data. Some methods implement a ``raw`` + parameter which will convert the original pandas object to a + NumPy array, which will then be passed here to the executor. + func : function or NumPy ufunc + The function to execute. + args : tuple + Positional arguments to be passed to ``func``. + kwargs : dict + Keyword arguments to be passed to ``func``. + decorator : function, optional + For JIT compilers and other engines that need to decorate the + function ``func``, this is the decorator to use. While the + executor may already know which is the decorator to use, this + is useful as for a single executor the user can specify for + example ``numba.jit`` or ``numba.njit(nogil=True)``, and this + decorator parameter will contain the exact decorator from the + executor the user wants to use. + axis : {0 or 'index', 1 or 'columns'} + 0 or 'index' should execute the function passing each column as + parameter. 1 or 'columns' should execute the function passing + each row as parameter. The default executor engine passes rows + as pandas ``Series``. Other executor engines should probably + expect functions to be implemented this way for compatibility. + But passing rows as other data structures is technically possible + as far as the function ``func`` is implemented accordingly. + """ + + def frame_apply( obj: DataFrame, func: AggFuncType, @@ -92,16 +191,19 @@ def frame_apply( kwargs=None, ) -> FrameApply: """construct and return a row or column based frame apply object""" + _, func, columns, _ = reconstruct_func(func, **kwargs) + axis = obj._get_axis_number(axis) klass: type[FrameApply] if axis == 0: klass = FrameRowApply elif axis == 1: + if columns: + raise NotImplementedError( + f"Named aggregation is not supported when {axis=}." + ) klass = FrameColumnApply - _, func, _, _ = reconstruct_func(func, **kwargs) - assert func is not None - return klass( obj, func, @@ -178,10 +280,7 @@ def agg(self) -> DataFrame | Series | None: Result of aggregation, or None if agg cannot be performed by this method. """ - obj = self.obj func = self.func - args = self.args - kwargs = self.kwargs if isinstance(func, str): return self.apply_str() @@ -192,12 +291,6 @@ def agg(self) -> DataFrame | Series | None: # we require a list, but not a 'str' return self.agg_list_like() - if callable(func): - f = com.get_cython_func(func) - if f and not args and not kwargs: - warn_alias_replacement(obj, func, f) - return getattr(obj, f)() - # caller can react return None @@ -234,7 +327,7 @@ def transform(self) -> DataFrame | Series: if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: - func = {col: func for col in obj} + func = dict.fromkeys(obj, func) if is_dict_like(func): func = cast(AggFuncTypeDict, func) @@ -257,12 +350,8 @@ def transform(self) -> DataFrame | Series: and not obj.empty ): raise ValueError("Transform function failed") - # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type - # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy, - # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame, - # Series]" if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( - obj.index # type: ignore[arg-type] + obj.index ): raise ValueError("Function did not transform") @@ -303,12 +392,6 @@ def transform_str_or_callable(self, func) -> DataFrame | Series: if isinstance(func, str): return self._apply_str(obj, func, *args, **kwargs) - if not args and not kwargs: - f = com.get_cython_func(func) - if f: - warn_alias_replacement(obj, func, f) - return getattr(obj, f)() - # Two possible ways to use a UDF - apply or call directly try: return obj.apply(func, args=args, **kwargs) @@ -491,8 +574,24 @@ def compute_dict_like( keys += [key] * len(key_data) results += key_data - else: + elif is_groupby: # key used for column selection and output + + df = selected_obj + results, keys = [], [] + for key, how in func.items(): + cols = df[key] + + if cols.ndim == 1: + series = obj._gotitem(key, ndim=1, subset=cols) + results.append(getattr(series, op_name)(how, **kwargs)) + keys.append(key) + else: + for _, col in cols.items(): + series = obj._gotitem(key, ndim=1, subset=col) + results.append(getattr(series, op_name)(how, **kwargs)) + keys.append(key) + else: results = [ getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) for key, how in func.items() @@ -516,11 +615,14 @@ def wrap_results_dict_like( is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] if all(is_ndframe): - results = dict(zip(result_index, result_data)) + results = [result for result in result_data if not result.empty] keys_to_use: Iterable[Hashable] - keys_to_use = [k for k in result_index if not results[k].empty] + keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty] # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else result_index + if keys_to_use == []: + keys_to_use = result_index + results = result_data + if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) @@ -529,7 +631,7 @@ def wrap_results_dict_like( axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 result = concat( - {k: results[k] for k in keys_to_use}, + results, axis=axis, keys=keys_to_use, ) @@ -584,22 +686,10 @@ def apply_str(self) -> DataFrame | Series: "axis" not in arg_names or func in ("corrwith", "skew") ): raise ValueError(f"Operation {func} does not support axis=1") - if "axis" in arg_names: - if isinstance(obj, (SeriesGroupBy, DataFrameGroupBy)): - # Try to avoid FutureWarning for deprecated axis keyword; - # If self.axis matches the axis we would get by not passing - # axis, we safely exclude the keyword. - - default_axis = 0 - if func in ["idxmax", "idxmin"]: - # DataFrameGroupBy.idxmax, idxmin axis defaults to self.axis, - # whereas other axis keywords default to 0 - default_axis = self.obj.axis - - if default_axis != self.axis: - self.kwargs["axis"] = self.axis - else: - self.kwargs["axis"] = self.axis + if "axis" in arg_names and not isinstance( + obj, (SeriesGroupBy, DataFrameGroupBy) + ): + self.kwargs["axis"] = self.axis return self._apply_str(obj, func, *self.args, **self.kwargs) def apply_list_or_dict_like(self) -> DataFrame | Series: @@ -660,7 +750,8 @@ def normalize_dictlike_arg( cols = Index(list(func.keys())).difference(obj.columns, sort=True) if len(cols) > 0: - raise KeyError(f"Column(s) {list(cols)} do not exist") + # GH 58474 + raise KeyError(f"Label(s) {list(cols)} do not exist") aggregator_types = (list, tuple, dict) @@ -695,7 +786,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs): # people may aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 + assert not any(kwarg == "axis" for kwarg in kwargs) return f elif hasattr(np, func) and hasattr(obj, "__array__"): # in particular exclude Window @@ -812,7 +903,7 @@ def result_columns(self) -> Index: @property @abc.abstractmethod - def series_generator(self) -> Generator[Series, None, None]: + def series_generator(self) -> Generator[Series]: pass @staticmethod @@ -827,7 +918,7 @@ def generate_numba_apply_func( def apply_with_numba(self): pass - def validate_values_for_numba(self): + def validate_values_for_numba(self) -> None: # Validate column dtyps all OK for colname, dtype in self.obj.dtypes.items(): if not is_numeric_dtype(dtype): @@ -1003,16 +1094,21 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - engine_kwargs = {} if engine_kwargs is None else engine_kwargs - + args, kwargs = prepare_function_arguments( + self.func, # type: ignore[arg-type] + self.args, + self.kwargs, + num_required_args=1, + ) # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has # incompatible type "Callable[..., Any] | str | list[Callable # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( - self.func, **engine_kwargs # type: ignore[arg-type] + self.func, # type: ignore[arg-type] + **get_jit_arguments(engine_kwargs), ) - result = nb_looper(self.values, self.axis) + result = nb_looper(self.values, self.axis, *args) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) else: @@ -1075,14 +1171,12 @@ def apply_series_generator(self) -> tuple[ResType, Index]: results = {} - with option_context("mode.chained_assignment", None): - for i, v in enumerate(series_gen): - # ignore SettingWithCopy here in case the user mutates - results[i] = self.func(v, *self.args, **self.kwargs) - if isinstance(results[i], ABCSeries): - # If we have a view on v, we need to make a copy because - # series_generator will swap out the underlying data - results[i] = results[i].copy(deep=False) + for i, v in enumerate(series_gen): + results[i] = self.func(v, *self.args, **self.kwargs) + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) return results, res_index @@ -1135,7 +1229,7 @@ class FrameRowApply(FrameApply): axis: AxisInt = 0 @property - def series_generator(self) -> Generator[Series, None, None]: + def series_generator(self) -> Generator[Series]: return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) @staticmethod @@ -1155,36 +1249,35 @@ def generate_numba_apply_func( # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names, df_index): + def numba_func(values, col_names, df_index, *args): results = {} for j in range(values.shape[1]): # Create the series ser = Series( values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) ) - results[j] = jitted_udf(ser) + results[j] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments( + func, self.args, self.kwargs, num_required_args=1 + ) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs) ) from pandas.core._numba.extensions import set_numba_data index = self.obj.index - if index.dtype == "string": - index = index.astype(object) - columns = self.obj.columns - if columns.dtype == "string": - columns = columns.astype(object) # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict with set_numba_data(index) as index, set_numba_data(columns) as columns: - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res @property @@ -1245,7 +1338,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: return result.T @property - def series_generator(self) -> Generator[Series, None, None]: + def series_generator(self) -> Generator[Series]: values = self.values values = ensure_wrapped_if_datetimelike(values) assert len(values) > 0 @@ -1255,7 +1348,7 @@ def series_generator(self) -> Generator[Series, None, None]: ser = self.obj._ixs(0, axis=0) mgr = ser._mgr - is_view = mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] + is_view = mgr.blocks[0].refs.has_reference() if isinstance(ser.dtype, ExtensionDtype): # values will be incorrect for this block @@ -1277,7 +1370,7 @@ def series_generator(self) -> Generator[Series, None, None]: # -> if that happened and `ser` is already a copy, then we reset # the refs here to avoid triggering a unnecessary CoW inside the # applied function (https://github.com/pandas-dev/pandas/pull/56212) - mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) # type: ignore[union-attr] + mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) yield ser @staticmethod @@ -1292,7 +1385,7 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names_index, index): + def numba_func(values, col_names_index, index, *args): results = {} # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @@ -1304,25 +1397,30 @@ def numba_func(values, col_names_index, index): index=col_names_index, name=maybe_cast_str(index[i]), ) - results[i] = jitted_udf(ser) + results[i] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments( + func, self.args, self.kwargs, num_required_args=1 + ) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs) ) from pandas.core._numba.extensions import set_numba_data # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict - with set_numba_data(self.obj.index) as index, set_numba_data( - self.columns - ) as columns: - res = dict(nb_func(self.values, columns, index)) + with ( + set_numba_data(self.obj.index) as index, + set_numba_data(self.columns) as columns, + ): + res = dict(nb_func(self.values, columns, index, *args)) return res @@ -1364,7 +1462,7 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame: result.index = res_index # infer dtypes - result = result.infer_objects(copy=False) + result = result.infer_objects() return result @@ -1379,23 +1477,10 @@ def __init__( obj: Series, func: AggFuncType, *, - convert_dtype: bool | lib.NoDefault = lib.no_default, by_row: Literal[False, "compat", "_compat"] = "compat", args, kwargs, ) -> None: - if convert_dtype is lib.no_default: - convert_dtype = True - else: - warnings.warn( - "the convert_dtype parameter is deprecated and will be removed in a " - "future version. Do ``ser.astype(object).apply()`` " - "instead if you want ``convert_dtype=False``.", - FutureWarning, - stacklevel=find_stack_level(), - ) - self.convert_dtype = convert_dtype - super().__init__( obj, func, @@ -1433,22 +1518,7 @@ def agg(self): func = self.func # string, list-like, and dict-like are entirely handled in super assert callable(func) - - # GH53325: The setup below is just to keep current behavior while emitting a - # deprecation message. In the future this will all be replaced with a simple - # `result = f(self.obj, *self.args, **self.kwargs)`. - try: - result = obj.apply(func, args=self.args, **self.kwargs) - except (ValueError, AttributeError, TypeError): - result = func(obj, *self.args, **self.kwargs) - else: - msg = ( - f"using {func} in {type(obj).__name__}.agg cannot aggregate and " - f"has been deprecated. Use {type(obj).__name__}.transform to " - f"keep behavior unchanged." - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - + result = func(obj, *self.args, **self.kwargs) return result def apply_empty_result(self) -> Series: @@ -1497,16 +1567,7 @@ def curried(x): else: curried = func - - # row-wise access - # apply doesn't have a `na_action` keyword and for backward compat reasons - # we need to give `na_action="ignore"` for categorical data. - # TODO: remove the `na_action="ignore"` when that default has been changed in - # Categorical (GH51645). - action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None - mapped = obj._map_values( - mapper=curried, na_action=action, convert=self.convert_dtype - ) + mapped = obj._map_values(mapper=curried) if len(mapped) and isinstance(mapped[0], ABCSeries): # GH#43986 Need to do list(mapped) in order to get treated as nested @@ -1688,8 +1749,7 @@ def reconstruct_func( # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name raise SpecificationError( - "Function names must be unique if there is no new column names " - "assigned" + "Function names must be unique if there is no new column names assigned" ) if func is None: # nicer error message @@ -1772,9 +1832,9 @@ def normalize_keyword_aggregation( # TODO: aggspec type: typing.Dict[str, List[AggScalar]] aggspec = defaultdict(list) order = [] - columns, pairs = list(zip(*kwargs.items())) + columns = tuple(kwargs.keys()) - for column, aggfunc in pairs: + for column, aggfunc in kwargs.values(): aggspec[column].append(aggfunc) order.append((column, com.get_callable_name(aggfunc) or aggfunc)) @@ -1797,14 +1857,14 @@ def normalize_keyword_aggregation( def _make_unique_kwarg_list( - seq: Sequence[tuple[Any, Any]] + seq: Sequence[tuple[Any, Any]], ) -> Sequence[tuple[Any, Any]]: """ Uniquify aggfunc name of the pairs in the order list Examples: -------- - >>> kwarg_list = [('a', ''), ('a', ''), ('b', '')] + >>> kwarg_list = [("a", ""), ("a", ""), ("b", "")] >>> _make_unique_kwarg_list(kwarg_list) [('a', '_0'), ('a', '_1'), ('b', '')] """ @@ -1836,7 +1896,7 @@ def relabel_result( >>> from pandas.core.apply import relabel_result >>> result = pd.DataFrame( ... {"A": [np.nan, 2, np.nan], "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}, - ... index=["max", "mean", "min"] + ... index=["max", "mean", "min"], ... ) >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} >>> columns = ("foo", "aab", "bar", "dat") @@ -1886,12 +1946,14 @@ def relabel_result( com.get_callable_name(f) if not isinstance(f, str) else f for f in fun ] col_idx_order = Index(s.index).get_indexer(fun) - s = s.iloc[col_idx_order] - + valid_idx = col_idx_order != -1 + if valid_idx.any(): + s = s.iloc[col_idx_order[valid_idx]] # assign the new user-provided "named aggregation" as index names, and reindex # it based on the whole user-provided names. - s.index = reordered_indexes[idx : idx + len(fun)] - reordered_result_in_dict[col] = s.reindex(columns, copy=False) + if not s.empty: + s.index = reordered_indexes[idx : idx + len(fun)] + reordered_result_in_dict[col] = s.reindex(columns) idx = idx + len(fun) return reordered_result_in_dict @@ -1975,7 +2037,7 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: Examples -------- - >>> maybe_mangle_lambdas('sum') + >>> maybe_mangle_lambdas("sum") 'sum' >>> maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP [, @@ -2020,7 +2082,7 @@ def validate_func_kwargs( Examples -------- - >>> validate_func_kwargs({'one': 'min', 'two': 'max'}) + >>> validate_func_kwargs({"one": "min", "two": "max"}) (['one', 'two'], ['min', 'max']) """ tuple_given_message = "func is expected but received {} in **kwargs." @@ -2040,23 +2102,3 @@ def include_axis(op_name: Literal["agg", "apply"], colg: Series | DataFrame) -> return isinstance(colg, ABCDataFrame) or ( isinstance(colg, ABCSeries) and op_name == "agg" ) - - -def warn_alias_replacement( - obj: AggObjType, - func: Callable, - alias: str, -) -> None: - if alias.startswith("np."): - full_alias = alias - else: - full_alias = f"{type(obj).__name__}.{alias}" - alias = f'"{alias}"' - warnings.warn( - f"The provided callable {func} is currently using " - f"{full_alias}. In a future version of pandas, " - f"the provided callable will be used directly. To keep current " - f"behavior pass the string {alias} instead.", - category=FutureWarning, - stacklevel=find_stack_level(), - ) diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index 825fe60ee6cf8..bc10dbfbec90d 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Callable +from typing import TYPE_CHECKING import numpy as np @@ -12,13 +12,16 @@ from pandas.core.dtypes.missing import isna +if TYPE_CHECKING: + from collections.abc import Callable + def _cum_func( func: Callable, values: np.ndarray, *, skipna: bool = True, -): +) -> np.ndarray: """ Accumulations for 1D datetimelike arrays. @@ -37,8 +40,10 @@ def _cum_func( np.cumsum: 0, np.minimum.accumulate: np.iinfo(np.int64).max, }[func] - except KeyError: - raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + except KeyError as err: + raise ValueError( + f"No accumulation for {func} implemented on BaseMaskedArray" + ) from err mask = isna(values) y = values.view("i8") @@ -47,7 +52,8 @@ def _cum_func( if not skipna: mask = np.maximum.accumulate(mask) - result = func(y) + # GH 57956 + result = func(y, axis=0) result[mask] = iNaT if values.dtype.kind in "mM": @@ -59,9 +65,9 @@ def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: return _cum_func(np.cumsum, values, skipna=skipna) -def cummin(values: np.ndarray, *, skipna: bool = True): +def cummin(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: return _cum_func(np.minimum.accumulate, values, skipna=skipna) -def cummax(values: np.ndarray, *, skipna: bool = True): +def cummax(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: return _cum_func(np.maximum.accumulate, values, skipna=skipna) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index ad9e96d398a24..b4e116388b85e 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -5,14 +5,13 @@ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import numpy as np if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import npt @@ -22,7 +21,7 @@ def _cum_func( mask: npt.NDArray[np.bool_], *, skipna: bool = True, -): +) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: """ Accumulations for 1D masked array. @@ -60,10 +59,10 @@ def _cum_func( np.cumsum: 0, np.minimum.accumulate: dtype_info.max, }[func] - except KeyError: + except KeyError as err: raise NotImplementedError( f"No accumulation for {func} implemented on BaseMaskedArray" - ) + ) from err values[mask] = fill_value @@ -74,17 +73,25 @@ def _cum_func( return values, mask -def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): +def cumsum( + values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True +) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: return _cum_func(np.cumsum, values, mask, skipna=skipna) -def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): +def cumprod( + values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True +) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: return _cum_func(np.cumprod, values, mask, skipna=skipna) -def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): +def cummin( + values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True +) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna) -def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): +def cummax( + values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True +) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 335fa1afc0f4e..bdf88f2e9fa07 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -2,12 +2,10 @@ masked_reductions.py is for reduction algorithms using a mask-based approach for missing values. """ + from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings import numpy as np @@ -17,6 +15,8 @@ from pandas.core.nanops import check_below_min_count if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( AxisInt, npt, @@ -62,6 +62,10 @@ def _reductions( ): return libmissing.NA + if values.dtype == np.dtype(object): + # object dtype does not support `where` without passing an initial + values = values[~mask] + return func(values, axis=axis, **kwargs) return func(values, where=~mask, axis=axis, **kwargs) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index f65d2d20e028e..464a4d552af68 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -1,6 +1,7 @@ """ EA-compatible analogue to np.putmask """ + from __future__ import annotations from typing import ( diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index ee6f00b219a15..eb5026454552c 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -91,24 +91,24 @@ def quantile_with_mask( if is_empty: # create the array of na_values # 2d len(values) * len(qs) - flat = np.array([fill_value] * len(qs)) + flat = np.full(len(qs), fill_value) result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) else: - result = _nanpercentile( + result = _nanquantile( values, - qs * 100.0, + qs, na_value=fill_value, mask=mask, interpolation=interpolation, ) - result = np.array(result, copy=False) + result = np.asarray(result) # type: ignore[assignment] result = result.T return result -def _nanpercentile_1d( +def _nanquantile_1d( values: np.ndarray, mask: npt.NDArray[np.bool_], qs: npt.NDArray[np.float64], @@ -116,7 +116,7 @@ def _nanpercentile_1d( interpolation: str, ) -> Scalar | np.ndarray: """ - Wrapper for np.percentile that skips missing values, specialized to + Wrapper for np.quantile that skips missing values, specialized to 1-dimensional case. Parameters @@ -142,7 +142,7 @@ def _nanpercentile_1d( # equiv: 'np.array([na_value] * len(qs))' but much faster return np.full(len(qs), na_value) - return np.percentile( + return np.quantile( values, qs, # error: No overload variant of "percentile" matches argument @@ -152,7 +152,7 @@ def _nanpercentile_1d( ) -def _nanpercentile( +def _nanquantile( values: np.ndarray, qs: npt.NDArray[np.float64], *, @@ -161,7 +161,7 @@ def _nanpercentile( interpolation: str, ): """ - Wrapper for np.percentile that skips missing values. + Wrapper for np.quantile that skips missing values. Parameters ---------- @@ -180,7 +180,7 @@ def _nanpercentile( if values.dtype.kind in "mM": # need to cast to integer to avoid rounding errors in numpy - result = _nanpercentile( + result = _nanquantile( values.view("i8"), qs=qs, na_value=na_value.view("i8"), @@ -196,14 +196,14 @@ def _nanpercentile( # Caller is responsible for ensuring mask shape match assert mask.shape == values.shape result = [ - _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation) + _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) # type: ignore[arg-type] for (val, m) in zip(list(values), list(mask)) ] if values.dtype.kind == "f": # preserve itemsize - result = np.array(result, dtype=values.dtype, copy=False).T + result = np.asarray(result, dtype=values.dtype).T else: - result = np.array(result, copy=False).T + result = np.asarray(result).T if ( result.dtype != values.dtype and not mask.all() @@ -215,7 +215,7 @@ def _nanpercentile( result = result.astype(values.dtype, copy=False) return result else: - return np.percentile( + return np.quantile( values, qs, axis=1, diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 5f377276be480..debd6368e98a4 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -1,6 +1,7 @@ """ Methods used by Block.replace and related methods. """ + from __future__ import annotations import operator @@ -67,7 +68,7 @@ def compare_or_regex_search( def _check_comparison_types( result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern - ): + ) -> None: """ Raises an error if the two arrays (a,b) cannot be compared. Otherwise, returns the comparison result as expected. @@ -78,7 +79,7 @@ def _check_comparison_types( type_names[0] = f"ndarray(dtype={a.dtype})" raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + f"Cannot compare types {type_names[0]!r} and {type_names[1]!r}" ) if not regex or not should_use_regex(regex, b): @@ -88,21 +89,23 @@ def _check_comparison_types( op = np.vectorize( lambda x: bool(re.search(b, x)) if isinstance(x, str) and isinstance(b, (str, Pattern)) - else False + else False, + otypes=[bool], ) # GH#32621 use mask to avoid comparing to NAs - if isinstance(a, np.ndarray): + if isinstance(a, np.ndarray) and mask is not None: a = a[mask] - - result = op(a) - - if isinstance(result, np.ndarray) and mask is not None: - # The shape of the mask can differ to that of the result - # since we may compare only a subset of a's or b's elements - tmp = np.zeros(mask.shape, dtype=np.bool_) - np.place(tmp, mask, result) - result = tmp + result = op(a) + + if isinstance(result, np.ndarray): + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool_) + np.place(tmp, mask, result) + result = tmp + else: + result = op(a) _check_comparison_types(result, a, b) return result @@ -149,4 +152,6 @@ def re_replacer(s): if mask is None: values[:] = f(values) else: + if values.ndim != mask.ndim: + mask = np.broadcast_to(mask, values.shape) values[mask] = f(values[mask]) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index ac674e31586e7..4d33b01f616cc 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -41,8 +41,7 @@ def take_nd( axis: AxisInt = ..., fill_value=..., allow_fill: bool = ..., -) -> np.ndarray: - ... +) -> np.ndarray: ... @overload @@ -52,8 +51,7 @@ def take_nd( axis: AxisInt = ..., fill_value=..., allow_fill: bool = ..., -) -> ArrayLike: - ... +) -> ArrayLike: ... def take_nd( @@ -166,64 +164,6 @@ def _take_nd_ndarray( return out -def take_1d( - arr: ArrayLike, - indexer: npt.NDArray[np.intp], - fill_value=None, - allow_fill: bool = True, - mask: npt.NDArray[np.bool_] | None = None, -) -> ArrayLike: - """ - Specialized version for 1D arrays. Differences compared to `take_nd`: - - - Assumes input array has already been converted to numpy array / EA - - Assumes indexer is already guaranteed to be intp dtype ndarray - - Only works for 1D arrays - - To ensure the lowest possible overhead. - - Note: similarly to `take_nd`, this function assumes that the indexer is - a valid(ated) indexer with no out of bound indices. - - Parameters - ---------- - arr : np.ndarray or ExtensionArray - Input array. - indexer : ndarray - 1-D array of indices to take (validated indices, intp dtype). - fill_value : any, default np.nan - Fill value to replace -1 values with - allow_fill : bool, default True - If False, indexer is assumed to contain no -1 values so no filling - will be done. This short-circuits computation of a mask. Result is - undefined if allow_fill == False and -1 is present in indexer. - mask : np.ndarray, optional, default None - If `allow_fill` is True, and the mask (where indexer == -1) is already - known, it can be passed to avoid recomputation. - """ - if not isinstance(arr, np.ndarray): - # ExtensionArray -> dispatch to their method - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - - if not allow_fill: - return arr.take(indexer) - - dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( - arr, indexer, fill_value, True, mask - ) - - # at this point, it's guaranteed that dtype can hold both the arr values - # and the fill_value - out = np.empty(indexer.shape, dtype=dtype) - - func = _get_take_nd_function( - arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info - ) - func(arr, indexer, out, fill_value) - - return out - - def take_2d_multi( arr: np.ndarray, indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], @@ -397,6 +337,10 @@ def wrapper( ("int32", "int64"): libalgos.take_1d_int32_int64, ("int32", "float64"): libalgos.take_1d_int32_float64, ("int64", "int64"): libalgos.take_1d_int64_int64, + ("uint8", "uint8"): libalgos.take_1d_bool_bool, + ("uint16", "int64"): libalgos.take_1d_uint16_uint16, + ("uint32", "int64"): libalgos.take_1d_uint32_uint32, + ("uint64", "int64"): libalgos.take_1d_uint64_uint64, ("int64", "float64"): libalgos.take_1d_int64_float64, ("float32", "float32"): libalgos.take_1d_float32_float32, ("float32", "float64"): libalgos.take_1d_float32_float64, @@ -426,6 +370,10 @@ def wrapper( ("int32", "float64"): libalgos.take_2d_axis0_int32_float64, ("int64", "int64"): libalgos.take_2d_axis0_int64_int64, ("int64", "float64"): libalgos.take_2d_axis0_int64_float64, + ("uint8", "uint8"): libalgos.take_2d_axis0_bool_bool, + ("uint16", "uint16"): libalgos.take_2d_axis0_uint16_uint16, + ("uint32", "uint32"): libalgos.take_2d_axis0_uint32_uint32, + ("uint64", "uint64"): libalgos.take_2d_axis0_uint64_uint64, ("float32", "float32"): libalgos.take_2d_axis0_float32_float32, ("float32", "float64"): libalgos.take_2d_axis0_float32_float64, ("float64", "float64"): libalgos.take_2d_axis0_float64_float64, @@ -458,6 +406,10 @@ def wrapper( ("int32", "float64"): libalgos.take_2d_axis1_int32_float64, ("int64", "int64"): libalgos.take_2d_axis1_int64_int64, ("int64", "float64"): libalgos.take_2d_axis1_int64_float64, + ("uint8", "uint8"): libalgos.take_2d_axis1_bool_bool, + ("uint16", "uint16"): libalgos.take_2d_axis1_uint16_uint16, + ("uint32", "uint32"): libalgos.take_2d_axis1_uint32_uint32, + ("uint64", "uint64"): libalgos.take_2d_axis1_uint64_uint64, ("float32", "float32"): libalgos.take_2d_axis1_float32_float32, ("float32", "float64"): libalgos.take_2d_axis1_float32_float64, ("float64", "float64"): libalgos.take_2d_axis1_float64_float64, diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index fd585b3e19468..51ddd9e91b227 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -4,6 +4,7 @@ Index ExtensionArray """ + from __future__ import annotations import operator @@ -119,8 +120,9 @@ def __add__(self, other): Examples -------- - >>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]}, - ... index=['elk', 'moose']) + >>> df = pd.DataFrame( + ... {"height": [1.5, 2.6], "weight": [500, 800]}, index=["elk", "moose"] + ... ) >>> df height weight elk 1.5 500 @@ -128,14 +130,14 @@ def __add__(self, other): Adding a scalar affects all rows and columns. - >>> df[['height', 'weight']] + 1.5 + >>> df[["height", "weight"]] + 1.5 height weight elk 3.0 501.5 moose 4.1 801.5 Each element of a list is added to a column of the DataFrame, in order. - >>> df[['height', 'weight']] + [0.5, 1.5] + >>> df[["height", "weight"]] + [0.5, 1.5] height weight elk 2.0 501.5 moose 3.1 801.5 @@ -143,7 +145,7 @@ def __add__(self, other): Keys of a dictionary are aligned to the DataFrame, based on column names; each value in the dictionary is added to the corresponding column. - >>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5} + >>> df[["height", "weight"]] + {"height": 0.5, "weight": 1.5} height weight elk 2.0 501.5 moose 3.1 801.5 @@ -151,8 +153,8 @@ def __add__(self, other): When `other` is a :class:`Series`, the index of `other` is aligned with the columns of the DataFrame. - >>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height']) - >>> df[['height', 'weight']] + s1 + >>> s1 = pd.Series([0.5, 1.5], index=["weight", "height"]) + >>> df[["height", "weight"]] + s1 height weight elk 3.0 500.5 moose 4.1 800.5 @@ -161,13 +163,13 @@ def __add__(self, other): the :class:`Series` will not be reoriented. If index-wise alignment is desired, :meth:`DataFrame.add` should be used with `axis='index'`. - >>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose']) - >>> df[['height', 'weight']] + s2 + >>> s2 = pd.Series([0.5, 1.5], index=["elk", "moose"]) + >>> df[["height", "weight"]] + s2 elk height moose weight elk NaN NaN NaN NaN moose NaN NaN NaN NaN - >>> df[['height', 'weight']].add(s2, axis='index') + >>> df[["height", "weight"]].add(s2, axis="index") height weight elk 2.0 500.5 moose 4.1 801.5 @@ -175,9 +177,10 @@ def __add__(self, other): When `other` is a :class:`DataFrame`, both columns names and the index are aligned. - >>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]}, - ... index=['elk', 'moose', 'deer']) - >>> df[['height', 'weight']] + other + >>> other = pd.DataFrame( + ... {"height": [0.2, 0.4, 0.6]}, index=["elk", "moose", "deer"] + ... ) + >>> df[["height", "weight"]] + other height weight deer NaN NaN elk 1.7 NaN @@ -263,10 +266,7 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) Series, ) from pandas.core.generic import NDFrame - from pandas.core.internals import ( - ArrayManager, - BlockManager, - ) + from pandas.core.internals import BlockManager cls = type(self) @@ -329,8 +329,8 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) if self.ndim == 1: - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - name = names[0] if len(set(names)) == 1 else None + names = {x.name for x in inputs if hasattr(x, "name")} + name = names.pop() if len(names) == 1 else None reconstruct_kwargs = {"name": name} else: reconstruct_kwargs = {} @@ -350,7 +350,7 @@ def _reconstruct(result): if method == "outer": raise NotImplementedError return result - if isinstance(result, (BlockManager, ArrayManager)): + if isinstance(result, BlockManager): # we went through BlockManager.apply e.g. np.sqrt result = self._constructor_from_mgr(result, axes=result.axes) else: @@ -403,12 +403,12 @@ def _reconstruct(result): # for np.(..) calls # kwargs cannot necessarily be handled block-by-block, so only # take this path if there are no kwargs - mgr = inputs[0]._mgr + mgr = inputs[0]._mgr # pyright: ignore[reportGeneralTypeIssues] result = mgr.apply(getattr(ufunc, method)) else: # otherwise specific ufunc methods (eg np..accumulate(..)) # Those can have an axis keyword and thus can't be called block-by-block - result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs) + result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs) # pyright: ignore[reportGeneralTypeIssues] # e.g. np.negative (only one reached), with "where" and "out" in kwargs result = reconstruct(result) @@ -522,7 +522,7 @@ def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwar # so calls DataFrame.min (without ever getting here) with the np.min # default of axis=None, which DataFrame.min catches and changes to axis=0. # np.minimum.reduce(df) gets here bc axis is not in kwargs, - # so we set axis=0 to match the behaviorof np.minimum.reduce(df.values) + # so we set axis=0 to match the behavior of np.minimum.reduce(df.values) kwargs["axis"] = 0 # By default, numpy's reductions do not skip NaNs, so we have to diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 245a171fea74b..f183e9236471e 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -23,21 +23,21 @@ __all__ = [ "ArrowExtensionArray", - "ExtensionArray", - "ExtensionOpsMixin", - "ExtensionScalarOpsMixin", "ArrowStringArray", "BaseMaskedArray", "BooleanArray", "Categorical", "DatetimeArray", + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", "FloatingArray", "IntegerArray", "IntervalArray", "NumpyExtensionArray", "PeriodArray", - "period_array", "SparseArray", "StringArray", "TimedeltaArray", + "period_array", ] diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index cc41985843574..1ca52ce64bd77 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,41 +1,115 @@ from __future__ import annotations -from typing import Literal +from functools import partial +import re +from typing import ( + TYPE_CHECKING, + Any, + Literal, +) import numpy as np -from pandas.compat import pa_version_under10p1 +from pandas._libs import lib +from pandas.compat import ( + pa_version_under10p1, + pa_version_under11p0, + pa_version_under13p0, + pa_version_under17p0, +) if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc +if TYPE_CHECKING: + from collections.abc import Callable + + from pandas._typing import ( + Scalar, + Self, + ) + class ArrowStringArrayMixin: - _pa_array = None + _pa_array: pa.ChunkedArray def __init__(self, *args, **kwargs) -> None: raise NotImplementedError + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): + # Convert a bool-dtype result to the appropriate result type + raise NotImplementedError + + def _convert_int_result(self, result): + # Convert an integer-dtype result to the appropriate result type + raise NotImplementedError + + def _apply_elementwise(self, func: Callable) -> list[list[Any]]: + raise NotImplementedError + + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return self._convert_int_result(result) + + def _str_lower(self) -> Self: + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self) -> Self: + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + def _str_pad( self, width: int, side: Literal["left", "right", "both"] = "left", fillchar: str = " ", - ): + ) -> Self: if side == "left": pa_pad = pc.utf8_lpad elif side == "right": pa_pad = pc.utf8_rpad elif side == "both": - pa_pad = pc.utf8_center + if pa_version_under17p0: + # GH#59624 fall back to object dtype + from pandas import array + + obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] + obj = array(obj_arr, dtype=object) + result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] + return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] + else: + # GH#54792 + # https://github.com/apache/arrow/issues/15053#issuecomment-2317032347 + lean_left = (width % 2) == 0 + pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left) else: raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" ) return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) - def _str_get(self, i: int): + def _str_get(self, i: int) -> Self: lengths = pc.utf8_length(self._pa_array) if i >= 0: out_of_bounds = pc.greater_equal(i, lengths) @@ -51,15 +125,32 @@ def _str_get(self, i: int): selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar( - None, type=self._pa_array.type # type: ignore[attr-defined] - ) + null_value = pa.scalar(None, type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ) -> Self: + if pa_version_under11p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) + if start is None: + if step is not None and step < 0: + # GH#59710 + start = -1 + else: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None - ): + ) -> Self: if repl is None: repl = "" if start is None: @@ -68,17 +159,202 @@ def _str_slice_replace( stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_capitalize(self): + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ) -> Self: + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) + + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + + def _str_capitalize(self) -> Self: return type(self)(pc.utf8_capitalize(self._pa_array)) - def _str_title(self): + def _str_title(self) -> Self: return type(self)(pc.utf8_title(self._pa_array)) - def _str_swapcase(self): + def _str_swapcase(self) -> Self: return type(self)(pc.utf8_swapcase(self._pa_array)) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_removesuffix(self, suffix: str): ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) + + def _str_startswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + return self._convert_bool_result(result, na=na, method_name="startswith") + + def _str_endswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + return self._convert_bool_result(result, na=na, method_name="endswith") + + def _str_isalnum(self): + result = pc.utf8_is_alnum(self._pa_array) + return self._convert_bool_result(result) + + def _str_isalpha(self): + result = pc.utf8_is_alpha(self._pa_array) + return self._convert_bool_result(result) + + def _str_isascii(self): + result = pc.string_is_ascii(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdecimal(self): + result = pc.utf8_is_decimal(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdigit(self): + result = pc.utf8_is_digit(self._pa_array) + return self._convert_bool_result(result) + + def _str_islower(self): + result = pc.utf8_is_lower(self._pa_array) + return self._convert_bool_result(result) + + def _str_isnumeric(self): + result = pc.utf8_is_numeric(self._pa_array) + return self._convert_bool_result(result) + + def _str_isspace(self): + result = pc.utf8_is_space(self._pa_array) + return self._convert_bool_result(result) + + def _str_istitle(self): + result = pc.utf8_is_title(self._pa_array) + return self._convert_bool_result(result) + + def _str_isupper(self): + result = pc.utf8_is_upper(self._pa_array) + return self._convert_bool_result(result) + + def _str_contains( + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + regex: bool = True, + ): + if flags: + raise NotImplementedError(f"contains not implemented with {flags=}") + + if regex: + pa_contains = pc.match_substring_regex + else: + pa_contains = pc.match_substring + result = pa_contains(self._pa_array, pat, ignore_case=not case) + return self._convert_bool_result(result, na=na, method_name="contains") + + def _str_match( + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 + res_list = self._apply_elementwise(lambda val: val.find(sub, start, end)) + return self._convert_int_result(pa.chunked_array(res_list)) + + if (start == 0 or start is None) and end is None: + result = pc.find_substring(self._pa_array, sub) + else: + if sub == "": + # GH#56792 + res_list = self._apply_elementwise( + lambda val: val.find(sub, start, end) + ) + return self._convert_int_result(pa.chunked_array(res_list)) + if start is None: + start_offset = 0 + start = 0 + elif start < 0: + start_offset = pc.add(start, pc.utf8_length(self._pa_array)) + start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) + else: + start_offset = start + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + found = pc.not_equal(result, pa.scalar(-1, type=result.type)) + offset_result = pc.add(result, start_offset) + result = pc.if_else(found, offset_result, -1) + return self._convert_int_result(result) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 9ece12cf51a7b..26585e7bab8e3 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -33,7 +33,6 @@ from pandas.util._decorators import doc from pandas.util._validators import ( validate_bool_kwarg, - validate_fillna_kwargs, validate_insert_loc, ) @@ -89,7 +88,9 @@ def method(self, *args, **kwargs): return cast(F, method) -class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray): +# error: Definition of "delete/ravel/T/repeat/copy" in base class "NDArrayBacked" +# is incompatible with definition in base class "ExtensionArray" +class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray): # type: ignore[misc] """ ExtensionArray that is backed by a single NumPy ndarray. """ @@ -141,18 +142,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: dt64_values = arr.view(dtype) return DatetimeArray._simple_new(dt64_values, dtype=dtype) - elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): from pandas.core.arrays import TimedeltaArray td64_values = arr.view(dtype) return TimedeltaArray._simple_new(td64_values, dtype=dtype) - - # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible - # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None, - # type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, - # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" - return arr.view(dtype=dtype) # type: ignore[arg-type] + return arr.view(dtype=dtype) def take( self, @@ -209,7 +204,7 @@ def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[overri # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return nargminmax(self, "argmin", axis=axis) # Signature of "argmax" incompatible with supertype "ExtensionArray" @@ -217,7 +212,7 @@ def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[overri # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return nargminmax(self, "argmax", axis=axis) def unique(self) -> Self: @@ -248,7 +243,7 @@ def searchsorted( return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter) @doc(ExtensionArray.shift) - def shift(self, periods: int = 1, fill_value=None): + def shift(self, periods: int = 1, fill_value=None) -> Self: # NB: shift is always along axis=0 axis = 0 fill_value = self._validate_scalar(fill_value) @@ -265,15 +260,13 @@ def _validate_setitem_value(self, value): return value @overload - def __getitem__(self, key: ScalarIndexer) -> Any: - ... + def __getitem__(self, key: ScalarIndexer) -> Any: ... @overload def __getitem__( self, key: SequenceIndexer | PositionalIndexerTuple, - ) -> Self: - ... + ) -> Self: ... def __getitem__( self, @@ -297,15 +290,13 @@ def __getitem__( result = self._from_backing_data(result) return result - def _fill_mask_inplace( - self, method: str, limit: int | None, mask: npt.NDArray[np.bool_] - ) -> None: - # (for now) when self.ndim == 2, we assume axis=0 - func = missing.get_fill_func(method, ndim=self.ndim) - func(self._ndarray.T, limit=limit, mask=mask.T) - def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self.isna() if mask.any(): @@ -315,7 +306,7 @@ def _pad_or_backfill( npvalues = self._ndarray.T if copy: npvalues = npvalues.copy() - func(npvalues, limit=limit, mask=mask.T) + func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T) npvalues = npvalues.T if copy: @@ -331,44 +322,33 @@ def _pad_or_backfill( return new_values @doc(ExtensionArray.fillna) - def fillna( - self, value=None, method=None, limit: int | None = None, copy: bool = True - ) -> Self: - value, method = validate_fillna_kwargs( - value, method, validate_scalar_dict_value=False - ) - + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self.isna() + if limit is not None and limit < len(self): + # mypy doesn't like that mask can be an EA which need not have `cumsum` + modify = mask.cumsum() > limit # type: ignore[union-attr] + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( - value, mask, len(self) # type: ignore[arg-type] + value, + mask, # type: ignore[arg-type] + len(self), ) if mask.any(): - if method is not None: - # (for now) when self.ndim == 2, we assume axis=0 - func = missing.get_fill_func(method, ndim=self.ndim) - npvalues = self._ndarray.T - if copy: - npvalues = npvalues.copy() - func(npvalues, limit=limit, mask=mask.T) - npvalues = npvalues.T - - # TODO: NumpyExtensionArray didn't used to copy, need tests - # for this - new_values = self._from_backing_data(npvalues) + # fill with value + if copy: + new_values = self.copy() else: - # fill with value - if copy: - new_values = self.copy() - else: - new_values = self[:] - new_values[mask] = value + new_values = self[:] + new_values[mask] = value else: # We validate the fill_value even if there is nothing to fill - if value is not None: - self._validate_setitem_value(value) + self._validate_setitem_value(value) if not copy: new_values = self[:] @@ -379,7 +359,7 @@ def fillna( # ------------------------------------------------------------------------ # Reductions - def _wrap_reduction_result(self, axis: AxisInt | None, result): + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: if axis is None or self.ndim == 1: return self._box_func(result) return self._from_backing_data(result) @@ -510,17 +490,14 @@ def _quantile( fill_value = self._internal_fill_value res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - - res_values = self._cast_quantile_result(res_values) - return self._from_backing_data(res_values) - - # TODO: see if we can share this with other dispatch-wrapping methods - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - """ - Cast the result of quantile_with_mask to an appropriate dtype - to pass to _from_backing_data in _quantile. - """ - return res_values + if res_values.dtype == self._ndarray.dtype: + return self._from_backing_data(res_values) + else: + # e.g. test_quantile_empty we are empty integer dtype and res_values + # has floating dtype + # TODO: technically __init__ isn't defined here. + # Should we raise NotImplementedError and handle this on NumpyEA? + return type(self)(res_values) # type: ignore[call-arg] # ------------------------------------------------------------------------ # numpy-like methods diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 3e89391324ad4..88f5ac4ebdea4 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -2,6 +2,7 @@ Helper functions to generate range-like data for DatetimeArray (and possibly TimedeltaArray/PeriodArray) """ + from __future__ import annotations from typing import TYPE_CHECKING @@ -17,6 +18,8 @@ iNaT, ) +from pandas.core.construction import range_to_ndarray + if TYPE_CHECKING: from pandas._typing import npt @@ -81,17 +84,7 @@ def generate_regular_range( "at least 'start' or 'end' should be specified if a 'period' is given." ) - with np.errstate(over="raise"): - # If the range is sufficiently large, np.arange may overflow - # and incorrectly return an empty array if not caught. - try: - values = np.arange(b, e, stride, dtype=np.int64) - except FloatingPointError: - xdr = [b] - while xdr[-1] != e: - xdr.append(xdr[-1] + stride) - values = np.array(xdr[:-1], dtype=np.int64) - return values + return range_to_ndarray(range(b, e, stride)) def _generate_range_overflow_safe( diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index c75ec7f843ed2..6b46396d5efdf 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -44,7 +44,16 @@ def to_numpy_dtype_inference( dtype_given = True if na_value is lib.no_default: - na_value = arr.dtype.na_value + if dtype is None or not hasna: + na_value = arr.dtype.na_value + elif dtype.kind == "f": # type: ignore[union-attr] + na_value = np.nan + elif dtype.kind == "M": # type: ignore[union-attr] + na_value = np.datetime64("nat") + elif dtype.kind == "m": # type: ignore[union-attr] + na_value = np.timedelta64("nat") + else: + na_value = arr.dtype.na_value if not dtype_given and hasna: try: diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index 5fc50f786fc6a..50274a2de2cc1 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -4,4 +4,4 @@ ) from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"] +__all__ = ["ArrowExtensionArray", "ListAccessor", "StructAccessor"] diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index 2a053fac2985c..7da83e2257e30 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -1,24 +1,8 @@ from __future__ import annotations -import warnings - import numpy as np import pyarrow -from pandas.errors import PerformanceWarning -from pandas.util._exceptions import find_stack_level - - -def fallback_performancewarning(version: str | None = None) -> None: - """ - Raise a PerformanceWarning for falling back to ExtensionArray's - non-pyarrow method - """ - msg = "Falling back on a non-pyarrow code path which may decrease performance." - if version is not None: - msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." - warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) - def pyarrow_array_to_numpy_and_mask( arr, dtype: np.dtype @@ -60,7 +44,7 @@ def pyarrow_array_to_numpy_and_mask( mask = pyarrow.BooleanArray.from_buffers( pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset ) - mask = np.asarray(mask) + mask = np.asarray(mask) # type: ignore[assignment] else: mask = np.ones(len(arr), dtype=bool) return data, mask diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 7f88267943526..b220a94d032b5 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -6,13 +6,18 @@ ABCMeta, abstractmethod, ) -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + cast, +) from pandas.compat import ( pa_version_under10p1, pa_version_under11p0, ) +from pandas.core.dtypes.common import is_list_like + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -39,9 +44,9 @@ def __init__(self, data, validation_msg: str) -> None: def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: pass - def _validate(self, data): + def _validate(self, data) -> None: dtype = data.dtype - if not isinstance(dtype, ArrowDtype): + if pa_version_under10p1 or not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) @@ -87,6 +92,12 @@ def len(self) -> Series: pandas.Series The length of each list. + See Also + -------- + str.len : Python built-in function returning the length of an object. + Series.size : Returns the length of the Series. + StringMethods.len : Compute the length of each element in the Series/Index. + Examples -------- >>> import pyarrow as pa @@ -95,9 +106,7 @@ def len(self) -> Series: ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_( - ... pa.int64() - ... )) + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list.len() 0 3 @@ -107,7 +116,12 @@ def len(self) -> Series: from pandas import Series value_lengths = pc.list_value_length(self._pa_array) - return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) + return Series( + value_lengths, + dtype=ArrowDtype(value_lengths.type), + index=self._data.index, + name=self._data.name, + ) def __getitem__(self, key: int | slice) -> Series: """ @@ -123,6 +137,10 @@ def __getitem__(self, key: int | slice) -> Series: pandas.Series The list at requested index. + See Also + -------- + ListAccessor.flatten : Flatten list values. + Examples -------- >>> import pyarrow as pa @@ -131,9 +149,7 @@ def __getitem__(self, key: int | slice) -> Series: ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_( - ... pa.int64() - ... )) + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list[0] 0 1 @@ -148,7 +164,12 @@ def __getitem__(self, key: int | slice) -> Series: # if key < 0: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) - return Series(element, dtype=ArrowDtype(element.type)) + return Series( + element, + dtype=ArrowDtype(element.type), + index=self._data.index, + name=self._data.name, + ) elif isinstance(key, slice): if pa_version_under11p0: raise NotImplementedError( @@ -166,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series: if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) - return Series(sliced, dtype=ArrowDtype(sliced.type)) + return Series( + sliced, + dtype=ArrowDtype(sliced.type), + index=self._data.index, + name=self._data.name, + ) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") @@ -182,6 +208,10 @@ def flatten(self) -> Series: pandas.Series The data from all lists in the series flattened. + See Also + -------- + ListAccessor.__getitem__ : Index or slice values in the Series. + Examples -------- >>> import pyarrow as pa @@ -190,21 +220,26 @@ def flatten(self) -> Series: ... [1, 2, 3], ... [3], ... ], - ... dtype=pd.ArrowDtype(pa.list_( - ... pa.int64() - ... )) + ... dtype=pd.ArrowDtype(pa.list_(pa.int64())), ... ) >>> s.list.flatten() 0 1 - 1 2 - 2 3 - 3 3 + 0 2 + 0 3 + 1 3 dtype: int64[pyarrow] """ from pandas import Series - flattened = pc.list_flatten(self._pa_array) - return Series(flattened, dtype=ArrowDtype(flattened.type)) + counts = pa.compute.list_value_length(self._pa_array) + flattened = pa.compute.list_flatten(self._pa_array) + index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type))) + return Series( + flattened, + dtype=ArrowDtype(flattened.type), + index=index, + name=self._data.name, + ) class StructAccessor(ArrowAccessor): @@ -239,6 +274,10 @@ def dtypes(self) -> Series: pandas.Series The data type of each child field. + See Also + -------- + Series.dtype: Return the dtype object of the underlying data. + Examples -------- >>> import pyarrow as pa @@ -248,9 +287,9 @@ def dtypes(self) -> Series: ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) + ... dtype=pd.ArrowDtype( + ... pa.struct([("version", pa.int64()), ("project", pa.string())]) + ... ), ... ) >>> s.struct.dtypes version int64[pyarrow] @@ -267,15 +306,27 @@ def dtypes(self) -> Series: names = [struct.name for struct in pa_type] return Series(types, index=Index(names)) - def field(self, name_or_index: str | int) -> Series: + def field( + self, + name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + ) -> Series: """ Extract a child field of a struct as a Series. Parameters ---------- - name_or_index : str | int + name_or_index : str | bytes | int | expression | list Name or index of the child field to extract. + For list-like inputs, this will index into a nested + struct. + Returns ------- pandas.Series @@ -285,6 +336,19 @@ def field(self, name_or_index: str | int) -> Series: -------- Series.struct.explode : Return all child fields as a DataFrame. + Notes + ----- + The name of the resulting Series will be set using the following + rules: + + - For string, bytes, or integer `name_or_index` (or a list of these, for + a nested selection), the Series name is set to the selected + field's name. + - For a :class:`pyarrow.compute.Expression`, this is set to + the string form of the expression. + - For list-like `name_or_index`, the name will be set to the + name of the final field selected. + Examples -------- >>> import pyarrow as pa @@ -294,9 +358,9 @@ def field(self, name_or_index: str | int) -> Series: ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) + ... dtype=pd.ArrowDtype( + ... pa.struct([("version", pa.int64()), ("project", pa.string())]) + ... ), ... ) Extract by field name. @@ -314,27 +378,94 @@ def field(self, name_or_index: str | int) -> Series: 1 2 2 1 Name: version, dtype: int64[pyarrow] + + Or an expression + + >>> import pyarrow.compute as pc + >>> s.struct.field(pc.field("project")) + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + For nested struct types, you can pass a list of values to index + multiple levels: + + >>> version_type = pa.struct( + ... [ + ... ("major", pa.int64()), + ... ("minor", pa.int64()), + ... ] + ... ) + >>> s = pd.Series( + ... [ + ... {"version": {"major": 1, "minor": 5}, "project": "pandas"}, + ... {"version": {"major": 2, "minor": 1}, "project": "pandas"}, + ... {"version": {"major": 1, "minor": 26}, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype( + ... pa.struct([("version", version_type), ("project", pa.string())]) + ... ), + ... ) + >>> s.struct.field(["version", "minor"]) + 0 5 + 1 1 + 2 26 + Name: minor, dtype: int64[pyarrow] + >>> s.struct.field([0, 0]) + 0 1 + 1 2 + 2 1 + Name: major, dtype: int64[pyarrow] """ from pandas import Series + def get_name( + level_name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + data: pa.ChunkedArray, + ): + if isinstance(level_name_or_index, int): + name = data.type.field(level_name_or_index).name + elif isinstance(level_name_or_index, (str, bytes)): + name = level_name_or_index + elif isinstance(level_name_or_index, pc.Expression): + name = str(level_name_or_index) + elif is_list_like(level_name_or_index): + # For nested input like [2, 1, 2] + # iteratively get the struct and field name. The last + # one is used for the name of the index. + level_name_or_index = list(reversed(level_name_or_index)) + selected = data + while level_name_or_index: + # we need the cast, otherwise mypy complains about + # getting ints, bytes, or str here, which isn't possible. + level_name_or_index = cast(list, level_name_or_index) + name_or_index = level_name_or_index.pop() + name = get_name(name_or_index, selected) + selected = selected.type.field(selected.type.get_field_index(name)) + name = selected.name + else: + raise ValueError( + "name_or_index must be an int, str, bytes, " + "pyarrow.compute.Expression, or list of those" + ) + return name + pa_arr = self._data.array._pa_array - if isinstance(name_or_index, int): - index = name_or_index - elif isinstance(name_or_index, str): - index = pa_arr.type.get_field_index(name_or_index) - else: - raise ValueError( - "name_or_index must be an int or str, " - f"got {type(name_or_index).__name__}" - ) + name = get_name(name_or_index, pa_arr) + field_arr = pc.struct_field(pa_arr, name_or_index) - pa_field = pa_arr.type[index] - field_arr = pc.struct_field(pa_arr, [index]) return Series( field_arr, dtype=ArrowDtype(field_arr.type), index=self._data.index, - name=pa_field.name, + name=name, ) def explode(self) -> DataFrame: @@ -359,9 +490,9 @@ def explode(self) -> DataFrame: ... {"version": 2, "project": "pandas"}, ... {"version": 1, "project": "numpy"}, ... ], - ... dtype=pd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) + ... dtype=pd.ArrowDtype( + ... pa.struct([("version", pa.int64()), ("project", pa.string())]) + ... ), ... ) >>> s.struct.explode() diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 23b5448029dd9..0b90bcea35100 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -7,9 +7,9 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, + overload, ) import unicodedata @@ -27,20 +27,21 @@ pa_version_under13p0, ) from pandas.util._decorators import doc -from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import ( can_hold_element, infer_dtype_from_scalar, ) from pandas.core.dtypes.common import ( - CategoricalDtype, is_array_like, is_bool_dtype, + is_float_dtype, is_integer, is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -67,6 +68,7 @@ unpack_tuple_and_ellipses, validate_indices, ) +from pandas.core.nanops import check_below_min_count from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -107,25 +109,50 @@ def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar - ) -> pa.ChunkedArray: + ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): - return arrow_array.cast(pa.float64()) - return arrow_array + # GH: 56645. + # https://github.com/apache/arrow/issues/35563 + return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( + pa_object, pa.float64(), safe=False + ) + + return arrow_array, pa_object def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: - # Ensure int // int -> int mirroring Python/Numpy behavior - # as pc.floor(pc.divide_checked(int, int)) -> float - converted_left = cast_for_truediv(left, right) - result = pc.floor(pc.divide(converted_left, right)) + # TODO: Replace with pyarrow floordiv kernel. + # https://github.com/apache/arrow/issues/39386 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + divided = pc.divide_checked(left, right) + if pa.types.is_signed_integer(divided.type): + # GH 56676 + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + has_one_negative_operand = pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ) + result = pc.if_else( + pc.and_( + has_remainder, + has_one_negative_operand, + ), + # GH: 55561 + pc.subtract(divided, pa.scalar(1, type=divided.type)), + divided, + ) + else: + result = divided result = result.cast(left.type) + else: + divided = pc.divide(left, right) + result = pc.floor(divided) return result ARROW_ARITHMETIC_FUNCS = { @@ -135,8 +162,8 @@ def floordiv_compat( "rsub": lambda x, y: pc.subtract_checked(y, x), "mul": pc.multiply_checked, "rmul": lambda x, y: pc.multiply_checked(y, x), - "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y), - "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)), + "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), + "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), "floordiv": lambda x, y: floordiv_compat(x, y), "rfloordiv": lambda x, y: floordiv_compat(y, x), "mod": NotImplemented, @@ -148,13 +175,18 @@ def floordiv_compat( } if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) + from pandas._libs.missing import NAType from pandas._typing import ( ArrayLike, AxisInt, Dtype, FillnaOptions, + InterpolateOptions, Iterator, NpDtype, NumpySorter, @@ -169,12 +201,14 @@ def floordiv_compat( npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas import Series from pandas.core.arrays.datetimes import DatetimeArray from pandas.core.arrays.timedeltas import TimedeltaArray -def get_unit_from_pa_dtype(pa_dtype): +def get_unit_from_pa_dtype(pa_dtype) -> str: # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804 if pa_version_under11p0: unit = str(pa_dtype).split("[", 1)[-1][:-1] @@ -223,6 +257,7 @@ class ArrowExtensionArray( Parameters ---------- values : pyarrow.Array or pyarrow.ChunkedArray + The input data to initialize the ArrowExtensionArray. Attributes ---------- @@ -236,6 +271,12 @@ class ArrowExtensionArray( ------- ArrowExtensionArray + See Also + -------- + array : Create a Pandas array with a specified dtype. + DataFrame.to_feather : Write a DataFrame to the binary Feather format. + read_feather : Load a feather-format object from the file path. + Notes ----- Most methods are implemented using `pyarrow compute functions. `__ @@ -273,7 +314,9 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: self._dtype = ArrowDtype(self._pa_array.type) @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: """ Construct a new ExtensionArray from a sequence of scalars. """ @@ -284,8 +327,8 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False - ): + cls, strings, *, dtype: ExtensionDtype, copy: bool = False + ) -> Self: """ Construct a new ExtensionArray from a sequence of strings. """ @@ -492,21 +535,20 @@ def _box_pa_array( if pa_type is not None and pa_array.type != pa_type: if pa.types.is_dictionary(pa_type): pa_array = pa_array.dictionary_encode() + if pa_array.type != pa_type: + pa_array = pa_array.cast(pa_type) else: try: pa_array = pa_array.cast(pa_type) - except ( - pa.ArrowInvalid, - pa.ArrowTypeError, - pa.ArrowNotImplementedError, - ): + except (pa.ArrowNotImplementedError, pa.ArrowTypeError): if pa.types.is_string(pa_array.type) or pa.types.is_large_string( pa_array.type ): # TODO: Move logic in _from_sequence_of_strings into # _box_pa_array + dtype = ArrowDtype(pa_type) return cls._from_sequence_of_strings( - value, dtype=pa_type + value, dtype=dtype )._pa_array else: raise @@ -542,10 +584,11 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage in ( - "pyarrow", - "pyarrow_numpy", + if ( + isinstance(self._dtype, StringDtype) + and self._dtype.storage == "pyarrow" ): + # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype @@ -628,9 +671,20 @@ def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" return self._pa_array - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + if copy is False: + # TODO: By using `zero_copy_only` it may be possible to implement this + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + elif copy is None: + # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. + copy = False + + return self.to_numpy(dtype=dtype, copy=copy) def __invert__(self) -> Self: # This is a bit wise op for integer types @@ -645,7 +699,12 @@ def __invert__(self) -> Self: return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: - return type(self)(pc.negate_checked(self._pa_array)) + try: + return type(self)(pc.negate_checked(self._pa_array)) + except pa.ArrowNotImplementedError as err: + raise TypeError( + f"unary '-' not supported for dtype '{self.dtype}'" + ) from err def __pos__(self) -> Self: return type(self)(self._pa_array) @@ -668,12 +727,16 @@ def __setstate__(self, state) -> None: state["_pa_array"] = pa.chunked_array(data) self.__dict__.update(state) - def _cmp_method(self, other, op): + def _cmp_method(self, other, op) -> ArrowExtensionArray: pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance( - other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) - ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): - result = pc_func(self._pa_array, self._box_pa(other)) + if isinstance(other, (ExtensionArray, np.ndarray, list)): + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except pa.ArrowNotImplementedError: + # TODO: could this be wrong if other is object dtype? + # in which case we need to operate pointwise? + result = ops.invalid_comparison(self, other, op) + result = pa.array(result, type=pa.bool_()) elif is_scalar(other): try: result = pc_func(self._pa_array, self._box_pa(other)) @@ -685,7 +748,7 @@ def _cmp_method(self, other, op): try: result[valid] = op(np_array[valid], other) except TypeError: - result = ops.invalid_comparison(np_array, other, op) + result = ops.invalid_comparison(self, other, op) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) else: @@ -694,8 +757,19 @@ def _cmp_method(self, other, op): ) return ArrowExtensionArray(result) - def _evaluate_op_method(self, other, op, arrow_funcs): + def _op_method_error_message(self, other, op) -> str: + if hasattr(other, "dtype"): + other_type = f"dtype '{other.dtype}'" + else: + other_type = f"object of type {type(other)}" + return ( + f"operation '{op.__name__}' not supported for " + f"dtype '{self.dtype}' with {other_type}" + ) + + def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type + other_original = other other = self._box_pa(other) if ( @@ -705,10 +779,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs): ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - elif op is roperator.radd: - result = pc.binary_join_element_wise(other, self._pa_array, sep) + try: + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + except pa.ArrowNotImplementedError as err: + raise TypeError( + self._op_method_error_message(other_original, op) + ) from err return type(self)(result) elif op in [operator.mul, roperator.rmul]: binary = self._pa_array @@ -740,12 +819,17 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: + if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): + raise TypeError(self._op_method_error_message(other_original, op)) raise NotImplementedError(f"{op.__name__} not implemented.") - result = pc_func(self._pa_array, other) + try: + result = pc_func(self._pa_array, other) + except pa.ArrowNotImplementedError as err: + raise TypeError(self._op_method_error_message(other_original, op)) from err return type(self)(result) - def _logical_method(self, other, op): + def _logical_method(self, other, op) -> Self: # For integer types `^`, `|`, `&` are bitwise operators and return # integer types. Otherwise these are boolean ops. if pa.types.is_integer(self._pa_array.type): @@ -753,7 +837,7 @@ def _logical_method(self, other, op): else: return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) - def _arith_method(self, other, op): + def _arith_method(self, other, op) -> Self: return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) def equals(self, other) -> bool: @@ -818,7 +902,13 @@ def isna(self) -> npt.NDArray[np.bool_]: return self._pa_array.is_null().to_numpy() - def any(self, *, skipna: bool = True, **kwargs): + @overload + def any(self, *, skipna: Literal[True] = ..., **kwargs) -> bool: ... + + @overload + def any(self, *, skipna: bool, **kwargs) -> bool | NAType: ... + + def any(self, *, skipna: bool = True, **kwargs) -> bool | NAType: """ Return whether any element is truthy. @@ -876,7 +966,13 @@ def any(self, *, skipna: bool = True, **kwargs): """ return self._reduce("any", skipna=skipna, **kwargs) - def all(self, *, skipna: bool = True, **kwargs): + @overload + def all(self, *, skipna: Literal[True] = ..., **kwargs) -> bool: ... + + @overload + def all(self, *, skipna: bool, **kwargs) -> bool | NAType: ... + + def all(self, *, skipna: bool = True, **kwargs) -> bool | NAType: """ Return whether all elements are truthy. @@ -998,13 +1094,17 @@ def dropna(self) -> Self: return type(self)(pc.drop_null(self._pa_array)) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: if not self._hasna: - # TODO(CoW): Not necessary anymore when CoW is the default - return self.copy() + return self - if limit is None: + if limit is None and limit_area is None: method = missing.clean_fill_method(method) try: if method == "pad": @@ -1018,29 +1118,25 @@ def _pad_or_backfill( # a kernel for duration types. pass + # TODO: Why do we no longer need the above cases? # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) @doc(ExtensionArray.fillna) def fillna( self, - value: object | ArrayLike | None = None, - method: FillnaOptions | None = None, + value: object | ArrayLike, limit: int | None = None, copy: bool = True, ) -> Self: - value, method = validate_fillna_kwargs(value, method) - if not self._hasna: - # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() if limit is not None: - return super().fillna(value=value, method=method, limit=limit, copy=copy) - - if method is not None: - return super().fillna(method=method, limit=limit, copy=copy) + return super().fillna(value=value, limit=limit, copy=copy) if isinstance(value, (np.ndarray, ExtensionArray)): # Similar to check_value_size, but we do not mask here since we may @@ -1054,7 +1150,7 @@ def fillna( try: fill_value = self._box_pa(value, pa_type=self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err try: @@ -1066,7 +1162,7 @@ def fillna( # a kernel for duration types. pass - return super().fillna(value=value, method=method, limit=limit, copy=copy) + return super().fillna(value=value, limit=limit, copy=copy) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: # short-circuit to return all False array. @@ -1109,7 +1205,12 @@ def factorize( data = data.cast(pa.int64()) if pa.types.is_dictionary(data.type): - encoded = data + if null_encoding == "encode": + # dictionary encode does nothing if an already encoded array is given + data = data.cast(data.type.value_type) + encoded = data.dictionary_encode(null_encoding=null_encoding) + else: + encoded = data else: encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: @@ -1299,7 +1400,7 @@ def _to_datetimearray(self) -> DatetimeArray: np_dtype = np.dtype(f"M8[{pa_type.unit}]") dtype = tz_to_dtype(pa_type.tz, pa_type.unit) np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype) + np_array = np_array.astype(np_dtype, copy=False) return DatetimeArray._simple_new(np_array, dtype=dtype) def _to_timedeltaarray(self) -> TimedeltaArray: @@ -1310,9 +1411,14 @@ def _to_timedeltaarray(self) -> TimedeltaArray: assert pa.types.is_duration(pa_type) np_dtype = np.dtype(f"m8[{pa_type.unit}]") np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype) + np_array = np_array.astype(np_dtype, copy=False) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) + def _values_for_json(self) -> np.ndarray: + if is_numeric_dtype(self.dtype): + return np.asarray(self, dtype=object) + return super()._values_for_json() + @doc(ExtensionArray.to_numpy) def to_numpy( self, @@ -1320,6 +1426,7 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: + original_na_value = na_value dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): @@ -1345,7 +1452,13 @@ def to_numpy( if dtype is not None and isna(na_value): na_value = None result = np.full(len(data), fill_value=na_value, dtype=dtype) - elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): + elif not data._hasna or ( + pa.types.is_floating(pa_type) + and ( + na_value is np.nan + or (original_na_value is lib.no_default and is_float_dtype(dtype)) + ) + ): result = data._pa_array.to_numpy() if dtype is not None: result = result.astype(dtype, copy=False) @@ -1364,9 +1477,9 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result - def map(self, mapper, na_action=None): + def map(self, mapper, na_action: Literal["ignore"] | None = None): if is_numeric_dtype(self.dtype): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) else: return super().map(mapper, na_action) @@ -1516,6 +1629,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1540,13 +1656,68 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + try: + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + except pa.ArrowNotImplementedError as err: + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) from err if convert_to_int: result = result.cast(pa_dtype) return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: pa.array | None = None + na_mask: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = pc.is_null(pa_array) + if pc.all(na_mask) == pa.scalar(True): + return type(self)(pa_array) + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + # We can retain the running min/max by forward/backward filling. + pa_array = pc.fill_null_forward(pa_array) + pa_array = pc.fill_null_backward(pa_array) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = pc.index(na_mask, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx] + + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] + + if tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) + elif na_mask is not None: + pa_result = pc.if_else(na_mask, None, pa_result) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. @@ -1611,6 +1782,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs): denominator = pc.sqrt_checked(pc.count(self._pa_array)) return pc.divide_checked(numerator, denominator) + elif name == "sum" and ( + pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type) + ): + + def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] + mask = pc.is_null(data) if data.null_count > 0 else None + if skip_nulls: + if min_count > 0 and check_below_min_count( + (len(data),), + None if mask is None else mask.to_numpy(), + min_count, + ): + return pa.scalar(None, type=data.type) + if data.null_count > 0: + # binary_join returns null if there is any null -> + # have to filter out any nulls + data = data.filter(pc.invert(mask)) + else: + if mask is not None or check_below_min_count( + (len(data),), None, min_count + ): + return pa.scalar(None, type=data.type) + + if pa.types.is_large_string(data.type): + # binary_join only supports string, not large_string + data = data.cast(pa.string()) + data_list = pa.ListArray.from_arrays( + [0, len(data)], data.combine_chunks() + )[0] + return pc.binary_join(data_list, "") + else: pyarrow_name = { "median": "quantile", @@ -1638,7 +1840,7 @@ def pyarrow_meth(data, skip_nulls, **kwargs): except (AttributeError, NotImplementedError, TypeError) as err: msg = ( f"'{type(self).__name__}' with dtype {self.dtype} " - f"does not support reduction '{name}' with pyarrow " + f"does not support operation '{name}' with pyarrow " f"version {pa.__version__}. '{name}' may be supported by " f"upgrading pyarrow." ) @@ -1646,8 +1848,6 @@ def pyarrow_meth(data, skip_nulls, **kwargs): if name == "median": # GH 52679: Use quantile instead of approximate_median; returns array result = result[0] - if pc.is_null(result).as_py(): - return result if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): result = result.cast(pa_type) @@ -1735,7 +1935,10 @@ def _explode(self): """ # child class explode method supports only list types; return # default implementation for non list types. - if not pa.types.is_list(self.dtype.pyarrow_dtype): + if not ( + pa.types.is_list(self.dtype.pyarrow_dtype) + or pa.types.is_large_list(self.dtype.pyarrow_dtype) + ): return super()._explode() values = self counts = pa.compute.list_value_length(values._pa_array) @@ -1819,7 +2022,8 @@ def __setitem__(self, key, value) -> None: raise ValueError("Length of indexer and values mismatch") if len(indices) == 0: return - argsort = np.argsort(indices) + # GH#58530 wrong item assignment by repeated key + _, argsort = np.unique(indices, return_index=True) indices = indices[argsort] value = value.take(argsort) mask = np.zeros(len(self), dtype=np.bool_) @@ -1902,11 +2106,11 @@ def _rank( na_option: str = "keep", ascending: bool = True, pct: bool = False, - ): + ) -> Self: """ See Series.rank.__doc__. """ - return type(self)( + return self._convert_rank_result( self._rank_calc( axis=axis, method=method, @@ -2002,17 +2206,73 @@ def _maybe_convert_setitem_value(self, value): try: value = self._box_pa(value, self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err return value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if not self.dtype._is_numeric: + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") + + if ( + not pa_version_under13p0 + and method == "linear" + and limit_area is None + and limit is None + and limit_direction == "forward" + ): + values = self._pa_array.combine_chunks() + na_value = pa.array([None], type=values.type) + y_diff_2 = pc.fill_null_backward(pc.pairwise_diff_checked(values, period=2)) + prev_values = pa.concat_arrays([na_value, values[:-2], na_value]) + interps = pc.add_checked(prev_values, pc.divide_checked(y_diff_2, 2)) + return type(self)(pc.coalesce(self._pa_array, interps)) + + mask = self.isna() + if self.dtype.kind == "f": + data = self._pa_array.to_numpy() + elif self.dtype.kind in "iu": + data = self.to_numpy(dtype="f8", na_value=0.0) + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + return type(self)(self._box_pa_array(pa.array(data, mask=mask))) + @classmethod def _if_else( cls, cond: npt.NDArray[np.bool_] | bool, left: ArrayLike | Scalar, right: ArrayLike | Scalar, - ): + ) -> pa.Array: """ Choose values based on a condition. @@ -2056,7 +2316,7 @@ def _replace_with_mask( values: pa.Array | pa.ChunkedArray, mask: npt.NDArray[np.bool_] | bool, replacements: ArrayLike | Scalar, - ): + ) -> pa.Array | pa.ChunkedArray: """ Replace items selected with a mask. @@ -2122,6 +2382,20 @@ def _groupby_op( **kwargs, ): if isinstance(self.dtype, StringDtype): + if how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) return super()._groupby_op( how=how, has_dropped_na=has_dropped_na, @@ -2151,7 +2425,13 @@ def _groupby_op( ) if isinstance(result, np.ndarray): return result - return type(self)._from_sequence(result, copy=False) + elif isinstance(result, BaseMaskedArray): + pa_result = result.__arrow_array__() + return type(self)(pa_result) + else: + # DatetimeArray, TimedeltaArray + pa_result = pa.array(result, from_pandas=True) + return type(self)(pa_result) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: """Apply a callable to each element while maintaining the chunking structure.""" @@ -2163,127 +2443,30 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] - def _str_count(self, pat: str, flags: int = 0): - if flags: - raise NotImplementedError(f"count not implemented with {flags=}") - return type(self)(pc.count_substring_regex(self._pa_array, pat)) - - def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True - ): - if flags: - raise NotImplementedError(f"contains not implemented with {flags=}") - - if regex: - pa_contains = pc.match_substring_regex - else: - pa_contains = pc.match_substring - result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna(na): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _convert_int_result(self, result): return type(self)(result) - def _str_endswith(self, pat: str | tuple[str, ...], na=None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _convert_rank_result(self, result): return type(self)(result) - def _str_replace( - self, - pat: str | re.Pattern, - repl: str | Callable, - n: int = -1, - case: bool = True, - flags: int = 0, - regex: bool = True, - ): - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - raise NotImplementedError( - "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" - ) - - func = pc.replace_substring_regex if regex else pc.replace_substring - # https://github.com/apache/arrow/issues/39149 - # GH 56404, unexpected behavior with negative max_replacements with pyarrow. - pa_max_replacements = None if n < 0 else n - result = func( - self._pa_array, - pattern=pat, - replacement=repl, - max_replacements=pa_max_replacements, - ) - return type(self)(result) + def _str_count(self, pat: str, flags: int = 0) -> Self: + if flags: + raise NotImplementedError(f"count not implemented with {flags=}") + return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_repeat(self, repeats: int | Sequence[int]): + def _str_repeat(self, repeats: int | Sequence[int]) -> Self: if not isinstance(repeats, int): raise NotImplementedError( f"repeat is not implemented when repeats is {type(repeats).__name__}" ) - else: - return type(self)(pc.binary_repeat(self._pa_array, repeats)) + return type(self)(pc.binary_repeat(self._pa_array, repeats)) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("//$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - - def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - start_offset = max(0, start) - offset_result = pc.add(result, start_offset) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: - raise NotImplementedError( - f"find not implemented with {sub=}, {start=}, {end=}" - ) - return type(self)(result) - - def _str_join(self, sep: str): + def _str_join(self, sep: str) -> Self: if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type ): @@ -2293,100 +2476,22 @@ def _str_join(self, sep: str): result = self._pa_array return type(self)(pc.binary_join(result, sep)) - def _str_partition(self, sep: str, expand: bool): + def _str_partition(self, sep: str, expand: bool) -> Self: predicate = lambda val: val.partition(sep) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_rpartition(self, sep: str, expand: bool): + def _str_rpartition(self, sep: str, expand: bool) -> Self: predicate = lambda val: val.rpartition(sep) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_isalnum(self): - return type(self)(pc.utf8_is_alnum(self._pa_array)) - - def _str_isalpha(self): - return type(self)(pc.utf8_is_alpha(self._pa_array)) - - def _str_isdecimal(self): - return type(self)(pc.utf8_is_decimal(self._pa_array)) - - def _str_isdigit(self): - return type(self)(pc.utf8_is_digit(self._pa_array)) - - def _str_islower(self): - return type(self)(pc.utf8_is_lower(self._pa_array)) - - def _str_isnumeric(self): - return type(self)(pc.utf8_is_numeric(self._pa_array)) - - def _str_isspace(self): - return type(self)(pc.utf8_is_space(self._pa_array)) - - def _str_istitle(self): - return type(self)(pc.utf8_is_title(self._pa_array)) - - def _str_isupper(self): - return type(self)(pc.utf8_is_upper(self._pa_array)) - - def _str_len(self): - return type(self)(pc.utf8_length(self._pa_array)) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_removeprefix(self, prefix: str): - if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) - predicate = lambda val: val.removeprefix(prefix) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_casefold(self): + def _str_casefold(self) -> Self: predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_encode(self, encoding: str, errors: str = "strict"): + def _str_encode(self, encoding: str, errors: str = "strict") -> Self: predicate = lambda val: val.encode(encoding, errors) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) @@ -2406,13 +2511,15 @@ def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): else: return type(self)(pc.struct_field(result, [0])) - def _str_findall(self, pat: str, flags: int = 0): + def _str_findall(self, pat: str, flags: int = 0) -> Self: regex = re.compile(pat, flags=flags) predicate = lambda val: regex.findall(val) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): + if dtype is None: + dtype = np.bool_ split = pc.split_pattern(self._pa_array, sep) flattened_values = pc.list_flatten(split) uniques = flattened_values.unique() @@ -2422,28 +2529,34 @@ def _str_get_dummies(self, sep: str = "|"): n_cols = len(uniques) indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() indices = indices + np.arange(n_rows).repeat(lengths) * n_cols - dummies = np.zeros(n_rows * n_cols, dtype=np.bool_) + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) dummies[indices] = True - dummies = dummies.reshape((n_rows, n_cols)) + dummies = dummies.reshape((n_rows, n_cols)) # type: ignore[assignment] result = type(self)(pa.array(list(dummies))) return result, uniques_sorted.to_pylist() - def _str_index(self, sub: str, start: int = 0, end: int | None = None): + def _str_index(self, sub: str, start: int = 0, end: int | None = None) -> Self: predicate = lambda val: val.index(sub, start, end) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_rindex(self, sub: str, start: int = 0, end: int | None = None): + def _str_rindex(self, sub: str, start: int = 0, end: int | None = None) -> Self: predicate = lambda val: val.rindex(sub, start, end) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_normalize(self, form: str): + def _str_normalize(self, form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> Self: predicate = lambda val: unicodedata.normalize(form, val) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_rfind(self, sub: str, start: int = 0, end=None): + def _str_rfind(self, sub: str, start: int = 0, end=None) -> Self: predicate = lambda val: val.rfind(sub, start, end) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) @@ -2454,7 +2567,7 @@ def _str_split( n: int | None = -1, expand: bool = False, regex: bool | None = None, - ): + ) -> Self: if n in {-1, 0}: n = None if pat is None: @@ -2465,24 +2578,23 @@ def _str_split( split_func = functools.partial(pc.split_pattern, pattern=pat) return type(self)(split_func(self._pa_array, max_splits=n)) - def _str_rsplit(self, pat: str | None = None, n: int | None = -1): + def _str_rsplit(self, pat: str | None = None, n: int | None = -1) -> Self: if n in {-1, 0}: n = None if pat is None: return type(self)( pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True) ) - else: - return type(self)( - pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) - ) + return type(self)( + pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) + ) - def _str_translate(self, table: dict[int, str]): + def _str_translate(self, table: dict[int, str]) -> Self: predicate = lambda val: val.translate(table) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_wrap(self, width: int, **kwargs): + def _str_wrap(self, width: int, **kwargs) -> Self: kwargs["width"] = width tw = textwrap.TextWrapper(**kwargs) predicate = lambda val: "\n".join(tw.wrap(val)) @@ -2490,43 +2602,131 @@ def _str_wrap(self, width: int, **kwargs): return type(self)(pa.chunked_array(result)) @property - def _dt_year(self): + def _dt_days(self) -> Self: + return type(self)( + pa.array( + self._to_timedeltaarray().components.days, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_hours(self) -> Self: + return type(self)( + pa.array( + self._to_timedeltaarray().components.hours, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_minutes(self) -> Self: + return type(self)( + pa.array( + self._to_timedeltaarray().components.minutes, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_seconds(self) -> Self: + return type(self)( + pa.array( + self._to_timedeltaarray().components.seconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_milliseconds(self) -> Self: + return type(self)( + pa.array( + self._to_timedeltaarray().components.milliseconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_microseconds(self) -> Self: + return type(self)( + pa.array( + self._to_timedeltaarray().components.microseconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_nanoseconds(self) -> Self: + return type(self)( + pa.array( + self._to_timedeltaarray().components.nanoseconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + def _dt_to_pytimedelta(self) -> np.ndarray: + data = self._pa_array.to_pylist() + if self._dtype.pyarrow_dtype.unit == "ns": + data = [None if ts is None else ts.to_pytimedelta() for ts in data] + return np.array(data, dtype=object) + + def _dt_total_seconds(self) -> Self: + return type(self)( + pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) + ) + + def _dt_as_unit(self, unit: str) -> Self: + if pa.types.is_date(self.dtype.pyarrow_dtype): + raise NotImplementedError("as_unit not implemented for date types") + pd_array = self._maybe_convert_datelike_array() + # Don't just cast _pa_array in order to follow pandas unit conversion rules + return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True)) + + @property + def _dt_year(self) -> Self: return type(self)(pc.year(self._pa_array)) @property - def _dt_day(self): + def _dt_day(self) -> Self: return type(self)(pc.day(self._pa_array)) @property - def _dt_day_of_week(self): + def _dt_day_of_week(self) -> Self: return type(self)(pc.day_of_week(self._pa_array)) _dt_dayofweek = _dt_day_of_week _dt_weekday = _dt_day_of_week @property - def _dt_day_of_year(self): + def _dt_day_of_year(self) -> Self: return type(self)(pc.day_of_year(self._pa_array)) _dt_dayofyear = _dt_day_of_year @property - def _dt_hour(self): + def _dt_hour(self) -> Self: return type(self)(pc.hour(self._pa_array)) - def _dt_isocalendar(self): + def _dt_isocalendar(self) -> Self: return type(self)(pc.iso_calendar(self._pa_array)) @property - def _dt_is_leap_year(self): + def _dt_is_leap_year(self) -> Self: return type(self)(pc.is_leap_year(self._pa_array)) @property - def _dt_is_month_start(self): + def _dt_is_month_start(self) -> Self: return type(self)(pc.equal(pc.day(self._pa_array), 1)) @property - def _dt_is_month_end(self): + def _dt_is_month_end(self) -> Self: result = pc.equal( pc.days_between( pc.floor_temporal(self._pa_array, unit="day"), @@ -2537,7 +2737,7 @@ def _dt_is_month_end(self): return type(self)(result) @property - def _dt_is_year_start(self): + def _dt_is_year_start(self) -> Self: return type(self)( pc.and_( pc.equal(pc.month(self._pa_array), 1), @@ -2546,7 +2746,7 @@ def _dt_is_year_start(self): ) @property - def _dt_is_year_end(self): + def _dt_is_year_end(self) -> Self: return type(self)( pc.and_( pc.equal(pc.month(self._pa_array), 12), @@ -2555,7 +2755,7 @@ def _dt_is_year_end(self): ) @property - def _dt_is_quarter_start(self): + def _dt_is_quarter_start(self) -> Self: result = pc.equal( pc.floor_temporal(self._pa_array, unit="quarter"), pc.floor_temporal(self._pa_array, unit="day"), @@ -2563,7 +2763,7 @@ def _dt_is_quarter_start(self): return type(self)(result) @property - def _dt_is_quarter_end(self): + def _dt_is_quarter_end(self) -> Self: result = pc.equal( pc.days_between( pc.floor_temporal(self._pa_array, unit="day"), @@ -2574,7 +2774,7 @@ def _dt_is_quarter_end(self): return type(self)(result) @property - def _dt_days_in_month(self): + def _dt_days_in_month(self) -> Self: result = pc.days_between( pc.floor_temporal(self._pa_array, unit="month"), pc.ceil_temporal(self._pa_array, unit="month"), @@ -2584,35 +2784,38 @@ def _dt_days_in_month(self): _dt_daysinmonth = _dt_days_in_month @property - def _dt_microsecond(self): - return type(self)(pc.microsecond(self._pa_array)) + def _dt_microsecond(self) -> Self: + # GH 59154 + us = pc.microsecond(self._pa_array) + ms_to_us = pc.multiply(pc.millisecond(self._pa_array), 1000) + return type(self)(pc.add(us, ms_to_us)) @property - def _dt_minute(self): + def _dt_minute(self) -> Self: return type(self)(pc.minute(self._pa_array)) @property - def _dt_month(self): + def _dt_month(self) -> Self: return type(self)(pc.month(self._pa_array)) @property - def _dt_nanosecond(self): + def _dt_nanosecond(self) -> Self: return type(self)(pc.nanosecond(self._pa_array)) @property - def _dt_quarter(self): + def _dt_quarter(self) -> Self: return type(self)(pc.quarter(self._pa_array)) @property - def _dt_second(self): + def _dt_second(self) -> Self: return type(self)(pc.second(self._pa_array)) @property - def _dt_date(self): + def _dt_date(self) -> Self: return type(self)(self._pa_array.cast(pa.date32())) @property - def _dt_time(self): + def _dt_time(self) -> Self: unit = ( self.dtype.pyarrow_dtype.unit if self.dtype.pyarrow_dtype.unit in {"us", "ns"} @@ -2628,10 +2831,10 @@ def _dt_tz(self): def _dt_unit(self): return self.dtype.pyarrow_dtype.unit - def _dt_normalize(self): + def _dt_normalize(self) -> Self: return type(self)(pc.floor_temporal(self._pa_array, 1, "day")) - def _dt_strftime(self, format: str): + def _dt_strftime(self, format: str) -> Self: return type(self)(pc.strftime(self._pa_array, format=format)) def _round_temporally( @@ -2640,7 +2843,7 @@ def _round_temporally( freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ): + ) -> Self: if ambiguous != "raise": raise NotImplementedError("ambiguous is not supported.") if nonexistent != "raise": @@ -2676,7 +2879,7 @@ def _dt_ceil( freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ): + ) -> Self: return self._round_temporally("ceil", freq, ambiguous, nonexistent) def _dt_floor( @@ -2684,7 +2887,7 @@ def _dt_floor( freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ): + ) -> Self: return self._round_temporally("floor", freq, ambiguous, nonexistent) def _dt_round( @@ -2692,20 +2895,22 @@ def _dt_round( freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ): + ) -> Self: return self._round_temporally("round", freq, ambiguous, nonexistent) - def _dt_day_name(self, locale: str | None = None): + def _dt_day_name(self, locale: str | None = None) -> Self: if locale is None: locale = "C" return type(self)(pc.strftime(self._pa_array, format="%A", locale=locale)) - def _dt_month_name(self, locale: str | None = None): + def _dt_month_name(self, locale: str | None = None) -> Self: if locale is None: locale = "C" return type(self)(pc.strftime(self._pa_array, format="%B", locale=locale)) - def _dt_to_pydatetime(self): + def _dt_to_pydatetime(self) -> Series: + from pandas import Series + if pa.types.is_date(self.dtype.pyarrow_dtype): raise ValueError( f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. " @@ -2714,14 +2919,14 @@ def _dt_to_pydatetime(self): data = self._pa_array.to_pylist() if self._dtype.pyarrow_dtype.unit == "ns": data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data] - return np.array(data, dtype=object) + return Series(data, dtype=object) def _dt_tz_localize( self, tz, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ): + ) -> Self: if ambiguous != "raise": raise NotImplementedError(f"{ambiguous=} is not supported") nonexistent_pa = { @@ -2729,7 +2934,8 @@ def _dt_tz_localize( "shift_backward": "earliest", "shift_forward": "latest", }.get( - nonexistent, None # type: ignore[arg-type] + nonexistent, # type: ignore[arg-type] + None, ) if nonexistent_pa is None: raise NotImplementedError(f"{nonexistent=} is not supported") @@ -2741,7 +2947,7 @@ def _dt_tz_localize( ) return type(self)(result) - def _dt_tz_convert(self, tz): + def _dt_tz_convert(self, tz) -> Self: if self.dtype.pyarrow_dtype.tz is None: raise TypeError( "Cannot convert tz-naive timestamps, use tz_localize to localize" @@ -2761,7 +2967,7 @@ def transpose_homogeneous_pyarrow( """ arrays = list(arrays) nrows, ncols = len(arrays[0]), len(arrays) - indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten() + indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1) arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks]) arr = arr.take(indices) return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)] diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 72bfd6f2212f8..2fa5f7a882cc7 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -135,7 +135,7 @@ def to_pandas_dtype(self) -> IntervalDtype: """ -def patch_pyarrow(): +def patch_pyarrow() -> None: # starting from pyarrow 14.0.1, it has its own mechanism if not pa_version_under14p1: return @@ -145,7 +145,7 @@ def patch_pyarrow(): return class ForbiddenExtensionType(pyarrow.ExtensionType): - def __arrow_ext_serialize__(self): + def __arrow_ext_serialize__(self) -> bytes: return b"" @classmethod diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 59c6d911cfaef..d0048e122051a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -6,13 +6,13 @@ This is an experimental API and subject to breaking changes without warning. """ + from __future__ import annotations import operator from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, cast, @@ -37,7 +37,6 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, - validate_fillna_kwargs, validate_insert_loc, ) @@ -70,6 +69,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.missing import _fill_limit_area_1d from pandas.core.sorting import ( nargminmax, nargsort, @@ -77,10 +77,12 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) + from pandas._libs.missing import NAType from pandas._typing import ( ArrayLike, AstypeArg, @@ -156,6 +158,12 @@ class ExtensionArray: _values_for_argsort _values_for_factorize + See Also + -------- + api.extensions.ExtensionDtype : A custom data type, to be paired with an + ExtensionArray. + api.extensions.ExtensionArray.dtype : An instance of ExtensionDtype. + Notes ----- The interface includes the following abstract methods that must be @@ -266,7 +274,9 @@ class ExtensionArray: # ------------------------------------------------------------------------ @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: """ Construct a new ExtensionArray from a sequence of scalars. @@ -285,6 +295,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal ------- ExtensionArray + See Also + -------- + api.extensions.ExtensionArray._from_sequence_of_strings : Construct a new + ExtensionArray from a sequence of strings. + api.extensions.ExtensionArray._hash_pandas_object : Hook for + hash_pandas_object. + Examples -------- >>> pd.arrays.IntegerArray._from_sequence([4, 5]) @@ -328,8 +345,8 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False - ): + cls, strings, *, dtype: ExtensionDtype, copy: bool = False + ) -> Self: """ Construct a new ExtensionArray from a sequence of strings. @@ -338,7 +355,7 @@ def _from_sequence_of_strings( strings : Sequence Each element will be an instance of the scalar type for this array, ``cls.dtype.type``. - dtype : dtype, optional + dtype : ExtensionDtype Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. copy : bool, default False @@ -348,9 +365,21 @@ def _from_sequence_of_strings( ------- ExtensionArray + See Also + -------- + api.extensions.ExtensionArray._from_sequence : Construct a new ExtensionArray + from a sequence of scalars. + api.extensions.ExtensionArray._from_factorized : Reconstruct an ExtensionArray + after factorization. + api.extensions.ExtensionArray._from_scalars : Strict analogue to _from_sequence, + allowing only sequences of scalars that should be specifically inferred to + the given dtype. + Examples -------- - >>> pd.arrays.IntegerArray._from_sequence_of_strings(["1", "2", "3"]) + >>> pd.arrays.IntegerArray._from_sequence_of_strings( + ... ["1", "2", "3"], dtype=pd.Int64Dtype() + ... ) [1, 2, 3] Length: 3, dtype: Int64 @@ -376,8 +405,9 @@ def _from_factorized(cls, values, original): Examples -------- - >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), - ... pd.Interval(1, 5), pd.Interval(1, 5)]) + >>> interv_arr = pd.arrays.IntervalArray( + ... [pd.Interval(0, 1), pd.Interval(1, 5), pd.Interval(1, 5)] + ... ) >>> codes, uniques = pd.factorize(interv_arr) >>> pd.arrays.IntervalArray._from_factorized(uniques, interv_arr) @@ -390,12 +420,10 @@ def _from_factorized(cls, values, original): # Must be a Sequence # ------------------------------------------------------------------------ @overload - def __getitem__(self, item: ScalarIndexer) -> Any: - ... + def __getitem__(self, item: ScalarIndexer) -> Any: ... @overload - def __getitem__(self, item: SequenceIndexer) -> Self: - ... + def __getitem__(self, item: SequenceIndexer) -> Self: ... def __getitem__(self, item: PositionalIndexer) -> Self | Any: """ @@ -568,7 +596,7 @@ def to_numpy( if copy or na_value is not lib.no_default: result = result.copy() if na_value is not lib.no_default: - result[self.isna()] = na_value + result[self.isna()] = na_value # type: ignore[index] return result # ------------------------------------------------------------------------ @@ -580,6 +608,14 @@ def dtype(self) -> ExtensionDtype: """ An instance of ExtensionDtype. + See Also + -------- + api.extensions.ExtensionDtype : Base class for extension dtypes. + api.extensions.ExtensionArray : Base class for extension array types. + api.extensions.ExtensionArray.dtype : The dtype of an ExtensionArray. + Series.dtype : The dtype of a Series. + DataFrame.dtype : The dtype of a DataFrame. + Examples -------- >>> pd.array([1, 2, 3]).dtype @@ -592,6 +628,13 @@ def shape(self) -> Shape: """ Return a tuple of the array dimensions. + See Also + -------- + numpy.ndarray.shape : Similar attribute which returns the shape of an array. + DataFrame.shape : Return a tuple representing the dimensionality of the + DataFrame. + Series.shape : Return a tuple representing the dimensionality of the Series. + Examples -------- >>> arr = pd.array([1, 2, 3]) @@ -614,6 +657,11 @@ def ndim(self) -> int: """ Extension Arrays are only allowed to be 1-dimensional. + See Also + -------- + ExtensionArray.shape: Return a tuple of the array dimensions. + ExtensionArray.size: The number of elements in the array. + Examples -------- >>> arr = pd.array([1, 2, 3]) @@ -627,6 +675,11 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. + See Also + -------- + ExtensionArray.shape: Return a tuple of the array dimensions. + ExtensionArray.size: The number of elements in the array. + Examples -------- >>> pd.array([1, 2, 3]).nbytes @@ -641,16 +694,13 @@ def nbytes(self) -> int: # ------------------------------------------------------------------------ @overload - def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: - ... + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @overload - def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: - ... + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: ... @overload - def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: - ... + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: ... def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: """ @@ -671,6 +721,16 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: An ``ExtensionArray`` if ``dtype`` is ``ExtensionDtype``, otherwise a Numpy ndarray with ``dtype`` for its dtype. + See Also + -------- + Series.astype : Cast a Series to a different dtype. + DataFrame.astype : Cast a DataFrame to a different dtype. + api.extensions.ExtensionArray : Base class for ExtensionArray objects. + core.arrays.DatetimeArray._from_sequence : Create a DatetimeArray from a + sequence. + core.arrays.TimedeltaArray._from_sequence : Create a TimedeltaArray from + a sequence. + Examples -------- >>> arr = pd.array([1, 2, 3]) @@ -681,7 +741,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: Casting to another ``ExtensionDtype`` returns an ``ExtensionArray``: - >>> arr1 = arr.astype('Float64') + >>> arr1 = arr.astype("Float64") >>> arr1 [1.0, 2.0, 3.0] @@ -691,7 +751,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: Otherwise, we will get a Numpy ndarray: - >>> arr2 = arr.astype('float64') + >>> arr2 = arr.astype("float64") >>> arr2 array([1., 2., 3.]) >>> arr2.dtype @@ -718,7 +778,10 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) - return np.array(self, dtype=dtype, copy=copy) + if not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: """ @@ -732,12 +795,18 @@ def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: an ndarray would be expensive, an ExtensionArray may be returned. + See Also + -------- + ExtensionArray.dropna: Return ExtensionArray without NA values. + ExtensionArray.fillna: Fill NA/NaN values using the specified method. + Notes ----- If returning an ExtensionArray, then * ``na_values._is_boolean`` should be True - * `na_values` should implement :func:`ExtensionArray._reduce` + * ``na_values`` should implement :func:`ExtensionArray._reduce` + * ``na_values`` should implement :func:`ExtensionArray._accumulate` * ``na_values.any`` and ``na_values.all`` should be implemented Examples @@ -814,7 +883,7 @@ def argsort( na_position : {'first', 'last'}, default 'last' If ``'first'``, put ``NaN`` values at the beginning. If ``'last'``, put ``NaN`` values at the end. - *args, **kwargs: + **kwargs Passed through to :func:`numpy.argsort`. Returns @@ -872,7 +941,7 @@ def argmin(self, skipna: bool = True) -> int: -------- >>> arr = pd.array([3, 1, 2, 5, 4]) >>> arr.argmin() - 1 + np.int64(1) """ # Implementer note: You have two places to override the behavior of # argmin. @@ -880,7 +949,7 @@ def argmin(self, skipna: bool = True) -> int: # 2. argmin itself : total control over sorting. validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return nargminmax(self, "argmin") def argmax(self, skipna: bool = True) -> int: @@ -906,7 +975,7 @@ def argmax(self, skipna: bool = True) -> int: -------- >>> arr = pd.array([3, 1, 2, 5, 4]) >>> arr.argmax() - 3 + np.int64(3) """ # Implementer note: You have two places to override the behavior of # argmax. @@ -914,7 +983,7 @@ def argmax(self, skipna: bool = True) -> int: # 2. argmax itself : total control over sorting. validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return nargminmax(self, "argmax") def interpolate( @@ -930,23 +999,97 @@ def interpolate( **kwargs, ) -> Self: """ - See DataFrame.interpolate.__doc__. + Fill NaN values using an interpolation method. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. One of: + * 'linear': Ignore the index and treat the values as equally spaced. + This is the only method supported on MultiIndexes. + * 'time': Works on daily and higher resolution data to interpolate + given length of interval. + * 'index', 'values': use the actual numerical values of the index. + * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', + 'polynomial': Passed to scipy.interpolate.interp1d, whereas 'spline' + is passed to scipy.interpolate.UnivariateSpline. These methods use + the numerical values of the index. + Both 'polynomial' and 'spline' require that you also specify an + order (int), e.g. arr.interpolate(method='polynomial', order=5). + * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima', + 'cubicspline': Wrappers around the SciPy interpolation methods + of similar names. See Notes. + * 'from_derivatives': Refers to scipy.interpolate.BPoly.from_derivatives. + axis : int + Axis to interpolate along. For 1-dimensional data, use 0. + index : Index + Index to use for interpolation. + limit : int or None + Maximum number of consecutive NaNs to fill. Must be greater than 0. + limit_direction : {'forward', 'backward', 'both'} + Consecutive NaNs will be filled in this direction. + limit_area : {'inside', 'outside'} or None + If limit is specified, consecutive NaNs will be filled with this + restriction. + * None: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + copy : bool + If True, a copy of the object is returned with interpolated values. + **kwargs : optional + Keyword arguments to pass on to the interpolating function. + + Returns + ------- + ExtensionArray + An ExtensionArray with interpolated values. + + See Also + -------- + Series.interpolate : Interpolate values in a Series. + DataFrame.interpolate : Interpolate values in a DataFrame. + + Notes + ----- + - All parameters must be specified as keyword arguments. + - The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima' + methods are wrappers around the respective SciPy implementations of + similar names. These use the actual numerical values of the index. Examples -------- + Interpolating values in a NumPy array: + >>> arr = pd.arrays.NumpyExtensionArray(np.array([0, 1, np.nan, 3])) - >>> arr.interpolate(method="linear", - ... limit=3, - ... limit_direction="forward", - ... index=pd.Index([1, 2, 3, 4]), - ... fill_value=1, - ... copy=False, - ... axis=0, - ... limit_area="inside" - ... ) + >>> arr.interpolate( + ... method="linear", + ... limit=3, + ... limit_direction="forward", + ... index=pd.Index(range(len(arr))), + ... fill_value=1, + ... copy=False, + ... axis=0, + ... limit_area="inside", + ... ) [0.0, 1.0, 2.0, 3.0] Length: 4, dtype: float64 + + Interpolating values in a FloatingArray: + + >>> arr = pd.array([1.0, pd.NA, 3.0, 4.0, pd.NA, 6.0], dtype="Float64") + >>> arr.interpolate( + ... method="linear", + ... axis=0, + ... index=pd.Index(range(len(arr))), + ... limit=None, + ... limit_direction="both", + ... limit_area=None, + ... copy=True, + ... ) + + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + Length: 6, dtype: Float64 """ # NB: we return type(self) even if copy=False raise NotImplementedError( @@ -954,7 +1097,12 @@ def interpolate( ) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: """ Pad or backfill values, used by Series/DataFrame ffill and bfill. @@ -975,6 +1123,12 @@ def _pad_or_backfill( maximum number of entries along the entire axis where NaNs will be filled. + limit_area : {'inside', 'outside'} or None, default None + Specifies which area to limit filling. + - 'inside': Limit the filling to the area within the gaps. + - 'outside': Limit the filling to the area outside the gaps. + If `None`, no limitation is applied. + copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. @@ -986,6 +1140,16 @@ def _pad_or_backfill( Returns ------- Same type as self + The filled array with the same type as the original. + + See Also + -------- + Series.ffill : Forward fill missing values. + Series.bfill : Backward fill missing values. + DataFrame.ffill : Forward fill missing values in DataFrame. + DataFrame.bfill : Backward fill missing values in DataFrame. + api.types.isna : Check for missing values. + api.types.isnull : Check for missing values. Examples -------- @@ -995,25 +1159,6 @@ def _pad_or_backfill( [, 2, 2, 3, , ] Length: 6, dtype: Int64 """ - - # If a 3rd-party EA has implemented this functionality in fillna, - # we warn that they need to implement _pad_or_backfill instead. - if ( - type(self).fillna is not ExtensionArray.fillna - and type(self)._pad_or_backfill is ExtensionArray._pad_or_backfill - ): - # Check for _pad_or_backfill here allows us to call - # super()._pad_or_backfill without getting this warning - warnings.warn( - "ExtensionArray.fillna 'method' keyword is deprecated. " - "In a future version. arr._pad_or_backfill will be called " - "instead. 3rd-party ExtensionArray authors need to implement " - "_pad_or_backfill.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - return self.fillna(method=method, limit=limit) - mask = self.isna() if mask.any(): @@ -1021,6 +1166,8 @@ def _pad_or_backfill( meth = missing.clean_fill_method(method) npmask = np.asarray(mask) + if limit_area is not None and not npmask.all(): + _fill_limit_area_1d(npmask, limit_area) if meth == "pad": indexer = libalgos.get_fill_indexer(npmask, limit=limit) return self.take(indexer, allow_fill=True) @@ -1037,8 +1184,7 @@ def _pad_or_backfill( def fillna( self, - value: object | ArrayLike | None = None, - method: FillnaOptions | None = None, + value: object | ArrayLike, limit: int | None = None, copy: bool = True, ) -> Self: @@ -1051,37 +1197,26 @@ def fillna( If a scalar value is passed it is used to fill all missing values. Alternatively, an array-like "value" can be given. It's expected that the array-like have the same length as 'self'. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series: - - * pad / ffill: propagate last valid observation forward to next valid. - * backfill / bfill: use NEXT valid observation to fill gap. - - .. deprecated:: 2.1.0 - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. - - .. deprecated:: 2.1.0 - + The maximum number of entries where NA values will be filled. copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. For ExtensionArray subclasses that cannot do this, it is at the author's discretion whether to ignore "copy=False" or to raise. - The base class implementation ignores the keyword in pad/backfill - cases. Returns ------- ExtensionArray With NA/NaN filled. + See Also + -------- + api.extensions.ExtensionArray.dropna : Return ExtensionArray without + NA values. + api.extensions.ExtensionArray.isna : A 1-D array indicating if + each value is missing. + Examples -------- >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) @@ -1090,42 +1225,31 @@ def fillna( [0, 0, 2, 3, 0, 0] Length: 6, dtype: Int64 """ - if method is not None: - warnings.warn( - f"The 'method' keyword in {type(self).__name__}.fillna is " - "deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - value, method = validate_fillna_kwargs(value, method) - mask = self.isna() + if limit is not None and limit < len(self): + # isna can return an ExtensionArray, we're assuming that comparisons + # are implemented. + # mypy doesn't like that mask can be an EA which need not have `cumsum` + modify = mask.cumsum() > limit # type: ignore[union-attr] + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( - value, mask, len(self) # type: ignore[arg-type] + value, + mask, # type: ignore[arg-type] + len(self), ) if mask.any(): - if method is not None: - meth = missing.clean_fill_method(method) - - npmask = np.asarray(mask) - if meth == "pad": - indexer = libalgos.get_fill_indexer(npmask, limit=limit) - return self.take(indexer, allow_fill=True) - else: - # i.e. meth == "backfill" - indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1] - return self[::-1].take(indexer, allow_fill=True) + # fill with value + if not copy: + new_values = self[:] else: - # fill with value - if not copy: - new_values = self[:] - else: - new_values = self.copy() - new_values[mask] = value + new_values = self.copy() + new_values[mask] = value else: if not copy: new_values = self[:] @@ -1139,6 +1263,16 @@ def dropna(self) -> Self: Returns ------- + Self + An ExtensionArray of the same type as the original but with all + NA values removed. + + See Also + -------- + Series.dropna : Remove missing values from a Series. + DataFrame.dropna : Remove missing values from a DataFrame. + api.extensions.ExtensionArray.isna : Check for missing values in + an ExtensionArray. Examples -------- @@ -1166,6 +1300,15 @@ def duplicated( Returns ------- ndarray[bool] + With true in indices where elements are duplicated and false otherwise. + + See Also + -------- + DataFrame.duplicated : Return boolean Series denoting + duplicate rows. + Series.duplicated : Indicate duplicate Series values. + api.extensions.ExtensionArray.unique : Compute the ExtensionArray + of unique values. Examples -------- @@ -1197,6 +1340,13 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: ExtensionArray Shifted. + See Also + -------- + api.extensions.ExtensionArray.transpose : Return a transposed view on + this array. + api.extensions.ExtensionArray.factorize : Encode the extension array as an + enumerated type. + Notes ----- If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is @@ -1242,6 +1392,13 @@ def unique(self) -> Self: Returns ------- pandas.api.extensions.ExtensionArray + With unique values from the input array. + + See Also + -------- + Index.unique: Return unique values in the index. + Series.unique: Return unique values of Series object. + unique: Return unique values based on a hash table. Examples -------- @@ -1332,12 +1489,23 @@ def equals(self, other: object) -> bool: boolean Whether the arrays are equivalent. + See Also + -------- + numpy.array_equal : Equivalent method for numpy array. + Series.equals : Equivalent method for Series. + DataFrame.equals : Equivalent method for DataFrame. + Examples -------- >>> arr1 = pd.array([1, 2, np.nan]) >>> arr2 = pd.array([1, 2, np.nan]) >>> arr1.equals(arr2) True + + >>> arr1 = pd.array([1, 3, np.nan]) + >>> arr2 = pd.array([1, 2, np.nan]) + >>> arr1.equals(arr2) + False """ if type(self) != type(other): return False @@ -1364,10 +1532,18 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: Parameters ---------- values : np.ndarray or ExtensionArray + Values to compare every element in the array against. Returns ------- np.ndarray[bool] + With true at indices where value is in `values`. + + See Also + -------- + DataFrame.isin: Whether each element in the DataFrame is contained in values. + Index.isin: Return a boolean array where the index values are in values. + Series.isin: Whether elements in Series are contained in values. Examples -------- @@ -1395,6 +1571,10 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: `-1` and not included in `uniques`. By default, ``np.nan`` is used. + See Also + -------- + util.hash_pandas_object : Hash the pandas object. + Notes ----- The values returned by this method are also used in @@ -1448,8 +1628,10 @@ def factorize( Examples -------- - >>> idx1 = pd.PeriodIndex(["2014-01", "2014-01", "2014-02", "2014-02", - ... "2014-03", "2014-03"], freq="M") + >>> idx1 = pd.PeriodIndex( + ... ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + ... freq="M", + ... ) >>> arr, idx = idx1.factorize() >>> arr array([0, 0, 1, 1, 2, 2]) @@ -1473,9 +1655,7 @@ def factorize( uniques_ea = self._from_factorized(uniques, self) return codes, uniques_ea - _extension_array_shared_docs[ - "repeat" - ] = """ + _extension_array_shared_docs["repeat"] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -1567,6 +1747,7 @@ def take( Returns ------- ExtensionArray + An array formed with selected `indices`. Raises ------ @@ -1610,8 +1791,9 @@ def take(self, indices, allow_fill=False, fill_value=None): # type for the array, to the physical storage type for # the data, before passing to take. - result = take(data, indices, fill_value=fill_value, - allow_fill=allow_fill) + result = take( + data, indices, fill_value=fill_value, allow_fill=allow_fill + ) return self._from_sequence(result, dtype=self.dtype) """ # Implementer note: The `fill_value` parameter should be a user-facing @@ -1627,9 +1809,19 @@ def copy(self) -> Self: """ Return a copy of the array. + This method creates a copy of the `ExtensionArray` where modifying the + data in the copy will not affect the original array. This is useful when + you want to manipulate data without altering the original dataset. + Returns ------- ExtensionArray + A new `ExtensionArray` object that is a copy of the current instance. + + See Also + -------- + DataFrame.copy : Return a copy of the DataFrame. + Series.copy : Return a copy of the Series. Examples -------- @@ -1657,11 +1849,17 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: ExtensionArray or np.ndarray A view on the :class:`ExtensionArray`'s data. + See Also + -------- + api.extensions.ExtensionArray.ravel: Return a flattened view on input array. + Index.view: Equivalent function for Index. + ndarray.view: New view of array with the same data. + Examples -------- This gives view on the underlying data of an ``ExtensionArray`` and is not a copy. Modifications on either the view or the original ``ExtensionArray`` - will be reflectd on the underlying data: + will be reflected on the underlying data: >>> arr = pd.array([1, 2, 3]) >>> arr2 = arr.view() @@ -1746,14 +1944,25 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: when ``boxed=False`` and :func:`str` is used when ``boxed=True``. + See Also + -------- + api.extensions.ExtensionArray._concat_same_type : Concatenate multiple + array of this dtype. + api.extensions.ExtensionArray._explode : Transform each element of + list-like to a row. + api.extensions.ExtensionArray._from_factorized : Reconstruct an + ExtensionArray after factorization. + api.extensions.ExtensionArray._from_sequence : Construct a new + ExtensionArray from a sequence of scalars. + Examples -------- >>> class MyExtensionArray(pd.arrays.NumpyExtensionArray): ... def _formatter(self, boxed=False): - ... return lambda x: '*' + str(x) + '*' if boxed else repr(x) + '*' + ... return lambda x: "*" + str(x) + "*" >>> MyExtensionArray(np.array([1, 2, 3, 4])) - [1*, 2*, 3*, 4*] + [*1*, *2*, *3*, *4*] Length: 4, dtype: int64 """ if boxed: @@ -1764,7 +1973,7 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: # Reshaping # ------------------------------------------------------------------------ - def transpose(self, *axes: int) -> ExtensionArray: + def transpose(self, *axes: int) -> Self: """ Return a transposed view on this array. @@ -1785,10 +1994,10 @@ def transpose(self, *axes: int) -> ExtensionArray: return self[:] @property - def T(self) -> ExtensionArray: + def T(self) -> Self: return self.transpose() - def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> ExtensionArray: + def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> Self: """ Return a flattened view on this array. @@ -1799,6 +2008,11 @@ def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> ExtensionArr Returns ------- ExtensionArray + A flattened view on the array. + + See Also + -------- + ExtensionArray.tolist: Return a list of the values. Notes ----- @@ -1822,11 +2036,21 @@ def _concat_same_type(cls, to_concat: Sequence[Self]) -> Self: Parameters ---------- to_concat : sequence of this type + An array of the same dtype to concatenate. Returns ------- ExtensionArray + See Also + -------- + api.extensions.ExtensionArray._explode : Transform each element of + list-like to a row. + api.extensions.ExtensionArray._formatter : Formatting function for + scalar values. + api.extensions.ExtensionArray._from_factorized : Reconstruct an + ExtensionArray after factorization. + Examples -------- >>> arr1 = pd.array([1, 2, 3]) @@ -1877,15 +2101,24 @@ def _accumulate( Returns ------- array + An array performing the accumulation operation. Raises ------ NotImplementedError : subclass does not define accumulations + See Also + -------- + api.extensions.ExtensionArray._concat_same_type : Concatenate multiple + array of this dtype. + api.extensions.ExtensionArray.view : Return a view on the array. + api.extensions.ExtensionArray._explode : Transform each element of + list-like to a row. + Examples -------- >>> arr = pd.array([1, 2, 3]) - >>> arr._accumulate(name='cumsum') + >>> arr._accumulate(name="cumsum") [1, 3, 6] Length: 3, dtype: Int64 @@ -1909,38 +2142,62 @@ def _reduce( keepdims : bool, default False If False, a scalar is returned. If True, the result has dimension with size one along the reduced axis. - - .. versionadded:: 2.1 - - This parameter is not required in the _reduce signature to keep backward - compatibility, but will become required in the future. If the parameter - is not found in the method signature, a FutureWarning will be emitted. **kwargs Additional keyword arguments passed to the reduction function. Currently, `ddof` is the only supported kwarg. Returns ------- - scalar + scalar or ndarray: + The result of the reduction operation. The type of the result + depends on `keepdims`: + - If `keepdims` is `False`, a scalar value is returned. + - If `keepdims` is `True`, the result is wrapped in a numpy array with + a single element. Raises ------ - TypeError : subclass does not define reductions + TypeError : subclass does not define operations + + See Also + -------- + Series.min : Return the minimum value. + Series.max : Return the maximum value. + Series.sum : Return the sum of values. + Series.mean : Return the mean of values. + Series.median : Return the median of values. + Series.std : Return the standard deviation. + Series.var : Return the variance. + Series.prod : Return the product of values. + Series.sem : Return the standard error of the mean. + Series.kurt : Return the kurtosis. + Series.skew : Return the skewness. Examples -------- >>> pd.array([1, 2, 3])._reduce("min") - 1 + np.int64(1) + >>> pd.array([1, 2, 3])._reduce("max") + np.int64(3) + >>> pd.array([1, 2, 3])._reduce("sum") + np.int64(6) + >>> pd.array([1, 2, 3])._reduce("mean") + np.float64(2.0) + >>> pd.array([1, 2, 3])._reduce("median") + np.float64(2.0) """ meth = getattr(self, name, None) if meth is None: raise TypeError( f"'{type(self).__name__}' with dtype {self.dtype} " - f"does not support reduction '{name}'" + f"does not support operation '{name}'" ) result = meth(skipna=skipna, **kwargs) if keepdims: - result = np.array([result]) + if name in ["min", "max"]: + result = self._from_sequence([result], dtype=self.dtype) + else: + result = np.array([result]) return result @@ -1987,13 +2244,19 @@ def _hash_pandas_object( Returns ------- np.ndarray[uint64] + An array of hashed values. + + See Also + -------- + api.extensions.ExtensionArray._values_for_factorize : Return an array and + missing value suitable for factorization. + util.hash_array : Given a 1d array, return an array of hashed values. Examples -------- - >>> pd.array([1, 2])._hash_pandas_object(encoding='utf-8', - ... hash_key="1000000000000000", - ... categorize=False - ... ) + >>> pd.array([1, 2])._hash_pandas_object( + ... encoding="utf-8", hash_key="1000000000000000", categorize=False + ... ) array([ 6238072747940578789, 15839785061582574730], dtype=uint64) """ from pandas.core.util.hashing import hash_array @@ -2027,8 +2290,9 @@ def _explode(self) -> tuple[Self, npt.NDArray[np.uint64]]: Examples -------- >>> import pyarrow as pa - >>> a = pd.array([[1, 2, 3], [4], [5, 6]], - ... dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + >>> a = pd.array( + ... [[1, 2, 3], [4], [5, 6]], dtype=pd.ArrowDtype(pa.list_(pa.int64())) + ... ) >>> a._explode() ( [1, 2, 3, 4, 5, 6] @@ -2049,6 +2313,12 @@ def tolist(self) -> list: Returns ------- list + Python list of values in array. + + See Also + -------- + Index.to_list: Return a list of the values in the Index. + Series.to_list: Return a list of the values in the Series. Examples -------- @@ -2071,11 +2341,18 @@ def insert(self, loc: int, item) -> Self: Parameters ---------- loc : int + Index where the `item` needs to be inserted. item : scalar-like + Value to be inserted. Returns ------- - same type as self + ExtensionArray + With `item` inserted at `loc`. + + See Also + -------- + Index.insert: Make new Index inserting new item at location. Notes ----- @@ -2150,25 +2427,6 @@ def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: result[~mask] = val return result - # TODO(3.0): this can be removed once GH#33302 deprecation is enforced - def _fill_mask_inplace( - self, method: str, limit: int | None, mask: npt.NDArray[np.bool_] - ) -> None: - """ - Replace values in locations specified by 'mask' using pad or backfill. - - See also - -------- - ExtensionArray.fillna - """ - func = missing.get_fill_func(method) - npvalues = self.astype(object) - # NB: if we don't copy mask here, it may be altered inplace, which - # would mess up the `self[mask] = ...` below. - func(npvalues, limit=limit, mask=mask.copy()) - new_values = self._from_sequence(npvalues, dtype=self.dtype) - self[mask] = new_values[mask] - def _rank( self, *, @@ -2185,7 +2443,7 @@ def _rank( raise NotImplementedError return rank( - self._values_for_argsort(), + self, axis=axis, method=method, na_option=na_option, @@ -2253,8 +2511,9 @@ def _mode(self, dropna: bool = True) -> Self: Sorted, if possible. """ # error: Incompatible return value type (got "Union[ExtensionArray, - # ndarray[Any, Any]]", expected "Self") - return mode(self, dropna=dropna) # type: ignore[return-value] + # Tuple[np.ndarray, npt.NDArray[np.bool_]]", expected "Self") + result, _ = mode(self, dropna=dropna) + return result # type: ignore[return-value] def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): if any( @@ -2282,7 +2541,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) - def map(self, mapper, na_action=None): + def map(self, mapper, na_action: Literal["ignore"] | None = None): """ Map values using an input mapping or function. @@ -2352,10 +2611,33 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + if op.how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + "kurt", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) - npvalues = self.to_numpy(object, na_value=np.nan) + + arr = self + if op.how == "sum": + # https://github.com/pandas-dev/pandas/issues/60229 + # All NA should result in the empty string. + assert "skipna" in kwargs + if kwargs["skipna"] and min_count == 0: + arr = arr.fillna("") + npvalues = arr.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( f"function is not implemented for this dtype: {self.dtype}" @@ -2385,10 +2667,22 @@ def _groupby_op( class ExtensionArraySupportsAnyAll(ExtensionArray): - def any(self, *, skipna: bool = True) -> bool: + @overload + def any(self, *, skipna: Literal[True] = ...) -> bool: ... + + @overload + def any(self, *, skipna: bool) -> bool | NAType: ... + + def any(self, *, skipna: bool = True) -> bool | NAType: raise AbstractMethodError(self) - def all(self, *, skipna: bool = True) -> bool: + @overload + def all(self, *, skipna: Literal[True] = ...) -> bool: ... + + @overload + def all(self, *, skipna: bool) -> bool | NAType: ... + + def all(self, *, skipna: bool = True) -> bool | NAType: raise AbstractMethodError(self) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 04e6f0a0bcdde..87c18fe346c62 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -29,13 +29,14 @@ import pyarrow from pandas._typing import ( - Dtype, DtypeObj, Self, npt, type_t, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + @register_extension_dtype class BooleanDtype(BaseMaskedDtype): @@ -55,6 +56,10 @@ class BooleanDtype(BaseMaskedDtype): ------- None + See Also + -------- + StringDtype : Extension dtype for string data. + Examples -------- >>> pd.BooleanDtype() @@ -63,6 +68,9 @@ class BooleanDtype(BaseMaskedDtype): name: ClassVar[str] = "boolean" + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + # https://github.com/python/mypy/issues/4125 # error: Signature of "type" incompatible with supertype "BaseMaskedDtype" @property @@ -278,6 +286,13 @@ class BooleanArray(BaseMaskedArray): ------- BooleanArray + See Also + -------- + array : Create an array from data with the appropriate dtype. + BooleanDtype : Extension dtype for boolean data. + Series : One-dimensional ndarray with axis labels (including time series). + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- Create an BooleanArray with :func:`pandas.array`: @@ -288,13 +303,6 @@ class BooleanArray(BaseMaskedArray): Length: 3, dtype: boolean """ - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = False - # Fill values used for any/all - # Incompatible types in assignment (expression has type "bool", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = True # type: ignore[assignment] - _falsey_value = False # type: ignore[assignment] _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} @@ -324,19 +332,25 @@ def _from_sequence_of_strings( cls, strings: list[str], *, - dtype: Dtype | None = None, + dtype: ExtensionDtype, copy: bool = False, true_values: list[str] | None = None, false_values: list[str] | None = None, + none_values: list[str] | None = None, ) -> BooleanArray: true_values_union = cls._TRUE_VALUES.union(true_values or []) false_values_union = cls._FALSE_VALUES.union(false_values or []) - def map_string(s) -> bool: + if none_values is None: + none_values = [] + + def map_string(s) -> bool | None: if s in true_values_union: return True elif s in false_values_union: return False + elif s in none_values: + return None else: raise ValueError(f"{s} cannot be cast to bool") @@ -355,7 +369,7 @@ def _coerce_to_array( assert dtype == "boolean" return coerce_to_array(value, copy=copy) - def _logical_method(self, other, op): + def _logical_method(self, other, op): # type: ignore[override] assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} other_is_scalar = lib.is_scalar(other) mask = None diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 065a942cae768..df1aa21e9203c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -10,7 +10,6 @@ cast, overload, ) -import warnings import numpy as np @@ -23,7 +22,6 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas.compat.numpy import function as nv -from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -44,7 +42,9 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, + CategoricalDtypeType, ExtensionDtype, ) from pandas.core.dtypes.generic import ( @@ -91,6 +91,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, Sequence, @@ -241,7 +242,9 @@ def contains(cat, key, container) -> bool: return any(loc_ in container for loc_ in loc) -class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): +# error: Definition of "delete/ravel/T/repeat/copy" in base class "NDArrayBacked" +# is incompatible with definition in base class "ExtensionArray" +class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): # type: ignore[misc] """ Represent a categorical variable in classic R / S-plus fashion. @@ -272,6 +275,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi provided). dtype : CategoricalDtype An instance of ``CategoricalDtype`` to use for this categorical. + copy : bool, default True + Whether to copy if the codes are unchanged. Attributes ---------- @@ -289,6 +294,15 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Methods ------- from_codes + as_ordered + as_unordered + set_categories + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + map __array__ Raises @@ -316,7 +330,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi [1, 2, 3, 1, 2, 3] Categories (3, int64): [1, 2, 3] - >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + >>> pd.Categorical(["a", "b", "c", "a", "b", "c"]) ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] @@ -336,8 +350,9 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. - >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, - ... categories=['c', 'b', 'a']) + >>> c = pd.Categorical( + ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"] + ... ) >>> c ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['c' < 'b' < 'a'] @@ -372,20 +387,8 @@ def __init__( categories=None, ordered=None, dtype: Dtype | None = None, - fastpath: bool | lib.NoDefault = lib.no_default, copy: bool = True, ) -> None: - if fastpath is not lib.no_default: - # GH#20110 - warnings.warn( - "The 'fastpath' keyword in Categorical is deprecated and will " - "be removed in a future version. Use Categorical.from_codes instead", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - else: - fastpath = False - dtype = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype ) @@ -393,12 +396,6 @@ def __init__( # we may have dtype.categories be None, and we need to # infer categories in a factorization step further below - if fastpath: - codes = coerce_indexer_dtype(values, dtype.categories) - dtype = CategoricalDtype(ordered=False).update_dtype(dtype) - super().__init__(codes, dtype) - return - if not is_list_like(values): # GH#38433 raise TypeError("Categorical input must be list-like") @@ -412,6 +409,10 @@ def __init__( if isinstance(vdtype, CategoricalDtype): if dtype.categories is None: dtype = CategoricalDtype(values.categories, dtype.ordered) + elif isinstance(values, range): + from pandas.core.indexes.range import RangeIndex + + values = RangeIndex(values) elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): values = com.convert_to_list_like(values) if isinstance(values, list) and len(values) == 0: @@ -443,24 +444,37 @@ def __init__( values = arr if dtype.categories is None: - if not isinstance(values, ABCIndex): - # in particular RangeIndex xref test_index_equal_range_categories - values = sanitize_array(values, None) - try: - codes, categories = factorize(values, sort=True) - except TypeError as err: - codes, categories = factorize(values, sort=False) - if dtype.ordered: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories - raise TypeError( - "'values' is not ordered, please " - "explicitly specify the categories order " - "by passing in a categories argument." - ) from err - - # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + if isinstance(values.dtype, ArrowDtype) and issubclass( + values.dtype.type, CategoricalDtypeType + ): + from pandas import Index + + if isinstance(values, Index): + arr = values._data._pa_array.combine_chunks() + else: + arr = extract_array(values)._pa_array.combine_chunks() + categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) + codes = arr.indices.to_numpy() + dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) + else: + if not isinstance(values, ABCIndex): + # in particular RangeIndex xref test_index_equal_range_categories + values = sanitize_array(values, None) + try: + codes, categories = factorize(values, sort=True) + except TypeError as err: + codes, categories = factorize(values, sort=False) + if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) from err + + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes @@ -486,9 +500,14 @@ def dtype(self) -> CategoricalDtype: """ The :class:`~pandas.api.types.CategoricalDtype` for this instance. + See Also + -------- + astype : Cast argument to a specified dtype. + CategoricalDtype : Type for categorical data. + Examples -------- - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat ['a', 'b'] Categories (2, object): ['a' < 'b'] @@ -527,16 +546,13 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: return res @overload - def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: - ... + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @overload - def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: - ... + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: ... @overload - def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: - ... + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: ... def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: """ @@ -551,6 +567,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: object is returned. """ dtype = pandas_dtype(dtype) + result: Categorical | np.ndarray if self.dtype is dtype: result = self.copy() if copy else self @@ -567,11 +584,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array( - self, - dtype=dtype, - copy=copy, - ) + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + result = np.asarray(self, dtype=dtype) + else: + result = np.array(self, dtype=dtype) else: # GH8628 (PERF): astype category codes instead of astyping array @@ -587,9 +605,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, - ): + ) as err: msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" - raise ValueError(msg) + raise ValueError(msg) from err result = take_nd( new_cats, ensure_platform_int(self._codes), fill_value=fill_value @@ -597,19 +615,6 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return result - def to_list(self): - """ - Alias for tolist. - """ - # GH#51254 - warnings.warn( - "Categorical.to_list is deprecated and will be removed in a future " - "version. Use obj.tolist() instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.tolist() - @classmethod def _from_inferred_categories( cls, inferred_categories, inferred_codes, dtype, true_values=None @@ -725,9 +730,14 @@ def from_codes( ------- Categorical + See Also + -------- + codes : The category codes of the categorical. + CategoricalIndex : An Index with an underlying ``Categorical``. + Examples -------- - >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype = pd.CategoricalDtype(["a", "b"], ordered=True) >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) ['a', 'b', 'a', 'b'] Categories (2, object): ['a' < 'b'] @@ -782,28 +792,28 @@ def categories(self) -> Index: -------- For :class:`pandas.Series`: - >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") >>> ser.cat.categories Index(['a', 'b', 'c'], dtype='object') - >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], categories=['b', 'c', 'd']) + >>> raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"]) >>> ser = pd.Series(raw_cat) >>> ser.cat.categories Index(['b', 'c', 'd'], dtype='object') For :class:`pandas.Categorical`: - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat.categories Index(['a', 'b'], dtype='object') For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'c', 'b', 'a', 'c', 'b']) + >>> ci = pd.CategoricalIndex(["a", "c", "b", "a", "c", "b"]) >>> ci.categories Index(['a', 'b', 'c'], dtype='object') - >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a']) + >>> ci = pd.CategoricalIndex(["a", "c"], categories=["c", "b", "a"]) >>> ci.categories Index(['c', 'b', 'a'], dtype='object') """ @@ -814,36 +824,42 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. + See Also + -------- + set_ordered : Set the ordered attribute. + as_ordered : Set the Categorical to be ordered. + as_unordered : Set the Categorical to be unordered. + Examples -------- For :class:`pandas.Series`: - >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") >>> ser.cat.ordered False - >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True) + >>> raw_cat = pd.Categorical(["a", "b", "c", "a"], ordered=True) >>> ser = pd.Series(raw_cat) >>> ser.cat.ordered True For :class:`pandas.Categorical`: - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat.ordered True - >>> cat = pd.Categorical(['a', 'b'], ordered=False) + >>> cat = pd.Categorical(["a", "b"], ordered=False) >>> cat.ordered False For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=True) + >>> ci = pd.CategoricalIndex(["a", "b"], ordered=True) >>> ci.ordered True - >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=False) + >>> ci = pd.CategoricalIndex(["a", "b"], ordered=False) >>> ci.ordered False """ @@ -865,21 +881,26 @@ def codes(self) -> np.ndarray: ndarray[int] A non-writable view of the ``codes`` array. + See Also + -------- + Categorical.from_codes : Make a Categorical from codes. + CategoricalIndex : An Index with an underlying ``Categorical``. + Examples -------- For :class:`pandas.Categorical`: - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat.codes array([0, 1], dtype=int8) For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) >>> ci.codes array([0, 1, 2, 0, 1, 2], dtype=int8) - >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a']) + >>> ci = pd.CategoricalIndex(["a", "c"], categories=["c", "b", "a"]) >>> ci.codes array([2, 0], dtype=int8) """ @@ -898,12 +919,12 @@ def _set_categories(self, categories, fastpath: bool = False) -> None: Examples -------- - >>> c = pd.Categorical(['a', 'b']) + >>> c = pd.Categorical(["a", "b"]) >>> c ['a', 'b'] Categories (2, object): ['a', 'b'] - >>> c._set_categories(pd.Index(['a', 'c'])) + >>> c._set_categories(pd.Index(["a", "c"])) >>> c ['a', 'c'] Categories (2, object): ['a', 'c'] @@ -963,11 +984,15 @@ def as_ordered(self) -> Self: Categorical Ordered Categorical. + See Also + -------- + as_unordered : Set the Categorical to be unordered. + Examples -------- For :class:`pandas.Series`: - >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") >>> ser.cat.ordered False >>> ser = ser.cat.as_ordered() @@ -976,7 +1001,7 @@ def as_ordered(self) -> Self: For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a']) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a"]) >>> ci.ordered False >>> ci = ci.as_ordered() @@ -994,11 +1019,15 @@ def as_unordered(self) -> Self: Categorical Unordered Categorical. + See Also + -------- + as_ordered : Set the Categorical to be ordered. + Examples -------- For :class:`pandas.Series`: - >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True) + >>> raw_cat = pd.Categorical(["a", "b", "c", "a"], ordered=True) >>> ser = pd.Series(raw_cat) >>> ser.cat.ordered True @@ -1008,7 +1037,7 @@ def as_unordered(self) -> Self: For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'], ordered=True) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a"], ordered=True) >>> ci.ordered True >>> ci = ci.as_unordered() @@ -1017,7 +1046,9 @@ def as_unordered(self) -> Self: """ return self.set_ordered(False) - def set_categories(self, new_categories, ordered=None, rename: bool = False): + def set_categories( + self, new_categories, ordered=None, rename: bool = False + ) -> Self: """ Set the categories to the specified new categories. @@ -1034,14 +1065,14 @@ def set_categories(self, new_categories, ordered=None, rename: bool = False): On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes, for example when using special string - dtypes, which does not considers a S1 string equal to a single char + dtypes, which do not consider a S1 string equal to a single char python string. Parameters ---------- new_categories : Index-like The categories in new order. - ordered : bool, default False + ordered : bool, default None Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. rename : bool, default False @@ -1050,7 +1081,8 @@ def set_categories(self, new_categories, ordered=None, rename: bool = False): Returns ------- - Categorical with reordered categories. + Categorical + New categories to be used, with optional ordering changes. Raises ------ @@ -1069,8 +1101,9 @@ def set_categories(self, new_categories, ordered=None, rename: bool = False): -------- For :class:`pandas.Series`: - >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'], - ... categories=['a', 'b', 'c'], ordered=True) + >>> raw_cat = pd.Categorical( + ... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True + ... ) >>> ser = pd.Series(raw_cat) >>> ser 0 a @@ -1080,7 +1113,7 @@ def set_categories(self, new_categories, ordered=None, rename: bool = False): dtype: category Categories (3, object): ['a' < 'b' < 'c'] - >>> ser.cat.set_categories(['A', 'B', 'C'], rename=True) + >>> ser.cat.set_categories(["A", "B", "C"], rename=True) 0 A 1 B 2 C @@ -1090,16 +1123,17 @@ def set_categories(self, new_categories, ordered=None, rename: bool = False): For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'A'], - ... categories=['a', 'b', 'c'], ordered=True) + >>> ci = pd.CategoricalIndex( + ... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True + ... ) >>> ci CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'], ordered=True, dtype='category') - >>> ci.set_categories(['A', 'b', 'c']) + >>> ci.set_categories(["A", "b", "c"]) CategoricalIndex([nan, 'b', 'c', nan], categories=['A', 'b', 'c'], ordered=True, dtype='category') - >>> ci.set_categories(['A', 'b', 'c'], rename=True) + >>> ci.set_categories(["A", "b", "c"], rename=True) CategoricalIndex(['A', 'b', 'c', nan], categories=['A', 'b', 'c'], ordered=True, dtype='category') """ @@ -1127,6 +1161,12 @@ def rename_categories(self, new_categories) -> Self: """ Rename categories. + This method is commonly used to re-label or adjust the + category names in categorical data without changing the + underlying data. It is useful in situations where you want + to modify the labels used for clarity, consistency, + or readability. + Parameters ---------- new_categories : list-like, dict-like or callable @@ -1165,7 +1205,7 @@ def rename_categories(self, new_categories) -> Self: Examples -------- - >>> c = pd.Categorical(['a', 'a', 'b']) + >>> c = pd.Categorical(["a", "a", "b"]) >>> c.rename_categories([0, 1]) [0, 0, 1] Categories (2, int64): [0, 1] @@ -1173,7 +1213,7 @@ def rename_categories(self, new_categories) -> Self: For dict-like ``new_categories``, extra keys are ignored and categories not in the dictionary are passed through - >>> c.rename_categories({'a': 'A', 'c': 'C'}) + >>> c.rename_categories({"a": "A", "c": "C"}) ['A', 'A', 'b'] Categories (2, object): ['A', 'b'] @@ -1233,8 +1273,8 @@ def reorder_categories(self, new_categories, ordered=None) -> Self: -------- For :class:`pandas.Series`: - >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') - >>> ser = ser.cat.reorder_categories(['c', 'b', 'a'], ordered=True) + >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") + >>> ser = ser.cat.reorder_categories(["c", "b", "a"], ordered=True) >>> ser 0 a 1 b @@ -1253,11 +1293,11 @@ def reorder_categories(self, new_categories, ordered=None) -> Self: For :class:`pandas.CategoricalIndex`: - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a']) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a"]) >>> ci CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, dtype='category') - >>> ci.reorder_categories(['c', 'b', 'a'], ordered=True) + >>> ci.reorder_categories(["c", "b", "a"], ordered=True) CategoricalIndex(['a', 'b', 'c', 'a'], categories=['c', 'b', 'a'], ordered=True, dtype='category') """ @@ -1280,7 +1320,7 @@ def add_categories(self, new_categories) -> Self: Parameters ---------- new_categories : category or list-like of category - The new categories to be included. + The new categories to be included. Returns ------- @@ -1303,12 +1343,12 @@ def add_categories(self, new_categories) -> Self: Examples -------- - >>> c = pd.Categorical(['c', 'b', 'c']) + >>> c = pd.Categorical(["c", "b", "c"]) >>> c ['c', 'b', 'c'] Categories (2, object): ['b', 'c'] - >>> c.add_categories(['d', 'a']) + >>> c.add_categories(["d", "a"]) ['c', 'b', 'c'] Categories (4, object): ['b', 'c', 'd', 'a'] """ @@ -1343,8 +1383,8 @@ def remove_categories(self, removals) -> Self: """ Remove the specified categories. - `removals` must be included in the old categories. Values which were in - the removed categories will be set to NaN + The ``removals`` argument must be a subset of the current categories. + Any values that were part of the removed categories will be set to NaN. Parameters ---------- @@ -1371,12 +1411,12 @@ def remove_categories(self, removals) -> Self: Examples -------- - >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) + >>> c = pd.Categorical(["a", "c", "b", "c", "d"]) >>> c ['a', 'c', 'b', 'c', 'd'] Categories (4, object): ['a', 'b', 'c', 'd'] - >>> c.remove_categories(['d', 'a']) + >>> c.remove_categories(["d", "a"]) [NaN, 'c', 'b', 'c', NaN] Categories (2, object): ['b', 'c'] """ @@ -1403,6 +1443,10 @@ def remove_unused_categories(self) -> Self: """ Remove categories which are not used. + This method is useful when working with datasets + that undergo dynamic changes where categories may no longer be + relevant, allowing to maintain a clean, efficient data structure. + Returns ------- Categorical @@ -1418,13 +1462,13 @@ def remove_unused_categories(self) -> Self: Examples -------- - >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) + >>> c = pd.Categorical(["a", "c", "b", "c", "d"]) >>> c ['a', 'c', 'b', 'c', 'd'] Categories (4, object): ['a', 'b', 'c', 'd'] - >>> c[2] = 'a' - >>> c[4] = 'c' + >>> c[2] = "a" + >>> c[4] = "c" >>> c ['a', 'c', 'a', 'c', 'c'] Categories (4, object): ['a', 'b', 'c', 'd'] @@ -1453,7 +1497,7 @@ def remove_unused_categories(self) -> Self: def map( self, mapper, - na_action: Literal["ignore"] | None | lib.NoDefault = lib.no_default, + na_action: Literal["ignore"] | None = None, ): """ Map categories using an input mapping or function. @@ -1471,15 +1515,10 @@ def map( ---------- mapper : function, dict, or Series Mapping correspondence. - na_action : {None, 'ignore'}, default 'ignore' + na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the mapping correspondence. - .. deprecated:: 2.1.0 - - The default value of 'ignore' has been deprecated and will be changed to - None in the future. - Returns ------- pandas.Categorical or pandas.Index @@ -1498,50 +1537,39 @@ def map( Examples -------- - >>> cat = pd.Categorical(['a', 'b', 'c']) + >>> cat = pd.Categorical(["a", "b", "c"]) >>> cat ['a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] >>> cat.map(lambda x: x.upper(), na_action=None) ['A', 'B', 'C'] Categories (3, object): ['A', 'B', 'C'] - >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}, na_action=None) + >>> cat.map({"a": "first", "b": "second", "c": "third"}, na_action=None) ['first', 'second', 'third'] Categories (3, object): ['first', 'second', 'third'] If the mapping is one-to-one the ordering of the categories is preserved: - >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) + >>> cat = pd.Categorical(["a", "b", "c"], ordered=True) >>> cat ['a', 'b', 'c'] Categories (3, object): ['a' < 'b' < 'c'] - >>> cat.map({'a': 3, 'b': 2, 'c': 1}, na_action=None) + >>> cat.map({"a": 3, "b": 2, "c": 1}, na_action=None) [3, 2, 1] Categories (3, int64): [3 < 2 < 1] If the mapping is not one-to-one an :class:`~pandas.Index` is returned: - >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}, na_action=None) + >>> cat.map({"a": "first", "b": "second", "c": "first"}, na_action=None) Index(['first', 'second', 'first'], dtype='object') If a `dict` is used, all unmapped categories are mapped to `NaN` and the result is an :class:`~pandas.Index`: - >>> cat.map({'a': 'first', 'b': 'second'}, na_action=None) + >>> cat.map({"a": "first", "b": "second"}, na_action=None) Index(['first', 'second', nan], dtype='object') """ - if na_action is lib.no_default: - warnings.warn( - "The default value of 'ignore' for the `na_action` parameter in " - "pandas.Categorical.map is deprecated and will be " - "changed to 'None' in a future version. Please set na_action to the " - "desired value to avoid seeing this warning", - FutureWarning, - stacklevel=find_stack_level(), - ) - na_action = "ignore" - assert callable(mapper) or is_dict_like(mapper) new_categories = self.categories.map(mapper) @@ -1626,10 +1654,23 @@ def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndar # ------------------------------------------------------------- @ravel_compat - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ The numpy array interface. + Users should not call this directly. Rather, it is invoked by + :func:`numpy.array` and :func:`numpy.asarray`. + + Parameters + ---------- + dtype : np.dtype or None + Specifies the the dtype for the array. + + copy : bool or None, optional + See :func:`numpy.asarray`. + Returns ------- numpy.array @@ -1637,23 +1678,32 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: if dtype==None (default), the same dtype as categorical.categories.dtype. + See Also + -------- + numpy.asarray : Convert input to numpy.ndarray. + Examples -------- - >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat = pd.Categorical(["a", "b"], ordered=True) The following calls ``cat.__array__`` >>> np.asarray(cat) array(['a', 'b'], dtype=object) """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + ret = take_nd(self.categories._values, self._codes) - if dtype and np.dtype(dtype) != self.categories.dtype: - return np.asarray(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return np.asarray(ret) + + # `take_nd` should already make a copy, so don't force again. + return np.asarray(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods @@ -1803,13 +1853,17 @@ def value_counts(self, dropna: bool = True) -> Series: count = np.bincount(obs, minlength=ncat or 0) else: count = np.bincount(np.where(mask, code, ncat)) - ix = np.append(ix, -1) + ix = np.append(ix, -1) # type: ignore[assignment] ix = coerce_indexer_dtype(ix, self.dtype.categories) - ix = self._from_backing_data(ix) + ix_categorical = self._from_backing_data(ix) return Series( - count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False + count, + index=CategoricalIndex(ix_categorical), + dtype="int64", + name="count", + copy=False, ) # error: Argument 2 of "_empty" is incompatible with supertype @@ -1870,7 +1924,7 @@ def check_for_ordered(self, op) -> None: def argsort( self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs - ): + ) -> npt.NDArray[np.intp]: """ Return the indices that would sort the Categorical. @@ -1904,12 +1958,12 @@ def argsort( Examples -------- - >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() + >>> pd.Categorical(["b", "b", "a", "c"]).argsort() array([2, 0, 1, 3]) - >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], - ... categories=['c', 'b', 'a'], - ... ordered=True) + >>> cat = pd.Categorical( + ... ["b", "b", "a", "c"], categories=["c", "b", "a"], ordered=True + ... ) >>> cat.argsort() array([3, 0, 1, 2]) @@ -1928,14 +1982,12 @@ def sort_values( inplace: Literal[False] = ..., ascending: bool = ..., na_position: str = ..., - ) -> Self: - ... + ) -> Self: ... @overload def sort_values( self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ... - ) -> None: - ... + ) -> None: ... def sort_values( self, @@ -2003,16 +2055,16 @@ def sort_values( >>> c.sort_values(ascending=False) [5, 2, 2, NaN, NaN] Categories (2, int64): [2, 5] - >>> c.sort_values(na_position='first') + >>> c.sort_values(na_position="first") [NaN, NaN, 2, 2, 5] Categories (2, int64): [2, 5] - >>> c.sort_values(ascending=False, na_position='first') + >>> c.sort_values(ascending=False, na_position="first") [NaN, NaN, 5, 2, 2] Categories (2, int64): [2, 5] """ inplace = validate_bool_kwarg(inplace, "inplace") if na_position not in ["last", "first"]: - raise ValueError(f"invalid na_position: {repr(na_position)}") + raise ValueError(f"invalid na_position: {na_position!r}") sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) @@ -2164,7 +2216,9 @@ def __contains__(self, key) -> bool: # ------------------------------------------------------------------ # Rendering Methods - def _formatter(self, boxed: bool = False): + # error: Return type "None" of "_formatter" incompatible with return + # type "Callable[[Any], str | None]" in supertype "ExtensionArray" + def _formatter(self, boxed: bool = False) -> None: # type: ignore[override] # Returning None here will cause format_array to do inference. return None @@ -2318,7 +2372,7 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: Examples -------- - >>> c = pd.Categorical(list('aabca')) + >>> c = pd.Categorical(list("aabca")) >>> c ['a', 'a', 'b', 'c', 'a'] Categories (3, object): ['a', 'b', 'c'] @@ -2423,7 +2477,7 @@ def _mode(self, dropna: bool = True) -> Categorical: if dropna: mask = self.isna() - res_codes = algorithms.mode(codes, mask=mask) + res_codes, _ = algorithms.mode(codes, mask=mask) res_codes = cast(np.ndarray, res_codes) assert res_codes.dtype == codes.dtype res = self._from_backing_data(res_codes) @@ -2460,14 +2514,8 @@ def unique(self) -> Self: ['b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] """ - # pylint: disable=useless-parent-delegation return super().unique() - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - # make sure we have correct itemsize for resulting codes - assert res_values.dtype == self._ndarray.dtype - return res_values - def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. @@ -2487,6 +2535,28 @@ def equals(self, other: object) -> bool: return np.array_equal(self._codes, other._codes) return False + def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> Self: + func: Callable + if name == "cummin": + func = np.minimum.accumulate + elif name == "cummax": + func = np.maximum.accumulate + else: + raise TypeError(f"Accumulation {name} not supported for {type(self)}") + self.check_for_ordered(name) + + codes = self.codes.copy() + mask = self.isna() + if func == np.minimum.accumulate: + codes[mask] = np.iinfo(codes.dtype.type).max + # no need to change codes for maximum because codes[mask] is already -1 + if not skipna: + mask = np.maximum.accumulate(mask) + + codes = func(codes) + codes[mask] = -1 + return self._simple_new(codes, dtype=self._dtype) + @classmethod def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self: from pandas.core.dtypes.concat import union_categoricals @@ -2512,7 +2582,9 @@ def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self result = res_flat.reshape(len(first), -1, order="F") return result - result = union_categoricals(to_concat) + # error: Incompatible types in assignment (expression has type "Categorical", + # variable has type "Self") + result = union_categoricals(to_concat) # type: ignore[assignment] return result # ------------------------------------------------------------------ @@ -2564,7 +2636,7 @@ def describe(self) -> DataFrame: from pandas import Index from pandas.core.reshape.concat import concat - result = concat([counts, freqs], axis=1) + result = concat([counts, freqs], ignore_index=True, axis=1) result.columns = Index(["counts", "freqs"]) result.index.name = "categories" @@ -2600,15 +2672,14 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: Examples -------- - >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', - ... 'hippo']) - >>> s.isin(['cow', 'lama']) + >>> s = pd.Categorical(["llama", "cow", "llama", "beetle", "llama", "hippo"]) + >>> s.isin(["cow", "llama"]) array([ True, True, True, False, True, False]) - Passing a single string as ``s.isin('lama')`` will raise an error. Use + Passing a single string as ``s.isin('llama')`` will raise an error. Use a list of one element instead: - >>> s.isin(['lama']) + >>> s.isin(["llama"]) array([ True, False, True, False, True, False]) """ null_mask = np.asarray(isna(values)) @@ -2616,75 +2687,40 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) - def _replace(self, *, to_replace, value, inplace: bool = False): - from pandas import Index - - orig_dtype = self.dtype - - inplace = validate_bool_kwarg(inplace, "inplace") - cat = self if inplace else self.copy() - - mask = isna(np.asarray(value)) - if mask.any(): - removals = np.asarray(to_replace)[mask] - removals = cat.categories[cat.categories.isin(removals)] - new_cat = cat.remove_categories(removals) - NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) - - ser = cat.categories.to_series() - ser = ser.replace(to_replace=to_replace, value=value) - - all_values = Index(ser) - - # GH51016: maintain order of existing categories - idxr = cat.categories.get_indexer_for(all_values) - locs = np.arange(len(ser)) - locs = np.where(idxr == -1, locs, idxr) - locs = locs.argsort() - - new_categories = ser.take(locs) - new_categories = new_categories.drop_duplicates(keep="first") - new_categories = Index(new_categories) - new_codes = recode_for_categories( - cat._codes, all_values, new_categories, copy=False - ) - new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) - NDArrayBacked.__init__(cat, new_codes, new_dtype) - - if new_dtype != orig_dtype: - warnings.warn( - # GH#55147 - "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is deprecated. In a future version, replace " - "will only be used for cases that preserve the categories. " - "To change the categories, use ser.cat.rename_categories " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if not inplace: - return cat - # ------------------------------------------------------------------------ # String methods interface def _str_map( - self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True + self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True ): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. - from pandas.core.arrays import NumpyExtensionArray - categories = self.categories codes = self.codes - result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) + if categories.dtype == "string": + result = categories.array._str_map(f, na_value, dtype) # type: ignore[attr-defined] + if ( + categories.dtype.na_value is np.nan # type: ignore[union-attr] + and is_bool_dtype(dtype) + and (na_value is lib.no_default or isna(na_value)) + ): + # NaN propagates as False for functions with boolean return type + na_value = False + else: + from pandas.core.arrays import NumpyExtensionArray + + result = NumpyExtensionArray(categories.to_numpy())._str_map( + f, na_value, dtype + ) return take_nd(result, codes, fill_value=na_value) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies( + sep, dtype + ) # ------------------------------------------------------------------------ # GroupBy Methods @@ -2705,7 +2741,7 @@ def _groupby_op( op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) dtype = self.dtype - if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]: raise TypeError(f"{dtype} type does not support {how} operations") if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: # raise TypeError instead of NotImplementedError to ensure we @@ -2783,6 +2819,12 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): Parameters ---------- data : Series or CategoricalIndex + The object to which the categorical accessor is attached. + + See Also + -------- + Series.dt : Accessor object for datetimelike properties of the Series values. + Series.sparse : Accessor for sparse matrix data types. Examples -------- @@ -2890,7 +2932,7 @@ def __init__(self, data) -> None: self._freeze() @staticmethod - def _validate(data): + def _validate(data) -> None: if not isinstance(data.dtype, CategoricalDtype): raise AttributeError("Can only use .cat accessor with a 'category' dtype") @@ -2899,14 +2941,20 @@ def _delegate_property_get(self, name: str): # error: Signature of "_delegate_property_set" incompatible with supertype # "PandasDelegate" - def _delegate_property_set(self, name: str, new_values): # type: ignore[override] - return setattr(self._parent, name, new_values) + def _delegate_property_set(self, name: str, new_values) -> None: # type: ignore[override] + setattr(self._parent, name, new_values) @property def codes(self) -> Series: """ Return Series of codes as well as the index. + See Also + -------- + Series.cat.categories : Return the categories of this categorical. + Series.cat.as_ordered : Set the Categorical to be ordered. + Series.cat.as_unordered : Set the Categorical to be unordered. + Examples -------- >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"]) @@ -2966,8 +3014,8 @@ def recode_for_categories( Examples -------- - >>> old_cat = pd.Index(['b', 'a', 'c']) - >>> new_cat = pd.Index(['a', 'b']) + >>> old_cat = pd.Index(["b", "a", "c"]) + >>> new_cat = pd.Index(["a", "b"]) >>> codes = np.array([0, 1, 1, 2]) >>> recode_for_categories(codes, old_cat, new_cat) array([ 1, 0, 0, -1], dtype=int8) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 11a0c7bf18fcb..994d7b1d0081c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, Union, cast, @@ -20,11 +19,13 @@ import numpy as np +from pandas._config import using_string_dtype +from pandas._config.config import get_option + from pandas._libs import ( algos, lib, ) -from pandas._libs.arrays import NDArrayBacked from pandas._libs.tslibs import ( BaseOffset, IncompatibleFrequency, @@ -65,6 +66,7 @@ ScalarIndexer, Self, SequenceIndexer, + TakeIndexer, TimeAmbiguous, TimeNonexistent, npt, @@ -92,6 +94,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -146,6 +149,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) @@ -269,9 +273,9 @@ def _unbox_scalar( Examples -------- - >>> arr = pd.array(np.array(['1970-01-01'], 'datetime64[ns]')) + >>> arr = pd.array(np.array(["1970-01-01"], "datetime64[ns]")) >>> arr._unbox_scalar(arr[0]) - numpy.datetime64('1970-01-01T00:00:00.000000000') + np.datetime64('1970-01-01T00:00:00.000000000') """ raise AbstractMethodError(self) @@ -343,29 +347,36 @@ def _format_native_types( """ raise AbstractMethodError(self) - def _formatter(self, boxed: bool = False): + def _formatter(self, boxed: bool = False) -> Callable[[object], str]: # TODO: Remove Datetime & DatetimeTZ formatters. return "'{}'".format # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) return np.array(list(self), dtype=object) + + if copy is True: + return np.array(self._ndarray, dtype=dtype) return self._ndarray @overload - def __getitem__(self, item: ScalarIndexer) -> DTScalarOrNaT: - ... + def __getitem__(self, key: ScalarIndexer) -> DTScalarOrNaT: ... @overload def __getitem__( self, - item: SequenceIndexer | PositionalIndexerTuple, - ) -> Self: - ... + key: SequenceIndexer | PositionalIndexerTuple, + ) -> Self: ... def __getitem__(self, key: PositionalIndexer2D) -> Self | DTScalarOrNaT: """ @@ -467,10 +478,16 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) + elif is_string_dtype(dtype): + if isinstance(dtype, ExtensionDtype): + arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type] + cls = dtype.construct_array_type() + return cls._from_sequence(arr_object, dtype=dtype, copy=False) + else: + return self._format_native_types() + elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) - elif is_string_dtype(dtype): - return self._format_native_types() elif dtype.kind in "iu": # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. @@ -493,22 +510,17 @@ def astype(self, dtype, copy: bool = True): return np.asarray(self, dtype=dtype) @overload - def view(self) -> Self: - ... + def view(self) -> Self: ... @overload - def view(self, dtype: Literal["M8[ns]"]) -> DatetimeArray: - ... + def view(self, dtype: Literal["M8[ns]"]) -> DatetimeArray: ... @overload - def view(self, dtype: Literal["m8[ns]"]) -> TimedeltaArray: - ... + def view(self, dtype: Literal["m8[ns]"]) -> TimedeltaArray: ... @overload - def view(self, dtype: Dtype | None = ...) -> ArrayLike: - ... + def view(self, dtype: Dtype | None = ...) -> ArrayLike: ... - # pylint: disable-next=useless-parent-delegation def view(self, dtype: Dtype | None = None) -> ArrayLike: # we need to explicitly call super() method as long as the `@overload`s # are present in this file. @@ -523,9 +535,9 @@ def _validate_comparison_value(self, other): try: # GH#18435 strings get a pass from tzawareness compat other = self._scalar_from_string(other) - except (ValueError, IncompatibleFrequency): + except (ValueError, IncompatibleFrequency) as err: # failed to parse as Timestamp/Timedelta/Period - raise InvalidComparison(other) + raise InvalidComparison(other) from err if isinstance(other, self._recognized_scalars) or other is NaT: other = self._scalar_type(other) @@ -648,7 +660,7 @@ def _validation_error_message(self, value, allow_listlike: bool = False) -> str: def _validate_listlike(self, value, allow_object: bool = False): if isinstance(value, type(self)): - if self.dtype.kind in "mM" and not allow_object: + if self.dtype.kind in "mM" and not allow_object and self.unit != value.unit: # type: ignore[attr-defined] # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value @@ -663,11 +675,11 @@ def _validate_listlike(self, value, allow_object: bool = False): if lib.infer_dtype(value) in self._infer_matches: try: value = type(self)._from_sequence(value) - except (ValueError, TypeError): + except (ValueError, TypeError) as err: if allow_object: return value msg = self._validation_error_message(value, True) - raise TypeError(msg) + raise TypeError(msg) from err # Do type inference if necessary up front (after unpacking # NumpyExtensionArray) @@ -731,7 +743,7 @@ def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarra # pandas assumes they're there. @ravel_compat - def map(self, mapper, na_action=None): + def map(self, mapper, na_action: Literal["ignore"] | None = None): from pandas import Index result = map_array(self, mapper, na_action=na_action) @@ -762,14 +774,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: values = ensure_wrapped_if_datetimelike(values) if not isinstance(values, type(self)): - inferable = [ - "timedelta", - "timedelta64", - "datetime", - "datetime64", - "date", - "period", - ] if values.dtype == object: values = lib.maybe_convert_objects( values, # type: ignore[arg-type] @@ -778,38 +782,16 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: ) if values.dtype != object: return self.isin(values) - - inferred = lib.infer_dtype(values, skipna=False) - if inferred not in inferable: - if inferred == "string": - pass - - elif "mixed" in inferred: - return isin(self.astype(object), values) - else: - return np.zeros(self.shape, dtype=bool) - - try: - values = type(self)._from_sequence(values) - except ValueError: - return isin(self.astype(object), values) - else: - warnings.warn( - # GH#53111 - f"The behavior of 'isin' with dtype={self.dtype} and " - "castable values (e.g. strings) is deprecated. In a " - "future version, these will not be considered matching " - "by isin. Explicitly cast to the appropriate dtype before " - "calling isin instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + else: + # TODO: Deprecate this case + # https://github.com/pandas-dev/pandas/pull/58645/files#r1604055791 + return isin(self.astype(object), values) + return np.zeros(self.shape, dtype=bool) if self.dtype.kind in "mM": self = cast("DatetimeArray | TimedeltaArray", self) - # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" - # has no attribute "as_unit" - values = values.as_unit(self.unit) # type: ignore[union-attr] + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + values = values.as_unit(self.unit) # type: ignore[attr-defined] try: # error: Argument 1 to "_check_compatible_with" of "DatetimeLikeArrayMixin" @@ -879,6 +861,11 @@ def freqstr(self) -> str | None: """ Return the frequency object as a string if it's set, otherwise None. + See Also + -------- + DatetimeIndex.inferred_freq : Returns a string representing a frequency + generated by infer_freq. + Examples -------- For DatetimeIndex: @@ -889,8 +876,9 @@ def freqstr(self) -> str | None: The frequency can be inferred if there are more than 2 points: - >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"], - ... freq="infer") + >>> idx = pd.DatetimeIndex( + ... ["2018-01-01", "2018-01-03", "2018-01-05"], freq="infer" + ... ) >>> idx.freqstr '2D' @@ -911,6 +899,11 @@ def inferred_freq(self) -> str | None: Returns None if it can't autodetect the frequency. + See Also + -------- + DatetimeIndex.freqstr : Return the frequency object as a string if it's set, + otherwise None. + Examples -------- For DatetimeIndex: @@ -1208,7 +1201,7 @@ def _add_timedeltalike_scalar(self, other): self, other = self._ensure_matching_resos(other) return self._add_timedeltalike(other) - def _add_timedelta_arraylike(self, other: TimedeltaArray): + def _add_timedelta_arraylike(self, other: TimedeltaArray) -> Self: """ Add a delta of a TimedeltaIndex @@ -1221,30 +1214,28 @@ def _add_timedelta_arraylike(self, other: TimedeltaArray): if len(self) != len(other): raise ValueError("cannot add indices of unequal length") - self = cast("DatetimeArray | TimedeltaArray", self) - - self, other = self._ensure_matching_resos(other) + self, other = cast( + "DatetimeArray | TimedeltaArray", self + )._ensure_matching_resos(other) return self._add_timedeltalike(other) @final - def _add_timedeltalike(self, other: Timedelta | TimedeltaArray): - self = cast("DatetimeArray | TimedeltaArray", self) - + def _add_timedeltalike(self, other: Timedelta | TimedeltaArray) -> Self: other_i8, o_mask = self._get_i8_values_and_mask(other) new_values = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) res_values = new_values.view(self._ndarray.dtype) new_freq = self._get_arithmetic_result_freq(other) - # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has - # incompatible type "Union[dtype[datetime64], DatetimeTZDtype, - # dtype[timedelta64]]"; expected "Union[dtype[datetime64], DatetimeTZDtype]" + # error: Unexpected keyword argument "freq" for "_simple_new" of "NDArrayBacked" return type(self)._simple_new( - res_values, dtype=self.dtype, freq=new_freq # type: ignore[arg-type] + res_values, + dtype=self.dtype, + freq=new_freq, # type: ignore[call-arg] ) @final - def _add_nat(self): + def _add_nat(self) -> Self: """ Add pd.NaT to self """ @@ -1252,22 +1243,21 @@ def _add_nat(self): raise TypeError( f"Cannot add {type(self).__name__} and {type(NaT).__name__}" ) - self = cast("TimedeltaArray | DatetimeArray", self) # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes result = np.empty(self.shape, dtype=np.int64) result.fill(iNaT) result = result.view(self._ndarray.dtype) # preserve reso - # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has - # incompatible type "Union[dtype[timedelta64], dtype[datetime64], - # DatetimeTZDtype]"; expected "Union[dtype[datetime64], DatetimeTZDtype]" + # error: Unexpected keyword argument "freq" for "_simple_new" of "NDArrayBacked" return type(self)._simple_new( - result, dtype=self.dtype, freq=None # type: ignore[arg-type] + result, + dtype=self.dtype, + freq=None, # type: ignore[call-arg] ) @final - def _sub_nat(self): + def _sub_nat(self) -> np.ndarray: """ Subtract pd.NaT from self """ @@ -1312,7 +1302,7 @@ def _sub_periodlike(self, other: Period | PeriodArray) -> npt.NDArray[np.object_ return new_data @final - def _addsub_object_array(self, other: npt.NDArray[np.object_], op): + def _addsub_object_array(self, other: npt.NDArray[np.object_], op) -> np.ndarray: """ Add or subtract array-like of DateOffset objects @@ -1334,12 +1324,13 @@ def _addsub_object_array(self, other: npt.NDArray[np.object_], op): # If both 1D then broadcasting is unambiguous return op(self, other[0]) - warnings.warn( - "Adding/subtracting object-dtype array to " - f"{type(self).__name__} not vectorized.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) + if get_option("performance_warnings"): + warnings.warn( + "Adding/subtracting object-dtype array to " + f"{type(self).__name__} not vectorized.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) # Caller is responsible for broadcasting if necessary assert self.shape == other.shape, (self.shape, other.shape) @@ -1363,7 +1354,7 @@ def __add__(self, other): # scalar others if other is NaT: - result = self._add_nat() + result: np.ndarray | DatetimeLikeArrayMixin = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_timedeltalike_scalar(other) elif isinstance(other, BaseOffset): @@ -1409,7 +1400,7 @@ def __add__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(result) + return TimedeltaArray._from_sequence(result, dtype=result.dtype) return result def __radd__(self, other): @@ -1423,7 +1414,7 @@ def __sub__(self, other): # scalar others if other is NaT: - result = self._sub_nat() + result: np.ndarray | DatetimeLikeArrayMixin = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_timedeltalike_scalar(-other) elif isinstance(other, BaseOffset): @@ -1469,7 +1460,7 @@ def __sub__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(result) + return TimedeltaArray._from_sequence(result, dtype=result.dtype) return result def __rsub__(self, other): @@ -1488,7 +1479,7 @@ def __rsub__(self, other): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray - other = DatetimeArray._from_sequence(other) + other = DatetimeArray._from_sequence(other, dtype=other.dtype) return other - self elif self.dtype.kind == "M" and hasattr(other, "dtype") and not other_is_dt64: # GH#19959 datetime - datetime is well-defined as timedelta, @@ -1580,6 +1571,7 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0): skipna : bool, default True Whether to ignore any NaT elements. axis : int, optional, default 0 + Axis for the function to be applied on. Returns ------- @@ -1599,7 +1591,7 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0): -------- For :class:`pandas.DatetimeIndex`: - >>> idx = pd.date_range('2001-01-01 00:00', periods=3) + >>> idx = pd.date_range("2001-01-01 00:00", periods=3) >>> idx DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq='D') @@ -1608,7 +1600,7 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0): For :class:`pandas.TimedeltaIndex`: - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='D') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit="D") >>> tdelta_idx TimedeltaIndex(['1 days', '2 days', '3 days'], dtype='timedelta64[ns]', freq=None) @@ -1643,7 +1635,7 @@ def _mode(self, dropna: bool = True): if dropna: mask = self.isna() - i8modes = algorithms.mode(self.view("i8"), mask=mask) + i8modes, _ = algorithms.mode(self.view("i8"), mask=mask) npmodes = i8modes.view(self._ndarray.dtype) npmodes = cast(np.ndarray, npmodes) return self._from_backing_data(npmodes) @@ -1664,32 +1656,28 @@ def _groupby_op( dtype = self.dtype if dtype.kind == "M": # Adding/multiplying datetimes is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: - raise TypeError(f"datetime64 type does not support {how} operations") + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: + raise TypeError(f"datetime64 type does not support operation '{how}'") if how in ["any", "all"]: # GH#34479 - warnings.warn( - f"'{how}' with datetime64 dtypes is deprecated and will raise in a " - f"future version. Use (obj != pd.Timestamp(0)).{how}() instead.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + f"'{how}' with datetime64 dtypes is no longer supported. " + f"Use (obj != pd.Timestamp(0)).{how}() instead." ) elif isinstance(dtype, PeriodDtype): # Adding/multiplying Periods is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"Period type does not support {how} operations") if how in ["any", "all"]: # GH#34479 - warnings.warn( - f"'{how}' with PeriodDtype is deprecated and will raise in a " - f"future version. Use (obj != pd.Period(0, freq)).{how}() instead.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + f"'{how}' with PeriodDtype is no longer supported. " + f"Use (obj != pd.Period(0, freq)).{how}() instead." ) else: # timedeltas we can add but not multiply - if how in ["prod", "cumprod", "skew", "var"]: + if how in ["prod", "cumprod", "skew", "kurt", "var"]: raise TypeError(f"timedelta64 type does not support {how} operations") # All of the functions implemented here are ordinal, so we can @@ -1778,14 +1766,17 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: Examples -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq='s') - >>> rng.strftime('%%B %%d, %%Y, %%r') + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), periods=3, freq="s") + >>> rng.strftime("%%B %%d, %%Y, %%r") Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', 'March 10, 2018, 09:00:02 AM'], dtype='object') """ result = self._format_native_types(date_format=date_format, na_rep=np.nan) + if using_string_dtype(): + from pandas import StringDtype + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result.astype(object, copy=False) @@ -1796,7 +1787,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: ---------- freq : str or Offset The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See + frequency like 's' (second) not 'ME' (month end). See :ref:`frequency aliases ` for a list of possible `freq` values. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' @@ -1808,7 +1799,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: a non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' @@ -1821,7 +1812,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1834,6 +1825,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: ------ ValueError if the `freq` cannot be converted. + See Also + -------- + DatetimeIndex.floor : Perform floor operation on the data to the specified `freq`. + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + Notes ----- If the timestamps have a timezone, {op}ing will take place relative to the @@ -1872,11 +1868,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _floor_example = """>>> rng.floor('h') @@ -1899,11 +1895,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _ceil_example = """>>> rng.ceil('h') @@ -1926,11 +1922,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ @@ -1939,100 +1935,6 @@ class TimelikeOps(DatetimeLikeArrayMixin): Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. """ - _default_dtype: np.dtype - - def __init__( - self, values, dtype=None, freq=lib.no_default, copy: bool = False - ) -> None: - warnings.warn( - # GH#55623 - f"{type(self).__name__}.__init__ is deprecated and will be " - "removed in a future version. Use pd.array instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if dtype is not None: - dtype = pandas_dtype(dtype) - - values = extract_array(values, extract_numpy=True) - if isinstance(values, IntegerArray): - values = values.to_numpy("int64", na_value=iNaT) - - inferred_freq = getattr(values, "_freq", None) - explicit_none = freq is None - freq = freq if freq is not lib.no_default else None - - if isinstance(values, type(self)): - if explicit_none: - # don't inherit from values - pass - elif freq is None: - freq = values.freq - elif freq and values.freq: - freq = to_offset(freq) - freq = _validate_inferred_freq(freq, values.freq) - - if dtype is not None and dtype != values.dtype: - # TODO: we only have tests for this for DTA, not TDA (2022-07-01) - raise TypeError( - f"dtype={dtype} does not match data dtype {values.dtype}" - ) - - dtype = values.dtype - values = values._ndarray - - elif dtype is None: - if isinstance(values, np.ndarray) and values.dtype.kind in "Mm": - dtype = values.dtype - else: - dtype = self._default_dtype - if isinstance(values, np.ndarray) and values.dtype == "i8": - values = values.view(dtype) - - if not isinstance(values, np.ndarray): - raise ValueError( - f"Unexpected type '{type(values).__name__}'. 'values' must be a " - f"{type(self).__name__}, ndarray, or Series or Index " - "containing one of those." - ) - if values.ndim not in [1, 2]: - raise ValueError("Only 1-dimensional input arrays are supported.") - - if values.dtype == "i8": - # for compat with datetime/timedelta/period shared methods, - # we can sometimes get here with int64 values. These represent - # nanosecond UTC (or tz-naive) unix timestamps - if dtype is None: - dtype = self._default_dtype - values = values.view(self._default_dtype) - elif lib.is_np_dtype(dtype, "mM"): - values = values.view(dtype) - elif isinstance(dtype, DatetimeTZDtype): - kind = self._default_dtype.kind - new_dtype = f"{kind}8[{dtype.unit}]" - values = values.view(new_dtype) - - dtype = self._validate_dtype(values, dtype) - - if freq == "infer": - raise ValueError( - f"Frequency inference not allowed in {type(self).__name__}.__init__. " - "Use 'pd.array()' instead." - ) - - if copy: - values = values.copy() - if freq: - freq = to_offset(freq) - if values.dtype.kind == "m" and not isinstance(freq, Tick): - raise TypeError("TimedeltaArray/Index freq must be a Tick") - - NDArrayBacked.__init__(self, values=values, dtype=dtype) - self._freq = freq - - if inferred_freq is None and freq is not None: - type(self)._validate_frequency(self, freq) - @classmethod def _validate_dtype(cls, values, dtype): raise AbstractMethodError(cls) @@ -2041,6 +1943,29 @@ def _validate_dtype(cls, values, dtype): def freq(self): """ Return the frequency object if it is set, otherwise None. + + To learn more about the frequency strings, please see + :ref:`this link`. + + See Also + -------- + DatetimeIndex.freq : Return the frequency object if it is set, otherwise None. + PeriodIndex.freq : Return the frequency object if it is set, otherwise None. + + Examples + -------- + >>> datetimeindex = pd.date_range( + ... "2022-02-22 02:22:22", periods=10, tz="America/Chicago", freq="h" + ... ) + >>> datetimeindex + DatetimeIndex(['2022-02-22 02:22:22-06:00', '2022-02-22 03:22:22-06:00', + '2022-02-22 04:22:22-06:00', '2022-02-22 05:22:22-06:00', + '2022-02-22 06:22:22-06:00', '2022-02-22 07:22:22-06:00', + '2022-02-22 08:22:22-06:00', '2022-02-22 09:22:22-06:00', + '2022-02-22 10:22:22-06:00', '2022-02-22 11:22:22-06:00'], + dtype='datetime64[ns, America/Chicago]', freq='h') + >>> datetimeindex.freq + """ return self._freq @@ -2058,7 +1983,7 @@ def freq(self, value) -> None: self._freq = value @final - def _maybe_pin_freq(self, freq, validate_kwds: dict): + def _maybe_pin_freq(self, freq, validate_kwds: dict) -> None: """ Constructor helper to pin the appropriate `freq` attribute. Assumes that self._freq is currently set to any freq inferred in @@ -2092,7 +2017,7 @@ def _maybe_pin_freq(self, freq, validate_kwds: dict): @final @classmethod - def _validate_frequency(cls, index, freq: BaseOffset, **kwargs): + def _validate_frequency(cls, index, freq: BaseOffset, **kwargs) -> None: """ Validate that a frequency is compatible with the values of a given Datetime Array/Index or Timedelta Array/Index @@ -2148,12 +2073,75 @@ def _creso(self) -> int: @cache_readonly def unit(self) -> str: - # e.g. "ns", "us", "ms" + """ + The precision unit of the datetime data. + + Returns the precision unit for the dtype. + It means the smallest time frame that can be stored within this dtype. + + Returns + ------- + str + Unit string representation (e.g. "ns"). + + See Also + -------- + TimelikeOps.as_unit : Converts to a specific unit. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"]) + >>> idx.unit + 'ns' + >>> idx.as_unit("s").unit + 's' + """ # error: Argument 1 to "dtype_to_unit" has incompatible type # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] def as_unit(self, unit: str, round_ok: bool = True) -> Self: + """ + Convert to a dtype with the given unit resolution. + + The limits of timestamp representation depend on the chosen resolution. + Different resolutions can be converted to each other through as_unit. + + Parameters + ---------- + unit : {'s', 'ms', 'us', 'ns'} + round_ok : bool, default True + If False and the conversion requires rounding, raise ValueError. + + Returns + ------- + same type as self + Converted to the specified unit. + + See Also + -------- + Timestamp.as_unit : Convert to the given unit. + + Examples + -------- + For :class:`pandas.DatetimeIndex`: + + >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"]) + >>> idx + DatetimeIndex(['2020-01-02 01:02:03.004005006'], + dtype='datetime64[ns]', freq=None) + >>> idx.as_unit("s") + DatetimeIndex(['2020-01-02 01:02:03'], dtype='datetime64[s]', freq=None) + + For :class:`pandas.TimedeltaIndex`: + + >>> tdelta_idx = pd.to_timedelta(["1 day 3 min 2 us 42 ns"]) + >>> tdelta_idx + TimedeltaIndex(['1 days 00:03:00.000002042'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.as_unit("s") + TimedeltaIndex(['1 days 00:03:00'], dtype='timedelta64[s]', freq=None) + """ if unit not in ["s", "ms", "us", "ns"]: raise ValueError("Supported units are 's', 'ms', 'us', 'ns'") @@ -2169,7 +2157,9 @@ def as_unit(self, unit: str, round_ok: bool = True) -> Self: # error: Unexpected keyword argument "freq" for "_simple_new" of # "NDArrayBacked" [call-arg] return type(self)._simple_new( - new_values, dtype=new_dtype, freq=self.freq # type: ignore[call-arg] + new_values, + dtype=new_dtype, + freq=self.freq, # type: ignore[call-arg] ) # TODO: annotate other as DatetimeArray | TimedeltaArray | Timestamp | Timedelta @@ -2249,11 +2239,11 @@ def ceil( # Reductions def any(self, *, axis: AxisInt | None = None, skipna: bool = True) -> bool: - # GH#34479 the nanops call will issue a FutureWarning for non-td64 dtype + # GH#34479 the nanops call will raise a TypeError for non-td64 dtype return nanops.nanany(self._ndarray, axis=axis, skipna=skipna, mask=self.isna()) def all(self, *, axis: AxisInt | None = None, skipna: bool = True) -> bool: - # GH#34479 the nanops call will issue a FutureWarning for non-td64 dtype + # GH#34479 the nanops call will raise a TypeError for non-td64 dtype return nanops.nanall(self._ndarray, axis=axis, skipna=skipna, mask=self.isna()) @@ -2308,11 +2298,12 @@ def factorize( ): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) - codes = np.arange(len(self), dtype=np.intp) - uniques = self.copy() # TODO: copy or view? if sort and self.freq.n < 0: - codes = codes[::-1] - uniques = uniques[::-1] + codes = np.arange(len(self) - 1, -1, -1, dtype=np.intp) + uniques = self[::-1] + else: + codes = np.arange(len(self), dtype=np.intp) + uniques = self.copy() # TODO: copy or view? return codes, uniques if sort: @@ -2390,6 +2381,27 @@ def interpolate( return self return type(self)._simple_new(out_data, dtype=self.dtype) + def take( + self, + indices: TakeIndexer, + *, + allow_fill: bool = False, + fill_value: Any = None, + axis: AxisInt = 0, + ) -> Self: + result = super().take( + indices=indices, allow_fill=allow_fill, fill_value=fill_value, axis=axis + ) + + indices = np.asarray(indices, dtype=np.intp) + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) # type: ignore[arg-type] + + if isinstance(maybe_slice, slice): + freq = self._get_getitem_freq(maybe_slice) + result._freq = freq + + return result + # -------------------------------------------------------------- # Unsorted @@ -2459,23 +2471,21 @@ def ensure_arraylike_for_datetimelike( @overload -def validate_periods(periods: None) -> None: - ... +def validate_periods(periods: None) -> None: ... @overload -def validate_periods(periods: int | float) -> int: - ... +def validate_periods(periods: int) -> int: ... -def validate_periods(periods: int | float | None) -> int | None: +def validate_periods(periods: int | None) -> int | None: """ If a `periods` argument is passed to the Datetime/Timedelta Array/Index constructor, cast it to an integer. Parameters ---------- - periods : None, float, int + periods : None, int Returns ------- @@ -2484,22 +2494,13 @@ def validate_periods(periods: int | float | None) -> int | None: Raises ------ TypeError - if periods is None, float, or int + if periods is not None or int """ - if periods is not None: - if lib.is_float(periods): - warnings.warn( - # GH#56036 - "Non-integer 'periods' in pd.date_range, pd.timedelta_range, " - "pd.period_range, and pd.interval_range are deprecated and " - "will raise in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - periods = int(periods) - elif not lib.is_integer(periods): - raise TypeError(f"periods must be a number, got {periods}") - return periods + if periods is not None and not lib.is_integer(periods): + raise TypeError(f"periods must be an integer, got {periods}") + # error: Incompatible return value type (got "int | integer[Any] | None", + # expected "int | None") + return periods # type: ignore[return-value] def _validate_inferred_freq( @@ -2531,7 +2532,7 @@ def _validate_inferred_freq( return freq -def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: +def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype | ArrowDtype) -> str: """ Return the unit str corresponding to the dtype's resolution. @@ -2546,4 +2547,8 @@ def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: """ if isinstance(dtype, DatetimeTZDtype): return dtype.unit + elif isinstance(dtype, ArrowDtype): + if dtype.kind not in "mM": + raise ValueError(f"{dtype=} does not have a resolution.") + return dtype.pyarrow_dtype.unit return np.datetime_data(dtype)[0] diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6b7ddc4a72957..b31c543188282 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -7,6 +7,7 @@ ) from typing import ( TYPE_CHECKING, + TypeVar, cast, overload, ) @@ -14,6 +15,9 @@ import numpy as np +from pandas._config import using_string_dtype +from pandas._config.config import get_option + from pandas._libs import ( lib, tslib, @@ -39,10 +43,7 @@ tz_convert_from_utc, tzconversion, ) -from pandas._libs.tslibs.dtypes import ( - abbrev_to_npy_unit, - freq_to_period_freqstr, -) +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive @@ -73,7 +74,10 @@ ) if TYPE_CHECKING: - from collections.abc import Iterator + from collections.abc import ( + Generator, + Iterator, + ) from pandas._typing import ( ArrayLike, @@ -86,21 +90,25 @@ npt, ) - from pandas import DataFrame + from pandas import ( + DataFrame, + Timedelta, + ) from pandas.core.arrays import PeriodArray + _TimestampNoneT1 = TypeVar("_TimestampNoneT1", Timestamp, None) + _TimestampNoneT2 = TypeVar("_TimestampNoneT2", Timestamp, None) + _ITER_CHUNKSIZE = 10_000 @overload -def tz_to_dtype(tz: tzinfo, unit: str = ...) -> DatetimeTZDtype: - ... +def tz_to_dtype(tz: tzinfo, unit: str = ...) -> DatetimeTZDtype: ... @overload -def tz_to_dtype(tz: None, unit: str = ...) -> np.dtype[np.datetime64]: - ... +def tz_to_dtype(tz: None, unit: str = ...) -> np.dtype[np.datetime64]: ... def tz_to_dtype( @@ -136,10 +144,14 @@ def f(self): month_kw = 12 if freq: kwds = freq.kwds - month_kw = kwds.get("startingMonth", kwds.get("month", 12)) + month_kw = kwds.get("startingMonth", kwds.get("month", month_kw)) + if freq is not None: + freq_name = freq.name + else: + freq_name = None result = fields.get_start_end_field( - values, field, self.freqstr, month_kw, reso=self._creso + values, field, freq_name, month_kw, reso=self._creso ) else: result = fields.get_date_field(values, field, reso=self._creso) @@ -147,15 +159,8 @@ def f(self): # these return a boolean by-definition return result - if field in self._object_ops: - result = fields.get_date_name_field(values, field, reso=self._creso) - result = self._maybe_mask_results(result, fill_value=None) - - else: - result = fields.get_date_field(values, field, reso=self._creso) - result = self._maybe_mask_results( - result, fill_value=None, convert="float64" - ) + result = fields.get_date_field(values, field, reso=self._creso) + result = self._maybe_mask_results(result, fill_value=None, convert="float64") return result @@ -179,7 +184,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] Parameters ---------- - values : Series, Index, DatetimeArray, ndarray + data : Series, Index, DatetimeArray, ndarray The datetime data. For DatetimeArray `values` (or a Series or Index boxing one), @@ -200,13 +205,22 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] ------- None + See Also + -------- + DatetimeIndex : Immutable Index for datetime-like data. + Series : One-dimensional labeled array capable of holding datetime-like data. + Timestamp : Pandas replacement for python datetime.datetime object. + to_datetime : Convert argument to datetime. + period_range : Return a fixed frequency PeriodIndex. + Examples -------- >>> pd.arrays.DatetimeArray._from_sequence( - ... pd.DatetimeIndex(['2023-01-01', '2023-01-02'], freq='D')) + ... pd.DatetimeIndex(["2023-01-01", "2023-01-02"], freq="D") + ... ) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] - Length: 2, dtype: datetime64[ns] + Length: 2, dtype: datetime64[s] """ _typ = "datetimearray" @@ -231,7 +245,6 @@ def _scalar_type(self) -> type[Timestamp]: "is_year_end", "is_leap_year", ] - _object_ops: list[str] = ["freq", "tz"] _field_ops: list[str] = [ "year", "month", @@ -252,7 +265,7 @@ def _scalar_type(self) -> type[Timestamp]: ] _other_ops: list[str] = ["date", "time", "timetz"] _datetimelike_ops: list[str] = ( - _field_ops + _object_ops + _bool_ops + _other_ops + ["unit"] + _field_ops + _bool_ops + _other_ops + ["unit", "freq", "tz"] ) _datetimelike_methods: list[str] = [ "to_period", @@ -279,7 +292,6 @@ def _scalar_type(self) -> type[Timestamp]: _dtype: np.dtype[np.datetime64] | DatetimeTZDtype _freq: BaseOffset | None = None - _default_dtype = DT64NS_DTYPE # used in TimeLikeOps.__init__ @classmethod def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: @@ -319,14 +331,14 @@ def _simple_new( # type: ignore[override] else: # DatetimeTZDtype. If we have e.g. DatetimeTZDtype[us, UTC], # then values.dtype should be M8[us]. - assert dtype._creso == get_unit_from_dtype(values.dtype) + assert dtype._creso == get_unit_from_dtype(values.dtype) # type: ignore[union-attr] result = super()._simple_new(values, dtype) result._freq = freq return result @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) @classmethod @@ -530,9 +542,9 @@ def _unbox_scalar(self, value) -> np.datetime64: raise ValueError("'value' should be a Timestamp.") self._check_compatible_with(value) if value is NaT: - return np.datetime64(value._value, self.unit) + return np.datetime64(value._value, self.unit) # type: ignore[call-overload] else: - return value.as_unit(self.unit).asm8 + return value.as_unit(self.unit, round_ok=False).asm8 def _scalar_from_string(self, value) -> Timestamp | NaTType: return Timestamp(value, tz=self.tz) @@ -583,9 +595,16 @@ def tz(self) -> tzinfo | None: Returns ------- - datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + zoneinfo.ZoneInfo,, datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None Returns None when the array is tz-naive. + See Also + -------- + DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a + given time zone, or remove timezone from a tz-aware DatetimeIndex. + DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from + one time zone to another. + Examples -------- For Series: @@ -595,17 +614,18 @@ def tz(self) -> tzinfo | None: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.tz datetime.timezone.utc For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx.tz datetime.timezone.utc - """ + """ # noqa: E501 # GH 18595 return getattr(self.dtype, "tz", None) @@ -638,12 +658,12 @@ def _resolution_obj(self) -> Resolution: # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: if dtype is None and self.tz: # The default for tz-aware is object, to preserve tz info dtype = object - return super().__array__(dtype=dtype) + return super().__array__(dtype=dtype, copy=copy) def __iter__(self) -> Iterator: """ @@ -758,17 +778,6 @@ def _format_native_types( # ----------------------------------------------------------------- # Comparison Methods - def _has_same_tz(self, other) -> bool: - # vzone shouldn't be None if value is non-datetime like - if isinstance(other, np.datetime64): - # convert to Timestamp as np.datetime64 doesn't have tz attr - other = Timestamp(other) - - if not hasattr(other, "tzinfo"): - return False - other_tz = other.tzinfo - return timezones.tz_compare(self.tzinfo, other_tz) - def _assert_tzawareness_compat(self, other) -> None: # adapted from _Timestamp._assert_tzawareness_compat other_tz = getattr(other, "tzinfo", None) @@ -804,22 +813,17 @@ def _add_offset(self, offset: BaseOffset) -> Self: try: res_values = offset._apply_array(values._ndarray) if res_values.dtype.kind == "i": - # error: Argument 1 to "view" of "ndarray" has incompatible type - # "dtype[datetime64] | DatetimeTZDtype"; expected - # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]" - res_values = res_values.view(values.dtype) # type: ignore[arg-type] + res_values = res_values.view(values.dtype) except NotImplementedError: - warnings.warn( - "Non-vectorized DateOffset being applied to Series or DatetimeIndex.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) + if get_option("performance_warnings"): + warnings.warn( + "Non-vectorized DateOffset being applied to Series or " + "DatetimeIndex.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) res_values = self.astype("O") + offset - # TODO(GH#55564): as_unit will be unnecessary - result = type(self)._from_sequence(res_values).as_unit(self.unit) - if not len(self): - # GH#30336 _from_sequence won't be able to infer self.tz - return result.tz_localize(self.tz) + result = type(self)._from_sequence(res_values, dtype=self.dtype) else: result = type(self)._simple_new(res_values, dtype=res_values.dtype) @@ -853,7 +857,7 @@ def tz_convert(self, tz) -> Self: Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None Time zone for time. Corresponding timestamps would be converted to this time zone of the Datetime Array/Index. A `tz` of None will convert to UTC and remove the timezone information. @@ -861,6 +865,7 @@ def tz_convert(self, tz) -> Self: Returns ------- Array or Index + Datetme Array/Index with target `tz`. Raises ------ @@ -878,8 +883,9 @@ def tz_convert(self, tz) -> Self: With the `tz` parameter, we can change the DatetimeIndex to other time zones: - >>> dti = pd.date_range(start='2014-08-01 09:00', - ... freq='h', periods=3, tz='Europe/Berlin') + >>> dti = pd.date_range( + ... start="2014-08-01 09:00", freq="h", periods=3, tz="Europe/Berlin" + ... ) >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', @@ -887,7 +893,7 @@ def tz_convert(self, tz) -> Self: '2014-08-01 11:00:00+02:00'], dtype='datetime64[ns, Europe/Berlin]', freq='h') - >>> dti.tz_convert('US/Central') + >>> dti.tz_convert("US/Central") DatetimeIndex(['2014-08-01 02:00:00-05:00', '2014-08-01 03:00:00-05:00', '2014-08-01 04:00:00-05:00'], @@ -896,8 +902,9 @@ def tz_convert(self, tz) -> Self: With the ``tz=None``, we can remove the timezone (after converting to UTC if necessary): - >>> dti = pd.date_range(start='2014-08-01 09:00', freq='h', - ... periods=3, tz='Europe/Berlin') + >>> dti = pd.date_range( + ... start="2014-08-01 09:00", freq="h", periods=3, tz="Europe/Berlin" + ... ) >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', @@ -910,7 +917,7 @@ def tz_convert(self, tz) -> Self: '2014-08-01 08:00:00', '2014-08-01 09:00:00'], dtype='datetime64[ns]', freq='h') - """ + """ # noqa: E501 tz = timezones.maybe_get_tz(tz) if self.tz is None: @@ -942,7 +949,7 @@ def tz_localize( Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + tz : str, zoneinfo.ZoneInfo,, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None Time zone to convert timestamps to. Passing ``None`` will remove the time zone information preserving local time. ambiguous : 'infer', 'NaT', bool array, default 'raise' @@ -959,7 +966,7 @@ def tz_localize( non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ @@ -973,7 +980,7 @@ def tz_localize( closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1034,7 +1041,7 @@ def tz_localize( 4 2018-10-28 02:30:00+01:00 5 2018-10-28 03:00:00+01:00 6 2018-10-28 03:30:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly @@ -1046,14 +1053,14 @@ def tz_localize( 0 2018-10-28 01:20:00+02:00 1 2018-10-28 02:36:00+02:00 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) + ... '2015-03-29 03:30:00'], dtype="M8[ns]")) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 0 2015-03-29 03:00:00+02:00 1 2015-03-29 03:30:00+02:00 @@ -1068,7 +1075,7 @@ def tz_localize( 0 2015-03-29 03:30:00+02:00 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] - """ + """ # noqa: E501 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta @@ -1118,10 +1125,16 @@ def to_pydatetime(self) -> npt.NDArray[np.object_]: Returns ------- numpy.ndarray + An ndarray of ``datetime.datetime`` objects. + + See Also + -------- + DatetimeIndex.to_julian_date : Converts Datetime Array to float64 ndarray + of Julian Dates. Examples -------- - >>> idx = pd.date_range('2018-02-27', periods=3) + >>> idx = pd.date_range("2018-02-27", periods=3) >>> idx.to_pydatetime() array([datetime.datetime(2018, 2, 27, 0, 0), datetime.datetime(2018, 2, 28, 0, 0), @@ -1154,8 +1167,9 @@ def normalize(self) -> Self: Examples -------- - >>> idx = pd.date_range(start='2014-08-01 10:00', freq='h', - ... periods=3, tz='Asia/Calcutta') + >>> idx = pd.date_range( + ... start="2014-08-01 10:00", freq="h", periods=3, tz="Asia/Calcutta" + ... ) >>> idx DatetimeIndex(['2014-08-01 10:00:00+05:30', '2014-08-01 11:00:00+05:30', @@ -1191,6 +1205,7 @@ def to_period(self, freq=None) -> PeriodArray: Returns ------- PeriodArray/PeriodIndex + Immutable ndarray holding ordinal values at a particular frequency. Raises ------ @@ -1205,10 +1220,16 @@ def to_period(self, freq=None) -> PeriodArray: Examples -------- - >>> df = pd.DataFrame({"y": [1, 2, 3]}, - ... index=pd.to_datetime(["2000-03-31 00:00:00", - ... "2000-05-31 00:00:00", - ... "2000-08-31 00:00:00"])) + >>> df = pd.DataFrame( + ... {"y": [1, 2, 3]}, + ... index=pd.to_datetime( + ... [ + ... "2000-03-31 00:00:00", + ... "2000-05-31 00:00:00", + ... "2000-08-31 00:00:00", + ... ] + ... ), + ... ) >>> df.index.to_period("M") PeriodIndex(['2000-03', '2000-05', '2000-08'], dtype='period[M]') @@ -1232,8 +1253,10 @@ def to_period(self, freq=None) -> PeriodArray: if freq is None: freq = self.freqstr or self.inferred_freq - if isinstance(self.freq, BaseOffset): - freq = freq_to_period_freqstr(self.freq.n, self.freq.name) + if isinstance(self.freq, BaseOffset) and hasattr( + self.freq, "_period_dtype_code" + ): + freq = PeriodDtype(self.freq)._freqstr if freq is None: raise ValueError( @@ -1269,9 +1292,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: Series or Index Series or Index of month names. + See Also + -------- + DatetimeIndex.day_name : Return the day names with specified locale. + Examples -------- - >>> s = pd.Series(pd.date_range(start='2018-01', freq='ME', periods=3)) + >>> s = pd.Series(pd.date_range(start="2018-01", freq="ME", periods=3)) >>> s 0 2018-01-31 1 2018-02-28 @@ -1283,7 +1310,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: 2 March dtype: object - >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) + >>> idx = pd.date_range(start="2018-01", freq="ME", periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq='ME') @@ -1294,11 +1321,11 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month names in Brazilian Portuguese language. - >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) + >>> idx = pd.date_range(start="2018-01", freq="ME", periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq='ME') - >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP + >>> idx.month_name(locale="pt_BR.utf8") # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ values = self._local_timestamps() @@ -1307,6 +1334,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: values, "month_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result def day_name(self, locale=None) -> npt.NDArray[np.object_]: @@ -1326,9 +1360,13 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: Series or Index Series or Index of day names. + See Also + -------- + DatetimeIndex.month_name : Return the month names with specified locale. + Examples -------- - >>> s = pd.Series(pd.date_range(start='2018-01-01', freq='D', periods=3)) + >>> s = pd.Series(pd.date_range(start="2018-01-01", freq="D", periods=3)) >>> s 0 2018-01-01 1 2018-01-02 @@ -1340,7 +1378,7 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: 2 Wednesday dtype: object - >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx = pd.date_range(start="2018-01-01", freq="D", periods=3) >>> idx DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq='D') @@ -1351,11 +1389,11 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: for example: ``idx.day_name(locale='pt_BR.utf8')`` will return day names in Brazilian Portuguese language. - >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx = pd.date_range(start="2018-01-01", freq="D", periods=3) >>> idx DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq='D') - >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP + >>> idx.day_name(locale="pt_BR.utf8") # doctest: +SKIP Index(['Segunda', 'Terça', 'Quarta'], dtype='object') """ values = self._local_timestamps() @@ -1364,6 +1402,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: values, "day_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + # TODO: no tests that check for dtype of result as of 2024-08-15 + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result @property @@ -1373,6 +1419,14 @@ def time(self) -> npt.NDArray[np.object_]: The time part of the Timestamps. + See Also + -------- + DatetimeIndex.timetz : Returns numpy array of :class:`datetime.time` + objects with timezones. The time part of the Timestamps. + DatetimeIndex.date : Returns numpy array of python :class:`datetime.date` + objects. Namely, the date part of Timestamps without time and timezone + information. + Examples -------- For Series: @@ -1382,7 +1436,7 @@ def time(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.time 0 10:00:00 1 11:00:00 @@ -1390,8 +1444,9 @@ def time(self) -> npt.NDArray[np.object_]: For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx.time array([datetime.time(10, 0), datetime.time(11, 0)], dtype=object) """ @@ -1409,6 +1464,12 @@ def timetz(self) -> npt.NDArray[np.object_]: The time part of the Timestamps. + See Also + -------- + DatetimeIndex.time : Returns numpy array of :class:`datetime.time` objects. + The time part of the Timestamps. + DatetimeIndex.tz : Return the timezone. + Examples -------- For Series: @@ -1418,7 +1479,7 @@ def timetz(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.timetz 0 10:00:00+00:00 1 11:00:00+00:00 @@ -1426,8 +1487,9 @@ def timetz(self) -> npt.NDArray[np.object_]: For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx.timetz array([datetime.time(10, 0, tzinfo=datetime.timezone.utc), datetime.time(11, 0, tzinfo=datetime.timezone.utc)], dtype=object) @@ -1442,6 +1504,14 @@ def date(self) -> npt.NDArray[np.object_]: Namely, the date part of Timestamps without time and timezone information. + See Also + -------- + DatetimeIndex.time : Returns numpy array of :class:`datetime.time` objects. + The time part of the Timestamps. + DatetimeIndex.year : The year of the datetime. + DatetimeIndex.month : The month as January=1, December=12. + DatetimeIndex.day : The day of the datetime. + Examples -------- For Series: @@ -1451,7 +1521,7 @@ def date(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.date 0 2020-01-01 1 2020-02-01 @@ -1459,8 +1529,9 @@ def date(self) -> npt.NDArray[np.object_]: For DatetimeIndex: - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"] + ... ) >>> idx.date array([datetime.date(2020, 1, 1), datetime.date(2020, 2, 1)], dtype=object) """ @@ -1489,7 +1560,7 @@ def isocalendar(self) -> DataFrame: Examples -------- - >>> idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + >>> idx = pd.date_range(start="2019-12-29", freq="D", periods=4) >>> idx.isocalendar() year week day 2019-12-29 2019 52 7 @@ -1520,6 +1591,11 @@ def isocalendar(self) -> DataFrame: """ The year of the datetime. + See Also + -------- + DatetimeIndex.month: The month as January=1, December=12. + DatetimeIndex.day: The day of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1543,6 +1619,11 @@ def isocalendar(self) -> DataFrame: """ The month as January=1, December=12. + See Also + -------- + DatetimeIndex.year: The year of the datetime. + DatetimeIndex.day: The day of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1566,6 +1647,12 @@ def isocalendar(self) -> DataFrame: """ The day of the datetime. + See Also + -------- + DatetimeIndex.year: The year of the datetime. + DatetimeIndex.month: The month as January=1, December=12. + DatetimeIndex.hour: The hours of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1589,6 +1676,12 @@ def isocalendar(self) -> DataFrame: """ The hours of the datetime. + See Also + -------- + DatetimeIndex.day: The day of the datetime. + DatetimeIndex.minute: The minutes of the datetime. + DatetimeIndex.second: The seconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1612,6 +1705,11 @@ def isocalendar(self) -> DataFrame: """ The minutes of the datetime. + See Also + -------- + DatetimeIndex.hour: The hours of the datetime. + DatetimeIndex.second: The seconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1635,6 +1733,12 @@ def isocalendar(self) -> DataFrame: """ The seconds of the datetime. + See Also + -------- + DatetimeIndex.minute: The minutes of the datetime. + DatetimeIndex.microsecond: The microseconds of the datetime. + DatetimeIndex.nanosecond: The nanoseconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1658,6 +1762,11 @@ def isocalendar(self) -> DataFrame: """ The microseconds of the datetime. + See Also + -------- + DatetimeIndex.second: The seconds of the datetime. + DatetimeIndex.nanosecond: The nanoseconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1681,6 +1790,11 @@ def isocalendar(self) -> DataFrame: """ The nanoseconds of the datetime. + See Also + -------- + DatetimeIndex.second: The seconds of the datetime. + DatetimeIndex.microsecond: The microseconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1742,6 +1856,11 @@ def isocalendar(self) -> DataFrame: """ The ordinal day of the year. + See Also + -------- + DatetimeIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + DatetimeIndex.day : The day of the datetime. + Examples -------- For Series: @@ -1751,7 +1870,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.dayofyear 0 1 1 32 @@ -1772,6 +1891,12 @@ def isocalendar(self) -> DataFrame: """ The quarter of the date. + See Also + -------- + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + DatetimeIndex.time : Returns numpy array of datetime.time objects. + The time part of the Timestamps. + Examples -------- For Series: @@ -1781,7 +1906,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-04-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.quarter 0 1 1 2 @@ -1801,6 +1926,15 @@ def isocalendar(self) -> DataFrame: """ The number of days in the month. + See Also + -------- + Series.dt.day : Return the day of the month. + Series.dt.is_month_end : Return a boolean indicating if the + date is the last day of the month. + Series.dt.is_month_start : Return a boolean indicating if the + date is the first day of the month. + Series.dt.month : Return the month as January=1 through December=12. + Examples -------- >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) @@ -1808,7 +1942,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.daysinmonth 0 31 1 29 @@ -1994,6 +2128,32 @@ def isocalendar(self) -> DataFrame: >>> idx.is_year_start array([False, False, True]) + + This method, when applied to Series with datetime values under + the ``.dt`` accessor, will lose information about Business offsets. + + >>> dates = pd.Series(pd.date_range("2020-10-30", periods=4, freq="BYS")) + >>> dates + 0 2021-01-01 + 1 2022-01-03 + 2 2023-01-02 + 3 2024-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_start + 0 True + 1 False + 2 False + 3 True + dtype: bool + + >>> idx = pd.date_range("2020-10-30", periods=4, freq="BYS") + >>> idx + DatetimeIndex(['2021-01-01', '2022-01-03', '2023-01-02', '2024-01-01'], + dtype='datetime64[ns]', freq='BYS-JAN') + + >>> idx.is_year_start + array([ True, True, True, True]) """, ) is_year_end = _field_accessor( @@ -2056,6 +2216,13 @@ def isocalendar(self) -> DataFrame: Series or ndarray Booleans indicating if dates belong to a leap year. + See Also + -------- + DatetimeIndex.is_year_end : Indicate whether the date is the + last day of the year. + DatetimeIndex.is_year_start : Indicate whether the date is the first + day of a year. + Examples -------- This method is available on Series with datetime values under @@ -2117,6 +2284,19 @@ def to_julian_date(self) -> npt.NDArray[np.float64]: # ----------------------------------------------------------------- # Reductions + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims and isinstance(result, np.ndarray): + if name == "std": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(result) + else: + return self._from_sequence(result, dtype=self.dtype) + return result + def std( self, axis=None, @@ -2125,7 +2305,7 @@ def std( ddof: int = 1, keepdims: bool = False, skipna: bool = True, - ): + ) -> Timedelta: """ Return sample standard deviation over requested axis. @@ -2136,9 +2316,25 @@ def std( axis : int, optional Axis for the function to be applied on. For :class:`pandas.Series` this parameter is unused and defaults to ``None``. + dtype : dtype, optional, default None + Type to use in computing the standard deviation. For arrays of + integer type the default is float64, for arrays of float types + it is the same as the array type. + out : ndarray, optional, default None + Alternative output array in which to place the result. It must have + the same shape as the expected output but the type (of the + calculated values) will be cast if necessary. ddof : int, default 1 Degrees of Freedom. The divisor used in calculations is `N - ddof`, where `N` represents the number of elements. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left in the + result as dimensions with size one. With this option, the result + will broadcast correctly against the input array. If the default + value is passed, then keepdims will not be passed through to the + std method of sub-classes of ndarray, however any non-default value + will be. If the sub-class method does not implement keepdims any + exceptions will be raised. skipna : bool, default True Exclude NA/null values. If an entire row/column is ``NA``, the result will be ``NA``. @@ -2146,6 +2342,7 @@ def std( Returns ------- Timedelta + Standard deviation over requested axis. See Also -------- @@ -2157,7 +2354,7 @@ def std( -------- For :class:`pandas.DatetimeIndex`: - >>> idx = pd.date_range('2001-01-01 00:00', periods=3) + >>> idx = pd.date_range("2001-01-01 00:00", periods=3) >>> idx DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq='D') @@ -2191,7 +2388,7 @@ def _sequence_to_dt64( yearfirst: bool = False, ambiguous: TimeAmbiguous = "raise", out_unit: str | None = None, -): +) -> tuple[np.ndarray, tzinfo | None]: """ Parameters ---------- @@ -2223,9 +2420,9 @@ def _sequence_to_dt64( data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - if out_unit is None: - out_unit = "ns" - out_dtype = np.dtype(f"M8[{out_unit}]") + out_dtype = DT64NS_DTYPE + if out_unit is not None: + out_dtype = np.dtype(f"M8[{out_unit}]") if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, @@ -2251,7 +2448,7 @@ def _sequence_to_dt64( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, - out_unit=out_unit or "ns", + out_unit=out_unit, ) copy = False if tz and inferred_tz: @@ -2359,8 +2556,8 @@ def objects_to_datetime64( utc: bool = False, errors: DateTimeErrorChoices = "raise", allow_object: bool = False, - out_unit: str = "ns", -): + out_unit: str | None = None, +) -> tuple[np.ndarray, tzinfo | None]: """ Convert data to array of timestamps. @@ -2371,11 +2568,12 @@ def objects_to_datetime64( yearfirst : bool utc : bool, default False Whether to convert/localize timestamps to UTC. - errors : {'raise', 'ignore', 'coerce'} + errors : {'raise', 'coerce'} allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. - out_unit : str, default "ns" + out_unit : str or None, default None + None indicates we should do resolution inference. Returns ------- @@ -2391,10 +2589,10 @@ def objects_to_datetime64( ValueError : if data cannot be converted to datetimes TypeError : When a type cannot be converted to datetime """ - assert errors in ["raise", "ignore", "coerce"] + assert errors in ["raise", "coerce"] # if str-dtype, convert - data = np.array(data, copy=False, dtype=np.object_) + data = np.asarray(data, dtype=np.object_) result, tz_parsed = tslib.array_to_datetime( data, @@ -2506,8 +2704,7 @@ def _maybe_infer_tz(tz: tzinfo | None, inferred_tz: tzinfo | None) -> tzinfo | N pass elif not timezones.tz_compare(tz, inferred_tz): raise TypeError( - f"data is already tz-aware {inferred_tz}, unable to " - f"set specified tz: {tz}" + f"data is already tz-aware {inferred_tz}, unable to set specified tz: {tz}" ) return tz @@ -2665,8 +2862,8 @@ def _infer_tz_from_endpoints( def _maybe_normalize_endpoints( - start: Timestamp | None, end: Timestamp | None, normalize: bool -): + start: _TimestampNoneT1, end: _TimestampNoneT2, normalize: bool +) -> tuple[_TimestampNoneT1, _TimestampNoneT2]: if normalize: if start is not None: start = start.normalize() @@ -2717,7 +2914,7 @@ def _generate_range( offset: BaseOffset, *, unit: str, -): +) -> Generator[Timestamp]: """ Generates a sequence of dates corresponding to the specified time offset. Similar to dateutil.rrule except uses pandas DateOffset @@ -2763,12 +2960,12 @@ def _generate_range( if start and not offset.is_on_offset(start): # Incompatible types in assignment (expression has type "datetime", # variable has type "Optional[Timestamp]") - start = offset.rollforward(start) # type: ignore[assignment] - elif end and not offset.is_on_offset(end): - # Incompatible types in assignment (expression has type "datetime", - # variable has type "Optional[Timestamp]") - end = offset.rollback(end) # type: ignore[assignment] + # GH #56147 account for negative direction and range bounds + if offset.n >= 0: + start = offset.rollforward(start) # type: ignore[assignment] + else: + start = offset.rollback(start) # type: ignore[assignment] # Unsupported operand types for < ("Timestamp" and "None") if periods is None and end < start and offset.n >= 0: # type: ignore[operator] diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 74b8cfb65cbc7..67c23f4825a7f 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -23,6 +23,8 @@ class FloatingDtype(NumericDtype): The attributes name & type are set when these subclasses are created. """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = np.nan _default_np_dtype = np.dtype(np.float64) _checker = is_float_dtype @@ -94,6 +96,14 @@ class FloatingArray(NumericArray): ------- FloatingArray + See Also + -------- + array : Create an array. + Float32Dtype : Float32 dtype for FloatingArray. + Float64Dtype : Float64 dtype for FloatingArray. + Series : One-dimensional labeled array capable of holding data. + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- Create an FloatingArray with :func:`pandas.array`: @@ -113,14 +123,6 @@ class FloatingArray(NumericArray): _dtype_cls = FloatingDtype - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = np.nan - # Fill values used for any/all - # Incompatible types in assignment (expression has type "float", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = 1.0 # type: ignore[assignment] - _falsey_value = 0.0 # type: ignore[assignment] - _dtype_docstring = """ An ExtensionDtype for {dtype} data. @@ -135,6 +137,12 @@ class FloatingArray(NumericArray): ------- None +See Also +-------- +CategoricalDtype : Type for categorical data with the categories and orderedness. +IntegerDtype : An ExtensionDtype to hold a single size & kind of integer dtype. +StringDtype : An ExtensionDtype for string data. + Examples -------- For Float32Dtype: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f9384e25ba9d9..afbadd754cdbc 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -23,6 +23,8 @@ class IntegerDtype(NumericDtype): The attributes name & type are set when these subclasses are created. """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 _default_np_dtype = np.dtype(np.int64) _checker = is_integer_dtype @@ -103,6 +105,12 @@ class IntegerArray(NumericArray): ------- IntegerArray + See Also + -------- + array : Create an array using the appropriate dtype, including ``IntegerArray``. + Int32Dtype : An ExtensionDtype for int32 integer data. + UInt16Dtype : An ExtensionDtype for uint16 integer data. + Examples -------- Create an IntegerArray with :func:`pandas.array`. @@ -115,12 +123,12 @@ class IntegerArray(NumericArray): String aliases for the dtypes are also available. They are capitalized. - >>> pd.array([1, None, 3], dtype='Int32') + >>> pd.array([1, None, 3], dtype="Int32") [1, , 3] Length: 3, dtype: Int32 - >>> pd.array([1, None, 3], dtype='UInt16') + >>> pd.array([1, None, 3], dtype="UInt16") [1, , 3] Length: 3, dtype: UInt16 @@ -128,14 +136,6 @@ class IntegerArray(NumericArray): _dtype_cls = IntegerDtype - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = 1 - # Fill values used for any/all - # Incompatible types in assignment (expression has type "int", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = 1 # type: ignore[assignment] - _falsey_value = 0 # type: ignore[assignment] - _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. @@ -150,6 +150,13 @@ class IntegerArray(NumericArray): ------- None +See Also +-------- +Int8Dtype : 8-bit nullable integer type. +Int16Dtype : 16-bit nullable integer type. +Int32Dtype : 32-bit nullable integer type. +Int64Dtype : 64-bit nullable integer type. + Examples -------- For Int8Dtype: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index a19b304529383..6cb79e915c78b 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -12,7 +12,6 @@ Union, overload, ) -import warnings import numpy as np @@ -28,7 +27,6 @@ ArrayLike, AxisInt, Dtype, - FillnaOptions, IntervalClosedType, NpDtype, PositionalIndexer, @@ -79,6 +77,7 @@ unique, value_counts_internal as value_counts, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ( ExtensionArray, _extension_array_shared_docs, @@ -99,6 +98,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) @@ -121,9 +121,7 @@ } -_interval_shared_docs[ - "class" -] = """ +_interval_shared_docs["class"] = """ %(summary)s Parameters @@ -232,7 +230,7 @@ def __new__( dtype: Dtype | None = None, copy: bool = False, verify_integrity: bool = True, - ): + ) -> Self: data = extract_array(data, extract_numpy=True) if isinstance(data, cls): @@ -370,11 +368,18 @@ def _ensure_simple_new_inputs( right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() + if isinstance(left, ArrowExtensionArray) or isinstance( + right, ArrowExtensionArray + ): + pass + else: + lbase = getattr(left, "_ndarray", left) + lbase = getattr(lbase, "_data", lbase).base + rbase = getattr(right, "_ndarray", right) + rbase = getattr(rbase, "_data", rbase).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() dtype = IntervalDtype(left.dtype, closed=closed) @@ -699,12 +704,10 @@ def __len__(self) -> int: return len(self._left) @overload - def __getitem__(self, key: ScalarIndexer) -> IntervalOrNA: - ... + def __getitem__(self, key: ScalarIndexer) -> IntervalOrNA: ... @overload - def __getitem__(self, key: SequenceIndexer) -> Self: - ... + def __getitem__(self, key: SequenceIndexer) -> Self: ... def __getitem__(self, key: PositionalIndexer) -> Self | IntervalOrNA: key = check_array_indexer(self, key) @@ -889,16 +892,7 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] - def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True - ) -> Self: - # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove - # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) - - def fillna( - self, value=None, method=None, limit: int | None = None, copy: bool = True - ) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: """ Fill NA/NaN values using the specified method. @@ -909,17 +903,9 @@ def fillna( Alternatively, a Series or dict can be used to fill in different values for each index. The value should not be a list. The value(s) passed should be either Interval objects or NA/NaN. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - (Not implemented yet for IntervalArray) - Method to use for filling holes in reindexed Series limit : int, default None (Not implemented yet for IntervalArray) - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. + The maximum number of entries where NA values will be filled. copy : bool, default True Whether to make a copy of the data before filling. If False, then the original should be modified and no new memory should be allocated. @@ -932,8 +918,8 @@ def fillna( """ if copy is False: raise NotImplementedError - if method is not None: - return super().fillna(value=value, method=method, limit=limit) + if limit is not None: + raise ValueError("limit must be None") value_left, value_right = self._validate_scalar(value) @@ -1069,7 +1055,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: from pandas import Index fill_value = Index(self._left, copy=False)._na_value - empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + empty = IntervalArray.from_breaks( + [fill_value] * (empty_len + 1), closed=self.closed + ) else: empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype) @@ -1227,21 +1215,14 @@ def value_counts(self, dropna: bool = True) -> Series: Series.value_counts """ # TODO: implement this is a non-naive way! - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "The behavior of value_counts with object-dtype is deprecated", - category=FutureWarning, - ) - result = value_counts(np.asarray(self), dropna=dropna) - # Once the deprecation is enforced, we will need to do - # `result.index = result.index.astype(self.dtype)` + result = value_counts(np.asarray(self), dropna=dropna) + result.index = result.index.astype(self.dtype) return result # --------------------------------------------------------------------- # Rendering Methods - def _formatter(self, boxed: bool = False): + def _formatter(self, boxed: bool = False) -> Callable[[object], str]: # returning 'str' here causes us to render as e.g. "(0, 1]" instead of # "Interval(0, 1, closed='right')" return str @@ -1254,6 +1235,22 @@ def left(self) -> Index: """ Return the left endpoints of each Interval in the IntervalArray as an Index. + This property provides access to the left endpoints of the intervals + contained within the IntervalArray. This can be useful for analyses where + the starting point of each interval is of interest, such as in histogram + creation, data aggregation, or any scenario requiring the identification + of the beginning of defined ranges. This property returns a ``pandas.Index`` + object containing the midpoint for each interval. + + See Also + -------- + arrays.IntervalArray.right : Return the right endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.mid : Return the midpoint of each Interval in the + IntervalArray as an Index. + arrays.IntervalArray.contains : Check elementwise if the Intervals contain + the value. + Examples -------- @@ -1274,6 +1271,21 @@ def right(self) -> Index: """ Return the right endpoints of each Interval in the IntervalArray as an Index. + This property extracts the right endpoints from each interval contained within + the IntervalArray. This can be helpful in use cases where you need to work + with or compare only the upper bounds of intervals, such as when performing + range-based filtering, determining interval overlaps, or visualizing the end + boundaries of data segments. + + See Also + -------- + arrays.IntervalArray.left : Return the left endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.mid : Return the midpoint of each Interval in the + IntervalArray as an Index. + arrays.IntervalArray.contains : Check elementwise if the Intervals contain + the value. + Examples -------- @@ -1294,6 +1306,20 @@ def length(self) -> Index: """ Return an Index with entries denoting the length of each Interval. + The length of an interval is calculated as the difference between + its `right` and `left` bounds. This property is particularly useful + when working with intervals where the size of the interval is an important + attribute, such as in time-series analysis or spatial data analysis. + + See Also + -------- + arrays.IntervalArray.left : Return the left endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.right : Return the right endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.mid : Return the midpoint of each Interval in the + IntervalArray as an Index. + Examples -------- @@ -1312,6 +1338,16 @@ def mid(self) -> Index: """ Return the midpoint of each Interval in the IntervalArray as an Index. + The midpoint of an interval is calculated as the average of its + ``left`` and ``right`` bounds. This property returns a ``pandas.Index`` object + containing the midpoint for each interval. + + See Also + -------- + Interval.left : Return left bound for the interval. + Interval.right : Return right bound for the interval. + Interval.length : Return the length of each interval. + Examples -------- @@ -1410,6 +1446,12 @@ def closed(self) -> IntervalClosedType: Either ``left``, ``right``, ``both`` or ``neither``. + See Also + -------- + IntervalArray.closed : Returns inclusive side of the IntervalArray. + Interval.closed : Returns inclusive side of the Interval. + IntervalIndex.closed : Returns inclusive side of the IntervalIndex. + Examples -------- @@ -1451,12 +1493,26 @@ def closed(self) -> IntervalClosedType: """ ) - @Appender( - _interval_shared_docs["set_closed"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + def set_closed(self, closed: IntervalClosedType) -> Self: + """ + Return an identical IntervalArray closed on the specified side. + + Parameters + ---------- + closed : {'left', 'right', 'both', 'neither'} + Whether the intervals are closed on the left-side, right-side, both + or neither. + + Returns + ------- + IntervalArray + A new IntervalArray with the specified side closures. + + See Also + -------- + IntervalArray.closed : Returns inclusive side of the Interval. + arrays.IntervalArray.closed : Returns inclusive side of the IntervalArray. + Examples -------- >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) @@ -1464,15 +1520,11 @@ def closed(self) -> IntervalClosedType: [(0, 1], (1, 2], (2, 3]] Length: 3, dtype: interval[int64, right] - >>> index.set_closed('both') + >>> index.set_closed("both") [[0, 1], [1, 2], [2, 3]] Length: 3, dtype: interval[int64, both] """ - ), - } - ) - def set_closed(self, closed: IntervalClosedType) -> Self: if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) @@ -1481,9 +1533,7 @@ def set_closed(self, closed: IntervalClosedType) -> Self: dtype = IntervalDtype(left.dtype, closed=closed) return self._simple_new(left, right, dtype=dtype) - _interval_shared_docs[ - "is_non_overlapping_monotonic" - ] = """ + _interval_shared_docs["is_non_overlapping_monotonic"] = """ Return a boolean whether the %(klass)s is non-overlapping and monotonic. Non-overlapping means (no Intervals share points), and monotonic means @@ -1526,10 +1576,54 @@ def set_closed(self, closed: IntervalClosedType) -> Self: """ @property - @Appender( - _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs - ) def is_non_overlapping_monotonic(self) -> bool: + """ + Return a boolean whether the IntervalArray/IntervalIndex\ + is non-overlapping and monotonic. + + Non-overlapping means (no Intervals share points), and monotonic means + either monotonic increasing or monotonic decreasing. + + See Also + -------- + overlaps : Check if two IntervalIndex objects overlap. + + Examples + -------- + For arrays: + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.is_non_overlapping_monotonic + True + + >>> interv_arr = pd.arrays.IntervalArray( + ... [pd.Interval(0, 1), pd.Interval(-1, 0.1)] + ... ) + >>> interv_arr + + [(0.0, 1.0], (-1.0, 0.1]] + Length: 2, dtype: interval[float64, right] + >>> interv_arr.is_non_overlapping_monotonic + False + + For Interval Index: + + >>> interv_idx = pd.interval_range(start=0, end=2) + >>> interv_idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> interv_idx.is_non_overlapping_monotonic + True + + >>> interv_idx = pd.interval_range(start=0, end=2, closed="both") + >>> interv_idx + IntervalIndex([[0, 1], [1, 2]], dtype='interval[int64, both]') + >>> interv_idx.is_non_overlapping_monotonic + False + """ # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) # we already require left <= right @@ -1552,11 +1646,18 @@ def is_non_overlapping_monotonic(self) -> bool: # --------------------------------------------------------------------- # Conversion - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + left = self._left right = self._right mask = self.isna() @@ -1639,39 +1740,52 @@ def __arrow_array__(self, type=None): """ ) - @Appender( - _interval_shared_docs["to_tuples"] - % { - "return_type": ( - "ndarray (if self is IntervalArray) or Index (if self is IntervalIndex)" - ), - "examples": textwrap.dedent( - """\ - - Examples - -------- - For :class:`pandas.IntervalArray`: - - >>> idx = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) - >>> idx - - [(0, 1], (1, 2]] - Length: 2, dtype: interval[int64, right] - >>> idx.to_tuples() - array([(0, 1), (1, 2)], dtype=object) - - For :class:`pandas.IntervalIndex`: - - >>> idx = pd.interval_range(start=0, end=2) - >>> idx - IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') - >>> idx.to_tuples() - Index([(0, 1), (1, 2)], dtype='object') - """ - ), - } - ) def to_tuples(self, na_tuple: bool = True) -> np.ndarray: + """ + Return an ndarray (if self is IntervalArray) or Index \ + (if self is IntervalIndex) of tuples of the form (left, right). + + Parameters + ---------- + na_tuple : bool, default True + If ``True``, return ``NA`` as a tuple ``(nan, nan)``. If ``False``, + just return ``NA`` as ``nan``. + + Returns + ------- + ndarray or Index + An ndarray of tuples representing the intervals + if `self` is an IntervalArray. + An Index of tuples representing the intervals + if `self` is an IntervalIndex. + + See Also + -------- + IntervalArray.to_list : Convert IntervalArray to a list of tuples. + IntervalArray.to_numpy : Convert IntervalArray to a numpy array. + IntervalArray.unique : Find unique intervals in an IntervalArray. + + Examples + -------- + For :class:`pandas.IntervalArray`: + + >>> idx = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) + >>> idx + + [(0, 1], (1, 2]] + Length: 2, dtype: interval[int64, right] + >>> idx.to_tuples() + array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))], + dtype=object) + + For :class:`pandas.IntervalIndex`: + + >>> idx = pd.interval_range(start=0, end=2) + >>> idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> idx.to_tuples() + Index([(0, 1), (1, 2)], dtype='object') + """ tuples = com.asarray_tuplesafe(zip(self._left, self._right)) if not na_tuple: # GH 18756 @@ -1715,6 +1829,8 @@ def insert(self, loc: int, item: Interval) -> Self: return self._shallow_copy(new_left, new_right) def delete(self, loc) -> Self: + new_left: np.ndarray | DatetimeArray | TimedeltaArray + new_right: np.ndarray | DatetimeArray | TimedeltaArray if isinstance(self._left, np.ndarray): new_left = np.delete(self._left, loc) assert isinstance(self._right, np.ndarray) @@ -1766,22 +1882,40 @@ def repeat( """ ) - @Appender( - _interval_shared_docs["contains"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + def contains(self, other): + """ + Check elementwise if the Intervals contain the value. + + Return a boolean mask whether the value is contained in the Intervals + of the IntervalArray. + + Parameters + ---------- + other : scalar + The value to check whether it is contained in the Intervals. + + Returns + ------- + boolean array + A boolean mask whether the value is contained in the Intervals. + + See Also + -------- + Interval.contains : Check whether Interval object contains value. + IntervalArray.overlaps : Check if an Interval overlaps the values in the + IntervalArray. + + Examples + -------- >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) >>> intervals [(0, 1], (1, 3], (2, 4]] Length: 3, dtype: interval[int64, right] + + >>> intervals.contains(0.5) + array([ True, False, False]) """ - ), - } - ) - def contains(self, other): if isinstance(other, Interval): raise NotImplementedError("contains not implemented for two intervals") @@ -1842,9 +1976,13 @@ def _from_combined(self, combined: np.ndarray) -> IntervalArray: dtype = self._left.dtype if needs_i8_conversion(dtype): assert isinstance(self._left, (DatetimeArray, TimedeltaArray)) - new_left = type(self._left)._from_sequence(nc[:, 0], dtype=dtype) + new_left: DatetimeArray | TimedeltaArray | np.ndarray = type( + self._left + )._from_sequence(nc[:, 0], dtype=dtype) assert isinstance(self._right, (DatetimeArray, TimedeltaArray)) - new_right = type(self._right)._from_sequence(nc[:, 1], dtype=dtype) + new_right: DatetimeArray | TimedeltaArray | np.ndarray = type( + self._right + )._from_sequence(nc[:, 1], dtype=dtype) else: assert isinstance(dtype, np.dtype) new_left = nc[:, 0].view(dtype) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 03c09c5b2fd18..e7a6b207363c3 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -3,8 +3,8 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, + cast, overload, ) import warnings @@ -16,28 +16,12 @@ missing as libmissing, ) from pandas._libs.tslibs import is_supported_dtype -from pandas._typing import ( - ArrayLike, - AstypeArg, - AxisInt, - DtypeObj, - FillnaOptions, - NpDtype, - PositionalIndexer, - Scalar, - ScalarIndexer, - Self, - SequenceIndexer, - Shape, - npt, -) from pandas.compat import ( IS64, is_platform_windows, ) from pandas.errors import AbstractMethodError from pandas.util._decorators import doc -from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( @@ -88,6 +72,7 @@ from pandas.core.util.hashing import hash_array if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Iterator, Sequence, @@ -97,7 +82,23 @@ from pandas._typing import ( NumpySorter, NumpyValueArrayLike, + ArrayLike, + AstypeArg, + AxisInt, + DtypeObj, + FillnaOptions, + InterpolateOptions, + NpDtype, + PositionalIndexer, + Scalar, + ScalarIndexer, + Self, + SequenceIndexer, + Shape, + npt, ) + from pandas._libs.missing import NAType + from pandas.core.arrays import FloatingArray from pandas.compat.numpy import function as nv @@ -109,16 +110,10 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): numpy based """ - # The value used to fill '_data' to avoid upcasting - _internal_fill_value: Scalar # our underlying data and mask are each ndarrays _data: np.ndarray _mask: npt.NDArray[np.bool_] - # Fill values used for any/all - _truthy_value = Scalar # bool(_truthy_value) = True - _falsey_value = Scalar # bool(_falsey_value) = False - @classmethod def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: result = BaseMaskedArray.__new__(cls) @@ -152,9 +147,10 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: @classmethod @doc(ExtensionArray._empty) - def _empty(cls, shape: Shape, dtype: ExtensionDtype): - values = np.empty(shape, dtype=dtype.type) - values.fill(cls._internal_fill_value) + def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self: + dtype = cast(BaseMaskedDtype, dtype) + values: np.ndarray = np.empty(shape, dtype=dtype.type) + values.fill(dtype._internal_fill_value) mask = np.ones(shape, dtype=bool) result = cls(values, mask) if not isinstance(result, cls) or dtype != result.dtype: @@ -172,12 +168,10 @@ def dtype(self) -> BaseMaskedDtype: raise AbstractMethodError(self) @overload - def __getitem__(self, item: ScalarIndexer) -> Any: - ... + def __getitem__(self, item: ScalarIndexer) -> Any: ... @overload - def __getitem__(self, item: SequenceIndexer) -> Self: - ... + def __getitem__(self, item: SequenceIndexer) -> Self: ... def __getitem__(self, item: PositionalIndexer) -> Self | Any: item = check_array_indexer(self, item) @@ -192,7 +186,12 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self._mask @@ -204,7 +203,21 @@ def _pad_or_backfill( if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() + elif limit_area is not None: + mask = mask.copy() func(npvalues, limit=limit, mask=new_mask) + + if limit_area is not None and not mask.all(): + mask = mask.T + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + new_mask[:first] |= mask[:first] + new_mask[last + 1 :] |= mask[last + 1 :] + elif limit_area == "outside": + new_mask[first + 1 : last] |= mask[first + 1 : last] + if copy: return self._simple_new(npvalues.T, new_mask.T) else: @@ -217,32 +230,24 @@ def _pad_or_backfill( return new_values @doc(ExtensionArray.fillna) - def fillna( - self, value=None, method=None, limit: int | None = None, copy: bool = True - ) -> Self: - value, method = validate_fillna_kwargs(value, method) - + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self._mask + if limit is not None and limit < len(self): + modify = mask.cumsum() > limit + if modify.any(): + # Only copy mask if necessary + mask = mask.copy() + mask[modify] = False value = missing.check_value_size(value, mask, len(self)) if mask.any(): - if method is not None: - func = missing.get_fill_func(method, ndim=self.ndim) - npvalues = self._data.T - new_mask = mask.T - if copy: - npvalues = npvalues.copy() - new_mask = new_mask.copy() - func(npvalues, limit=limit, mask=new_mask) - return self._simple_new(npvalues.T, new_mask.T) + # fill with value + if copy: + new_values = self.copy() else: - # fill with value - if copy: - new_values = self.copy() - else: - new_values = self[:] - new_values[mask] = value + new_values = self[:] + new_values[mask] = value else: if copy: new_values = self.copy() @@ -281,7 +286,7 @@ def _validate_setitem_value(self, value): # Note: without the "str" here, the f-string rendering raises in # py38 builds. - raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}") + raise TypeError(f"Invalid value '{value!s}' for dtype '{self.dtype}'") def __setitem__(self, key, value) -> None: key = check_array_indexer(self, key) @@ -384,6 +389,8 @@ def round(self, decimals: int = 0, *args, **kwargs): DataFrame.round : Round values of a DataFrame. Series.round : Round values of a Series. """ + if self.dtype.kind == "b": + return self nv.validate_round(args, kwargs) values = np.round(self._data, decimals=decimals, **kwargs) @@ -407,6 +414,9 @@ def __abs__(self) -> Self: # ------------------------------------------------------------------ + def _values_for_json(self) -> np.ndarray: + return np.asarray(self, dtype=object) + def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -475,6 +485,8 @@ def to_numpy( """ hasna = self._hasna dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) + if dtype is None: + dtype = object if hasna: if ( @@ -499,23 +511,20 @@ def to_numpy( return data @doc(ExtensionArray.tolist) - def tolist(self): + def tolist(self) -> list: if self.ndim > 1: return [x.tolist() for x in self] dtype = None if self._hasna else self._data.dtype return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist() @overload - def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: - ... + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @overload - def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: - ... + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: ... @overload - def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: - ... + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: ... def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: dtype = pandas_dtype(dtype) @@ -565,12 +574,24 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ the array interface, return my values We return an object array here to preserve our scalar values """ - return self.to_numpy(dtype=dtype) + if copy is False: + if not self._hasna: + # special case, here we can simply return the underlying data + return np.array(self._data, dtype=dtype, copy=copy) + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if copy is None: + copy = False # The NumPy copy=False meaning is different here. + return self.to_numpy(dtype=dtype, copy=copy) _HANDLED_TYPES: tuple[type, ...] @@ -906,7 +927,9 @@ def take( ) -> Self: # we always fill with 1 internally # to avoid upcasting - data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + data_fill_value = ( + self.dtype._internal_fill_value if isna(fill_value) else fill_value + ) result = take( self._data, indexer, @@ -1069,18 +1092,15 @@ def value_counts(self, dropna: bool = True) -> Series: arr = IntegerArray(value_counts, mask) index = Index( self.dtype.construct_array_type()( - keys, mask_index # type: ignore[arg-type] + keys, # type: ignore[arg-type] + mask_index, ) ) return Series(arr, index=index, name="count", copy=False) def _mode(self, dropna: bool = True) -> Self: - if dropna: - result = mode(self._data, dropna=dropna, mask=self._mask) - res_mask = np.zeros(result.shape, dtype=np.bool_) - else: - result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) - result = type(self)(result, res_mask) # type: ignore[arg-type] + result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) + result = type(self)(result, res_mask) return result[result.argsort()] @doc(ExtensionArray.equals) @@ -1184,7 +1204,7 @@ def _wrap_na_result(self, *, name, axis, mask_size): mask = np.ones(mask_size, dtype=bool) float_dtyp = "float32" if self.dtype == "Float32" else "float64" - if name in ["mean", "median", "var", "std", "skew", "kurt"]: + if name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: np_dtype = float_dtyp elif name in ["min", "max"] or self.dtype.itemsize == 8: np_dtype = self.dtype.numpy_dtype.name @@ -1304,10 +1324,22 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): ) return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) - def map(self, mapper, na_action=None): - return map_array(self.to_numpy(), mapper, na_action=None) + def map(self, mapper, na_action: Literal["ignore"] | None = None): + return map_array(self.to_numpy(), mapper, na_action=na_action) + + @overload + def any( + self, *, skipna: Literal[True] = ..., axis: AxisInt | None = ..., **kwargs + ) -> np.bool_: ... + + @overload + def any( + self, *, skipna: bool, axis: AxisInt | None = ..., **kwargs + ) -> np.bool_ | NAType: ... - def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): + def any( + self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + ) -> np.bool_ | NAType: """ Return whether any element is truthy. @@ -1346,25 +1378,25 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): skips NAs): >>> pd.array([True, False, True]).any() - True + np.True_ >>> pd.array([True, False, pd.NA]).any() - True + np.True_ >>> pd.array([False, False, pd.NA]).any() - False + np.False_ >>> pd.array([], dtype="boolean").any() - False + np.False_ >>> pd.array([pd.NA], dtype="boolean").any() - False + np.False_ >>> pd.array([pd.NA], dtype="Float64").any() - False + np.False_ With ``skipna=False``, the result can be NA if this is logically required (whether ``pd.NA`` is True or False influences the result): >>> pd.array([True, False, pd.NA]).any(skipna=False) - True + np.True_ >>> pd.array([1, 0, pd.NA]).any(skipna=False) - True + np.True_ >>> pd.array([False, False, pd.NA]).any(skipna=False) >>> pd.array([0, 0, pd.NA]).any(skipna=False) @@ -1373,12 +1405,7 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_any((), kwargs) values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self.dtype._falsey_value) result = values.any() if skipna: return result @@ -1388,7 +1415,19 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): else: return self.dtype.na_value - def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): + @overload + def all( + self, *, skipna: Literal[True] = ..., axis: AxisInt | None = ..., **kwargs + ) -> np.bool_: ... + + @overload + def all( + self, *, skipna: bool, axis: AxisInt | None = ..., **kwargs + ) -> np.bool_ | NAType: ... + + def all( + self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + ) -> np.bool_ | NAType: """ Return whether all elements are truthy. @@ -1427,17 +1466,17 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): skips NAs): >>> pd.array([True, True, pd.NA]).all() - True + np.True_ >>> pd.array([1, 1, pd.NA]).all() - True + np.True_ >>> pd.array([True, False, pd.NA]).all() - False + np.False_ >>> pd.array([], dtype="boolean").all() - True + np.True_ >>> pd.array([pd.NA], dtype="boolean").all() - True + np.True_ >>> pd.array([pd.NA], dtype="Float64").all() - True + np.True_ With ``skipna=False``, the result can be NA if this is logically required (whether ``pd.NA`` is True or False influences the result): @@ -1447,29 +1486,76 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): >>> pd.array([1, 1, pd.NA]).all(skipna=False) >>> pd.array([True, False, pd.NA]).all(skipna=False) - False + np.False_ >>> pd.array([1, 0, pd.NA]).all(skipna=False) - False + np.False_ """ nv.validate_all((), kwargs) values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self.dtype._truthy_value) result = values.all(axis=axis) if skipna: - return result + return result # type: ignore[return-value] else: if not result or len(self) == 0 or not self._mask.any(): - return result + return result # type: ignore[return-value] else: return self.dtype.na_value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> FloatingArray: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if self.dtype.kind == "f": + if copy: + data = self._data.copy() + mask = self._mask.copy() + else: + data = self._data + mask = self._mask + elif self.dtype.kind in "iu": + copy = True + data = self._data.astype("f8") + mask = self._mask.copy() + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + if not copy: + return self # type: ignore[return-value] + if self.dtype.kind == "f": + return type(self)._simple_new(data, mask) # type: ignore[return-value] + else: + from pandas.core.arrays import FloatingArray + + return FloatingArray._simple_new(data, mask) + def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: @@ -1541,13 +1627,24 @@ def transpose_homogeneous_masked_arrays( same dtype. The caller is responsible for ensuring validity of input data. """ masked_arrays = list(masked_arrays) + dtype = masked_arrays[0].dtype + values = [arr._data.reshape(1, -1) for arr in masked_arrays] - transposed_values = np.concatenate(values, axis=0) + transposed_values = np.concatenate( + values, + axis=0, + out=np.empty( + (len(masked_arrays), len(masked_arrays[0])), + order="F", + dtype=dtype.numpy_dtype, + ), + ) masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] - transposed_masks = np.concatenate(masks, axis=0) + transposed_masks = np.concatenate( + masks, axis=0, out=np.empty_like(transposed_values, dtype=bool) + ) - dtype = masked_arrays[0].dtype arr_type = dtype.construct_array_type() transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 210450e868698..f319a3cc05575 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -28,17 +27,21 @@ ) if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import ( + Callable, + Mapping, + ) import pyarrow from pandas._typing import ( - Dtype, DtypeObj, Self, npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + class NumericDtype(BaseMaskedDtype): _default_np_dtype: np.dtype @@ -159,7 +162,10 @@ def _coerce_to_data_and_mask( return values, mask, dtype, inferred_type original = values - values = np.array(values, copy=copy) + if not copy: + values = np.asarray(values) + else: + values = np.array(values, copy=copy) inferred_type = None if values.dtype == object or is_string_dtype(values.dtype): inferred_type = lib.infer_dtype(values, skipna=True) @@ -168,7 +174,12 @@ def _coerce_to_data_and_mask( raise TypeError(f"{values.dtype} cannot be converted to {name}") elif values.dtype.kind == "b" and checker(dtype): - values = np.array(values, dtype=default_dtype, copy=copy) + # fastpath + mask = np.zeros(len(values), dtype=np.bool_) + if not copy: + values = np.asarray(values, dtype=default_dtype) + else: + values = np.array(values, dtype=default_dtype, copy=copy) elif values.dtype.kind not in "iuf": name = dtype_cls.__name__.strip("_") @@ -181,6 +192,10 @@ def _coerce_to_data_and_mask( if values.dtype.kind in "iu": # fastpath mask = np.zeros(len(values), dtype=np.bool_) + elif values.dtype.kind == "f": + # np.isnan is faster than is_numeric_na() for floats + # github issue: #60066 + mask = np.isnan(values) else: mask = libmissing.is_numeric_na(values) else: @@ -207,14 +222,14 @@ def _coerce_to_data_and_mask( inferred_type not in ["floating", "mixed-integer-float"] and not mask.any() ): - values = np.array(original, dtype=dtype, copy=False) + values = np.asarray(original, dtype=dtype) else: - values = np.array(original, dtype="object", copy=False) + values = np.asarray(original, dtype="object") # we copy as need to coerce here if mask.any(): values = values.copy() - values[mask] = cls._internal_fill_value + values[mask] = dtype_cls._internal_fill_value if inferred_type in ("string", "unicode"): # casts from str are always safe since they raise # a ValueError if the str cannot be parsed into a float @@ -270,7 +285,7 @@ def _coerce_to_array( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index d83a37088daec..fd2c8c9d63362 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, + Any, Literal, ) @@ -29,6 +30,8 @@ from pandas.core.strings.object_array import ObjectStringArrayMixin if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( AxisInt, Dtype, @@ -71,6 +74,11 @@ class NumpyExtensionArray( # type: ignore[misc] ------- None + See Also + -------- + array : Create an array. + Series.to_numpy : Convert a Series to a NumPy array. + Examples -------- >>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3])) @@ -137,9 +145,6 @@ def _from_sequence( result = result.copy() return cls(result) - def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray: - return type(self)(arr) - # ------------------------------------------------------------------------ # Data @@ -150,7 +155,12 @@ def dtype(self) -> NumpyEADtype: # ------------------------------------------------------------------------ # NumPy Array Interface - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + if copy is not None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.array(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): @@ -285,6 +295,9 @@ def interpolate( See NDFrame.interpolate.__doc__. """ # NB: we return type(self) even if copy=False + if not self.dtype._is_numeric: + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") + if not copy: out_data = self._ndarray else: @@ -556,6 +569,11 @@ def _wrap_ndarray_result(self, result: np.ndarray): return TimedeltaArray._simple_new(result, dtype=result.dtype) return type(self)(result) - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan + def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: + # NEP 51: https://github.com/numpy/numpy/pull/22449 + if self.dtype.kind in "SU": + return "'{}'".format + elif self.dtype == "object": + return repr + else: + return str diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2930b979bfe78..ae92e17332c76 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, TypeVar, cast, @@ -37,7 +36,6 @@ from pandas._libs.tslibs.dtypes import ( FreqGroup, PeriodDtypeBase, - freq_to_period_freqstr, ) from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.offsets import ( @@ -55,7 +53,6 @@ cache_readonly, doc, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_object, @@ -77,7 +74,10 @@ import pandas.core.common as com if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( AnyArrayLike, @@ -90,6 +90,8 @@ npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -134,11 +136,6 @@ class PeriodArray(dtl.DatelikeOps, libperiod.PeriodMixin): # type: ignore[misc] dtype : PeriodDtype, optional A PeriodDtype instance from which to extract a `freq`. If both `freq` and `dtype` are specified, then the frequencies must match. - freq : str or DateOffset - The `freq` to use for the array. Mostly applicable when `values` - is an ndarray of integers, when `freq` is required. When `values` - is a PeriodArray (or box around), it's checked that ``values.freq`` - matches `freq`. copy : bool, default False Whether to copy the ordinals before storing. @@ -172,8 +169,7 @@ class PeriodArray(dtl.DatelikeOps, libperiod.PeriodMixin): # type: ignore[misc] Examples -------- - >>> pd.arrays.PeriodArray(pd.PeriodIndex(['2023-01-01', - ... '2023-01-02'], freq='D')) + >>> pd.arrays.PeriodArray(pd.PeriodIndex(["2023-01-01", "2023-01-02"], freq="D")) ['2023-01-01', '2023-01-02'] Length: 2, dtype: period[D] @@ -224,20 +220,7 @@ def _scalar_type(self) -> type[Period]: # -------------------------------------------------------------------- # Constructors - def __init__( - self, values, dtype: Dtype | None = None, freq=None, copy: bool = False - ) -> None: - if freq is not None: - # GH#52462 - warnings.warn( - "The 'freq' keyword in the PeriodArray constructor is deprecated " - "and will be removed in a future version. Pass 'dtype' instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - freq = validate_dtype_freq(dtype, freq) - dtype = PeriodDtype(freq) - + def __init__(self, values, dtype: Dtype | None = None, copy: bool = False) -> None: if dtype is not None: dtype = pandas_dtype(dtype) if not isinstance(dtype, PeriodDtype): @@ -256,7 +239,10 @@ def __init__( raise raise_on_incompatible(values, dtype.freq) values, dtype = values._ndarray, values.dtype - values = np.array(values, dtype="int64", copy=copy) + if not copy: + values = np.asarray(values, dtype="int64") + else: + values = np.array(values, dtype="int64", copy=copy) if dtype is None: raise ValueError("dtype is not specified and cannot be inferred") dtype = cast(PeriodDtype, dtype) @@ -304,7 +290,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) @@ -324,7 +310,7 @@ def _from_datetime64(cls, data, freq, tz=None) -> Self: PeriodArray[freq] """ if isinstance(freq, BaseOffset): - freq = freq_to_period_freqstr(freq.n, freq.name) + freq = PeriodDtype(freq)._freqstr data, freq = dt64arr_to_periodarr(data, freq, tz) dtype = PeriodDtype(freq) return cls(data, dtype=dtype) @@ -398,12 +384,25 @@ def freq(self) -> BaseOffset: @property def freqstr(self) -> str: - return freq_to_period_freqstr(self.freq.n, self.freq.name) + return PeriodDtype(self.freq)._freqstr - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: if dtype == "i8": - return self.asi8 - elif dtype == bool: + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + return np.asarray(self.asi8, dtype=dtype) + else: + return np.array(self.asi8, dtype=dtype) + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes @@ -444,6 +443,15 @@ def __arrow_array__(self, type=None): """ The year of the period. + See Also + -------- + PeriodIndex.day_of_year : The ordinal day of the year. + PeriodIndex.dayofyear : The ordinal day of the year. + PeriodIndex.is_leap_year : Logical indicating if the date belongs to a + leap year. + PeriodIndex.weekofyear : The week ordinal of the year. + PeriodIndex.year : The year of the period. + Examples -------- >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") @@ -456,6 +464,11 @@ def __arrow_array__(self, type=None): """ The month as January=1, December=12. + See Also + -------- + PeriodIndex.days_in_month : The number of days in the month. + PeriodIndex.daysinmonth : The number of days in the month. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") @@ -468,6 +481,16 @@ def __arrow_array__(self, type=None): """ The days of the period. + See Also + -------- + PeriodIndex.day_of_week : The day of the week with Monday=0, Sunday=6. + PeriodIndex.day_of_year : The ordinal day of the year. + PeriodIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + PeriodIndex.dayofyear : The ordinal day of the year. + PeriodIndex.days_in_month : The number of days in the month. + PeriodIndex.daysinmonth : The number of days in the month. + PeriodIndex.weekday : The day of the week with Monday=0, Sunday=6. + Examples -------- >>> idx = pd.PeriodIndex(['2020-01-31', '2020-02-28'], freq='D') @@ -480,6 +503,12 @@ def __arrow_array__(self, type=None): """ The hour of the period. + See Also + -------- + PeriodIndex.minute : The minute of the period. + PeriodIndex.second : The second of the period. + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='h') @@ -492,6 +521,12 @@ def __arrow_array__(self, type=None): """ The minute of the period. + See Also + -------- + PeriodIndex.hour : The hour of the period. + PeriodIndex.second : The second of the period. + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-01 10:30:00", @@ -505,6 +540,12 @@ def __arrow_array__(self, type=None): """ The second of the period. + See Also + -------- + PeriodIndex.hour : The hour of the period. + PeriodIndex.minute : The minute of the period. + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-01 10:00:30", @@ -518,6 +559,14 @@ def __arrow_array__(self, type=None): """ The week ordinal of the year. + See Also + -------- + PeriodIndex.day_of_week : The day of the week with Monday=0, Sunday=6. + PeriodIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + PeriodIndex.week : The week ordinal of the year. + PeriodIndex.weekday : The day of the week with Monday=0, Sunday=6. + PeriodIndex.year : The year of the period. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") @@ -531,6 +580,17 @@ def __arrow_array__(self, type=None): """ The day of the week with Monday=0, Sunday=6. + See Also + -------- + PeriodIndex.day : The days of the period. + PeriodIndex.day_of_week : The day of the week with Monday=0, Sunday=6. + PeriodIndex.day_of_year : The ordinal day of the year. + PeriodIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + PeriodIndex.dayofyear : The ordinal day of the year. + PeriodIndex.week : The week ordinal of the year. + PeriodIndex.weekday : The day of the week with Monday=0, Sunday=6. + PeriodIndex.weekofyear : The week ordinal of the year. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-01", "2023-01-02", "2023-01-03"], freq="D") @@ -545,6 +605,17 @@ def __arrow_array__(self, type=None): """ The ordinal day of the year. + See Also + -------- + PeriodIndex.day : The days of the period. + PeriodIndex.day_of_week : The day of the week with Monday=0, Sunday=6. + PeriodIndex.day_of_year : The ordinal day of the year. + PeriodIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + PeriodIndex.dayofyear : The ordinal day of the year. + PeriodIndex.weekday : The day of the week with Monday=0, Sunday=6. + PeriodIndex.weekofyear : The week ordinal of the year. + PeriodIndex.year : The year of the period. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01-10", "2023-02-01", "2023-03-01"], freq="D") @@ -563,6 +634,11 @@ def __arrow_array__(self, type=None): """ The quarter of the date. + See Also + -------- + PeriodIndex.qyear : Fiscal year the Period lies in according to its + starting-quarter. + Examples -------- >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") @@ -570,12 +646,62 @@ def __arrow_array__(self, type=None): Index([1, 1, 1], dtype='int64') """, ) - qyear = _field_accessor("qyear") + qyear = _field_accessor( + "qyear", + """ + Fiscal year the Period lies in according to its starting-quarter. + + The `year` and the `qyear` of the period will be the same if the fiscal + and calendar years are the same. When they are not, the fiscal year + can be different from the calendar year of the period. + + Returns + ------- + int + The fiscal year of the period. + + See Also + -------- + PeriodIndex.quarter : The quarter of the date. + PeriodIndex.year : The year of the period. + + Examples + -------- + If the natural and fiscal year are the same, `qyear` and `year` will + be the same. + + >>> per = pd.Period('2018Q1', freq='Q') + >>> per.qyear + 2018 + >>> per.year + 2018 + + If the fiscal year starts in April (`Q-MAR`), the first quarter of + 2018 will start in April 2017. `year` will then be 2017, but `qyear` + will be the fiscal year, 2018. + + >>> per = pd.Period('2018Q1', freq='Q-MAR') + >>> per.start_time + Timestamp('2017-04-01 00:00:00') + >>> per.qyear + 2018 + >>> per.year + 2017 + """, + ) + days_in_month = _field_accessor( "days_in_month", """ The number of days in the month. + See Also + -------- + PeriodIndex.day : The days of the period. + PeriodIndex.days_in_month : The number of days in the month. + PeriodIndex.daysinmonth : The number of days in the month. + PeriodIndex.month : The month as January=1, December=12. + Examples -------- For Series: @@ -607,6 +733,12 @@ def is_leap_year(self) -> npt.NDArray[np.bool_]: """ Logical indicating if the date belongs to a leap year. + See Also + -------- + PeriodIndex.qyear : Fiscal year the Period lies in according to its + starting-quarter. + PeriodIndex.year : The year of the period. + Examples -------- >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") @@ -630,6 +762,19 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: Returns ------- DatetimeArray/Index + Timestamp representation of given Period-like object. + + See Also + -------- + PeriodIndex.day : The days of the period. + PeriodIndex.from_fields : Construct a PeriodIndex from fields + (year, month, day, etc.). + PeriodIndex.from_ordinals : Construct a PeriodIndex from ordinals. + PeriodIndex.hour : The hour of the period. + PeriodIndex.minute : The minute of the period. + PeriodIndex.month : The month as January=1, December=12. + PeriodIndex.second : The second of the period. + PeriodIndex.year : The year of the period. Examples -------- @@ -637,6 +782,20 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: >>> idx.to_timestamp() DatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01'], dtype='datetime64[ns]', freq='MS') + + The frequency will not be inferred if the index contains less than + three elements, or if the values of index are not strictly monotonic: + + >>> idx = pd.PeriodIndex(["2023-01", "2023-02"], freq="M") + >>> idx.to_timestamp() + DatetimeIndex(['2023-01-01', '2023-02-01'], dtype='datetime64[ns]', freq=None) + + >>> idx = pd.PeriodIndex( + ... ["2023-01", "2023-02", "2023-02", "2023-03"], freq="2M" + ... ) + >>> idx.to_timestamp() + DatetimeIndex(['2023-01-01', '2023-02-01', '2023-02-01', '2023-03-01'], + dtype='datetime64[ns]', freq=None) """ from pandas.core.arrays import DatetimeArray @@ -664,7 +823,7 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: new_parr = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) - dta = DatetimeArray._from_sequence(new_data) + dta = DatetimeArray._from_sequence(new_data, dtype=np.dtype("M8[ns]")) if self.freq.name == "B": # See if we can retain BDay instead of Day in cases where @@ -719,22 +878,22 @@ def asfreq(self, freq=None, how: str = "E") -> Self: Examples -------- - >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='Y') + >>> pidx = pd.period_range("2010-01-01", "2015-01-01", freq="Y") >>> pidx PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], dtype='period[Y-DEC]') - >>> pidx.asfreq('M') + >>> pidx.asfreq("M") PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', '2015-12'], dtype='period[M]') - >>> pidx.asfreq('M', how='S') + >>> pidx.asfreq("M", how="S") PeriodIndex(['2010-01', '2011-01', '2012-01', '2013-01', '2014-01', '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) - if isinstance(freq, BaseOffset): - freq = freq_to_period_freqstr(freq.n, freq.name) + if isinstance(freq, BaseOffset) and hasattr(freq, "_period_dtype_code"): + freq = PeriodDtype(freq)._freqstr freq = Period._maybe_convert_freq(freq) base1 = self._dtype._dtype_code @@ -759,7 +918,7 @@ def asfreq(self, freq=None, how: str = "E") -> Self: # ------------------------------------------------------------------ # Rendering Methods - def _formatter(self, boxed: bool = False): + def _formatter(self, boxed: bool = False) -> Callable[[object], str]: if boxed: return str return "'{}'".format @@ -810,30 +969,24 @@ def searchsorted( return m8arr.searchsorted(npvalue, side=side, sorter=sorter) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta._pad_or_backfill(method=method, limit=limit, copy=copy) + result = dta._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) if copy: return cast("Self", result.view(self.dtype)) else: return self - def fillna( - self, value=None, method=None, limit: int | None = None, copy: bool = True - ) -> Self: - if method is not None: - # view as dt64 so we get treated as timelike in core.missing, - # similar to dtl._period_dispatch - dta = self.view("M8[ns]") - result = dta.fillna(value=value, method=method, limit=limit, copy=copy) - # error: Incompatible return value type (got "Union[ExtensionArray, - # ndarray[Any, Any]]", expected "PeriodArray") - return result.view(self.dtype) # type: ignore[return-value] - return super().fillna(value=value, method=method, limit=limit, copy=copy) - # ------------------------------------------------------------------ # Arithmetic Methods @@ -960,6 +1113,17 @@ def _check_timedeltalike_freq_compat(self, other): delta = delta.view("i8") return lib.item_from_zerodim(delta) + # ------------------------------------------------------------------ + # Reductions + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims and isinstance(result, np.ndarray): + return self._from_sequence(result, dtype=self.dtype) + return result + def raise_on_incompatible(left, right) -> IncompatibleFrequency: """ @@ -980,13 +1144,17 @@ def raise_on_incompatible(left, right) -> IncompatibleFrequency: if isinstance(right, (np.ndarray, ABCTimedeltaArray)) or right is None: other_freq = None elif isinstance(right, BaseOffset): - other_freq = freq_to_period_freqstr(right.n, right.name) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + other_freq = PeriodDtype(right)._freqstr elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period)): other_freq = right.freqstr else: other_freq = delta_to_tick(Timedelta(right)).freqstr - own_freq = freq_to_period_freqstr(left.freq.n, left.freq.name) + own_freq = PeriodDtype(left.freq)._freqstr msg = DIFFERENT_FREQ.format( cls=type(left).__name__, own_freq=own_freq, other_freq=other_freq ) @@ -1028,29 +1196,26 @@ def period_array( Examples -------- - >>> period_array([pd.Period('2017', freq='Y'), - ... pd.Period('2018', freq='Y')]) + >>> period_array([pd.Period("2017", freq="Y"), pd.Period("2018", freq="Y")]) ['2017', '2018'] Length: 2, dtype: period[Y-DEC] - >>> period_array([pd.Period('2017', freq='Y'), - ... pd.Period('2018', freq='Y'), - ... pd.NaT]) + >>> period_array([pd.Period("2017", freq="Y"), pd.Period("2018", freq="Y"), pd.NaT]) ['2017', '2018', 'NaT'] Length: 3, dtype: period[Y-DEC] Integers that look like years are handled - >>> period_array([2000, 2001, 2002], freq='D') + >>> period_array([2000, 2001, 2002], freq="D") ['2000-01-01', '2001-01-01', '2002-01-01'] Length: 3, dtype: period[D] Datetime-like strings may also be passed - >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q') + >>> period_array(["2000-Q1", "2000-Q2", "2000-Q3", "2000-Q4"], freq="Q") ['2000Q1', '2000Q2', '2000Q3', '2000Q4'] Length: 4, dtype: period[Q-DEC] @@ -1097,13 +1262,11 @@ def period_array( @overload -def validate_dtype_freq(dtype, freq: BaseOffsetT) -> BaseOffsetT: - ... +def validate_dtype_freq(dtype, freq: BaseOffsetT) -> BaseOffsetT: ... @overload -def validate_dtype_freq(dtype, freq: timedelta | str | None) -> BaseOffset: - ... +def validate_dtype_freq(dtype, freq: timedelta | str | None) -> BaseOffset: ... def validate_dtype_freq( @@ -1179,12 +1342,7 @@ def dt64arr_to_periodarr( reso = get_unit_from_dtype(data.dtype) freq = Period._maybe_convert_freq(freq) - try: - base = freq._period_dtype_code - except (AttributeError, TypeError): - # AttributeError: _period_dtype_code might not exist - # TypeError: _period_dtype_code might intentionally raise - raise TypeError(f"{freq.name} is not supported as period frequency") + base = freq._period_dtype_code return c_dt64arr_to_periodarr(data.view("i8"), base, tz, reso=reso), freq diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index adf83963aca39..93d5cb8cc335a 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -12,8 +12,8 @@ __all__ = [ "BlockIndex", "IntIndex", - "make_sparse_index", "SparseAccessor", "SparseArray", "SparseFrameAccessor", + "make_sparse_index", ] diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index fc7debb1f31e4..7dde03b30cd6a 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -1,4 +1,5 @@ """Sparse accessor""" + from __future__ import annotations from typing import TYPE_CHECKING @@ -17,6 +18,11 @@ from pandas.core.arrays.sparse.array import SparseArray if TYPE_CHECKING: + from scipy.sparse import ( + coo_matrix, + spmatrix, + ) + from pandas import ( DataFrame, Series, @@ -30,7 +36,7 @@ def __init__(self, data=None) -> None: self._parent = data self._validate(data) - def _validate(self, data): + def _validate(self, data) -> None: raise NotImplementedError @@ -41,6 +47,18 @@ class SparseAccessor(BaseAccessor, PandasDelegate): """ Accessor for SparseSparse from other sparse matrix data types. + Parameters + ---------- + data : Series or DataFrame + The Series or DataFrame to which the SparseAccessor is attached. + + See Also + -------- + Series.sparse.to_coo : Create a scipy.sparse.coo_matrix from a Series with + MultiIndex. + Series.sparse.from_coo : Create a Series with sparse values from a + scipy.sparse.coo_matrix. + Examples -------- >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") @@ -50,7 +68,7 @@ class SparseAccessor(BaseAccessor, PandasDelegate): array([2, 2, 2]) """ - def _validate(self, data): + def _validate(self, data) -> None: if not isinstance(data.dtype, SparseDtype): raise AttributeError(self._validation_msg) @@ -70,9 +88,17 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: """ Create a Series with sparse values from a scipy.sparse.coo_matrix. + This method takes a ``scipy.sparse.coo_matrix`` (coordinate format) as input and + returns a pandas ``Series`` where the non-zero elements are represented as + sparse values. The index of the Series can either include only the coordinates + of non-zero elements (default behavior) or the full sorted set of coordinates + from the matrix if ``dense_index`` is set to `True`. + Parameters ---------- A : scipy.sparse.coo_matrix + The sparse matrix in coordinate format from which the sparse Series + will be created. dense_index : bool, default False If False (default), the index consists of only the coords of the non-null entries of the original coo_matrix. @@ -84,6 +110,12 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: s : Series A Series with sparse values. + See Also + -------- + DataFrame.sparse.from_spmatrix : Create a new DataFrame from a scipy sparse + matrix. + scipy.sparse.coo_matrix : A sparse matrix in COOrdinate format. + Examples -------- >>> from scipy import sparse @@ -92,8 +124,8 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 2.], @@ -115,7 +147,9 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: return result - def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False): + def to_coo( + self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False + ) -> tuple[coo_matrix, list, list]: """ Create a scipy.sparse.coo_matrix from a Series with MultiIndex. @@ -127,7 +161,9 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False) Parameters ---------- row_levels : tuple/list + MultiIndex levels to use for row coordinates, specified by name or index. column_levels : tuple/list + MultiIndex levels to use for column coordinates, specified by name or index. sort_labels : bool, default False Sort the row and column labels before forming the sparse matrix. When `row_levels` and/or `column_levels` refer to a single level, @@ -136,8 +172,16 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False) Returns ------- y : scipy.sparse.coo_matrix + The sparse matrix in coordinate format. rows : list (row labels) + Labels corresponding to the row coordinates. columns : list (column labels) + Labels corresponding to the column coordinates. + + See Also + -------- + Series.sparse.from_coo : Create a Series with sparse values from a + scipy.sparse.coo_matrix. Examples -------- @@ -149,7 +193,7 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False) ... (1, 1, "b", 0), ... (1, 1, "b", 1), ... (2, 1, "b", 0), - ... (2, 1, "b", 1) + ... (2, 1, "b", 1), ... ], ... names=["A", "B", "C", "D"], ... ) @@ -178,8 +222,8 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False) ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 3.], [3., 0., 0., 0.], @@ -235,15 +279,28 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): """ DataFrame accessor for sparse data. + It allows users to interact with a `DataFrame` that contains sparse data types + (`SparseDtype`). It provides methods and attributes to efficiently work with sparse + storage, reducing memory usage while maintaining compatibility with standard pandas + operations. + + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + + See Also + -------- + DataFrame.sparse.density : Ratio of non-sparse points to total (dense) data points. + Examples -------- - >>> df = pd.DataFrame({"a": [1, 2, 0, 0], - ... "b": [3, 0, 0, 4]}, dtype="Sparse[int]") + >>> df = pd.DataFrame({"a": [1, 2, 0, 0], "b": [3, 0, 0, 4]}, dtype="Sparse[int]") >>> df.sparse.density - 0.5 + np.float64(0.5) """ - def _validate(self, data): + def _validate(self, data) -> None: dtypes = data.dtypes if not all(isinstance(t, SparseDtype) for t in dtypes): raise AttributeError(self._validation_msg) @@ -267,15 +324,20 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: Each column of the DataFrame is stored as a :class:`arrays.SparseArray`. + See Also + -------- + DataFrame.sparse.to_coo : Return the contents of the frame as a + sparse SciPy COO matrix. + Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3, dtype=float) + >>> mat = scipy.sparse.eye(3, dtype=int) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 - 0 1.0 0 0 - 1 0 1.0 0 - 2 0 0 1.0 + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 """ from pandas._libs.sparse import IntIndex @@ -292,7 +354,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: indices = data.indices indptr = data.indptr array_data = data.data - dtype = SparseDtype(array_data.dtype, 0) + dtype = SparseDtype(array_data.dtype) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) @@ -312,6 +374,11 @@ def to_dense(self) -> DataFrame: DataFrame A DataFrame with the same values stored as dense arrays. + See Also + -------- + DataFrame.sparse.density : Ratio of non-sparse points to total + (dense) data points. + Examples -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) @@ -321,12 +388,12 @@ def to_dense(self) -> DataFrame: 1 1 2 0 """ - from pandas import DataFrame - data = {k: v.array.to_dense() for k, v in self._parent.items()} - return DataFrame(data, index=self._parent.index, columns=self._parent.columns) + return self._parent._constructor( + data, index=self._parent.index, columns=self._parent.columns + ) - def to_coo(self): + def to_coo(self) -> spmatrix: """ Return the contents of the frame as a sparse SciPy COO matrix. @@ -336,6 +403,10 @@ def to_coo(self): If the caller is heterogeneous and contains booleans or objects, the result will be of dtype=object. See Notes. + See Also + -------- + DataFrame.sparse.to_dense : Convert a DataFrame with sparse values to dense. + Notes ----- The dtype will be the lowest-common-denominator type (implicit @@ -350,8 +421,8 @@ def to_coo(self): -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.to_coo() - <4x1 sparse matrix of type '' - with 2 stored elements in COOrdinate format> + """ import_optional_dependency("scipy") from scipy.sparse import coo_matrix @@ -363,8 +434,6 @@ def to_coo(self): cols, rows, data = [], [], [] for col, (_, ser) in enumerate(self._parent.items()): sp_arr = ser.array - if sp_arr.fill_value != 0: - raise ValueError("fill value must be 0 when converting to COO matrix") row = sp_arr.sp_index.indices cols.append(np.repeat(col, len(row))) @@ -381,11 +450,16 @@ def density(self) -> float: """ Ratio of non-sparse points to total (dense) data points. + See Also + -------- + DataFrame.sparse.from_spmatrix : Create a new DataFrame from a + scipy sparse matrix. + Examples -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.density - 0.5 + np.float64(0.5) """ tmp = np.mean([column.array.density for _, column in self._parent.items()]) return tmp diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5db77db2a9c66..137dbb6e4d139 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1,6 +1,7 @@ """ SparseArray data structure """ + from __future__ import annotations from collections import abc @@ -9,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -18,6 +18,8 @@ import numpy as np +from pandas._config.config import get_option + from pandas._libs import lib import pandas._libs.sparse as splib from pandas._libs.sparse import ( @@ -37,7 +39,6 @@ from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( - construct_1d_arraylike_from_scalar, find_common_type, maybe_box_datetimelike, ) @@ -85,7 +86,10 @@ # See https://github.com/python/typing/issues/684 if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from enum import Enum class ellipsis(Enum): @@ -95,10 +99,7 @@ class ellipsis(Enum): from scipy.sparse import spmatrix - from pandas._typing import ( - FillnaOptions, - NumpySorter, - ) + from pandas._typing import NumpySorter SparseIndexKind = Literal["integer", "block"] @@ -288,12 +289,18 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): """ An ExtensionArray for storing sparse data. + SparseArray efficiently stores data with a high frequency of a + specific fill value (e.g., zeros), saving memory by only retaining + non-fill elements and their indices. This class is particularly + useful for large datasets where most values are redundant. + Parameters ---------- data : array-like or scalar A dense array of values to store in the SparseArray. This may contain `fill_value`. sparse_index : SparseIndex, optional + Index indicating the locations of sparse elements. fill_value : scalar, optional Elements in data that are ``fill_value`` are not stored in the SparseArray. For memory savings, this should be the most common value @@ -344,6 +351,10 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): ------- None + See Also + -------- + SparseDtype : Dtype for sparse data. + Examples -------- >>> from pandas.arrays import SparseArray @@ -399,19 +410,10 @@ def __init__( dtype = dtype.subtype if is_scalar(data): - warnings.warn( - f"Constructing {type(self).__name__} with scalar data is deprecated " - "and will raise in a future version. Pass a sequence instead.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + f"Cannot construct {type(self).__name__} from scalar data. " + "Pass a sequence instead." ) - if sparse_index is None: - npoints = 1 - else: - npoints = sparse_index.length - - data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None) - dtype = data.dtype if dtype is not None: dtype = pandas_dtype(dtype) @@ -454,7 +456,8 @@ def __init__( # error: Argument "dtype" to "asarray" has incompatible type # "Union[ExtensionDtype, dtype[Any], None]"; expected "None" sparse_values = np.asarray( - data.sp_values, dtype=dtype # type: ignore[arg-type] + data.sp_values, + dtype=dtype, # type: ignore[arg-type] ) elif sparse_index is None: data = extract_array(data, extract_numpy=True) @@ -551,12 +554,23 @@ def from_spmatrix(cls, data: spmatrix) -> Self: return cls._simple_new(arr, index, dtype) - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: - fill_value = self.fill_value - + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: if self.sp_index.ngaps == 0: # Compat for na dtype and int values. - return self.sp_values + if copy is True: + return np.array(self.sp_values) + else: + return self.sp_values + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + fill_value = self.fill_value + if dtype is None: # Can NumPy represent this type? # If not, `np.result_type` will raise. We catch that @@ -584,11 +598,13 @@ def __setitem__(self, key, value) -> None: raise TypeError(msg) @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: return cls(scalars, dtype=dtype) @classmethod - def _from_factorized(cls, values, original): + def _from_factorized(cls, values, original) -> Self: return cls(values, dtype=original.dtype) # ------------------------------------------------------------------------ @@ -606,6 +622,18 @@ def sp_values(self) -> np.ndarray: """ An ndarray containing the non- ``fill_value`` values. + This property returns the actual data values stored in the sparse + representation, excluding the values that are equal to the ``fill_value``. + The result is an ndarray of the underlying values, preserving the sparse + structure by omitting the default ``fill_value`` entries. + + See Also + -------- + Series.sparse.to_dense : Convert a Series from sparse values to dense. + Series.sparse.fill_value : Elements in `data` that are `fill_value` are + not stored. + Series.sparse.density : The percent of non- ``fill_value`` points, as decimal. + Examples -------- >>> from pandas.arrays import SparseArray @@ -626,6 +654,12 @@ def fill_value(self): For memory savings, this should be the most common value in the array. + See Also + -------- + SparseDtype : Dtype for data stored in :class:`SparseArray`. + Series.value_counts : Return a Series containing counts of unique values. + Series.fillna : Fill NA/NaN in a Series with a specified value. + Examples -------- >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") @@ -665,12 +699,6 @@ def __len__(self) -> int: def _null_fill_value(self) -> bool: return self._dtype._is_na_fill_value - def _fill_value_matches(self, fill_value) -> bool: - if self._null_fill_value: - return isna(fill_value) - else: - return self.fill_value == fill_value - @property def nbytes(self) -> int: return self.sp_values.nbytes + self.sp_index.nbytes @@ -680,6 +708,11 @@ def density(self) -> float: """ The percent of non- ``fill_value`` points, as decimal. + See Also + -------- + DataFrame.sparse.from_spmatrix : Create a new DataFrame from a + scipy sparse matrix. + Examples -------- >>> from pandas.arrays import SparseArray @@ -694,6 +727,18 @@ def npoints(self) -> int: """ The number of non- ``fill_value`` points. + This property returns the number of elements in the sparse series that are + not equal to the ``fill_value``. Sparse data structures store only the + non-``fill_value`` elements, reducing memory usage when the majority of + values are the same. + + See Also + -------- + Series.sparse.to_dense : Convert a Series from sparse values to dense. + Series.sparse.fill_value : Elements in ``data`` that are ``fill_value`` are + not stored. + Series.sparse.density : The percent of non- ``fill_value`` points, as decimal. + Examples -------- >>> from pandas.arrays import SparseArray @@ -715,17 +760,9 @@ def isna(self) -> Self: # type: ignore[override] mask[self.sp_index.indices] = isna(self.sp_values) return type(self)(mask, fill_value=False, dtype=dtype) - def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True - ) -> Self: - # TODO(3.0): We can remove this method once deprecation for fillna method - # keyword is enforced. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) - def fillna( self, - value=None, - method: FillnaOptions | None = None, + value, limit: int | None = None, copy: bool = True, ) -> Self: @@ -734,17 +771,9 @@ def fillna( Parameters ---------- - value : scalar, optional - method : str, optional - - .. warning:: - - Using 'method' will result in high memory use, - as all `fill_value` methods will be converted to - an in-memory ndarray - + value : scalar limit : int, optional - + Not supported for SparseArray, must be None. copy: bool, default True Ignored for SparseArray. @@ -764,22 +793,15 @@ def fillna( When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ - if (method is None and value is None) or ( - method is not None and value is not None - ): - raise ValueError("Must specify one of 'method' or 'value'.") - - if method is not None: - return super().fillna(method=method, limit=limit) + if limit is not None: + raise ValueError("limit must be None") + new_values = np.where(isna(self.sp_values), value, self.sp_values) + if self._null_fill_value: + # This is essentially just updating the dtype. + new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) else: - new_values = np.where(isna(self.sp_values), value, self.sp_values) - - if self._null_fill_value: - # This is essentially just updating the dtype. - new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) - else: - new_dtype = self.dtype + new_dtype = self.dtype return self._simple_new(new_values, self._sparse_index, new_dtype) @@ -916,15 +938,13 @@ def value_counts(self, dropna: bool = True) -> Series: # Indexing # -------- @overload - def __getitem__(self, key: ScalarIndexer) -> Any: - ... + def __getitem__(self, key: ScalarIndexer) -> Any: ... @overload def __getitem__( self, key: SequenceIndexer | tuple[int | ellipsis, ...], - ) -> Self: - ... + ) -> Self: ... def __getitem__( self, @@ -1144,8 +1164,9 @@ def searchsorted( side: Literal["left", "right"] = "left", sorter: NumpySorter | None = None, ) -> npt.NDArray[np.intp] | np.intp: - msg = "searchsorted requires high memory usage." - warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) + if get_option("performance_warnings"): + msg = "searchsorted requires high memory usage." + warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) v = np.asarray(v) return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) @@ -1241,7 +1262,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): IntIndex Indices: array([2, 3], dtype=int32) - >>> arr.astype(SparseDtype(np.dtype('int32'))) + >>> arr.astype(SparseDtype(np.dtype("int32"))) [0, 0, 1, 2] Fill: 0 IntIndex @@ -1250,7 +1271,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): Using a NumPy dtype with a different kind (e.g. float) will coerce just ``self.sp_values``. - >>> arr.astype(SparseDtype(np.dtype('float64'))) + >>> arr.astype(SparseDtype(np.dtype("float64"))) ... # doctest: +NORMALIZE_WHITESPACE [nan, nan, 1.0, 2.0] Fill: nan @@ -1288,7 +1309,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): return self._simple_new(sp_values, self.sp_index, dtype) - def map(self, mapper, na_action=None) -> Self: + def map(self, mapper, na_action: Literal["ignore"] | None = None) -> Self: """ Map categories using an input mapping or function. @@ -1649,13 +1670,13 @@ def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int: def argmax(self, skipna: bool = True) -> int: validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return self._argmin_argmax("argmax") def argmin(self, skipna: bool = True) -> int: validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return self._argmin_argmax("argmin") # ------------------------------------------------------------------------ @@ -1830,7 +1851,9 @@ def __repr__(self) -> str: pp_index = printing.pprint_thing(self.sp_index) return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" - def _formatter(self, boxed: bool = False): + # error: Return type "None" of "_formatter" incompatible with return + # type "Callable[[Any], str | None]" in supertype "ExtensionArray" + def _formatter(self, boxed: bool = False) -> None: # type: ignore[override] # Defer to the formatter from the GenericArrayFormatter calling us. # This will infer the correct formatter from the dtype of the values. return None @@ -1899,13 +1922,11 @@ def _make_sparse( @overload -def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex: - ... +def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex: ... @overload -def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex: - ... +def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex: ... def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 71b71a9779da5..d4ef3003583c3 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -3,6 +3,7 @@ Currently only includes to_coo helpers. """ + from __future__ import annotations from typing import TYPE_CHECKING @@ -27,7 +28,7 @@ ) -def _check_is_partition(parts: Iterable, whole: Iterable): +def _check_is_partition(parts: Iterable, whole: Iterable) -> None: whole = set(whole) parts = [set(x) for x in parts] if set.intersection(*parts) != set(): @@ -78,7 +79,7 @@ def _levels_to_axis( ax_coords = codes[valid_ilocs] ax_labels = ax_labels.tolist() - return ax_coords, ax_labels + return ax_coords, ax_labels # pyright: ignore[reportReturnType] def _to_ijv( diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 00197a150fb97..8048306df91a2 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,14 +1,21 @@ from __future__ import annotations +from functools import partial +import operator from typing import ( TYPE_CHECKING, - ClassVar, + Any, Literal, + cast, ) +import warnings import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( lib, @@ -16,9 +23,16 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + HAS_PYARROW, + pa_version_under10p1, +) from pandas.compat.numpy import function as nv -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, @@ -34,7 +48,12 @@ pandas_dtype, ) -from pandas.core import ops +from pandas.core import ( + missing, + nanops, + ops, +) +from pandas.core.algorithms import isin from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -50,10 +69,13 @@ from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna +from pandas.io.formats import printing + if TYPE_CHECKING: import pyarrow from pandas._typing import ( + ArrayLike, AxisInt, Dtype, DtypeObj, @@ -68,6 +90,7 @@ from pandas import Series +@set_module("pandas") @register_extension_dtype class StringDtype(StorageExtensionDtype): """ @@ -80,8 +103,10 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow"}, optional If not given, the value of ``pd.options.mode.string_storage``. + na_value : {np.nan, pd.NA}, default pd.NA + Whether the dtype follows NaN or NA missing value semantics. Attributes ---------- @@ -91,47 +116,114 @@ class StringDtype(StorageExtensionDtype): ------- None + See Also + -------- + BooleanDtype : Extension dtype for boolean data. + Examples -------- >>> pd.StringDtype() - string[python] + )> >>> pd.StringDtype(storage="pyarrow") - string[pyarrow] + )> """ - # error: Cannot override instance variable (previously declared on - # base class "StorageExtensionDtype") with class variable - name: ClassVar[str] = "string" # type: ignore[misc] + @property + def name(self) -> str: # type: ignore[override] + if self._na_value is libmissing.NA: + return "string" + else: + return "str" #: StringDtype().na_value uses pandas.NA except the implementation that # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] - if self.storage == "pyarrow_numpy": - return np.nan - else: - return libmissing.NA + return self._na_value - _metadata = ("storage",) + _metadata = ("storage", "_na_value") # type: ignore[assignment] - def __init__(self, storage=None) -> None: + def __init__( + self, + storage: str | None = None, + na_value: libmissing.NAType | float = libmissing.NA, + ) -> None: + # infer defaults if storage is None: - infer_string = get_option("future.infer_string") - if infer_string: - storage = "pyarrow_numpy" + if na_value is not libmissing.NA: + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + if storage == "auto": + storage = "python" + + if storage == "pyarrow_numpy": + warnings.warn( + "The 'pyarrow_numpy' storage option name is deprecated and will be " + 'removed in pandas 3.0. Use \'pd.StringDtype(storage="pyarrow", ' + "na_value-np.nan)' to construct the same dtype.\nOr enable the " + "'pd.options.future.infer_string = True' option globally and use " + 'the "str" alias as a shorthand notation to specify a dtype ' + '(instead of "string[pyarrow_numpy]").', + FutureWarning, + stacklevel=find_stack_level(), + ) + storage = "pyarrow" + na_value = np.nan + + # validate options + if storage not in {"python", "pyarrow"}: raise ValueError( - f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " - f"Got {storage} instead." + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under10p1: raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) - self.storage = storage + + if isinstance(na_value, float) and np.isnan(na_value): + # when passed a NaN value, always set to np.nan to ensure we use + # a consistent NaN value (and we can use `dtype.na_value is np.nan`) + na_value = np.nan + elif na_value is not libmissing.NA: + raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") + + self.storage = cast(str, storage) + self._na_value = na_value + + def __repr__(self) -> str: + storage = "" if self.storage == "pyarrow" else "storage='python', " + return f"" + + def __eq__(self, other: object) -> bool: + # we need to override the base class __eq__ because na_value (NA or NaN) + # cannot be checked with normal `==` + if isinstance(other, str): + # TODO should dtype == "string" work for the NaN variant? + if other == "string" or other == self.name: + return True + try: + other = self.construct_from_string(other) + except (TypeError, ImportError): + # TypeError if `other` is not a valid string for StringDtype + # ImportError if pyarrow is not installed for "string[pyarrow]" + return False + if isinstance(other, type(self)): + return self.storage == other.storage and self.na_value is other.na_value + return False + + def __hash__(self) -> int: + # need to override __hash__ as well because of overriding __eq__ + return super().__hash__() + + def __reduce__(self): + return StringDtype, (self.storage, self.na_value) @property def type(self) -> type[str]: @@ -171,11 +263,14 @@ def construct_from_string(cls, string) -> Self: ) if string == "string": return cls() + elif string == "str" and using_string_dtype(): + return cls(na_value=np.nan) elif string == "string[python]": return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": + # this is deprecated in the dtype __init__, remove this in pandas 3.0 return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -198,13 +293,43 @@ def construct_array_type( # type: ignore[override] ArrowStringArrayNumpySemantics, ) - if self.storage == "python": + if self.storage == "python" and self._na_value is libmissing.NA: return StringArray - elif self.storage == "pyarrow": + elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray + elif self.storage == "python": + return StringArrayNumpySemantics else: return ArrowStringArrayNumpySemantics + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + storages = set() + na_values = set() + + for dtype in dtypes: + if isinstance(dtype, StringDtype): + storages.add(dtype.storage) + na_values.add(dtype.na_value) + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"): + continue + else: + return None + + if len(storages) == 2: + # if both python and pyarrow storage -> priority to pyarrow + storage = "pyarrow" + else: + storage = next(iter(storages)) # type: ignore[assignment] + + na_value: libmissing.NAType | float + if len(na_values) == 2: + # if both NaN and NA -> priority to NA + na_value = libmissing.NA + else: + na_value = next(iter(na_values)) + + return StringDtype(storage=storage, na_value=na_value) + def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseStringArray: @@ -212,13 +337,17 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + if self._na_value is libmissing.NA: + from pandas.core.arrays.string_arrow import ArrowStringArray - return ArrowStringArray(array) - elif self.storage == "pyarrow_numpy": - from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + return ArrowStringArray(array) + else: + from pandas.core.arrays.string_arrow import ( + ArrowStringArrayNumpySemantics, + ) + + return ArrowStringArrayNumpySemantics(array) - return ArrowStringArrayNumpySemantics(array) else: import pyarrow @@ -233,7 +362,7 @@ def __from_arrow__( # convert chunk by chunk to numpy and concatenate then, to avoid # overflow for large string data when concatenating the pyarrow arrays arr = arr.to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = ensure_string_array(arr, na_value=self.na_value) results.append(arr) if len(chunks) == 0: @@ -243,11 +372,7 @@ def __from_arrow__( # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) - NDArrayBacked.__init__( - new_string_array, - arr, - StringDtype(storage="python"), - ) + NDArrayBacked.__init__(new_string_array, arr, self) return new_string_array @@ -256,8 +381,10 @@ class BaseStringArray(ExtensionArray): Mixin class for StringArray, ArrowStringArray. """ + dtype: StringDtype + @doc(ExtensionArray.tolist) - def tolist(self): + def tolist(self) -> list: if self.ndim > 1: return [x.tolist() for x in self] return list(self.to_numpy()) @@ -269,6 +396,146 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _formatter(self, boxed: bool = False): + formatter = partial( + printing.pprint_thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=not boxed, + ) + return formatter + + def _str_map( + self, + f, + na_value=lib.no_default, + dtype: Dtype | None = None, + convert: bool = True, + ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype) + + from pandas.arrays import BooleanArray + + if dtype is None: + dtype = self.dtype + if na_value is lib.no_default: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray | BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + elif dtype == np.dtype("bool"): + # GH#55736 + na_value = bool(na_value) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(cast(type, dtype)), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + + def _str_map_str_or_object( + self, + dtype, + na_value, + arr: np.ndarray, + f, + mask: npt.NDArray[np.bool_], + ): + # _str_map helper for case where dtype is either string dtype or object + if is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + if self.dtype.storage == "pyarrow": + import pyarrow as pa + + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) + # error: Too many arguments for "BaseStringArray" + return type(self)(result) # type: ignore[call-arg] + + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _str_map_nan_semantics( + self, f, na_value=lib.no_default, dtype: Dtype | None = None + ): + if dtype is None: + dtype = self.dtype + if na_value is lib.no_default: + if is_bool_dtype(dtype): + # NaN propagates as False + na_value = False + else: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + # NaN propagates as False + na_value = False + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and is_integer_dtype(dtype) and mask.any(): + # TODO: we could alternatively do this check before map_infer_mask + # and adjust the dtype/na_value we pass there. Which is more + # performant? + result = result.astype("float64") + result[mask] = np.nan + + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + + def view(self, dtype: Dtype | None = None) -> ArrayLike: + if dtype is not None: + raise TypeError("Cannot change data-type for string array.") + return super().view(dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -314,7 +581,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] See Also -------- - :func:`pandas.array` + :func:`array` The recommended function for creating a StringArray. Series.str The string methods are available on Series backed by @@ -326,7 +593,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") + >>> pd.array(["This is", "some text", None, "data."], dtype="string") ['This is', 'some text', , 'data.'] Length: 4, dtype: string @@ -334,11 +601,11 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` will convert the values to strings. - >>> pd.array(['1', 1], dtype="object") + >>> pd.array(["1", 1], dtype="object") ['1', 1] Length: 2, dtype: object - >>> pd.array(['1', 1], dtype="string") + >>> pd.array(["1", 1], dtype="string") ['1', '1'] Length: 2, dtype: string @@ -355,6 +622,8 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" + _storage = "python" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) @@ -362,9 +631,13 @@ def __init__(self, values, copy: bool = False) -> None: super().__init__(values, copy=copy) if not isinstance(values, type(self)): self._validate() - NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) + NDArrayBacked.__init__( + self, + self._ndarray, + StringDtype(storage=self._storage, na_value=self._na_value), + ) - def _validate(self): + def _validate(self) -> None: """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") @@ -380,20 +653,39 @@ def _validate(self): else: lib.convert_nans_to_NA(self._ndarray) + def _validate_scalar(self, value): + # used by NDArrayBackedExtensionIndex.insert + if isna(value): + return self.dtype.na_value + elif not isinstance(value, str): + raise TypeError( + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." + ) + return value + @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) assert isinstance(dtype, StringDtype) and dtype.storage == "python" + else: + if using_string_dtype(): + dtype = StringDtype(storage="python", na_value=np.nan) + else: + dtype = StringDtype(storage="python") from pandas.core.arrays.masked import BaseMaskedArray + na_value = dtype.na_value if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA + result[na_values] = na_value else: if lib.is_pyarrow_array(scalars): @@ -402,19 +694,19 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal # zero_copy_only to True which caused problems see GH#52076 scalars = np.array(scalars) # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) + result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) + NDArrayBacked.__init__(new_string_array, result, dtype) return new_string_array @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False - ): + cls, strings, *, dtype: ExtensionDtype, copy: bool = False + ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) @classmethod @@ -436,42 +728,57 @@ def __arrow_array__(self, type=None): values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) - def _values_for_factorize(self): - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = None - return arr, None + def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] + arr = self._ndarray - def __setitem__(self, key, value) -> None: - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract NumpyExtensionArray subclasses - value = value._ndarray + return arr, self.dtype.na_value - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - # validate new items - if scalar_value: + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + if lib.is_scalar(value): if isna(value): - value = libmissing.NA + value = self.dtype.na_value elif not isinstance(value, str): raise TypeError( - f"Cannot set non-string value '{value}' into a StringArray." + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should " + f"be a string or missing value, got '{type(value).__name__}' " + "instead." ) else: + value = extract_array(value, extract_numpy=True) if not is_array_like(value): value = np.asarray(value, dtype=object) + elif isinstance(value.dtype, type(self.dtype)): + return value + else: + # cast categories and friends to arrays to see if values are + # compatible, compatibility with arrow backed strings + value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): - raise TypeError("Must provide strings.") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) + return value + + def __setitem__(self, key, value) -> None: + value = self._maybe_convert_setitem_value(value) + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") - mask = isna(value) - if mask.any(): - value = value.copy() - value[isna(value)] = libmissing.NA + if not scalar_value: + if value.dtype == self.dtype: + value = value._ndarray + else: + value = np.asarray(value) + mask = isna(value) + if mask.any(): + value = value.copy() + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) @@ -481,6 +788,30 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: + # the super() method NDArrayBackedExtensionArray._where uses + # np.putmask which doesn't properly handle None/pd.NA, so using the + # base class implementation that uses __setitem__ + return ExtensionArray._where(self, mask, value) + + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, BaseStringArray) or ( + isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) + ): + values = values.astype(self.dtype, copy=False) + else: + if not lib.is_string_array(np.asarray(values), skipna=True): + values = np.array( + [val for val in values if isinstance(val, str) or isna(val)], + dtype=object, + ) + if not len(values): + return np.zeros(self.shape, dtype=bool) + + values = self._from_sequence(values, dtype=self.dtype) + + return isin(np.asarray(self), np.asarray(values)) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) @@ -496,10 +827,10 @@ def astype(self, dtype, copy: bool = True): values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) elif isinstance(dtype, FloatingDtype): - arr = self.copy() + arr_ea = self.copy() mask = self.isna() - arr[mask] = "0" - values = arr.astype(dtype.numpy_dtype) + arr_ea[mask] = "0" + values = arr_ea.astype(dtype.numpy_dtype) return FloatingArray(values, mask, copy=False) elif isinstance(dtype, ExtensionDtype): # Skip the NumpyExtensionArray.astype method @@ -515,13 +846,116 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy) def _reduce( - self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + self, + name: str, + *, + skipna: bool = True, + keepdims: bool = False, + axis: AxisInt | None = 0, + **kwargs, ): - if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, axis=axis) + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + + if name in ["min", "max", "argmin", "argmax", "sum"]: + result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs) + if keepdims: + return self._from_sequence([result], dtype=self.dtype) + return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: np.ndarray | None = None + na_mask: np.ndarray | None = None + ndarray = self._ndarray + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray)) + if np.all(na_mask): + return type(self)(ndarray) + if skipna: + if name == "cumsum": + ndarray = np.where(na_mask, "", ndarray) + else: + # We can retain the running min/max by forward/backward filling. + ndarray = ndarray.copy() + missing.pad_or_backfill_inplace( + ndarray, + method="pad", + axis=0, + ) + missing.pad_or_backfill_inplace( + ndarray, + method="backfill", + axis=0, + ) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = np.argmax(na_mask) + tail = np.empty(len(ndarray) - idx, dtype="object") + tail[:] = self.dtype.na_value + ndarray = ndarray[:idx] + + # mypy: Cannot call function of unknown type + np_result = np_func(ndarray) # type: ignore[operator] + + if tail is not None: + np_result = np.hstack((np_result, tail)) + elif na_mask is not None: + # Argument 2 to "where" has incompatible type "NAType | float" + np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type] + + result = type(self)(np_result) + return result + + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + if self.dtype.na_value is np.nan and result is libmissing.NA: + # the masked_reductions use pd.NA -> convert to np.nan + return np.nan + return super()._wrap_reduction_result(axis, result) + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) result = masked_reductions.min( @@ -536,11 +970,28 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: ) return self._wrap_reduction_result(axis, result) + def sum( + self, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + **kwargs, + ) -> Scalar: + nv.validate_sum((), kwargs) + result = masked_reductions.sum( + values=self._ndarray, mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts - result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=False, dropna=dropna) result.index = result.index.astype(self.dtype) + + if self.dtype.na_value is libmissing.NA: + result = result.astype("Int64") return result def memory_usage(self, deep: bool = False) -> int: @@ -564,7 +1015,30 @@ def searchsorted( return super().searchsorted(value=value, side=side, sorter=sorter) def _cmp_method(self, other, op): - from pandas.arrays import BooleanArray + from pandas.arrays import ( + ArrowExtensionArray, + BooleanArray, + ) + + if ( + isinstance(other, BaseStringArray) + and self.dtype.na_value is not libmissing.NA + and other.dtype.na_value is libmissing.NA + ): + # NA has priority of NaN semantics + return NotImplemented + + if isinstance(other, ArrowExtensionArray): + if isinstance(other, BaseStringArray): + # pyarrow storage has priority over python storage + # (except if we have NA semantics and other not) + if not ( + self.dtype.na_value is libmissing.NA + and other.dtype.na_value is not libmissing.NA + ): + return NotImplemented + else: + return NotImplemented if isinstance(other, StringArray): other = other._ndarray @@ -579,79 +1053,52 @@ def _cmp_method(self, other, op): f"Lengths of operands do not match: {len(self)} != {len(other)}" ) - other = np.asarray(other) + # for array-likes, first filter out NAs before converting to numpy + if not is_array_like(other): + other = np.asarray(other) other = other[valid] if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") - result[mask] = libmissing.NA + result[mask] = self.dtype.na_value result[valid] = op(self._ndarray[valid], other) - return StringArray(result) + return self._from_backing_data(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) + res_arr = BooleanArray(result, mask) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return res_arr.to_numpy(np.bool_, na_value=True) + else: + return res_arr.to_numpy(np.bool_, na_value=False) + return res_arr _arith_method = _cmp_method - # ------------------------------------------------------------------------ - # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "NumpyExtensionArray" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - from pandas.arrays import BooleanArray +class StringArrayNumpySemantics(StringArray): + _storage = "python" + _na_value = np.nan - if dtype is None: - dtype = StringDtype(storage="python") - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - elif dtype == np.dtype("bool"): - na_value = bool(na_value) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] + def _validate(self) -> None: + """Validate that we only store NaN or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN" ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value + if self._ndarray.dtype != "object": + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN. Got " + f"'{self._ndarray.dtype}' dtype instead." ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + # TODO validate or force NA/None to NaN + + @classmethod + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + if dtype is None: + dtype = StringDtype(storage="python", na_value=np.nan) + return super()._from_sequence(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5a76811a12e6..9668981df827b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,11 +1,9 @@ from __future__ import annotations -from functools import partial import operator import re from typing import ( TYPE_CHECKING, - Callable, Union, ) import warnings @@ -19,15 +17,12 @@ from pandas.compat import ( pa_version_under10p1, pa_version_under13p0, + pa_version_under16p0, ) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, - is_object_dtype, is_scalar, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.missing import isna @@ -35,33 +30,36 @@ from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float64Dtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import ( BaseStringArray, StringDtype, ) -from pandas.core.ops import invalid_comparison from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( ArrayLike, - AxisInt, Dtype, - Scalar, + NpDtype, + Self, npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas import Series @@ -74,6 +72,10 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) +def _is_string_view(typ): + return not pa_version_under16p0 and pa.types.is_string_view(typ) + + # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support @@ -103,7 +105,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr See Also -------- - :func:`pandas.array` + :func:`array` The recommended function for creating a ArrowStringArray. Series.str The string methods are available on Series backed by @@ -115,7 +117,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") + >>> pd.array(["This is", "some text", None, "data."], dtype="string[pyarrow]") ['This is', 'some text', , 'data.'] Length: 4, dtype: string @@ -125,21 +127,28 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _storage = "pyarrow" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type + if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( + pa.types.is_string(values.type) + or _is_string_view(values.type) + or ( + pa.types.is_dictionary(values.type) + and ( + pa.types.is_string(values.type.value_type) + or pa.types.is_large_string(values.type.value_type) + or _is_string_view(values.type.value_type) + ) + ) ): values = pc.cast(values, pa.large_string()) super().__init__(values) - self._dtype = StringDtype(storage=self._storage) + self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): + if not pa.types.is_large_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of " "large_string type" @@ -172,17 +181,16 @@ def __len__(self) -> int: return len(self._pa_array) @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: from pandas.core.arrays.masked import BaseMaskedArray _chk_pyarrow_available() if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -190,18 +198,18 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.string())) + return cls(pa.array(result, mask=na_values, type=pa.large_string())) elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): - return cls(pc.cast(scalars, pa.string())) + return cls(pc.cast(scalars, pa.large_string())) # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) - return cls(pa.array(result, type=pa.string(), from_pandas=True)) + return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( - cls, strings, dtype: Dtype | None = None, copy: bool = False - ): + cls, strings, *, dtype: ExtensionDtype, copy: bool = False + ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) @property @@ -212,12 +220,36 @@ def dtype(self) -> StringDtype: # type: ignore[override] return self._dtype def insert(self, loc: int, item) -> ArrowStringArray: + if self.dtype.na_value is np.nan and item is np.nan: + item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{item}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(item).__name__}' instead." + ) return super().insert(loc, item) - @classmethod - def _result_converter(cls, values, na=None): + def _convert_bool_result(self, values, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + na = bool(na) + + if self.dtype.na_value is np.nan: + if na is lib.no_default or isna(na): + # NaN propagates as False + values = values.fill_null(False) + else: + values = values.fill_null(na) + return values.to_numpy() + else: + if na is not lib.no_default and not isna(na): # pyright: ignore [reportGeneralTypeIssues] + values = values.fill_null(na) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -226,30 +258,36 @@ def _maybe_convert_setitem_value(self, value): if isna(value): value = None elif not isinstance(value, str): - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{value}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." + ) else: value = np.array(value, dtype=object, copy=True) value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Scalar must be NA or str") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] - if pa_scalar.type in (pa.string(), pa.null()) + if pa_scalar.type in (pa.string(), pa.null(), pa.large_string()) ] # short-circuit to return all False array. - if not len(value_set): + if not value_set: return np.zeros(len(self), dtype=bool) result = pc.is_in( self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type) ) - # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls + # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -268,137 +306,51 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy=copy) - @property - def _data(self): - # dask accesses ._data directlys - warnings.warn( - f"{type(self).__name__}._data is a deprecated and will be removed " - "in a future version, use ._pa_array instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._pa_array - # ------------------------------------------------------------------------ # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "ObjectStringArrayMixin" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - # TODO: de-duplicate with StringArray method. This method is moreless copy and - # paste. - - from pandas.arrays import ( - BooleanArray, - IntegerArray, - ) - - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) - return type(self)(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + _str_isalnum = ArrowStringArrayMixin._str_isalnum + _str_isalpha = ArrowStringArrayMixin._str_isalpha + _str_isdecimal = ArrowStringArrayMixin._str_isdecimal + _str_isdigit = ArrowStringArrayMixin._str_isdigit + _str_islower = ArrowStringArrayMixin._str_islower + _str_isnumeric = ArrowStringArrayMixin._str_isnumeric + _str_isspace = ArrowStringArrayMixin._str_isspace + _str_istitle = ArrowStringArrayMixin._str_istitle + _str_isupper = ArrowStringArrayMixin._str_isupper + + _str_map = BaseStringArray._str_map + _str_startswith = ArrowStringArrayMixin._str_startswith + _str_endswith = ArrowStringArrayMixin._str_endswith + _str_pad = ArrowStringArrayMixin._str_pad + _str_match = ArrowStringArrayMixin._str_match + _str_fullmatch = ArrowStringArrayMixin._str_fullmatch + _str_lower = ArrowStringArrayMixin._str_lower + _str_upper = ArrowStringArrayMixin._str_upper + _str_strip = ArrowStringArrayMixin._str_strip + _str_lstrip = ArrowStringArrayMixin._str_lstrip + _str_rstrip = ArrowStringArrayMixin._str_rstrip + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_get = ArrowStringArrayMixin._str_get + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace + _str_len = ArrowStringArrayMixin._str_len + _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if flags: - fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) - if regex: - result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) - else: - result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._result_converter(result, na=na) - if not isna(na): - result[isna(result)] = bool(na) - return result - - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._result_converter(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._result_converter(result) + return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( self, @@ -410,306 +362,136 @@ def _str_replace( regex: bool = True, ): if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) - func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) - return type(self)(result) + return ArrowStringArrayMixin._str_replace( + self, pat, repl, n, case, flags, regex + ) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): return super()._str_repeat(repeats) else: - return type(self)(pc.binary_repeat(self._pa_array, repeats)) - - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("//$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if stop is None: - return super()._str_slice(start, stop, step) - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_isalnum(self): - result = pc.utf8_is_alnum(self._pa_array) - return self._result_converter(result) - - def _str_isalpha(self): - result = pc.utf8_is_alpha(self._pa_array) - return self._result_converter(result) - - def _str_isdecimal(self): - result = pc.utf8_is_decimal(self._pa_array) - return self._result_converter(result) - - def _str_isdigit(self): - result = pc.utf8_is_digit(self._pa_array) - return self._result_converter(result) - - def _str_islower(self): - result = pc.utf8_is_lower(self._pa_array) - return self._result_converter(result) - - def _str_isnumeric(self): - result = pc.utf8_is_numeric(self._pa_array) - return self._result_converter(result) - - def _str_isspace(self): - result = pc.utf8_is_space(self._pa_array) - return self._result_converter(result) - - def _str_istitle(self): - result = pc.utf8_is_title(self._pa_array) - return self._result_converter(result) - - def _str_isupper(self): - result = pc.utf8_is_upper(self._pa_array) - return self._result_converter(result) - - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return self._convert_int_dtype(result) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) + return ArrowExtensionArray._str_repeat(self, repeats=repeats) def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) + return ArrowStringArrayMixin._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) result = pc.count_substring_regex(self._pa_array, pat) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 return super()._str_find(sub, start, end) - return self._convert_int_dtype(result) + return ArrowStringArrayMixin._str_find(self, sub, start, end) - def _str_get_dummies(self, sep: str = "|"): - dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): + if dtype is None: + dtype = np.int64 + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies( + sep, dtype + ) if len(labels) == 0: - return np.empty(shape=(0, 0), dtype=np.int64), labels + return np.empty(shape=(0, 0), dtype=dtype), labels dummies = np.vstack(dummies_pa.to_numpy()) - return dummies.astype(np.int64, copy=False), labels + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + return dummies.astype(dummies_dtype, copy=False), labels + + def _convert_int_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result - def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) + def _convert_rank_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + return result.astype("float64", copy=False) + + return Float64Dtype().__from_arrow__(result) + def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): - result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if not skipna: + nas = pc.is_null(self._pa_array) + arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + result = ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + if keepdims: + # ArrowExtensionArray will return a length-1 bool[pyarrow] array + return result.astype(np.bool_) + return result + + if name in ("min", "max", "sum", "argmin", "argmax"): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + else: + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + if name in ("argmin", "argmax") and isinstance(result, pa.Array): - return self._convert_int_dtype(result) + return self._convert_int_result(result) elif isinstance(result, pa.Array): return type(self)(result) else: return result - def _rank( - self, - *, - axis: AxisInt = 0, - method: str = "average", - na_option: str = "keep", - ascending: bool = True, - pct: bool = False, - ): - """ - See Series.rank.__doc__. - """ - return self._convert_int_dtype( - self._rank_calc( - axis=axis, - method=method, - na_option=na_option, - ascending=ascending, - pct=pct, + def value_counts(self, dropna: bool = True) -> Series: + result = super().value_counts(dropna=dropna) + if self.dtype.na_value is np.nan: + res_values = result._values.to_numpy() + return result._constructor( + res_values, index=result.index, name=result.name, copy=False ) - ) - - -class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow_numpy" + return result - @classmethod - def _result_converter(cls, values, na=None): - if not isna(na): - values = values.fill_null(bool(na)) - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) - - def __getattribute__(self, item): - # ArrowStringArray and we both inherit from ArrowExtensionArray, which - # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item not in ( - "_pa_array", - "__dict__", + def _cmp_method(self, other, op): + if ( + isinstance(other, (BaseStringArray, ArrowExtensionArray)) + and self.dtype.na_value is not libmissing.NA + and other.dtype.na_value is libmissing.NA ): - return partial(getattr(ArrowStringArrayMixin, item), self) - return super().__getattribute__(item) + # NA has priority of NaN semantics + return NotImplemented - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - if is_integer_dtype(dtype): - na_value = np.nan + result = super()._cmp_method(other, op) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) else: - na_value = False - try: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - return result - - except ValueError: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - ) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) - return type(self)(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) - - def _convert_int_dtype(self, result): - if isinstance(result, pa.Array): - result = result.to_numpy(zero_copy_only=False) - else: - result = result.to_numpy() - if result.dtype == np.int32: - result = result.astype(np.int64) + return result.to_numpy(np.bool_, na_value=False) return result - def _cmp_method(self, other, op): - try: - result = super()._cmp_method(other, op) - except pa.ArrowNotImplementedError: - return invalid_comparison(self, other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) + def __pos__(self) -> Self: + raise TypeError(f"bad operand type for unary +: '{self.dtype}'") - def value_counts(self, dropna: bool = True) -> Series: - from pandas import Series - result = super().value_counts(dropna) - return Series( - result._values.to_numpy(), index=result.index, name=result.name, copy=False - ) - - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if not skipna and name == "all": - nas = pc.invert(pc.is_null(self._pa_array)) - arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) - else: - arr = pc.not_equal(self._pa_array, "") - return ArrowExtensionArray(arr)._reduce( - name, skipna=skipna, keepdims=keepdims, **kwargs - ) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - - def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: - if item is np.nan: - item = libmissing.NA - return super().insert(loc, item) # type: ignore[return-value] +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _na_value = np.nan diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 1b885a2bdcd47..9012b9f36348a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -113,12 +113,12 @@ class TimedeltaArray(dtl.TimelikeOps): Parameters ---------- - values : array-like + data : array-like The timedelta data. - dtype : numpy.dtype Currently, only ``numpy.dtype("timedelta64[ns]")`` is accepted. freq : Offset, optional + Frequency of the data. copy : bool, default False Whether to copy the underlying array of data. @@ -130,9 +130,15 @@ class TimedeltaArray(dtl.TimelikeOps): ------- None + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + TimedeltaIndex : Immutable Index of timedelta64 data. + to_timedelta : Convert argument to timedelta. + Examples -------- - >>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(['1h', '2h'])) + >>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(["1h", "2h"])) ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -152,9 +158,8 @@ def _scalar_type(self) -> type[Timedelta]: # define my properties & methods for delegation _other_ops: list[str] = [] _bool_ops: list[str] = [] - _object_ops: list[str] = ["freq"] _field_ops: list[str] = ["days", "seconds", "microseconds", "nanoseconds"] - _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + ["unit"] + _datetimelike_ops: list[str] = _field_ops + _bool_ops + ["unit", "freq"] _datetimelike_methods: list[str] = [ "to_pytimedelta", "total_seconds", @@ -196,7 +201,6 @@ def dtype(self) -> np.dtype[np.timedelta64]: # type: ignore[override] # Constructors _freq = None - _default_dtype = TD64NS_DTYPE # used in TimeLikeOps.__init__ @classmethod def _validate_dtype(cls, values, dtype): @@ -321,9 +325,9 @@ def _unbox_scalar(self, value) -> np.timedelta64: raise ValueError("'value' should be a Timedelta.") self._check_compatible_with(value) if value is NaT: - return np.timedelta64(value._value, self.unit) + return np.timedelta64(value._value, self.unit) # type: ignore[call-overload] else: - return value.as_unit(self.unit).asm8 + return value.as_unit(self.unit, round_ok=False).asm8 def _scalar_from_string(self, value) -> Timedelta | NaTType: return Timedelta(value) @@ -468,6 +472,10 @@ def __mul__(self, other) -> Self: if is_scalar(other): # numpy will accept float and int, raise TypeError for others result = self._ndarray * other + if result.dtype.kind != "m": + # numpy >= 2.1 may not raise a TypeError + # and seems to dispatch to others.__rmul__? + raise TypeError(f"Cannot multiply with {type(other).__name__}") freq = None if self.freq is not None and not isna(other): freq = self.freq * other @@ -495,6 +503,10 @@ def __mul__(self, other) -> Self: # numpy will accept float or int dtype, raise TypeError for others result = self._ndarray * other + if result.dtype.kind != "m": + # numpy >= 2.1 may not raise a TypeError + # and seems to dispatch to others.__rmul__? + raise TypeError(f"Cannot multiply with {type(other).__name__}") return type(self)._simple_new(result, dtype=result.dtype) __rmul__ = __mul__ @@ -747,7 +759,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: -------- **Series** - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d')) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="D")) >>> s 0 0 days 1 1 days @@ -766,7 +778,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: **TimedeltaIndex** - >>> idx = pd.to_timedelta(np.arange(5), unit='d') + >>> idx = pd.to_timedelta(np.arange(5), unit="D") >>> idx TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) @@ -784,27 +796,54 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: Returns ------- numpy.ndarray + A NumPy ``timedelta64`` object representing the same duration as the + original pandas ``Timedelta`` object. The precision of the resulting + object is in nanoseconds, which is the default + time resolution used by pandas for ``Timedelta`` objects, ensuring + high precision for time-based calculations. + + See Also + -------- + to_timedelta : Convert argument to timedelta format. + Timedelta : Represents a duration between two dates or times. + DatetimeIndex: Index of datetime64 data. + Timedelta.components : Return a components namedtuple-like + of a single timedelta. Examples -------- - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='D') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit="D") >>> tdelta_idx TimedeltaIndex(['1 days', '2 days', '3 days'], dtype='timedelta64[ns]', freq=None) >>> tdelta_idx.to_pytimedelta() array([datetime.timedelta(days=1), datetime.timedelta(days=2), datetime.timedelta(days=3)], dtype=object) + + >>> tidx = pd.TimedeltaIndex(data=["1 days 02:30:45", "3 days 04:15:10"]) + >>> tidx + TimedeltaIndex(['1 days 02:30:45', '3 days 04:15:10'], + dtype='timedelta64[ns]', freq=None) + >>> tidx.to_pytimedelta() + array([datetime.timedelta(days=1, seconds=9045), + datetime.timedelta(days=3, seconds=15310)], dtype=object) """ return ints_to_pytimedelta(self._ndarray) days_docstring = textwrap.dedent( """Number of days for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.microseconds : Return number of microseconds for each element. + Series.dt.nanoseconds : Return number of nanoseconds for each element. + Examples -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='d')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='D')) >>> ser 0 1 days 1 2 days @@ -830,6 +869,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: seconds_docstring = textwrap.dedent( """Number of seconds (>= 0 and less than 1 day) for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.nanoseconds : Return number of nanoseconds for each element. + Examples -------- For Series: @@ -864,6 +908,12 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: microseconds_docstring = textwrap.dedent( """Number of microseconds (>= 0 and less than 1 second) for each element. + See Also + -------- + pd.Timedelta.microseconds : Number of microseconds (>= 0 and less than 1 second). + pd.Timedelta.to_pytimedelta.microseconds : Number of microseconds (>= 0 and less + than 1 second) of a datetime.timedelta. + Examples -------- For Series: @@ -899,6 +949,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: nanoseconds_docstring = textwrap.dedent( """Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.microseconds : Return number of nanoseconds for each element. + Examples -------- For Series: @@ -943,9 +998,15 @@ def components(self) -> DataFrame: ------- DataFrame + See Also + -------- + TimedeltaIndex.total_seconds : Return total duration expressed in seconds. + Timedelta.components : Return a components namedtuple-like of a single + timedelta. + Examples -------- - >>> tdelta_idx = pd.to_timedelta(['1 day 3 min 2 us 42 ns']) + >>> tdelta_idx = pd.to_timedelta(["1 day 3 min 2 us 42 ns"]) >>> tdelta_idx TimedeltaIndex(['1 days 00:03:00.000002042'], dtype='timedelta64[ns]', freq=None) @@ -1072,7 +1133,10 @@ def sequence_to_td64ns( # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) assert data.dtype.kind == "m" assert data.dtype != "m8" # i.e. not unit-less @@ -1080,7 +1144,7 @@ def sequence_to_td64ns( return data, inferred_freq -def _ints_to_td64ns(data, unit: str = "ns"): +def _ints_to_td64ns(data, unit: str = "ns") -> tuple[np.ndarray, bool]: """ Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating the integers as multiples of the given timedelta unit. @@ -1120,7 +1184,9 @@ def _ints_to_td64ns(data, unit: str = "ns"): return data, copy_made -def _objects_to_td64ns(data, unit=None, errors: DateTimeErrorChoices = "raise"): +def _objects_to_td64ns( + data, unit=None, errors: DateTimeErrorChoices = "raise" +) -> np.ndarray: """ Convert a object-dtyped or string-dtyped array into an timedelta64[ns]-dtyped array. @@ -1150,7 +1216,7 @@ def _objects_to_td64ns(data, unit=None, errors: DateTimeErrorChoices = "raise"): higher level. """ # coerce Index to np.ndarray, converting string-dtype if necessary - values = np.array(data, dtype=np.object_, copy=False) + values = np.asarray(data, dtype=np.object_) result = array_to_timedelta64(values, unit=unit, errors=errors) return result.view("timedelta64[ns]") diff --git a/pandas/core/base.py b/pandas/core/base.py index e98f1157572bb..6cc28d4e46634 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -14,12 +14,9 @@ final, overload, ) -import warnings import numpy as np -from pandas._config import using_copy_on_write - from pandas._libs import lib from pandas._typing import ( AxisInt, @@ -37,7 +34,6 @@ cache_readonly, doc, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( @@ -48,6 +44,7 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, + ABCMultiIndex, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -89,12 +86,6 @@ _shared_docs: dict[str, str] = {} -_indexops_doc_kwargs = { - "klass": "IndexOpsMixin", - "inplace": "", - "unique": "IndexOpsMixin", - "duplicated": "IndexOpsMixin", -} class PandasObject(DirNamesMixin): @@ -106,7 +97,7 @@ class PandasObject(DirNamesMixin): _cache: dict[str, Any] @property - def _constructor(self): + def _constructor(self) -> type[Self]: """ Class constructor (for this class it's just `__class__`). """ @@ -137,7 +128,7 @@ def __sizeof__(self) -> int: """ memory_usage = getattr(self, "memory_usage", None) if memory_usage: - mem = memory_usage(deep=True) # pylint: disable=not-callable + mem = memory_usage(deep=True) return int(mem if is_scalar(mem) else mem.sum()) # no memory_usage attribute, so fall back to object's 'sizeof' @@ -218,7 +209,7 @@ def _obj_with_exclusions(self): return self.obj if self._selection is not None: - return self.obj._getitem_nocopy(self._selection_list) + return self.obj[self._selection_list] if len(self.exclusions) > 0: # equivalent to `self.obj.drop(self.exclusions, axis=1) @@ -319,6 +310,10 @@ def transpose(self, *args, **kwargs) -> Self: doc=""" Return the transpose, which is by definition self. + See Also + -------- + Index : Immutable sequence used for indexing and alignment. + Examples -------- For Series: @@ -348,6 +343,12 @@ def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. + See Also + -------- + Series.ndim : Number of dimensions of the underlying data. + Series.size : Return the number of elements in the underlying data. + Series.nbytes : Return the number of bytes in the underlying data. + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -360,14 +361,24 @@ def __len__(self) -> int: # We need this defined here for mypy raise AbstractMethodError(self) + # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug + # https://github.com/ipython/ipython/issues/14412 + # https://github.com/davidhalter/jedi/issues/1990 @property - def ndim(self) -> Literal[1]: + def ndim(self) -> int: """ Number of dimensions of the underlying data, by definition 1. + See Also + -------- + Series.size: Return the number of elements in the underlying data. + Series.shape: Return a tuple of the shape of the underlying data. + Series.dtype: Return the dtype object of the underlying data. + Series.values: Return Series as ndarray or ndarray-like depending on the dtype. + Examples -------- - >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s = pd.Series(["Ant", "Bear", "Cow"]) >>> s 0 Ant 1 Bear @@ -401,6 +412,11 @@ def item(self): ValueError If the data is not length = 1. + See Also + -------- + Index.values : Returns an array representing the data in the Index. + Series.head : Returns the first `n` rows. + Examples -------- >>> s = pd.Series([1]) @@ -409,7 +425,7 @@ def item(self): For an index: - >>> s = pd.Series([1], index=['a']) + >>> s = pd.Series([1], index=["a"]) >>> s.index.item() 'a' """ @@ -422,11 +438,16 @@ def nbytes(self) -> int: """ Return the number of bytes in the underlying data. + See Also + -------- + Series.ndim : Number of dimensions of the underlying data. + Series.size : Return the number of elements in the underlying data. + Examples -------- For Series: - >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s = pd.Series(["Ant", "Bear", "Cow"]) >>> s 0 Ant 1 Bear @@ -450,11 +471,18 @@ def size(self) -> int: """ Return the number of elements in the underlying data. + See Also + -------- + Series.ndim: Number of dimensions of the underlying data, by definition 1. + Series.shape: Return a tuple of the shape of the underlying data. + Series.dtype: Return the dtype object of the underlying data. + Series.values: Return Series as ndarray or ndarray-like depending on the dtype. + Examples -------- For Series: - >>> s = pd.Series(['Ant', 'Bear', 'Cow']) + >>> s = pd.Series(["Ant", "Bear", "Cow"]) >>> s 0 Ant 1 Bear @@ -478,6 +506,11 @@ def array(self) -> ExtensionArray: """ The ExtensionArray of the data backing this Series or Index. + This property provides direct access to the underlying array data of a + Series or Index without requiring conversion to a NumPy array. It + returns an ExtensionArray, which is the native storage format for + pandas extension dtypes. + Returns ------- ExtensionArray @@ -531,14 +564,13 @@ def array(self) -> ExtensionArray: For extension types, like Categorical, the actual ExtensionArray is returned - >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser = pd.Series(pd.Categorical(["a", "b", "a"])) >>> ser.array ['a', 'b', 'a'] Categories (2, object): ['a', 'b'] """ raise AbstractMethodError(self) - @final def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -568,6 +600,8 @@ def to_numpy( Returns ------- numpy.ndarray + The NumPy ndarray holding the values from this Series or Index. + The dtype of the array may differ. See Notes. See Also -------- @@ -610,7 +644,7 @@ def to_numpy( Examples -------- - >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser = pd.Series(pd.Categorical(["a", "b", "a"])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) @@ -618,7 +652,7 @@ def to_numpy( Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` objects, each with the correct ``tz``. - >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> ser = pd.Series(pd.date_range("2000", periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), Timestamp('2000-01-02 00:00:00+0100', tz='CET')], @@ -648,7 +682,7 @@ def to_numpy( ) values = self._values - if fillna: + if fillna and self.hasnans: if not can_hold_element(values, na_value): # if we can't hold the na_value asarray either makes a copy or we # error before modifying values. The asarray later on thus won't make @@ -661,10 +695,10 @@ def to_numpy( result = np.asarray(values, dtype=dtype) - if (copy and not fillna) or (not copy and using_copy_on_write()): + if (copy and not fillna) or not copy: if np.shares_memory(self._values[:2], result[:2]): # Take slices to improve performance of check - if using_copy_on_write() and not copy: + if not copy: result = result.view() result.flags.writeable = False else: @@ -675,6 +709,45 @@ def to_numpy( @final @property def empty(self) -> bool: + """ + Indicator whether Index is empty. + + An Index is considered empty if it has no elements. This property can be + useful for quickly checking the state of an Index, especially in data + processing and analysis workflows where handling of empty datasets might + be required. + + Returns + ------- + bool + If Index is empty, return True, if not return False. + + See Also + -------- + Index.size : Return the number of elements in the underlying data. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.empty + False + + >>> idx_empty = pd.Index([]) + >>> idx_empty + Index([], dtype='object') + >>> idx_empty.empty + True + + If we only have NaNs in our DataFrame, it is not considered empty! + + >>> idx = pd.Index([np.nan, np.nan]) + >>> idx + Index([nan, nan], dtype='float64') + >>> idx.empty + False + """ return not self.size @doc(op="max", oppose="min", value="largest") @@ -692,7 +765,8 @@ def argmax( axis : {{None}} Unused. Parameter needed for compatibility with DataFrame. skipna : bool, default True - Exclude NA/null values when showing the result. + Exclude NA/null values. If the entire Series is NA, or if ``skipna=False`` + and there is an NA value, this method will raise a ``ValueError``. *args, **kwargs Additional arguments and keywords for compatibility with NumPy. @@ -713,8 +787,15 @@ def argmax( -------- Consider dataset containing cereal calories - >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0, - ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}}) + >>> s = pd.Series( + ... [100.0, 110.0, 120.0, 110.0], + ... index=[ + ... "Corn Flakes", + ... "Almond Delight", + ... "Cinnamon Toast Crunch", + ... "Cocoa Puff", + ... ], + ... ) >>> s Corn Flakes 100.0 Almond Delight 110.0 @@ -723,9 +804,9 @@ def argmax( dtype: float64 >>> s.argmax() - 2 + np.int64(2) >>> s.argmin() - 0 + np.int64(0) The maximum cereal calories is the third element and the minimum cereal calories is the first element, @@ -736,27 +817,9 @@ def argmax( skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) if isinstance(delegate, ExtensionArray): - if not skipna and delegate.isna().any(): - warnings.warn( - f"The behavior of {type(self).__name__}.argmax/argmin " - "with skipna=False and NAs, or with all-NAs is deprecated. " - "In a future version this will raise ValueError.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return -1 - else: - return delegate.argmax() + return delegate.argmax(skipna=skipna) else: result = nanops.nanargmax(delegate, skipna=skipna) - if result == -1: - warnings.warn( - f"The behavior of {type(self).__name__}.argmax/argmin " - "with skipna=False and NAs, or with all-NAs is deprecated. " - "In a future version this will raise ValueError.", - FutureWarning, - stacklevel=find_stack_level(), - ) # error: Incompatible return value type (got "Union[int, ndarray]", expected # "int") return result # type: ignore[return-value] @@ -767,35 +830,17 @@ def argmin( ) -> int: delegate = self._values nv.validate_minmax_axis(axis) - skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) + skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) if isinstance(delegate, ExtensionArray): - if not skipna and delegate.isna().any(): - warnings.warn( - f"The behavior of {type(self).__name__}.argmax/argmin " - "with skipna=False and NAs, or with all-NAs is deprecated. " - "In a future version this will raise ValueError.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return -1 - else: - return delegate.argmin() + return delegate.argmin(skipna=skipna) else: result = nanops.nanargmin(delegate, skipna=skipna) - if result == -1: - warnings.warn( - f"The behavior of {type(self).__name__}.argmax/argmin " - "with skipna=False and NAs, or with all-NAs is deprecated. " - "In a future version this will raise ValueError.", - FutureWarning, - stacklevel=find_stack_level(), - ) # error: Incompatible return value type (got "Union[int, ndarray]", expected # "int") return result # type: ignore[return-value] - def tolist(self): + def tolist(self) -> list: """ Return a list of the values. @@ -806,6 +851,7 @@ def tolist(self): Returns ------- list + List containing the values as Python or pandas scalers. See Also -------- @@ -844,6 +890,11 @@ def __iter__(self) -> Iterator: Returns ------- iterator + An iterator yielding scalar values from the Series. + + See Also + -------- + Series.items : Lazily iterate over (index, value) tuples. Examples -------- @@ -872,6 +923,11 @@ def hasnans(self) -> bool: ------- bool + See Also + -------- + Series.isna : Detect missing values. + Series.notna : Detect existing (non-missing) values. + Examples -------- >>> s = pd.Series([1, 2, 3, None]) @@ -889,7 +945,7 @@ def hasnans(self) -> bool: return bool(isna(self).any()) # type: ignore[union-attr] @final - def _map_values(self, mapper, na_action=None, convert: bool = True): + def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). @@ -901,10 +957,6 @@ def _map_values(self, mapper, na_action=None, convert: bool = True): na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function - convert : bool, default True - Try to find better dtype for elementwise function results. If - False, leave as dtype=object. Note that the dtype is always - preserved for some extension array dtypes, such as Categorical. Returns ------- @@ -918,9 +970,8 @@ def _map_values(self, mapper, na_action=None, convert: bool = True): if isinstance(arr, ExtensionArray): return arr.map(mapper, na_action=na_action) - return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert) + return algorithms.map_array(arr, mapper, na_action=na_action) - @final def value_counts( self, normalize: bool = False, @@ -954,6 +1005,7 @@ def value_counts( Returns ------- Series + Series containing counts of unique values. See Also -------- @@ -1006,6 +1058,34 @@ def value_counts( 4.0 1 NaN 1 Name: count, dtype: int64 + + **Categorical Dtypes** + + Rows with categorical type will be counted as one group + if they have same categories and order. + In the example below, even though ``a``, ``c``, and ``d`` + all have the same data types of ``category``, + only ``c`` and ``d`` will be counted as one group + since ``a`` doesn't have the same categories. + + >>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]}) + >>> df = df.astype({"a": "category", "c": "category", "d": "category"}) + >>> df + a b c d + 0 1 2 3 3 + + >>> df.dtypes + a category + b object + c category + d category + dtype: object + + >>> df.dtypes.value_counts() + category 2 + category 1 + object 1 + Name: count, dtype: int64 """ return algorithms.value_counts_internal( self, @@ -1040,6 +1120,7 @@ def nunique(self, dropna: bool = True) -> int: Returns ------- int + A integer indicating the number of unique elements in the object. See Also -------- @@ -1068,12 +1149,18 @@ def nunique(self, dropna: bool = True) -> int: @property def is_unique(self) -> bool: """ - Return boolean if values in the object are unique. + Return True if values in the object are unique. Returns ------- bool + See Also + -------- + Series.unique : Return unique values of Series object. + Series.drop_duplicates : Return Series with duplicate values removed. + Series.duplicated : Indicate duplicate Series values. + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -1089,12 +1176,17 @@ def is_unique(self) -> bool: @property def is_monotonic_increasing(self) -> bool: """ - Return boolean if values in the object are monotonically increasing. + Return True if values in the object are monotonically increasing. Returns ------- bool + See Also + -------- + Series.is_monotonic_decreasing : Return boolean if values in the object are + monotonically decreasing. + Examples -------- >>> s = pd.Series([1, 2, 2]) @@ -1112,12 +1204,17 @@ def is_monotonic_increasing(self) -> bool: @property def is_monotonic_decreasing(self) -> bool: """ - Return boolean if values in the object are monotonically decreasing. + Return True if values in the object are monotonically decreasing. Returns ------- bool + See Also + -------- + Series.is_monotonic_increasing : Return boolean if values in the object are + monotonically increasing. + Examples -------- >>> s = pd.Series([3, 2, 2, 1]) @@ -1146,6 +1243,7 @@ def _memory_usage(self, deep: bool = False) -> int: Returns ------- bytes used + Returns memory usage of the values in the Index in bytes. See Also -------- @@ -1164,7 +1262,7 @@ def _memory_usage(self, deep: bool = False) -> int: 24 """ if hasattr(self.array, "memory_usage"): - return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues] + return self.array.memory_usage( # pyright: ignore[reportAttributeAccessIssue] deep=deep, ) @@ -1198,18 +1296,21 @@ def factorize( if uniques.dtype == np.float16: uniques = uniques.astype(np.float32) - if isinstance(self, ABCIndex): - # preserve e.g. MultiIndex + if isinstance(self, ABCMultiIndex): + # preserve MultiIndex uniques = self._constructor(uniques) else: from pandas import Index - uniques = Index(uniques) + try: + uniques = Index(uniques, dtype=self.dtype) + except NotImplementedError: + # not all dtypes are supported in Index that are allowed for Series + # e.g. float16 or bytes + uniques = Index(uniques) return codes, uniques - _shared_docs[ - "searchsorted" - ] = """ + _shared_docs["searchsorted"] = """ Find indices where elements should be inserted to maintain order. Find the indices into a sorted {klass} `self` such that, if the @@ -1259,7 +1360,7 @@ def factorize( dtype: int64 >>> ser.searchsorted(4) - 3 + np.int64(3) >>> ser.searchsorted([0, 4]) array([0, 3]) @@ -1275,10 +1376,10 @@ def factorize( 0 2000-03-11 1 2000-03-12 2 2000-03-13 - dtype: datetime64[ns] + dtype: datetime64[s] >>> ser.searchsorted('3/14/2000') - 3 + np.int64(3) >>> ser = pd.Categorical( ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True @@ -1288,7 +1389,7 @@ def factorize( Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk'] >>> ser.searchsorted('bread') - 1 + np.int64(1) >>> ser.searchsorted(['bread'], side='right') array([3]) @@ -1318,8 +1419,7 @@ def searchsorted( # type: ignore[overload-overlap] value: ScalarLike_co, side: Literal["left", "right"] = ..., sorter: NumpySorter = ..., - ) -> np.intp: - ... + ) -> np.intp: ... @overload def searchsorted( @@ -1327,8 +1427,7 @@ def searchsorted( value: npt.ArrayLike | ExtensionArray, side: Literal["left", "right"] = ..., sorter: NumpySorter = ..., - ) -> npt.NDArray[np.intp]: - ... + ) -> npt.NDArray[np.intp]: ... @doc(_shared_docs["searchsorted"], klass="Index") def searchsorted( @@ -1356,7 +1455,7 @@ def searchsorted( sorter=sorter, ) - def drop_duplicates(self, *, keep: DropKeep = "first"): + def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: duplicated = self._duplicated(keep=keep) # error: Value of type "IndexOpsMixin" is not indexable return self[~duplicated] # type: ignore[index] @@ -1381,9 +1480,9 @@ def _arith_method(self, other, op): with np.errstate(all="ignore"): result = ops.arithmetic_op(lvalues, rvalues, op) - return self._construct_result(result, name=res_name) + return self._construct_result(result, name=res_name, other=other) - def _construct_result(self, result, name): + def _construct_result(self, result, name, other): """ Construct an appropriately-wrapped result from the ArrayLike result of an arithmetic-like operation. diff --git a/pandas/core/common.py b/pandas/core/common.py index 7d864e02be54e..75f8a56aac5db 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3,6 +3,7 @@ Note: pandas.core.common is *not* part of the public API. """ + from __future__ import annotations import builtins @@ -11,6 +12,7 @@ defaultdict, ) from collections.abc import ( + Callable, Collection, Generator, Hashable, @@ -23,7 +25,7 @@ from typing import ( TYPE_CHECKING, Any, - Callable, + TypeVar, cast, overload, ) @@ -51,7 +53,9 @@ from pandas._typing import ( AnyArrayLike, ArrayLike, + Concatenate, NpDtype, + P, RandomState, T, ) @@ -141,7 +145,7 @@ def is_bool_indexer(key: Any) -> bool: elif isinstance(key, list): # check if np.array(key).dtype would be bool if len(key) > 0: - if type(key) is not list: # noqa: E721 + if type(key) is not list: # GH#42461 cython will raise TypeError if we pass a subclass key = list(key) return lib.is_bool_list(key) @@ -224,8 +228,7 @@ def asarray_tuplesafe( @overload -def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = ...) -> ArrayLike: - ... +def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = ...) -> ArrayLike: ... def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLike: @@ -233,6 +236,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi values = list(values) elif isinstance(values, ABCIndex): return values._values + elif isinstance(values, ABCSeries): + return values._values if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) @@ -241,7 +246,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi with warnings.catch_warnings(): # Can remove warning filter once NumPy 1.24 is min version if not np_version_gte1p24: - warnings.simplefilter("ignore", np.VisibleDeprecationWarning) + # np.VisibleDeprecationWarning only in np.exceptions in 2.0 + warnings.simplefilter("ignore", np.VisibleDeprecationWarning) # type: ignore[attr-defined] result = np.asarray(values, dtype=dtype) except ValueError: # Using try/except since it's more performant than checking is_list_like @@ -330,11 +336,12 @@ def is_empty_slice(obj) -> bool: ) -def is_true_slices(line) -> list[bool]: +def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]: """ - Find non-trivial slices in "line": return a list of booleans with same length. + Find non-trivial slices in "line": yields a bool. """ - return [isinstance(k, slice) and not is_null_slice(k) for k in line] + for k in line: + yield isinstance(k, slice) and not is_null_slice(k) # TODO: used only once in indexing; belongs elsewhere? @@ -353,7 +360,7 @@ def is_full_slice(obj, line: int) -> bool: def get_callable_name(obj): # typical case has name if hasattr(obj, "__name__"): - return getattr(obj, "__name__") + return obj.__name__ # some objects don't; could recurse if isinstance(obj, partial): return get_callable_name(obj.func) @@ -417,15 +424,13 @@ def standardize_mapping(into): @overload -def random_state(state: np.random.Generator) -> np.random.Generator: - ... +def random_state(state: np.random.Generator) -> np.random.Generator: ... @overload def random_state( state: int | np.ndarray | np.random.BitGenerator | np.random.RandomState | None, -) -> np.random.RandomState: - ... +) -> np.random.RandomState: ... def random_state(state: RandomState | None = None): @@ -463,8 +468,32 @@ def random_state(state: RandomState | None = None): ) +_T = TypeVar("_T") # Secondary TypeVar for use in pipe's type hints + + +@overload def pipe( - obj, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs + obj: _T, + func: Callable[Concatenate[_T, P], T], + *args: P.args, + **kwargs: P.kwargs, +) -> T: ... + + +@overload +def pipe( + obj: Any, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, +) -> T: ... + + +def pipe( + obj: _T, + func: Callable[Concatenate[_T, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, ) -> T: """ Apply a function ``func`` to object ``obj`` either by passing obj as the @@ -490,12 +519,13 @@ def pipe( object : the return type of ``func``. """ if isinstance(func, tuple): - func, target = func + # Assigning to func_ so pyright understands that it's a callable + func_, target = func if target in kwargs: msg = f"{target} is both the pipe target and a keyword argument" raise ValueError(msg) kwargs[target] = obj - return func(*args, **kwargs) + return func_(*args, **kwargs) else: return func(obj, *args, **kwargs) @@ -531,9 +561,7 @@ def convert_to_list_like( @contextlib.contextmanager -def temp_setattr( - obj, attr: str, value, condition: bool = True -) -> Generator[None, None, None]: +def temp_setattr(obj, attr: str, value, condition: bool = True) -> Generator[None]: """ Temporarily set attribute on an object. @@ -576,22 +604,6 @@ def require_length_match(data, index: Index) -> None: ) -# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0, -# whereas np.min and np.max (which directly call obj.min and obj.max) -# default to axis=None. -_builtin_table = { - builtins.sum: np.sum, - builtins.max: np.maximum.reduce, - builtins.min: np.minimum.reduce, -} - -# GH#53425: Only for deprecation -_builtin_table_alias = { - builtins.sum: "np.sum", - builtins.max: "np.maximum.reduce", - builtins.min: "np.minimum.reduce", -} - _cython_table = { builtins.sum: "sum", builtins.max: "max", @@ -628,14 +640,6 @@ def get_cython_func(arg: Callable) -> str | None: return _cython_table.get(arg) -def is_builtin_func(arg): - """ - if we define a builtin function for this argument, return it, - otherwise return the arg - """ - return _builtin_table.get(arg, arg) - - def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]: """ If a name is missing then replace it by level_n, where n is the count diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index cd852ba9249cf..6158c4f4d0539 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -1,20 +1,20 @@ """ Core eval alignment algorithms. """ + from __future__ import annotations from functools import ( partial, wraps, ) -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings import numpy as np +from pandas._config.config import get_option + from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level @@ -28,7 +28,10 @@ from pandas.core.computation.common import result_type_many if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import F @@ -124,17 +127,21 @@ def _align_core(terms): reindexer_size = len(reindexer) ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) - if ordm >= 1 and reindexer_size >= 10000: + if ( + get_option("performance_warnings") + and ordm >= 1 + and reindexer_size >= 10000 + ): w = ( f"Alignment difference on axis {axis} is larger " - f"than an order of magnitude on term {repr(terms[i].name)}, " + f"than an order of magnitude on term {terms[i].name!r}, " f"by more than {ordm:.4g}; performance may suffer." ) warnings.warn( w, category=PerformanceWarning, stacklevel=find_stack_level() ) - obj = ti.reindex(reindexer, axis=axis, copy=False) + obj = ti.reindex(reindexer, axis=axis) terms[i].update(obj) terms[i].update(terms[i].value.values) @@ -153,19 +160,24 @@ def align_terms(terms): # can't iterate so it must just be a constant or single variable if isinstance(terms.value, (ABCSeries, ABCDataFrame)): typ = type(terms.value) - return typ, _zip_axes_from_type(typ, terms.value.axes) - return np.result_type(terms.type), None + name = terms.value.name if isinstance(terms.value, ABCSeries) else None + return typ, _zip_axes_from_type(typ, terms.value.axes), name + return np.result_type(terms.type), None, None # if all resolved variables are numeric scalars if all(term.is_scalar for term in terms): - return result_type_many(*(term.value for term in terms)).type, None + return result_type_many(*(term.value for term in terms)).type, None, None + + # if all input series have a common name, propagate it to the returned series + names = {term.value.name for term in terms if isinstance(term.value, ABCSeries)} + name = names.pop() if len(names) == 1 else None # perform the main alignment typ, axes = _align_core(terms) - return typ, axes + return typ, axes, name -def reconstruct_object(typ, obj, axes, dtype): +def reconstruct_object(typ, obj, axes, dtype, name): """ Reconstruct an object given its type, raw value, and possibly empty (None) axes. @@ -193,13 +205,15 @@ def reconstruct_object(typ, obj, axes, dtype): res_t = np.result_type(obj.dtype, dtype) if not isinstance(typ, partial) and issubclass(typ, PandasObject): - return typ(obj, dtype=res_t, **axes) + if name is None: + return typ(obj, dtype=res_t, **axes) + return typ(obj, dtype=res_t, name=name, **axes) # special case for pathological things like ~True/~False if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: ret_value = res_t.type(obj) else: - ret_value = typ(obj).astype(res_t) + ret_value = res_t.type(obj) # The condition is to distinguish 0-dim array (returned in case of # scalar) and 1 element array # e.g. np.array(0) and np.array([0]) diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index a3a05a9d75c6e..d2a181cbb3c36 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -1,6 +1,7 @@ """ Engine classes for :func:`~pandas.eval` """ + from __future__ import annotations import abc @@ -53,6 +54,7 @@ def __init__(self, expr) -> None: self.expr = expr self.aligned_axes = None self.result_type = None + self.result_name = None def convert(self) -> str: """ @@ -75,12 +77,18 @@ def evaluate(self) -> object: The result of the passed expression. """ if not self._is_aligned: - self.result_type, self.aligned_axes = align_terms(self.expr.terms) + self.result_type, self.aligned_axes, self.result_name = align_terms( + self.expr.terms + ) # make sure no names in resolvers and locals/globals clash res = self._evaluate() return reconstruct_object( - self.result_type, res, self.aligned_axes, self.expr.terms.return_type + self.result_type, + res, + self.aligned_axes, + self.expr.terms.return_type, + self.result_name, ) @property diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index f1fe528de06f8..f8e3200ef2ba0 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -1,16 +1,23 @@ """ Top level ``eval`` module. """ + from __future__ import annotations import tokenize -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import warnings from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.common import ( + is_extension_array_dtype, + is_string_dtype, +) from pandas.core.computation.engines import ENGINES from pandas.core.computation.expr import ( @@ -72,7 +79,7 @@ def _check_engine(engine: str | None) -> str: return engine -def _check_parser(parser: str): +def _check_parser(parser: str) -> None: """ Make sure a valid parser is passed. @@ -91,7 +98,7 @@ def _check_parser(parser: str): ) -def _check_resolvers(resolvers): +def _check_resolvers(resolvers) -> None: if resolvers is not None: for resolver in resolvers: if not hasattr(resolver, "__getitem__"): @@ -102,7 +109,7 @@ def _check_resolvers(resolvers): ) -def _check_expression(expr): +def _check_expression(expr) -> None: """ Make sure an expression is not an empty string @@ -149,7 +156,7 @@ def _convert_expression(expr) -> str: return s -def _check_for_locals(expr: str, stack_level: int, parser: str): +def _check_for_locals(expr: str, stack_level: int, parser: str) -> None: at_top_of_stack = stack_level == 0 not_pandas_parser = parser != "pandas" @@ -177,18 +184,14 @@ def eval( level: int = 0, target=None, inplace: bool = False, -): +) -> Any: """ Evaluate a Python expression as a string using various backends. - The following arithmetic operations are supported: ``+``, ``-``, ``*``, - ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following - boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). - Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, - :keyword:`or`, and :keyword:`not` with the same semantics as the - corresponding bitwise operators. :class:`~pandas.Series` and - :class:`~pandas.DataFrame` objects are supported and behave as they would - with plain ol' Python evaluation. + .. warning:: + + This function can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- @@ -198,6 +201,34 @@ def eval( `__, only Python `expressions `__. + + By default, with the numexpr engine, the following operations are supported: + + - Arithmetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` + - Boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not) + - Comparison operators: ``<``, ``<=``, ``==``, ``!=``, ``>=``, ``>`` + + Furthermore, the following mathematical functions are supported: + + - Trigonometric: ``sin``, ``cos``, ``tan``, ``arcsin``, ``arccos``, \ + ``arctan``, ``arctan2``, ``sinh``, ``cosh``, ``tanh``, ``arcsinh``, \ + ``arccosh`` and ``arctanh`` + - Logarithms: ``log`` natural, ``log10`` base 10, ``log1p`` log(1+x) + - Absolute Value ``abs`` + - Square root ``sqrt`` + - Exponential ``exp`` and Exponential minus one ``expm1`` + + See the numexpr engine `documentation + `__ + for further function support details. + + Using the ``'python'`` engine allows the use of native Python operators + such as floor division ``//``, in addition to built-in and user-defined + Python functions. + + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. parser : {'pandas', 'python'}, default 'pandas' The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard @@ -336,11 +367,16 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) if engine == "numexpr" and ( - is_extension_array_dtype(parsed_expr.terms.return_type) - or getattr(parsed_expr.terms, "operand_types", None) is not None - and any( - is_extension_array_dtype(elem) - for elem in parsed_expr.terms.operand_types + ( + is_extension_array_dtype(parsed_expr.terms.return_type) + and not is_string_dtype(parsed_expr.terms.return_type) + ) + or ( + getattr(parsed_expr.terms, "operand_types", None) is not None + and any( + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) + for elem in parsed_expr.terms.operand_types + ) ) ): warnings.warn( @@ -375,7 +411,7 @@ def eval( try: target = env.target if isinstance(target, NDFrame): - target = target.copy(deep=None) + target = target.copy(deep=False) else: target = target.copy() except AttributeError as err: @@ -391,7 +427,7 @@ def eval( if inplace and isinstance(target, NDFrame): target.loc[:, assigner] = ret else: - target[assigner] = ret # pyright: ignore[reportGeneralTypeIssues] + target[assigner] = ret # pyright: ignore[reportIndexIssue] except (TypeError, IndexError) as err: raise ValueError("Cannot assign expression output to target") from err diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4770f403b1bdb..b53596fe28e70 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -1,6 +1,7 @@ """ :func:`~pandas.eval` parsers. """ + from __future__ import annotations import ast @@ -11,7 +12,7 @@ from keyword import iskeyword import tokenize from typing import ( - Callable, + TYPE_CHECKING, ClassVar, TypeVar, ) @@ -20,6 +21,8 @@ from pandas.errors import UndefinedVariableError +from pandas.core.dtypes.common import is_string_dtype + import pandas.core.common as com from pandas.core.computation.ops import ( ARITH_OPS_SYMS, @@ -31,7 +34,6 @@ UNARY_OPS_SYMS, BinOp, Constant, - Div, FuncNode, Op, Term, @@ -46,6 +48,9 @@ from pandas.io.formats import printing +if TYPE_CHECKING: + from collections.abc import Callable + def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: """ @@ -163,7 +168,7 @@ def _preparse( the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), "f must be callable" - return tokenize.untokenize(f(x) for x in tokenize_string(source)) + return tokenize.untokenize(f(x) for x in tokenize_string(source)) # pyright: ignore[reportArgumentType] def _is_type(t): @@ -370,7 +375,7 @@ class BaseExprVisitor(ast.NodeVisitor): "Add", "Sub", "Mult", - None, + "Div", "Pow", "FloorDiv", "Mod", @@ -507,8 +512,7 @@ def _maybe_evaluate_binop( ) if self.engine != "pytables" and ( - res.op in CMP_OPS_SYMS - and getattr(lhs, "is_datetime", False) + (res.op in CMP_OPS_SYMS and getattr(lhs, "is_datetime", False)) or getattr(rhs, "is_datetime", False) ): # all date ops must be done in python bc numexpr doesn't work @@ -521,10 +525,12 @@ def _maybe_evaluate_binop( elif self.engine != "pytables": if ( getattr(lhs, "return_type", None) == object + or is_string_dtype(getattr(lhs, "return_type", None)) or getattr(rhs, "return_type", None) == object + or is_string_dtype(getattr(rhs, "return_type", None)) ): # evaluate "==" and "!=" in python if either of our operands - # has an object return type + # has an object or string return type return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res @@ -533,9 +539,6 @@ def visit_BinOp(self, node, **kwargs): left, right = self._maybe_downcast_constants(left, right) return self._maybe_evaluate_binop(op, op_class, left, right) - def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs) - def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) operand = self.visit(node.operand) @@ -641,7 +644,11 @@ def visit_Attribute(self, node, **kwargs): ctx = node.ctx if isinstance(ctx, ast.Load): # resolve the value - resolved = self.visit(value).value + visited_value = self.visit(value) + if hasattr(visited_value, "value"): + resolved = visited_value.value + else: + resolved = visited_value(self.env) try: v = getattr(resolved, attr) name = self.env.add_tmp(v) @@ -695,8 +702,7 @@ def visit_Call(self, node, side=None, **kwargs): if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " # type: ignore[attr-defined] - f"'{node.func.id}'" + f"keyword error in function call '{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 6219cac4aeb16..5a5fad0d83d7a 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -5,6 +5,7 @@ Offer fast expression evaluation through numexpr """ + from __future__ import annotations import operator @@ -64,23 +65,23 @@ def set_numexpr_threads(n=None) -> None: ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b): +def _evaluate_standard(op, op_str, left_op, right_op): """ Standard evaluation. """ if _TEST_MODE: _store_test_result(False) - return op(a, b) + return op(left_op, right_op) -def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: - """return a boolean if we WILL be using numexpr""" +def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool: + """return left_op boolean if we WILL be using numexpr""" if op_str is not None: # required min elements (otherwise we are adding overhead) - if a.size > _MIN_ELEMENTS: + if left_op.size > _MIN_ELEMENTS: # check for dtype compatibility dtypes: set[str] = set() - for o in [a, b]: + for o in [left_op, right_op]: # ndarray and Series Case if hasattr(o, "dtype"): dtypes |= {o.dtype.name} @@ -92,22 +93,22 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: return False -def _evaluate_numexpr(op, op_str, a, b): +def _evaluate_numexpr(op, op_str, left_op, right_op): result = None - if _can_use_numexpr(op, op_str, a, b, "evaluate"): + if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"): is_reversed = op.__name__.strip("_").startswith("r") if is_reversed: # we were originally called by a reversed op method - a, b = b, a + left_op, right_op = right_op, left_op - a_value = a - b_value = b + left_value = left_op + right_value = right_op try: result = ne.evaluate( - f"a_value {op_str} b_value", - local_dict={"a_value": a_value, "b_value": b_value}, + f"left_value {op_str} right_value", + local_dict={"left_value": left_value, "right_value": right_value}, casting="safe", ) except TypeError: @@ -115,20 +116,20 @@ def _evaluate_numexpr(op, op_str, a, b): # (https://github.com/pydata/numexpr/issues/379) pass except NotImplementedError: - if _bool_arith_fallback(op_str, a, b): + if _bool_arith_fallback(op_str, left_op, right_op): pass else: raise if is_reversed: # reverse order to original for fallback - a, b = b, a + left_op, right_op = right_op, left_op if _TEST_MODE: _store_test_result(result is not None) if result is None: - result = _evaluate_standard(op, op_str, a, b) + result = _evaluate_standard(op, op_str, left_op, right_op) return result @@ -169,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b): } -def _where_standard(cond, a, b): +def _where_standard(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary - return np.where(cond, a, b) + return np.where(cond, left_op, right_op) -def _where_numexpr(cond, a, b): +def _where_numexpr(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary result = None - if _can_use_numexpr(None, "where", a, b, "where"): + if _can_use_numexpr(None, "where", left_op, right_op, "where"): result = ne.evaluate( "where(cond_value, a_value, b_value)", - local_dict={"cond_value": cond, "a_value": a, "b_value": b}, + local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op}, casting="safe", ) if result is None: - result = _where_standard(cond, a, b) + result = _where_standard(cond, left_op, right_op) return result @@ -205,33 +206,33 @@ def _has_bool_dtype(x): _BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"} -def _bool_arith_fallback(op_str, a, b) -> bool: +def _bool_arith_fallback(op_str, left_op, right_op) -> bool: """ Check if we should fallback to the python `_evaluate_standard` in case of an unsupported operation by numexpr, which is the case for some boolean ops. """ - if _has_bool_dtype(a) and _has_bool_dtype(b): + if _has_bool_dtype(left_op) and _has_bool_dtype(right_op): if op_str in _BOOL_OP_UNSUPPORTED: warnings.warn( - f"evaluating in Python space because the {repr(op_str)} " + f"evaluating in Python space because the {op_str!r} " "operator is not supported by numexpr for the bool dtype, " - f"use {repr(_BOOL_OP_UNSUPPORTED[op_str])} instead.", + f"use {_BOOL_OP_UNSUPPORTED[op_str]!r} instead.", stacklevel=find_stack_level(), ) return True return False -def evaluate(op, a, b, use_numexpr: bool = True): +def evaluate(op, left_op, right_op, use_numexpr: bool = True): """ - Evaluate and return the expression of the op on a and b. + Evaluate and return the expression of the op on left_op and right_op. Parameters ---------- op : the actual operand - a : left operand - b : right operand + left_op : left operand + right_op : right operand use_numexpr : bool, default True Whether to try to use numexpr. """ @@ -239,24 +240,27 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: if use_numexpr: # error: "None" not callable - return _evaluate(op, op_str, a, b) # type: ignore[misc] - return _evaluate_standard(op, op_str, a, b) + return _evaluate(op, op_str, left_op, right_op) # type: ignore[misc] + return _evaluate_standard(op, op_str, left_op, right_op) -def where(cond, a, b, use_numexpr: bool = True): +def where(cond, left_op, right_op, use_numexpr: bool = True): """ - Evaluate the where condition cond on a and b. + Evaluate the where condition cond on left_op and right_op. Parameters ---------- cond : np.ndarray[bool] - a : return if cond is True - b : return if cond is False + left_op : return if cond is True + right_op : return if cond is False use_numexpr : bool, default True Whether to try to use numexpr. """ assert _where is not None - return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) + if use_numexpr: + return _where(cond, left_op, right_op) + else: + return _where_standard(cond, left_op, right_op) def set_test_mode(v: bool = True) -> None: diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 95ac20ba39edc..f06ded6d9f98e 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -9,7 +9,6 @@ import operator from typing import ( TYPE_CHECKING, - Callable, Literal, ) @@ -36,6 +35,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterable, Iterator, ) @@ -45,6 +45,7 @@ _unary_math_ops = ( "sin", "cos", + "tan", "exp", "log", "expm1", @@ -75,8 +76,7 @@ class Term: def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls - # error: Argument 2 for "super" not an instance of argument 1 - supr_new = super(Term, klass).__new__ # type: ignore[misc] + supr_new = super(Term, klass).__new__ return supr_new(klass) is_local: bool @@ -115,7 +115,7 @@ def _resolve_name(self): res = self.env.resolve(local_name, is_local=is_local) self.update(res) - if hasattr(res, "ndim") and res.ndim > 2: + if hasattr(res, "ndim") and isinstance(res.ndim, int) and res.ndim > 2: raise NotImplementedError( "N-dimensional objects, where N > 2, are not supported with eval" ) @@ -160,7 +160,7 @@ def type(self): @property def raw(self) -> str: - return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})" + return f"{type(self).__name__}(name={self.name!r}, type={self.type})" @property def is_datetime(self) -> bool: @@ -320,43 +320,12 @@ def _not_in(x, y): ) _arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) -SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") -_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) -_special_case_arith_ops_dict = dict( - zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs) -) - _binary_ops_dict = {} for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): _binary_ops_dict.update(d) -def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: - """ - Cast an expression inplace. - - Parameters - ---------- - terms : Op - The expression that should cast. - acceptable_dtypes : list of acceptable numpy.dtype - Will not cast if term's dtype in this list. - dtype : str or numpy.dtype - The dtype to cast to. - """ - dt = np.dtype(dtype) - for term in terms: - if term.type in acceptable_dtypes: - continue - - try: - new_value = term.value.astype(dt) - except AttributeError: - new_value = dt.type(term.value) - term.update(new_value) - - def is_term(obj) -> bool: return isinstance(obj, Term) @@ -387,7 +356,7 @@ def __init__(self, op: str, lhs, rhs) -> None: # has to be made a list for python3 keys = list(_binary_ops_dict.keys()) raise ValueError( - f"Invalid binary operator {repr(op)}, valid operators are {keys}" + f"Invalid binary operator {op!r}, valid operators are {keys}" ) from err def __call__(self, env): @@ -491,7 +460,7 @@ def stringify(value): v = v.tz_convert("UTC") self.lhs.update(v) - def _disallow_scalar_only_bool_ops(self): + def _disallow_scalar_only_bool_ops(self) -> None: rhs = self.rhs lhs = self.lhs @@ -513,34 +482,6 @@ def _disallow_scalar_only_bool_ops(self): raise NotImplementedError("cannot evaluate scalar only bool ops") -def isnumeric(dtype) -> bool: - return issubclass(np.dtype(dtype).type, np.number) - - -class Div(BinOp): - """ - Div operator to special case casting. - - Parameters - ---------- - lhs, rhs : Term or Op - The Terms or Ops in the ``/`` expression. - """ - - def __init__(self, lhs, rhs) -> None: - super().__init__("/", lhs, rhs) - - if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): - raise TypeError( - f"unsupported operand type(s) for {self.op}: " - f"'{lhs.return_type}' and '{rhs.return_type}'" - ) - - # do not upcast float32s to float64 un-necessarily - acceptable_dtypes = [np.float32, np.float64] - _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) - - UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) _unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) @@ -571,8 +512,7 @@ def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: self.func = _unary_ops_dict[op] except KeyError as err: raise ValueError( - f"Invalid unary operator {repr(op)}, " - f"valid operators are {UNARY_OPS_SYMS}" + f"Invalid unary operator {op!r}, valid operators are {UNARY_OPS_SYMS}" ) from err def __call__(self, env) -> MathCall: diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 4cfa0f2baffd5..8441941797a6e 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -1,8 +1,10 @@ """ :func:`~pandas.eval` source string parsing functions """ + from __future__ import annotations +from enum import Enum from io import StringIO from keyword import iskeyword import token @@ -31,13 +33,21 @@ def create_valid_python_identifier(name: str) -> str: ------ SyntaxError If the returned name is not a Python valid identifier, raise an exception. - This can happen if there is a hashtag in the name, as the tokenizer will - than terminate and not find the backtick. - But also for characters that fall out of the range of (U+0001..U+007F). """ if name.isidentifier() and not iskeyword(name): return name + # Escape characters that fall outside the ASCII range (U+0001..U+007F). + # GH 49633 + gen = ( + (c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace"))) + for c in name + ) + name = "".join( + c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_") + for c, c_escaped in gen + ) + # Create a dict with the special characters and their replacement string. # EXACT_TOKEN_TYPES contains these special characters # token.tok_name contains a readable description of the replacement string. @@ -53,11 +63,10 @@ def create_valid_python_identifier(name: str) -> str: "$": "_DOLLARSIGN_", "€": "_EUROSIGN_", "°": "_DEGREESIGN_", - # Including quotes works, but there are exceptions. "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", - # Currently not possible. Terminates parser and won't find backtick. - # "#": "_HASH_", + "#": "_HASH_", + "`": "_BACKTICK_", } ) @@ -114,18 +123,11 @@ def clean_column_name(name: Hashable) -> Hashable: ------- name : hashable Returns the name after tokenizing and cleaning. - - Notes - ----- - For some cases, a name cannot be converted to a valid Python identifier. - In that case :func:`tokenize_string` raises a SyntaxError. - In that case, we just return the name unmodified. - - If this name was used in the query string (this makes the query call impossible) - an error will be raised by :func:`tokenize_backtick_quoted_string` instead, - which is not caught and propagates to the user level. """ try: + # Escape backticks + name = name.replace("`", "``") if isinstance(name, str) else name + tokenized = tokenize_string(f"`{name}`") tokval = next(tokenized)[1] return create_valid_python_identifier(tokval) @@ -133,38 +135,89 @@ def clean_column_name(name: Hashable) -> Hashable: return name -def tokenize_backtick_quoted_string( - token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int -) -> tuple[int, str]: +class ParseState(Enum): + DEFAULT = 0 + IN_BACKTICK = 1 + IN_SINGLE_QUOTE = 2 + IN_DOUBLE_QUOTE = 3 + + +def _split_by_backtick(s: str) -> list[tuple[bool, str]]: """ - Creates a token from a backtick quoted string. + Splits a str into substrings along backtick characters (`). - Moves the token_generator forwards till right after the next backtick. + Disregards backticks inside quotes. Parameters ---------- - token_generator : Iterator[tokenize.TokenInfo] - The generator that yields the tokens of the source string (Tuple[int, str]). - The generator is at the first token after the backtick (`) - - source : str + s : str The Python source code string. - string_start : int - This is the start of backtick quoted string inside the source string. - Returns ------- - tok: Tuple[int, str] - The token that represents the backtick quoted string. - The integer is equal to BACKTICK_QUOTED_STRING (100). + substrings: list[tuple[bool, str]] + List of tuples, where each tuple has two elements: + The first is a boolean indicating if the substring is backtick-quoted. + The second is the actual substring. """ - for _, tokval, start, _, _ in token_generator: - if tokval == "`": - string_end = start[1] - break - - return BACKTICK_QUOTED_STRING, source[string_start:string_end] + substrings = [] + substr: list[str] = [] # Will join into a string before adding to `substrings` + i = 0 + parse_state = ParseState.DEFAULT + while i < len(s): + char = s[i] + + match char: + case "`": + # start of a backtick-quoted string + if parse_state == ParseState.DEFAULT: + if substr: + substrings.append((False, "".join(substr))) + + substr = [char] + i += 1 + parse_state = ParseState.IN_BACKTICK + continue + + elif parse_state == ParseState.IN_BACKTICK: + # escaped backtick inside a backtick-quoted string + next_char = s[i + 1] if (i != len(s) - 1) else None + if next_char == "`": + substr.append(char) + substr.append(next_char) + i += 2 + continue + + # end of the backtick-quoted string + else: + substr.append(char) + substrings.append((True, "".join(substr))) + + substr = [] + i += 1 + parse_state = ParseState.DEFAULT + continue + case "'": + # start of a single-quoted string + if parse_state == ParseState.DEFAULT: + parse_state = ParseState.IN_SINGLE_QUOTE + # end of a single-quoted string + elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"): + parse_state = ParseState.DEFAULT + case '"': + # start of a double-quoted string + if parse_state == ParseState.DEFAULT: + parse_state = ParseState.IN_DOUBLE_QUOTE + # end of a double-quoted string + elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"): + parse_state = ParseState.DEFAULT + substr.append(char) + i += 1 + + if substr: + substrings.append((False, "".join(substr))) + + return substrings def tokenize_string(source: str) -> Iterator[tuple[int, str]]: @@ -181,18 +234,19 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]: tok_generator : Iterator[Tuple[int, str]] An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). """ + # GH 59285 + # Escape characters, including backticks + source = "".join( + ( + create_valid_python_identifier(substring[1:-1]) + if is_backtick_quoted + else substring + ) + for is_backtick_quoted, substring in _split_by_backtick(source) + ) + line_reader = StringIO(source).readline token_generator = tokenize.generate_tokens(line_reader) - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted string - for toknum, tokval, start, _, _ in token_generator: - if tokval == "`": - try: - yield tokenize_backtick_quoted_string( - token_generator, source, string_start=start[1] + 1 - ) - except Exception as err: - raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err - else: - yield toknum, tokval + for toknum, tokval, _, _, _ in token_generator: + yield toknum, tokval diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 04a8ad7ef0be6..77b7d9ad11a6c 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -1,4 +1,5 @@ -""" manage PyTables query interface via Expressions """ +"""manage PyTables query interface via Expressions""" + from __future__ import annotations import ast @@ -81,7 +82,7 @@ def _resolve_name(self): if self.side == "left": # Note: The behavior of __new__ ensures that self.name is a str here if self.name not in self.env.queryables: - raise NameError(f"name {repr(self.name)} is not defined") + raise NameError(f"name {self.name!r} is not defined") return self.name # resolve the rhs (and allow it to be None) @@ -204,7 +205,7 @@ def generate(self, v) -> str: val = v.tostring(self.encoding) return f"({self.lhs} {self.op} {val})" - def convert_value(self, v) -> TermValue: + def convert_value(self, conv_val) -> TermValue: """ convert the expression that is in the term to something that is accepted by pytables @@ -218,44 +219,45 @@ def stringify(value): kind = ensure_decoded(self.kind) meta = ensure_decoded(self.meta) if kind == "datetime" or (kind and kind.startswith("datetime64")): - if isinstance(v, (int, float)): - v = stringify(v) - v = ensure_decoded(v) - v = Timestamp(v).as_unit("ns") - if v.tz is not None: - v = v.tz_convert("UTC") - return TermValue(v, v._value, kind) + if isinstance(conv_val, (int, float)): + conv_val = stringify(conv_val) + conv_val = ensure_decoded(conv_val) + conv_val = Timestamp(conv_val).as_unit("ns") + if conv_val.tz is not None: + conv_val = conv_val.tz_convert("UTC") + return TermValue(conv_val, conv_val._value, kind) elif kind in ("timedelta64", "timedelta"): - if isinstance(v, str): - v = Timedelta(v) + if isinstance(conv_val, str): + conv_val = Timedelta(conv_val) else: - v = Timedelta(v, unit="s") - v = v.as_unit("ns")._value - return TermValue(int(v), v, kind) + conv_val = Timedelta(conv_val, unit="s") + conv_val = conv_val.as_unit("ns")._value + return TermValue(int(conv_val), conv_val, kind) elif meta == "category": metadata = extract_array(self.metadata, extract_numpy=True) result: npt.NDArray[np.intp] | np.intp | int - if v not in metadata: + if conv_val not in metadata: result = -1 else: - result = metadata.searchsorted(v, side="left") + # Find the index of the first match of conv_val in metadata + result = np.flatnonzero(metadata == conv_val)[0] return TermValue(result, result, "integer") elif kind == "integer": try: - v_dec = Decimal(v) + v_dec = Decimal(conv_val) except InvalidOperation: # GH 54186 # convert v to float to raise float's ValueError - float(v) + float(conv_val) else: - v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) - return TermValue(v, v, kind) + conv_val = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) + return TermValue(conv_val, conv_val, kind) elif kind == "float": - v = float(v) - return TermValue(v, v, kind) + conv_val = float(conv_val) + return TermValue(conv_val, conv_val, kind) elif kind == "bool": - if isinstance(v, str): - v = v.strip().lower() not in [ + if isinstance(conv_val, str): + conv_val = conv_val.strip().lower() not in [ "false", "f", "no", @@ -267,13 +269,15 @@ def stringify(value): "", ] else: - v = bool(v) - return TermValue(v, v, kind) - elif isinstance(v, str): + conv_val = bool(conv_val) + return TermValue(conv_val, conv_val, kind) + elif isinstance(conv_val, str): # string quoting - return TermValue(v, stringify(v), "string") + return TermValue(conv_val, stringify(conv_val), "string") else: - raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column") + raise TypeError( + f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column" + ) def convert_values(self) -> None: pass @@ -407,11 +411,12 @@ def prune(self, klass): operand = operand.prune(klass) if operand is not None and ( - issubclass(klass, ConditionBinOp) - and operand.condition is not None - or not issubclass(klass, ConditionBinOp) - and issubclass(klass, FilterBinOp) - and operand.filter is not None + (issubclass(klass, ConditionBinOp) and operand.condition is not None) + or ( + not issubclass(klass, ConditionBinOp) + and issubclass(klass, FilterBinOp) + and operand.filter is not None + ) ): return operand.invert() return None @@ -467,9 +472,7 @@ def visit_Subscript(self, node, **kwargs) -> ops.Term: try: return self.const_type(value[slobj], self.env) except TypeError as err: - raise ValueError( - f"cannot subscript {repr(value)} with {repr(slobj)}" - ) from err + raise ValueError(f"cannot subscript {value!r} with {slobj!r}") from err def visit_Attribute(self, node, **kwargs): attr = node.attr diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 7e553ca448218..336d62b9d9579 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -1,6 +1,7 @@ """ Module for scope operations """ + from __future__ import annotations from collections import ChainMap @@ -139,7 +140,7 @@ class Scope: temps : dict """ - __slots__ = ["level", "scope", "target", "resolvers", "temps"] + __slots__ = ["level", "resolvers", "scope", "target", "temps"] level: int scope: DeepChainMap resolvers: DeepChainMap diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a8b63f97141c2..20fe8cbab1c9f 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -9,10 +9,12 @@ module is imported, register them here rather than in the module. """ + from __future__ import annotations +from collections.abc import Callable import os -from typing import Callable +from typing import Any import pandas._config.config as cf from pandas._config.config import ( @@ -36,7 +38,7 @@ """ -def use_bottleneck_cb(key) -> None: +def use_bottleneck_cb(key: str) -> None: from pandas.core import nanops nanops.set_use_bottleneck(cf.get_option(key)) @@ -50,7 +52,7 @@ def use_bottleneck_cb(key) -> None: """ -def use_numexpr_cb(key) -> None: +def use_numexpr_cb(key: str) -> None: from pandas.core.computation import expressions expressions.set_use_numexpr(cf.get_option(key)) @@ -64,7 +66,7 @@ def use_numexpr_cb(key) -> None: """ -def use_numba_cb(key) -> None: +def use_numba_cb(key: str) -> None: from pandas.core.util import numba_ numba_.set_use_numba(cf.get_option(key)) @@ -94,16 +96,14 @@ def use_numba_cb(key) -> None: to ``precision`` in :meth:`numpy.set_printoptions`. """ -pc_colspace_doc = """ -: int - Default space for DataFrame columns. -""" - pc_max_rows_doc = """ : int If max_rows is exceeded, switch to truncate view. Depending on `large_repr`, objects are either centrally truncated or printed as - a summary view. 'None' value means unlimited. + a summary view. + + 'None' value means unlimited. Beware that printing a large number of rows + could cause your rendering environment (the browser, etc.) to crash. In case python/IPython is running in a terminal and `large_repr` equals 'truncate' this can be set to 0 and pandas will auto-detect @@ -124,7 +124,11 @@ def use_numba_cb(key) -> None: : int If max_cols is exceeded, switch to truncate view. Depending on `large_repr`, objects are either centrally truncated or printed as - a summary view. 'None' value means unlimited. + a summary view. + + 'None' value means unlimited. Beware that printing a large number of + columns could cause your rendering environment (the browser, etc.) to + crash. In case python/IPython is running in a terminal and `large_repr` equals 'truncate' this can be set to 0 or None and pandas will auto-detect @@ -204,11 +208,6 @@ def use_numba_cb(key) -> None: Enabling this may affect to the performance (default: False) """ -pc_ambiguous_as_wide_doc = """ -: boolean - Whether to handle Unicode characters belong to Ambiguous as Wide (width=2) - (default: False) -""" pc_table_schema_doc = """ : boolean @@ -286,7 +285,7 @@ def use_numba_cb(key) -> None: """ -def table_schema_cb(key) -> None: +def table_schema_cb(key: str) -> None: from pandas.io.formats.printing import enable_data_resource_formatter enable_data_resource_formatter(cf.get_option(key)) @@ -329,7 +328,7 @@ def is_terminal() -> bool: "min_rows", 10, pc_min_rows_doc, - validator=is_instance_factory([type(None), int]), + validator=is_instance_factory((type(None), int)), ) cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int) @@ -369,7 +368,7 @@ def is_terminal() -> bool: cf.register_option("chop_threshold", None, pc_chop_threshold_doc) cf.register_option("max_seq_items", 100, pc_max_seq_items) cf.register_option( - "width", 80, pc_width_doc, validator=is_instance_factory([type(None), int]) + "width", 80, pc_width_doc, validator=is_instance_factory((type(None), int)) ) cf.register_option( "memory_usage", @@ -406,61 +405,6 @@ def is_terminal() -> bool: with cf.config_prefix("mode"): cf.register_option("sim_interactive", False, tc_sim_interactive_doc) -use_inf_as_na_doc = """ -: boolean - True means treat None, NaN, INF, -INF as NA (old way), - False means None and NaN are null, but INF, -INF are not NA - (new way). - - This option is deprecated in pandas 2.1.0 and will be removed in 3.0. -""" - -# We don't want to start importing everything at the global context level -# or we'll hit circular deps. - - -def use_inf_as_na_cb(key) -> None: - # TODO(3.0): enforcing this deprecation will close GH#52501 - from pandas.core.dtypes.missing import _use_inf_as_na - - _use_inf_as_na(key) - - -with cf.config_prefix("mode"): - cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb) - -cf.deprecate_option( - # GH#51684 - "mode.use_inf_as_na", - "use_inf_as_na option is deprecated and will be removed in a future " - "version. Convert inf values to NaN before operating instead.", -) - -data_manager_doc = """ -: string - Internal data manager type; can be "block" or "array". Defaults to "block", - unless overridden by the 'PANDAS_DATA_MANAGER' environment variable (needs - to be set before pandas is imported). -""" - - -with cf.config_prefix("mode"): - cf.register_option( - "data_manager", - # Get the default from an environment variable, if set, otherwise defaults - # to "block". This environment variable can be set for testing. - os.environ.get("PANDAS_DATA_MANAGER", "block"), - data_manager_doc, - validator=is_one_of_factory(["block", "array"]), - ) - -cf.deprecate_option( - # GH#55043 - "mode.data_manager", - "data_manager option is deprecated and will be removed in a future " - "version. Only the BlockManager will be available.", -) - # TODO better name? copy_on_write_doc = """ @@ -499,19 +443,46 @@ def use_inf_as_na_cb(key) -> None: validator=is_one_of_factory([None, "warn", "raise"]), ) +performance_warnings = """ +: boolean + Whether to show or hide PerformanceWarnings. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "performance_warnings", + True, + performance_warnings, + validator=is_bool, + ) + string_storage_doc = """ : string - The default storage for StringDtype. This option is ignored if - ``future.infer_string`` is set to True. + The default storage for StringDtype. """ + +def is_valid_string_storage(value: Any) -> None: + legal_values = ["auto", "python", "pyarrow"] + if value not in legal_values: + msg = "Value must be one of python|pyarrow" + if value == "pyarrow_numpy": + # TODO: we can remove extra message after 3.0 + msg += ( + ". 'pyarrow_numpy' was specified, but this option should be " + "enabled using pandas.options.future.infer_string instead" + ) + raise ValueError(msg) + + with cf.config_prefix("mode"): cf.register_option( "string_storage", - "python", + "auto", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + # validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_valid_string_storage, ) @@ -653,7 +624,7 @@ def use_inf_as_na_cb(key) -> None: """ -def register_plotting_backend_cb(key) -> None: +def register_plotting_backend_cb(key: str | None) -> None: if key == "matplotlib": # We defer matplotlib validation, since it's the default return @@ -667,7 +638,7 @@ def register_plotting_backend_cb(key) -> None: "backend", defval="matplotlib", doc=plotting_backend_doc, - validator=register_plotting_backend_cb, + validator=register_plotting_backend_cb, # type: ignore[arg-type] ) @@ -679,7 +650,7 @@ def register_plotting_backend_cb(key) -> None: """ -def register_converter_cb(key) -> None: +def register_converter_cb(key: str) -> None: from pandas.plotting import ( deregister_matplotlib_converters, register_matplotlib_converters, @@ -850,14 +821,14 @@ def register_converter_cb(key) -> None: "format.thousands", None, styler_thousands, - validator=is_instance_factory([type(None), str]), + validator=is_instance_factory((type(None), str)), ) cf.register_option( "format.na_rep", None, styler_na_rep, - validator=is_instance_factory([type(None), str]), + validator=is_instance_factory((type(None), str)), ) cf.register_option( @@ -867,11 +838,15 @@ def register_converter_cb(key) -> None: validator=is_one_of_factory([None, "html", "latex", "latex-math"]), ) + # error: Argument 1 to "is_instance_factory" has incompatible type "tuple[ + # ..., , ...]"; expected "type | tuple[type, ...]" cf.register_option( "format.formatter", None, styler_formatter, - validator=is_instance_factory([type(None), dict, Callable, str]), + validator=is_instance_factory( + (type(None), dict, Callable, str) # type: ignore[arg-type] + ), ) cf.register_option("html.mathjax", True, styler_mathjax, validator=is_bool) @@ -898,14 +873,14 @@ def register_converter_cb(key) -> None: "latex.environment", None, styler_environment, - validator=is_instance_factory([type(None), str]), + validator=is_instance_factory((type(None), str)), ) with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/construction.py b/pandas/core/construction.py index d41a9c80a10ec..ada492787a179 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -4,37 +4,25 @@ These should not depend on core.internals. """ + from __future__ import annotations -from collections.abc import Sequence from typing import ( TYPE_CHECKING, - Optional, - Union, cast, overload, ) -import warnings import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas._libs.tslibs import ( - Period, get_supported_dtype, is_supported_dtype, ) -from pandas._typing import ( - AnyArrayLike, - ArrayLike, - Dtype, - DtypeObj, - T, -) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( @@ -47,9 +35,9 @@ maybe_promote, ) from pandas.core.dtypes.common import ( + ensure_object, is_list_like, is_object_dtype, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -64,11 +52,25 @@ import pandas.core.common as com if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + AnyArrayLike, + ArrayLike, + Dtype, + DtypeObj, + T, + ) + from pandas import ( Index, Series, ) - from pandas.core.arrays.base import ExtensionArray + from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + TimedeltaArray, + ) def array( @@ -79,6 +81,10 @@ def array( """ Create an array. + This method constructs an array using pandas extension types when possible. + If `dtype` is specified, it determines the type of array returned. Otherwise, + pandas attempts to infer the appropriate dtype based on `data`. + Parameters ---------- data : Sequence of objects @@ -169,7 +175,7 @@ def array( would no longer return a :class:`arrays.NumpyExtensionArray` backed by a NumPy array. - >>> pd.array(['a', 'b'], dtype=str) + >>> pd.array(["a", "b"], dtype=str) ['a', 'b'] Length: 2, dtype: str32 @@ -178,7 +184,7 @@ def array( data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. - >>> pd.array(['a', 'b'], dtype=np.dtype(">> pd.array(["a", "b"], dtype=np.dtype(" ['a', 'b'] Length: 2, dtype: str32 @@ -193,12 +199,12 @@ def array( rather than a ``NumpyExtensionArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. - >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') + >>> pd.array(["2015", "2016"], dtype="datetime64[ns]") ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] - >>> pd.array(["1h", "2h"], dtype='timedelta64[ns]') + >>> pd.array(["1h", "2h"], dtype="timedelta64[ns]") ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -230,27 +236,27 @@ def array( >>> with pd.option_context("string_storage", "pyarrow"): ... arr = pd.array(["a", None, "c"]) - ... >>> arr ['a', , 'c'] Length: 3, dtype: string - >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + >>> pd.array([pd.Period("2000", freq="D"), pd.Period("2000", freq="D")]) ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] You can use the string alias for `dtype` - >>> pd.array(['a', 'b', 'a'], dtype='category') + >>> pd.array(["a", "b", "a"], dtype="category") ['a', 'b', 'a'] Categories (2, object): ['a', 'b'] Or specify the actual dtype - >>> pd.array(['a', 'b', 'a'], - ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) + >>> pd.array( + ... ["a", "b", "a"], dtype=pd.CategoricalDtype(["a", "b", "c"], ordered=True) + ... ) ['a', 'b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] @@ -287,9 +293,7 @@ def array( ExtensionArray, FloatingArray, IntegerArray, - IntervalArray, NumpyExtensionArray, - PeriodArray, TimedeltaArray, ) from pandas.core.arrays.string_ import StringDtype @@ -321,46 +325,60 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=True) - if inferred_dtype == "period": - period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data) - return PeriodArray._from_sequence(period_data, copy=copy) - - elif inferred_dtype == "interval": - return IntervalArray(data, copy=copy) - - elif inferred_dtype.startswith("datetime"): - # datetime, datetime64 - try: - return DatetimeArray._from_sequence(data, copy=copy) - except ValueError: - # Mixture of timezones, fall back to NumpyExtensionArray - pass - - elif inferred_dtype.startswith("timedelta"): - # timedelta, timedelta64 - return TimedeltaArray._from_sequence(data, copy=copy) - - elif inferred_dtype == "string": + was_ndarray = isinstance(data, np.ndarray) + # error: Item "Sequence[object]" of "Sequence[object] | ExtensionArray | + # ndarray[Any, Any]" has no attribute "dtype" + if not was_ndarray or data.dtype == object: # type: ignore[union-attr] + result = lib.maybe_convert_objects( + ensure_object(data), + convert_non_numeric=True, + convert_to_nullable_dtype=True, + dtype_if_all_nat=None, + ) + result = ensure_wrapped_if_datetimelike(result) + if isinstance(result, np.ndarray): + if len(result) == 0 and not was_ndarray: + # e.g. empty list + return FloatingArray._from_sequence(data, dtype="Float64") + return NumpyExtensionArray._from_sequence( + data, dtype=result.dtype, copy=copy + ) + if result is data and copy: + return result.copy() + return result + + data = cast(np.ndarray, data) + result = ensure_wrapped_if_datetimelike(data) + if result is not data: + result = cast("DatetimeArray | TimedeltaArray", result) + if copy and result.dtype == data.dtype: + return result.copy() + return result + + if data.dtype.kind in "SU": # StringArray/ArrowStringArray depending on pd.options.mode.string_storage dtype = StringDtype() cls = dtype.construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) - elif inferred_dtype == "integer": - return IntegerArray._from_sequence(data, copy=copy) - elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data): - return FloatingArray._from_sequence(data, copy=copy) - elif ( - inferred_dtype in ("floating", "mixed-integer-float") - and getattr(data, "dtype", None) != np.float16 - ): + elif data.dtype.kind in "iu": + dtype = IntegerArray._dtype_cls._get_dtype_mapping()[data.dtype] + return IntegerArray._from_sequence(data, dtype=dtype, copy=copy) + elif data.dtype.kind == "f": # GH#44715 Exclude np.float16 bc FloatingArray does not support it; # we will fall back to NumpyExtensionArray. - return FloatingArray._from_sequence(data, copy=copy) - - elif inferred_dtype == "boolean": + if data.dtype == np.float16: + return NumpyExtensionArray._from_sequence( + data, dtype=data.dtype, copy=copy + ) + dtype = FloatingArray._dtype_cls._get_dtype_mapping()[data.dtype] + return FloatingArray._from_sequence(data, dtype=dtype, copy=copy) + + elif data.dtype.kind == "b": return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) + else: + # e.g. complex + return NumpyExtensionArray._from_sequence(data, dtype=data.dtype, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns,us,ms,s] @@ -372,13 +390,10 @@ def array( return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) elif lib.is_np_dtype(dtype, "mM"): - warnings.warn( + raise ValueError( + # GH#53817 r"datetime64 and timedelta64 dtype resolutions other than " - r"'s', 'ms', 'us', and 'ns' are deprecated. " - r"In future releases passing unsupported resolutions will " - r"raise an exception.", - FutureWarning, - stacklevel=find_stack_level(), + r"'s', 'ms', 'us', and 'ns' are no longer supported." ) return NumpyExtensionArray._from_sequence(data, dtype=dtype, copy=copy) @@ -402,15 +417,13 @@ def array( @overload def extract_array( obj: Series | Index, extract_numpy: bool = ..., extract_range: bool = ... -) -> ArrayLike: - ... +) -> ArrayLike: ... @overload def extract_array( obj: T, extract_numpy: bool = ..., extract_range: bool = ... -) -> T | ArrayLike: - ... +) -> T | ArrayLike: ... def extract_array( @@ -439,7 +452,7 @@ def extract_array( Examples -------- - >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) + >>> extract_array(pd.Series(["a", "b", "c"], dtype="category")) ['a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] @@ -547,9 +560,7 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype - object_index = False - if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: - object_index = True + infer_object = not isinstance(data, (ABCIndex, ABCSeries)) # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -566,14 +577,10 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - if ( - isinstance(data, str) - and using_pyarrow_string_dtype() - and original_dtype is None - ): + if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow_numpy") + dtype = StringDtype(na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -593,6 +600,8 @@ def sanitize_array( # create an extension array from its dtype _sanitize_non_ordered(data) cls = dtype.construct_array_type() + if not hasattr(data, "__array__"): + data = list(data) subarr = cls._from_sequence(data, dtype=dtype, copy=copy) # GH#846 @@ -602,22 +611,18 @@ def sanitize_array( if dtype is None: subarr = data - if data.dtype == object: + if data.dtype == object and infer_object: subarr = maybe_infer_to_datetimelike(data) - if ( - object_index - and using_pyarrow_string_dtype() - and is_string_dtype(subarr) - ): - # Avoid inference when string option is set - subarr = data - elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) - if subarr is data and copy: + if ( + subarr is data + or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr] + ) and copy: subarr = subarr.copy() else: @@ -626,7 +631,10 @@ def sanitize_array( elif hasattr(data, "__array__"): # e.g. dask array GH#38645 - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) return sanitize_array( data, index=index, @@ -744,8 +752,11 @@ def _sanitize_str_dtypes( # GH#19853: If data is a scalar, result has already the result if not lib.is_scalar(data): if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - result = np.array(data, dtype=object, copy=copy) + data = np.asarray(data, dtype=dtype) + if not copy: + result = np.asarray(data, dtype=object) + else: + result = np.array(data, dtype=object, copy=copy) return result @@ -802,6 +813,12 @@ def _try_cast( ) elif dtype.kind in "mM": + if is_ndarray: + arr = cast(np.ndarray, arr) + if arr.ndim == 2 and arr.shape[1] == 1: + # GH#60081: DataFrame Constructor converts 1D data to array of + # shape (N, 1), but maybe_cast_to_datetime assumes 1D input + return maybe_cast_to_datetime(arr[:, 0], dtype).reshape(arr.shape) return maybe_cast_to_datetime(arr, dtype) # GH#15832: Check if we are requesting a numeric dtype and @@ -810,6 +827,8 @@ def _try_cast( # this will raise if we have e.g. floats subarr = maybe_cast_to_integer_array(arr, dtype) + elif not copy: + subarr = np.asarray(arr, dtype=dtype) else: subarr = np.array(arr, dtype=dtype, copy=copy) diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index 254abe330b8e7..e66104d6afcd9 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -20,7 +20,6 @@ is_int64_dtype, is_integer, is_integer_dtype, - is_interval, is_interval_dtype, is_iterator, is_list_like, @@ -63,7 +62,6 @@ "is_int64_dtype", "is_integer", "is_integer_dtype", - "is_interval", "is_interval_dtype", "is_iterator", "is_list_like", diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index f5579082c679b..086f7d2da6640 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -2,6 +2,7 @@ Functions for implementing 'astype' methods according to pandas conventions, particularly ones that differ from numpy. """ + from __future__ import annotations import inspect @@ -36,21 +37,17 @@ from pandas.core.arrays import ExtensionArray -_dtype_obj = np.dtype(object) - @overload def _astype_nansafe( arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ... -) -> np.ndarray: - ... +) -> np.ndarray: ... @overload def _astype_nansafe( arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ... -) -> ExtensionArray: - ... +) -> ExtensionArray: ... def _astype_nansafe( @@ -258,6 +255,11 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: ------- True if new data is a view or not guaranteed to be a copy, False otherwise """ + if dtype.kind in "iufb" and dtype.kind == new_dtype.kind: + # fastpath for numeric dtypes + if hasattr(dtype, "itemsize") and hasattr(new_dtype, "itemsize"): + return dtype.itemsize == new_dtype.itemsize # pyright: ignore[reportAttributeAccessIssue] + if isinstance(dtype, np.dtype) and not isinstance(new_dtype, np.dtype): new_dtype, dtype = dtype, new_dtype diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 6b00a5284ec5b..428fc24cd08ac 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,6 +1,7 @@ """ Extend pandas with custom array types. """ + from __future__ import annotations from typing import ( @@ -44,6 +45,11 @@ class ExtensionDtype: """ A custom data type, to be paired with an ExtensionArray. + This enables support for third-party and custom dtypes within the + pandas ecosystem. By implementing this interface and pairing it with a custom + `ExtensionArray`, users can create rich data types that integrate cleanly + with pandas operations, such as grouping, joining, or aggregation. + See Also -------- extensions.register_extension_dtype: Register an ExtensionType @@ -96,10 +102,8 @@ class property**. >>> from pandas.api.extensions import ExtensionArray >>> class ExtensionDtype: ... def __from_arrow__( - ... self, - ... array: pyarrow.Array | pyarrow.ChunkedArray - ... ) -> ExtensionArray: - ... ... + ... self, array: pyarrow.Array | pyarrow.ChunkedArray + ... ) -> ExtensionArray: ... This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -487,6 +491,14 @@ def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDt callable A class decorator. + See Also + -------- + api.extensions.ExtensionDtype : The base class for creating custom pandas + data types. + Series : One-dimensional array with axis labels. + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous + tabular data. + Examples -------- >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype @@ -529,22 +541,18 @@ def register(self, dtype: type_t[ExtensionDtype]) -> None: self.dtypes.append(dtype) @overload - def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: - ... + def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: ... @overload - def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT: - ... + def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT: ... @overload - def find(self, dtype: str) -> ExtensionDtype | None: - ... + def find(self, dtype: str) -> ExtensionDtype | None: ... @overload def find( self, dtype: npt.DTypeLike - ) -> type_t[ExtensionDtype] | ExtensionDtype | None: - ... + ) -> type_t[ExtensionDtype] | ExtensionDtype | None: ... def find( self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7a088bf84c48e..dae04ba6244d4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import ( Interval, @@ -39,7 +39,6 @@ is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 -from pandas.compat.numpy import np_version_gt2 from pandas.errors import ( IntCastingNaNError, LossySetitemError, @@ -88,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -231,7 +230,7 @@ def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar: return value -def _disallow_mismatched_datetimelike(value, dtype: DtypeObj): +def _disallow_mismatched_datetimelike(value, dtype: DtypeObj) -> None: """ numpy allows np.array(dt64values, dtype="timedelta64[ns]") and vice-versa, but we do not want to allow this, so we need to @@ -243,17 +242,19 @@ def _disallow_mismatched_datetimelike(value, dtype: DtypeObj): elif (vdtype.kind == "m" and dtype.kind == "M") or ( vdtype.kind == "M" and dtype.kind == "m" ): - raise TypeError(f"Cannot cast {repr(value)} to {dtype}") + raise TypeError(f"Cannot cast {value!r} to {dtype}") @overload -def maybe_downcast_to_dtype(result: np.ndarray, dtype: str | np.dtype) -> np.ndarray: - ... +def maybe_downcast_to_dtype( + result: np.ndarray, dtype: str | np.dtype +) -> np.ndarray: ... @overload -def maybe_downcast_to_dtype(result: ExtensionArray, dtype: str | np.dtype) -> ArrayLike: - ... +def maybe_downcast_to_dtype( + result: ExtensionArray, dtype: str | np.dtype +) -> ArrayLike: ... def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLike: @@ -317,15 +318,13 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi @overload def maybe_downcast_numeric( result: np.ndarray, dtype: np.dtype, do_round: bool = False -) -> np.ndarray: - ... +) -> np.ndarray: ... @overload def maybe_downcast_numeric( result: ExtensionArray, dtype: DtypeObj, do_round: bool = False -) -> ArrayLike: - ... +) -> ArrayLike: ... def maybe_downcast_numeric( @@ -513,13 +512,11 @@ def _maybe_cast_to_extension_array( @overload -def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype: - ... +def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype: ... @overload -def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype: - ... +def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype: ... def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj: @@ -589,7 +586,9 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type # "Type[Any]"; expected "Hashable" [arg-type] dtype, fill_value = _maybe_promote_cached( - dtype, fill_value, type(fill_value) # type: ignore[arg-type] + dtype, + fill_value, + type(fill_value), # type: ignore[arg-type] ) except TypeError: # if fill_value is not hashable (required for caching) @@ -799,10 +798,10 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj - if using_pyarrow_string_dtype(): + if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: @@ -891,10 +890,10 @@ def infer_dtype_from_array(arr) -> tuple[DtypeObj, ArrayLike]: Examples -------- - >>> np.asarray([1, '1']) + >>> np.asarray([1, "1"]) array(['1', '1'], dtype='>> infer_dtype_from_array([1, '1']) + >>> infer_dtype_from_array([1, "1"]) (dtype('O'), [1, '1']) """ if isinstance(arr, np.ndarray): @@ -1015,10 +1014,8 @@ def convert_dtypes( Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). + * ``"numpy_nullable"``: returns nullable-dtype * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. .. versionadded:: 2.0 @@ -1026,6 +1023,8 @@ def convert_dtypes( ------- np.dtype, or ExtensionDtype """ + from pandas.core.arrays.string_ import StringDtype + inferred_dtype: str | DtypeObj if ( @@ -1104,10 +1103,17 @@ def convert_dtypes( # If we couldn't do anything else, then we retain the dtype inferred_dtype = input_array.dtype + elif ( + convert_string + and isinstance(input_array.dtype, StringDtype) + and input_array.dtype.na_value is np.nan + ): + inferred_dtype = pandas_dtype_func("string") + else: inferred_dtype = input_array.dtype - if dtype_backend == "pyarrow": + if dtype_backend == "pyarrow" and not isinstance(inferred_dtype, ArrowDtype): from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.string_ import StringDtype @@ -1121,6 +1127,7 @@ def convert_dtypes( or ( inferred_dtype.kind not in "iufcb" and not isinstance(inferred_dtype, StringDtype) + and not isinstance(inferred_dtype, CategoricalDtype) ) ): if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance( @@ -1156,6 +1163,7 @@ def convert_dtypes( def maybe_infer_to_datetimelike( value: npt.NDArray[np.object_], + convert_to_nullable_dtype: bool = False, ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: """ we might have a array (or single object) that is datetime like, @@ -1193,13 +1201,14 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, - dtype_if_all_nat=np.dtype("M8[ns]"), + convert_to_nullable_dtype=convert_to_nullable_dtype, + dtype_if_all_nat=np.dtype("M8[s]"), ) def maybe_cast_to_datetime( value: np.ndarray | list, dtype: np.dtype -) -> ExtensionArray | np.ndarray: +) -> DatetimeArray | TimedeltaArray | np.ndarray: """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT @@ -1332,7 +1341,7 @@ def find_result_type(left_dtype: DtypeObj, right: Any) -> DtypeObj: right = left_dtype elif ( not np.issubdtype(left_dtype, np.unsignedinteger) - and 0 < right <= 2 ** (8 * right_dtype.itemsize - 1) - 1 + and 0 < right <= np.iinfo(right_dtype).max ): # If left dtype isn't unsigned, check if it fits in the signed dtype right = np.dtype(f"i{right_dtype.itemsize}") @@ -1372,7 +1381,7 @@ def common_dtype_categorical_compat( # TODO: more generally, could do `not can_hold_na(dtype)` if lib.is_np_dtype(dtype, "iu"): for obj in objs: - # We don't want to accientally allow e.g. "categorical" str here + # We don't want to accidentally allow e.g. "categorical" str here obj_dtype = getattr(obj, "dtype", None) if isinstance(obj_dtype, CategoricalDtype): if isinstance(obj, ABCIndex): @@ -1416,18 +1425,15 @@ def np_find_common_type(*dtypes: np.dtype) -> np.dtype: @overload -def find_common_type(types: list[np.dtype]) -> np.dtype: - ... +def find_common_type(types: list[np.dtype]) -> np.dtype: ... @overload -def find_common_type(types: list[ExtensionDtype]) -> DtypeObj: - ... +def find_common_type(types: list[ExtensionDtype]) -> DtypeObj: ... @overload -def find_common_type(types: list[DtypeObj]) -> DtypeObj: - ... +def find_common_type(types: list[DtypeObj]) -> DtypeObj: ... def find_common_type(types): @@ -1501,7 +1507,10 @@ def construct_2d_arraylike_from_scalar( # Attempt to coerce to a numpy array try: - arr = np.array(value, dtype=dtype, copy=copy) + if not copy: + arr = np.asarray(value, dtype=dtype) + else: + arr = np.array(value, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: raise TypeError( f"DataFrame constructor called with incompatible data and dtype: {err}" @@ -1541,25 +1550,24 @@ def construct_1d_arraylike_from_scalar( if isinstance(dtype, ExtensionDtype): cls = dtype.construct_array_type() seq = [] if length == 0 else [value] - subarr = cls._from_sequence(seq, dtype=dtype).repeat(length) + return cls._from_sequence(seq, dtype=dtype).repeat(length) + + if length and dtype.kind in "iu" and isna(value): + # coerce if we have nan for an integer dtype + dtype = np.dtype("float64") + elif lib.is_np_dtype(dtype, "US"): + # we need to coerce to object dtype to avoid + # to allow numpy to take our string as a scalar value + dtype = np.dtype("object") + if not isna(value): + value = ensure_str(value) + elif dtype.kind in "mM": + value = _maybe_box_and_unbox_datetimelike(value, dtype) - else: - if length and dtype.kind in "iu" and isna(value): - # coerce if we have nan for an integer dtype - dtype = np.dtype("float64") - elif lib.is_np_dtype(dtype, "US"): - # we need to coerce to object dtype to avoid - # to allow numpy to take our string as a scalar value - dtype = np.dtype("object") - if not isna(value): - value = ensure_str(value) - elif dtype.kind in "mM": - value = _maybe_box_and_unbox_datetimelike(value, dtype) - - subarr = np.empty(length, dtype=dtype) - if length: - # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes - subarr.fill(value) + subarr = np.empty(length, dtype=dtype) + if length: + # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes + subarr.fill(value) return subarr @@ -1574,7 +1582,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1592,11 +1600,9 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype="object") - result[:] = values - return result + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter + return np.fromiter(values, dtype="object", count=len(values)) def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: @@ -1644,14 +1650,12 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n with warnings.catch_warnings(): # We already disallow dtype=uint w/ negative numbers # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. - if not np_version_gt2: - warnings.filterwarnings( - "ignore", - "NumPy will stop allowing conversion of " - "out-of-bound Python int", - DeprecationWarning, - ) - casted = np.array(arr, dtype=dtype, copy=False) + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of out-of-bound Python int", + DeprecationWarning, + ) + casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) @@ -1682,6 +1686,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n arr = np.asarray(arr) if np.issubdtype(arr.dtype, str): + # TODO(numpy-2.0 min): This case will raise an OverflowError above if (casted.astype(str) == arr).all(): return casted raise ValueError(f"string values cannot be losslessly cast to {dtype}") @@ -1697,7 +1702,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n ) raise ValueError("Trying to coerce float values to integers") if arr.dtype == object: - raise ValueError("Trying to coerce float values to integers") + raise ValueError("Trying to coerce object values to integers") if casted.dtype < arr.dtype: # TODO: Can this path be hit anymore with numpy > 2 @@ -1745,6 +1750,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: except (ValueError, TypeError): return False + if dtype == "string": + try: + arr._maybe_convert_setitem_value(element) # type: ignore[union-attr] + return True + except (ValueError, TypeError): + return False + # This is technically incorrect, but maintains the behavior of # ExtensionBlock._can_hold_element return True @@ -1816,8 +1828,8 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: # TODO: general-case for EAs? try: casted = element.astype(dtype) - except (ValueError, TypeError): - raise LossySetitemError + except (ValueError, TypeError) as err: + raise LossySetitemError from err # Check for cases of either # a) lossy overflow/rounding or # b) semantic changes like dt64->int64 diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2245359fd8eac..68d99937f728c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1,17 +1,19 @@ """ Common type operations. """ + from __future__ import annotations from typing import ( TYPE_CHECKING, Any, - Callable, ) import warnings import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( Interval, Period, @@ -42,7 +44,6 @@ is_float, is_hashable, is_integer, - is_interval, is_iterator, is_list_like, is_named_tuple, @@ -55,6 +56,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( ArrayLike, DtypeObj, @@ -138,6 +141,11 @@ def is_object_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the object dtype. + This method examines the input to determine if it is of the + object data type. Object dtype is a generic data type that can + hold any Python objects, including strings, lists, and custom + objects. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -148,6 +156,15 @@ def is_object_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the object dtype. + See Also + -------- + api.types.is_numeric_dtype : Check whether the provided array or dtype is of a + numeric dtype. + api.types.is_string_dtype : Check whether the provided array or dtype is of + the string dtype. + api.types.is_bool_dtype : Check whether the provided array or dtype is of a + boolean dtype. + Examples -------- >>> from pandas.api.types import is_object_dtype @@ -169,6 +186,9 @@ def is_sparse(arr) -> bool: """ Check whether an array-like is a 1-D pandas sparse array. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.SparseDtype) instead. + Check that the one-dimensional array-like is a pandas sparse array. Returns True if it is a pandas sparse array, not another type of sparse array. @@ -183,6 +203,10 @@ def is_sparse(arr) -> bool: bool Whether or not the array-like is a pandas sparse array. + See Also + -------- + api.types.SparseDtype : The dtype object for pandas sparse arrays. + Examples -------- Returns `True` if the parameter is a 1-D pandas sparse array. @@ -247,7 +271,7 @@ def is_scipy_sparse(arr) -> bool: """ global _is_scipy_sparse - if _is_scipy_sparse is None: # pylint: disable=used-before-assignment + if _is_scipy_sparse is None: try: from scipy.sparse import issparse as _is_scipy_sparse except ImportError: @@ -271,6 +295,13 @@ def is_datetime64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the datetime64 dtype. + See Also + -------- + api.types.is_datetime64_ns_dtype: Check whether the provided array or + dtype is of the datetime64[ns] dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_dtype @@ -295,6 +326,9 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.DatetimeTZDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -305,6 +339,13 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of a DatetimeTZDtype dtype. + See Also + -------- + api.types.is_datetime64_dtype: Check whether an array-like or + dtype is of the datetime64 dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64tz_dtype @@ -356,6 +397,13 @@ def is_timedelta64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the timedelta64 dtype. + See Also + -------- + api.types.is_timedelta64_ns_dtype : Check whether the provided array or dtype is + of the timedelta64[ns] dtype. + api.types.is_period_dtype : Check whether an array-like or dtype is of the + Period dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_timedelta64_dtype @@ -367,7 +415,7 @@ def is_timedelta64_dtype(arr_or_dtype) -> bool: False >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) True - >>> is_timedelta64_dtype('0 days') + >>> is_timedelta64_dtype("0 days") False """ if isinstance(arr_or_dtype, np.dtype): @@ -381,6 +429,9 @@ def is_period_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Period dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.PeriodDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -391,6 +442,13 @@ def is_period_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Period dtype. + See Also + -------- + api.types.is_timedelta64_ns_dtype : Check whether the provided array or dtype is + of the timedelta64[ns] dtype. + api.types.is_timedelta64_dtype: Check whether an array-like or dtype + is of the timedelta64 dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_period_dtype @@ -424,6 +482,9 @@ def is_interval_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Interval dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.IntervalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -434,6 +495,15 @@ def is_interval_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Interval dtype. + See Also + -------- + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + api.types.is_numeric_dtype : Check whether the provided array or dtype is + of a numeric dtype. + api.types.is_categorical_dtype : Check whether an array-like or dtype is of + the Categorical dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_interval_dtype @@ -470,6 +540,9 @@ def is_categorical_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Categorical dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.CategoricalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -480,6 +553,12 @@ def is_categorical_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Categorical dtype. + See Also + -------- + api.types.is_list_like: Check if the object is list-like. + api.types.is_complex_dtype: Check whether the provided array or + dtype is of a complex dtype. + Examples -------- >>> from pandas.api.types import is_categorical_dtype @@ -535,6 +614,11 @@ def is_string_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of the string dtype. + See Also + -------- + api.types.is_string_dtype : Check whether the provided array or dtype + is of the string dtype. + Examples -------- >>> from pandas.api.types import is_string_dtype @@ -544,7 +628,7 @@ def is_string_dtype(arr_or_dtype) -> bool: True >>> is_string_dtype(int) False - >>> is_string_dtype(np.array(['a', 'b'])) + >>> is_string_dtype(np.array(["a", "b"])) True >>> is_string_dtype(pd.Series([1, 2])) False @@ -571,24 +655,38 @@ def is_dtype_equal(source, target) -> bool: Parameters ---------- - source : The first dtype to compare - target : The second dtype to compare + source : type or str + The first dtype to compare. + target : type or str + The second dtype to compare. Returns ------- boolean Whether or not the two dtypes are equal. + See Also + -------- + api.types.is_categorical_dtype : Check whether the provided array or dtype + is of the Categorical dtype. + api.types.is_string_dtype : Check whether the provided array or dtype + is of the string dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + Examples -------- + >>> from pandas.api.types import is_dtype_equal >>> is_dtype_equal(int, float) False >>> is_dtype_equal("int", int) True >>> is_dtype_equal(object, "category") False + >>> from pandas.core.dtypes.dtypes import CategoricalDtype >>> is_dtype_equal(CategoricalDtype(), "category") True + >>> from pandas.core.dtypes.dtypes import DatetimeTZDtype >>> is_dtype_equal(DatetimeTZDtype(tz="UTC"), "datetime64") False """ @@ -635,6 +733,15 @@ def is_integer_dtype(arr_or_dtype) -> bool: Whether or not the array or dtype is of an integer dtype and not an instance of timedelta64. + See Also + -------- + api.types.is_integer : Return True if given object is integer. + api.types.is_numeric_dtype : Check whether the provided array or dtype is of a + numeric dtype. + api.types.is_float_dtype : Check whether the provided array or dtype is of a + float dtype. + Int64Dtype : An ExtensionDtype for Int64Dtype integer data. + Examples -------- >>> from pandas.api.types import is_integer_dtype @@ -646,9 +753,9 @@ def is_integer_dtype(arr_or_dtype) -> bool: False >>> is_integer_dtype(np.uint64) True - >>> is_integer_dtype('int8') + >>> is_integer_dtype("int8") True - >>> is_integer_dtype('Int8') + >>> is_integer_dtype("Int8") True >>> is_integer_dtype(pd.Int8Dtype) True @@ -656,13 +763,13 @@ def is_integer_dtype(arr_or_dtype) -> bool: False >>> is_integer_dtype(np.timedelta64) False - >>> is_integer_dtype(np.array(['a', 'b'])) + >>> is_integer_dtype(np.array(["a", "b"])) False >>> is_integer_dtype(pd.Series([1, 2])) True >>> is_integer_dtype(np.array([], dtype=np.timedelta64)) False - >>> is_integer_dtype(pd.Index([1, 2.])) # float + >>> is_integer_dtype(pd.Index([1, 2.0])) # float False """ return _is_dtype_type( @@ -692,6 +799,15 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: Whether or not the array or dtype is of a signed integer dtype and not an instance of timedelta64. + See Also + -------- + api.types.is_integer_dtype: Check whether the provided array or dtype + is of an integer dtype. + api.types.is_numeric_dtype: Check whether the provided array or dtype + is of a numeric dtype. + api.types.is_unsigned_integer_dtype: Check whether the provided array + or dtype is of an unsigned integer dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_signed_integer_dtype @@ -703,9 +819,9 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: False >>> is_signed_integer_dtype(np.uint64) # unsigned False - >>> is_signed_integer_dtype('int8') + >>> is_signed_integer_dtype("int8") True - >>> is_signed_integer_dtype('Int8') + >>> is_signed_integer_dtype("Int8") True >>> is_signed_integer_dtype(pd.Int8Dtype) True @@ -713,13 +829,13 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: False >>> is_signed_integer_dtype(np.timedelta64) False - >>> is_signed_integer_dtype(np.array(['a', 'b'])) + >>> is_signed_integer_dtype(np.array(["a", "b"])) False >>> is_signed_integer_dtype(pd.Series([1, 2])) True >>> is_signed_integer_dtype(np.array([], dtype=np.timedelta64)) False - >>> is_signed_integer_dtype(pd.Index([1, 2.])) # float + >>> is_signed_integer_dtype(pd.Index([1, 2.0])) # float False >>> is_signed_integer_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned False @@ -748,6 +864,15 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of an unsigned integer dtype. + See Also + -------- + api.types.is_signed_integer_dtype : Check whether the provided array + or dtype is of an signed integer dtype. + api.types.is_integer_dtype : Check whether the provided array or dtype + is of an integer dtype. + api.types.is_numeric_dtype : Check whether the provided array or dtype + is of a numeric dtype. + Examples -------- >>> from pandas.api.types import is_unsigned_integer_dtype @@ -759,17 +884,17 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool: False >>> is_unsigned_integer_dtype(np.uint64) True - >>> is_unsigned_integer_dtype('uint8') + >>> is_unsigned_integer_dtype("uint8") True - >>> is_unsigned_integer_dtype('UInt8') + >>> is_unsigned_integer_dtype("UInt8") True >>> is_unsigned_integer_dtype(pd.UInt8Dtype) True - >>> is_unsigned_integer_dtype(np.array(['a', 'b'])) + >>> is_unsigned_integer_dtype(np.array(["a", "b"])) False >>> is_unsigned_integer_dtype(pd.Series([1, 2])) # signed False - >>> is_unsigned_integer_dtype(pd.Index([1, 2.])) # float + >>> is_unsigned_integer_dtype(pd.Index([1, 2.0])) # float False >>> is_unsigned_integer_dtype(np.array([1, 2], dtype=np.uint32)) True @@ -800,6 +925,16 @@ def is_int64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of the int64 dtype. + See Also + -------- + api.types.is_float_dtype : Check whether the provided array or dtype is of a + float dtype. + api.types.is_bool_dtype : Check whether the provided array or dtype is of a + boolean dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + numpy.int64 : Numpy's 64-bit integer type. + Notes ----- Depending on system architecture, the return value of `is_int64_dtype( @@ -815,9 +950,9 @@ def is_int64_dtype(arr_or_dtype) -> bool: False >>> is_int64_dtype(np.int64) # doctest: +SKIP True - >>> is_int64_dtype('int8') # doctest: +SKIP + >>> is_int64_dtype("int8") # doctest: +SKIP False - >>> is_int64_dtype('Int8') # doctest: +SKIP + >>> is_int64_dtype("Int8") # doctest: +SKIP False >>> is_int64_dtype(pd.Int64Dtype) # doctest: +SKIP True @@ -825,11 +960,11 @@ def is_int64_dtype(arr_or_dtype) -> bool: False >>> is_int64_dtype(np.uint64) # unsigned # doctest: +SKIP False - >>> is_int64_dtype(np.array(['a', 'b'])) # doctest: +SKIP + >>> is_int64_dtype(np.array(["a", "b"])) # doctest: +SKIP False >>> is_int64_dtype(np.array([1, 2], dtype=np.int64)) # doctest: +SKIP True - >>> is_int64_dtype(pd.Index([1, 2.])) # float # doctest: +SKIP + >>> is_int64_dtype(pd.Index([1, 2.0])) # float # doctest: +SKIP False >>> is_int64_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned # doctest: +SKIP False @@ -858,6 +993,15 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: bool Whether or not the array or dtype is of the datetime64 dtype. + See Also + -------- + api.types.is_datetime64_dtype : Check whether an array-like or dtype is of the + datetime64 dtype. + api.is_datetime64_ns_dtype : Check whether the provided array or dtype is of the + datetime64[ns] dtype. + api.is_datetime64tz_dtype : Check whether an array-like or dtype is of a + DatetimeTZDtype dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_any_dtype @@ -870,7 +1014,7 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: True >>> is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern")) True - >>> is_datetime64_any_dtype(np.array(['a', 'b'])) + >>> is_datetime64_any_dtype(np.array(["a", "b"])) False >>> is_datetime64_any_dtype(np.array([1, 2])) False @@ -890,7 +1034,11 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: tipo = _get_dtype(arr_or_dtype) except TypeError: return False - return lib.is_np_dtype(tipo, "M") or isinstance(tipo, DatetimeTZDtype) + return ( + lib.is_np_dtype(tipo, "M") + or isinstance(tipo, DatetimeTZDtype) + or (isinstance(tipo, ExtensionDtype) and tipo.kind == "M") + ) def is_datetime64_ns_dtype(arr_or_dtype) -> bool: @@ -907,6 +1055,13 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: bool Whether or not the array or dtype is of the datetime64[ns] dtype. + See Also + -------- + api.types.is_datetime64_dtype: Check whether an array-like or + dtype is of the datetime64 dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_ns_dtype @@ -919,7 +1074,7 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: False >>> is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern")) True - >>> is_datetime64_ns_dtype(np.array(['a', 'b'])) + >>> is_datetime64_ns_dtype(np.array(["a", "b"])) False >>> is_datetime64_ns_dtype(np.array([1, 2])) False @@ -958,14 +1113,19 @@ def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of the timedelta64[ns] dtype. + See Also + -------- + api.types.is_timedelta64_dtype: Check whether an array-like or dtype + is of the timedelta64 dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_timedelta64_ns_dtype - >>> is_timedelta64_ns_dtype(np.dtype('m8[ns]')) + >>> is_timedelta64_ns_dtype(np.dtype("m8[ns]")) True - >>> is_timedelta64_ns_dtype(np.dtype('m8[ps]')) # Wrong frequency + >>> is_timedelta64_ns_dtype(np.dtype("m8[ps]")) # Wrong frequency False - >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]')) + >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype="m8[ns]")) True >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) False @@ -1047,7 +1207,7 @@ def needs_i8_conversion(dtype: DtypeObj | None) -> bool: False >>> needs_i8_conversion(np.dtype(np.datetime64)) True - >>> needs_i8_conversion(np.array(['a', 'b'])) + >>> needs_i8_conversion(np.array(["a", "b"])) False >>> needs_i8_conversion(pd.Series([1, 2])) False @@ -1077,6 +1237,15 @@ def is_numeric_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a numeric dtype. + See Also + -------- + api.types.is_integer_dtype: Check whether the provided array or dtype + is of an integer dtype. + api.types.is_unsigned_integer_dtype: Check whether the provided array + or dtype is of an unsigned integer dtype. + api.types.is_signed_integer_dtype: Check whether the provided array + or dtype is of an signed integer dtype. + Examples -------- >>> from pandas.api.types import is_numeric_dtype @@ -1092,11 +1261,11 @@ def is_numeric_dtype(arr_or_dtype) -> bool: False >>> is_numeric_dtype(np.timedelta64) False - >>> is_numeric_dtype(np.array(['a', 'b'])) + >>> is_numeric_dtype(np.array(["a", "b"])) False >>> is_numeric_dtype(pd.Series([1, 2])) True - >>> is_numeric_dtype(pd.Index([1, 2.])) + >>> is_numeric_dtype(pd.Index([1, 2.0])) True >>> is_numeric_dtype(np.array([], dtype=np.timedelta64)) False @@ -1122,6 +1291,12 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a real number dtype. + See Also + -------- + is_numeric_dtype : Check if a dtype is numeric. + is_complex_dtype : Check if a dtype is complex. + is_bool_dtype : Check if a dtype is boolean. + Examples -------- >>> from pandas.api.types import is_any_real_numeric_dtype @@ -1149,6 +1324,9 @@ def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. + The function checks for floating-point data types, which represent real numbers + that may have fractional components. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1159,6 +1337,15 @@ def is_float_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a float dtype. + See Also + -------- + api.types.is_numeric_dtype : Check whether the provided array or dtype is of + a numeric dtype. + api.types.is_integer_dtype : Check whether the provided array or dtype is of + an integer dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + Examples -------- >>> from pandas.api.types import is_float_dtype @@ -1168,11 +1355,11 @@ def is_float_dtype(arr_or_dtype) -> bool: False >>> is_float_dtype(float) True - >>> is_float_dtype(np.array(['a', 'b'])) + >>> is_float_dtype(np.array(["a", "b"])) False >>> is_float_dtype(pd.Series([1, 2])) False - >>> is_float_dtype(pd.Index([1, 2.])) + >>> is_float_dtype(pd.Index([1, 2.0])) True """ return _is_dtype_type(arr_or_dtype, classes(np.floating)) or _is_dtype( @@ -1184,6 +1371,10 @@ def is_bool_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a boolean dtype. + This function verifies whether a given object is a boolean data type. The input + can be an array or a dtype object. Accepted array types include instances + of ``np.array``, ``pd.Series``, ``pd.Index``, and similar array-like structures. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1194,6 +1385,10 @@ def is_bool_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a boolean dtype. + See Also + -------- + api.types.is_bool : Check if an object is a boolean. + Notes ----- An ExtensionArray is considered boolean when the ``_is_boolean`` @@ -1210,7 +1405,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: True >>> is_bool_dtype(np.bool_) True - >>> is_bool_dtype(np.array(['a', 'b'])) + >>> is_bool_dtype(np.array(["a", "b"])) False >>> is_bool_dtype(pd.Series([1, 2])) False @@ -1276,6 +1471,10 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: bool Whether the `arr_or_dtype` is an extension array type. + See Also + -------- + api.extensions.ExtensionArray : Abstract base class for pandas extension arrays. + Notes ----- This checks whether an object implements the pandas extension @@ -1294,13 +1493,13 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: Examples -------- >>> from pandas.api.types import is_extension_array_dtype - >>> arr = pd.Categorical(['a', 'b']) + >>> arr = pd.Categorical(["a", "b"]) >>> is_extension_array_dtype(arr) True >>> is_extension_array_dtype(arr.dtype) True - >>> arr = np.array(['a', 'b']) + >>> arr = np.array(["a", "b"]) >>> is_extension_array_dtype(arr.dtype) False """ @@ -1310,7 +1509,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: - return registry.find(dtype) is not None + try: + with warnings.catch_warnings(): + # pandas_dtype(..) can raise UserWarning for class input + warnings.simplefilter("ignore", UserWarning) + dtype = pandas_dtype(dtype) + except (TypeError, ValueError): + # np.dtype(..) can raise ValueError + return False + return isinstance(dtype, ExtensionDtype) def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: @@ -1338,6 +1545,14 @@ def is_complex_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a complex dtype. + See Also + -------- + api.types.is_complex: Return True if given object is complex. + api.types.is_numeric_dtype: Check whether the provided array or + dtype is of a numeric dtype. + api.types.is_integer_dtype: Check whether the provided array or + dtype is of an integer dtype. + Examples -------- >>> from pandas.api.types import is_complex_dtype @@ -1347,7 +1562,7 @@ def is_complex_dtype(arr_or_dtype) -> bool: False >>> is_complex_dtype(np.complex128) True - >>> is_complex_dtype(np.array(['a', 'b'])) + >>> is_complex_dtype(np.array(["a", "b"])) False >>> is_complex_dtype(pd.Series([1, 2])) False @@ -1548,8 +1763,8 @@ def _validate_date_like_dtype(dtype) -> None: raise TypeError(e) from e if typ not in ["generic", "ns"]: raise ValueError( - f"{repr(dtype.name)} is too specific of a frequency, " - f"try passing {repr(dtype.type.__name__)}" + f"{dtype.name!r} is too specific of a frequency, " + f"try passing {dtype.type.__name__!r}" ) @@ -1584,16 +1799,22 @@ def pandas_dtype(dtype) -> DtypeObj: Parameters ---------- - dtype : object to be converted + dtype : object + The object to be converted into a dtype. Returns ------- np.dtype or a pandas dtype + The converted dtype, which can be either a numpy dtype or a pandas dtype. Raises ------ TypeError if not a dtype + See Also + -------- + api.types.is_dtype : Return true if the condition is satisfied for the arr_or_dtype. + Examples -------- >>> pd.api.types.pandas_dtype(int) @@ -1605,6 +1826,12 @@ def pandas_dtype(dtype) -> DtypeObj: elif isinstance(dtype, (np.dtype, ExtensionDtype)): return dtype + # builtin aliases + if dtype is str and using_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + # registered extension types result = registry.find(dtype) if result is not None: @@ -1623,6 +1850,8 @@ def pandas_dtype(dtype) -> DtypeObj: # raise a consistent TypeError if failed try: with warnings.catch_warnings(): + # TODO: warnings.catch_warnings can be removed when numpy>2.3.0 + # is the minimum version # GH#51523 - Series.astype(np.integer) doesn't show # numpy deprecation warning of np.integer # Hence enabling DeprecationWarning @@ -1676,13 +1905,14 @@ def is_all_strings(value: ArrayLike) -> bool: __all__ = [ - "classes", "DT64NS_DTYPE", + "INT64_DTYPE", + "TD64NS_DTYPE", + "classes", "ensure_float64", "ensure_python_int", "ensure_str", "infer_dtype_from_object", - "INT64_DTYPE", "is_1d_only_ea_dtype", "is_all_strings", "is_any_real_numeric_dtype", @@ -1706,7 +1936,6 @@ def is_all_strings(value: ArrayLike) -> bool: "is_float_dtype", "is_int64_dtype", "is_integer_dtype", - "is_interval", "is_interval_dtype", "is_iterator", "is_named_tuple", @@ -1728,6 +1957,5 @@ def is_all_strings(value: ArrayLike) -> bool: "is_unsigned_integer_dtype", "needs_i8_conversion", "pandas_dtype", - "TD64NS_DTYPE", "validate_all_hashable", ] diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9ec662a6cd352..dcf8cb5c78536 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -1,18 +1,17 @@ """ Utility functions related to concat. """ + from __future__ import annotations from typing import ( TYPE_CHECKING, cast, ) -import warnings import numpy as np from pandas._libs import lib -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( @@ -41,7 +40,7 @@ ) -def _is_nonempty(x, axis) -> bool: +def _is_nonempty(x: ArrayLike, axis: AxisInt) -> bool: # filter empty arrays # 1-d dtypes always are included here if x.ndim <= axis: @@ -100,28 +99,10 @@ def concat_compat( # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. - orig = to_concat non_empties = [x for x in to_concat if _is_nonempty(x, axis)] - if non_empties and axis == 0 and not ea_compat_axis: - # ea_compat_axis see GH#39574 - to_concat = non_empties any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties) - if len(to_concat) < len(orig): - _, _, alt_dtype = _get_result_dtype(orig, non_empties) - if alt_dtype != target_dtype: - # GH#39122 - warnings.warn( - "The behavior of array concatenation with empty entries is " - "deprecated. In a future version, this will no longer exclude " - "empty items when determining the result dtype. " - "To retain the old behavior, exclude the empty entries before " - "the concat operation.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if target_dtype is not None: to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat] @@ -209,6 +190,7 @@ def union_categoricals( Returns ------- Categorical + The union of categories being combined. Raises ------ @@ -220,6 +202,11 @@ def union_categoricals( ValueError Empty list of categoricals passed + See Also + -------- + CategoricalDtype : Type for categorical data with the categories and orderedness. + Categorical : Represent a categorical variable in classic R / S-plus fashion. + Notes ----- To learn more about categories, see `link @@ -278,8 +265,8 @@ def union_categoricals( containing categorical data, but note that the resulting array will always be a plain `Categorical` - >>> a = pd.Series(["b", "c"], dtype='category') - >>> b = pd.Series(["a", "b"], dtype='category') + >>> a = pd.Series(["b", "c"], dtype="category") + >>> b = pd.Series(["a", "b"], dtype="category") >>> pd.api.types.union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ed5256922377a..570074e047da6 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1,6 +1,7 @@ """ Define extension dtypes. """ + from __future__ import annotations from datetime import ( @@ -17,9 +18,11 @@ cast, ) import warnings +import zoneinfo import numpy as np -import pytz + +from pandas._config.config import get_option from pandas._libs import ( lib, @@ -45,6 +48,7 @@ from pandas._libs.tslibs.offsets import BDay from pandas.compat import pa_version_under10p1 from pandas.errors import PerformanceWarning +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( @@ -62,8 +66,6 @@ is_list_like, ) -from pandas.util import capitalize_first_letter - if not pa_version_under10p1: import pyarrow as pa @@ -71,13 +73,14 @@ from collections.abc import MutableMapping from datetime import tzinfo - import pyarrow as pa # noqa: TCH004 + import pyarrow as pa # noqa: TC004 from pandas._typing import ( Dtype, DtypeObj, IntervalClosedType, Ordered, + Scalar, Self, npt, type_t, @@ -153,10 +156,16 @@ class CategoricalDtypeType(type): @register_extension_dtype +@set_module("pandas") class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ Type for categorical data with the categories and orderedness. + It is a dtype representation for categorical data, which allows users to define + a fixed set of values and optionally impose an ordering. This is particularly + useful for handling categorical variables efficiently, as it can significantly + reduce memory usage compared to using object dtypes. + Parameters ---------- categories : sequence, optional @@ -190,8 +199,8 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): Examples -------- - >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) - >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + >>> t = pd.CategoricalDtype(categories=["b", "a"], ordered=True) + >>> pd.Series(["a", "b", "a", "c"], dtype=t) 0 a 1 b 2 a @@ -203,7 +212,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): by providing an empty index. As follows, >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype - dtype('>> pd.CategoricalDtype._from_values_or_dtype() CategoricalDtype(categories=None, ordered=None, categories_dtype=None) >>> pd.CategoricalDtype._from_values_or_dtype( - ... categories=['a', 'b'], ordered=True + ... categories=["a", "b"], ordered=True ... ) CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object) - >>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True) - >>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False) + >>> dtype1 = pd.CategoricalDtype(["a", "b"], ordered=True) + >>> dtype2 = pd.CategoricalDtype(["x", "y"], ordered=False) >>> c = pd.Categorical([0, 1], dtype=dtype1) >>> pd.CategoricalDtype._from_values_or_dtype( - ... c, ['x', 'y'], ordered=True, dtype=dtype2 + ... c, ["x", "y"], ordered=True, dtype=dtype2 ... ) Traceback (most recent call last): ... @@ -318,7 +327,7 @@ def _from_values_or_dtype( dtype = CategoricalDtype(categories, ordered) else: - raise ValueError(f"Unknown dtype {repr(dtype)}") + raise ValueError(f"Unknown dtype {dtype!r}") elif categories is not None or ordered is not None: raise ValueError( "Cannot specify `categories` or `ordered` together with `dtype`." @@ -453,7 +462,7 @@ def __eq__(self, other: object) -> bool: # Because left and right have the same length and are unique, # `indexer` not having any -1s implies that there is a # bijection between `left` and `right`. - return (indexer != -1).all() + return bool((indexer != -1).all()) # With object-dtype we need a comparison that identifies # e.g. int(2) as distinct from float(2) @@ -511,7 +520,7 @@ def _hash_categories(self) -> int: [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] ) else: - cat_array = np.array([cat_array]) + cat_array = cat_array.reshape(1, len(cat_array)) combined_hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(combined_hashed) @@ -566,7 +575,7 @@ def validate_categories(categories, fastpath: bool = False) -> Index: if not fastpath and not is_list_like(categories): raise TypeError( - f"Parameter 'categories' must be list-like, was {repr(categories)}" + f"Parameter 'categories' must be list-like, was {categories!r}" ) if not isinstance(categories, ABCIndex): categories = Index._with_infer(categories, tupleize_cols=False) @@ -601,14 +610,20 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: return self elif not self.is_dtype(dtype): raise ValueError( - f"a CategoricalDtype must be passed to perform an update, " - f"got {repr(dtype)}" + f"a CategoricalDtype must be passed to perform an update, got {dtype!r}" ) else: # from here on, dtype is a CategoricalDtype dtype = cast(CategoricalDtype, dtype) # update categories/ordered unless they've been explicitly passed as None + if ( + isinstance(dtype, CategoricalDtype) + and dtype.categories is not None + and dtype.ordered is not None + ): + # Avoid re-validation in CategoricalDtype constructor + return dtype new_categories = ( dtype.categories if dtype.categories is not None else self.categories ) @@ -621,9 +636,13 @@ def categories(self) -> Index: """ An ``Index`` containing the unique categories allowed. + See Also + -------- + ordered : Whether the categories have an ordered relationship. + Examples -------- - >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> cat_type = pd.CategoricalDtype(categories=["a", "b"], ordered=True) >>> cat_type.categories Index(['a', 'b'], dtype='object') """ @@ -634,13 +653,17 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. + See Also + -------- + categories : An Index containing the unique categories allowed. + Examples -------- - >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> cat_type = pd.CategoricalDtype(categories=["a", "b"], ordered=True) >>> cat_type.ordered True - >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=False) + >>> cat_type = pd.CategoricalDtype(categories=["a", "b"], ordered=False) >>> cat_type.ordered False """ @@ -670,10 +693,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None # categorical is aware of Sparse -> extract sparse subdtypes - dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] + subtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) # extract the categories' dtype non_cat_dtypes = [ - x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes + x.categories.dtype if isinstance(x, CategoricalDtype) else x + for x in subtypes ] # TODO should categorical always give an answer? from pandas.core.dtypes.cast import find_common_type @@ -688,6 +712,7 @@ def index_class(self) -> type_t[CategoricalIndex]: @register_extension_dtype +@set_module("pandas") class DatetimeTZDtype(PandasExtensionDtype): """ An ExtensionDtype for timezone-aware datetime data. @@ -697,8 +722,8 @@ class DatetimeTZDtype(PandasExtensionDtype): Parameters ---------- unit : str, default "ns" - The precision of the datetime data. Currently limited - to ``"ns"``. + The precision of the datetime data. Valid options are + ``"s"``, ``"ms"``, ``"us"``, ``"ns"``. tz : str, int, or datetime.tzinfo The timezone. @@ -716,13 +741,18 @@ class DatetimeTZDtype(PandasExtensionDtype): ZoneInfoNotFoundError When the requested timezone cannot be found. + See Also + -------- + numpy.datetime64 : Numpy data type for datetime. + datetime.datetime : Python datetime object. + Examples -------- >>> from zoneinfo import ZoneInfo - >>> pd.DatetimeTZDtype(tz=ZoneInfo('UTC')) + >>> pd.DatetimeTZDtype(tz=ZoneInfo("UTC")) datetime64[ns, UTC] - >>> pd.DatetimeTZDtype(tz=ZoneInfo('Europe/Paris')) + >>> pd.DatetimeTZDtype(tz=ZoneInfo("Europe/Paris")) datetime64[ns, Europe/Paris] """ @@ -773,7 +803,7 @@ def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None: tz = timezones.maybe_get_tz(tz) tz = timezones.tz_standardize(tz) elif tz is not None: - raise pytz.UnknownTimeZoneError(tz) + raise zoneinfo.ZoneInfoNotFoundError(tz) if tz is None: raise TypeError("A 'tz' is required.") @@ -792,10 +822,14 @@ def unit(self) -> str_type: """ The precision of the datetime data. + See Also + -------- + DatetimeTZDtype.tz : Retrieves the timezone. + Examples -------- >>> from zoneinfo import ZoneInfo - >>> dtype = pd.DatetimeTZDtype(tz=ZoneInfo('America/Los_Angeles')) + >>> dtype = pd.DatetimeTZDtype(tz=ZoneInfo("America/Los_Angeles")) >>> dtype.unit 'ns' """ @@ -806,10 +840,14 @@ def tz(self) -> tzinfo: """ The timezone. + See Also + -------- + DatetimeTZDtype.unit : Retrieves precision of the datetime data. + Examples -------- >>> from zoneinfo import ZoneInfo - >>> dtype = pd.DatetimeTZDtype(tz=ZoneInfo('America/Los_Angeles')) + >>> dtype = pd.DatetimeTZDtype(tz=ZoneInfo("America/Los_Angeles")) >>> dtype.tz zoneinfo.ZoneInfo(key='America/Los_Angeles') """ @@ -842,7 +880,7 @@ def construct_from_string(cls, string: str_type) -> DatetimeTZDtype: Examples -------- - >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]') + >>> DatetimeTZDtype.construct_from_string("datetime64[ns, UTC]") datetime64[ns, UTC] """ if not isinstance(string, str): @@ -858,7 +896,7 @@ def construct_from_string(cls, string: str_type) -> DatetimeTZDtype: return cls(unit=d["unit"], tz=d["tz"]) except (KeyError, TypeError, ValueError) as err: # KeyError if maybe_get_tz tries and fails to get a - # pytz timezone (actually pytz.UnknownTimeZoneError). + # zoneinfo timezone (actually zoneinfo.ZoneInfoNotFoundError). # TypeError if we pass a nonsense tz; # ValueError if we pass a unit other than "ns" raise TypeError(msg) from err @@ -919,7 +957,7 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> DatetimeArray: else: np_arr = array.to_numpy() - return DatetimeArray._from_sequence(np_arr, dtype=self, copy=False) + return DatetimeArray._simple_new(np_arr, dtype=self) def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the @@ -943,6 +981,7 @@ def index_class(self) -> type_t[DatetimeIndex]: @register_extension_dtype +@set_module("pandas") class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype): """ An ExtensionDtype for Period data. @@ -962,9 +1001,17 @@ class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype): ------- None + See Also + -------- + Period : Represents a single time period. + PeriodIndex : Immutable index for period data. + date_range : Return a fixed frequency DatetimeIndex. + Series : One-dimensional array with axis labels. + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- - >>> pd.PeriodDtype(freq='D') + >>> pd.PeriodDtype(freq="D") period[D] >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd()) @@ -1026,9 +1073,23 @@ def freq(self) -> BaseOffset: """ The frequency object of this PeriodDtype. + The `freq` property returns the `BaseOffset` object that represents the + frequency of the PeriodDtype. This frequency specifies the interval (e.g., + daily, monthly, yearly) associated with the Period type. It is essential + for operations that depend on time-based calculations within a period index + or series. + + See Also + -------- + Period : Represents a period of time. + PeriodIndex : Immutable ndarray holding ordinal values indicating + regular periods. + PeriodDtype : An ExtensionDtype for Period data. + date_range : Return a fixed frequency range of dates. + Examples -------- - >>> dtype = pd.PeriodDtype(freq='D') + >>> dtype = pd.PeriodDtype(freq="D") >>> dtype.freq """ @@ -1058,10 +1119,8 @@ def construct_from_string(cls, string: str_type) -> PeriodDtype: possible """ if ( - isinstance(string, str) - and (string.startswith(("period[", "Period["))) - or isinstance(string, BaseOffset) - ): + isinstance(string, str) and (string.startswith(("period[", "Period["))) + ) or isinstance(string, BaseOffset): # do not parse string like U as period[U] # avoid tuple to be regarded as freq try: @@ -1087,7 +1146,7 @@ def na_value(self) -> NaTType: def __eq__(self, other: object) -> bool: if isinstance(other, str): - return other in [self.name, capitalize_first_letter(self.name)] + return other[:1].lower() + other[1:] == self.name return super().__eq__(other) @@ -1097,7 +1156,7 @@ def __ne__(self, other: object) -> bool: @classmethod def is_dtype(cls, dtype: object) -> bool: """ - Return a boolean if we if the passed type is an actual dtype that we + Return a boolean if the passed type is an actual dtype that we can match (via string or type) """ if isinstance(dtype, str): @@ -1162,6 +1221,7 @@ def index_class(self) -> type_t[PeriodIndex]: @register_extension_dtype +@set_module("pandas") class IntervalDtype(PandasExtensionDtype): """ An ExtensionDtype for Interval data. @@ -1172,6 +1232,9 @@ class IntervalDtype(PandasExtensionDtype): ---------- subtype : str, np.dtype The dtype of the Interval bounds. + closed : {'right', 'left', 'both', 'neither'}, default 'right' + Whether the interval is closed on the left-side, right-side, both or + neither. See the Notes for more detailed explanation. Attributes ---------- @@ -1181,9 +1244,13 @@ class IntervalDtype(PandasExtensionDtype): ------- None + See Also + -------- + PeriodDtype : An ExtensionDtype for Period data. + Examples -------- - >>> pd.IntervalDtype(subtype='int64', closed='both') + >>> pd.IntervalDtype(subtype="int64", closed="both") interval[int64, both] """ @@ -1281,9 +1348,13 @@ def subtype(self): """ The dtype of the Interval bounds. + See Also + -------- + IntervalDtype: An ExtensionDtype for Interval data. + Examples -------- - >>> dtype = pd.IntervalDtype(subtype='int64', closed='both') + >>> dtype = pd.IntervalDtype(subtype="int64", closed="both") >>> dtype.subtype dtype('int64') """ @@ -1365,7 +1436,7 @@ def __setstate__(self, state) -> None: @classmethod def is_dtype(cls, dtype: object) -> bool: """ - Return a boolean if we if the passed type is an actual dtype that we + Return a boolean if the passed type is an actual dtype that we can match (via string or type) """ if isinstance(dtype, str): @@ -1458,7 +1529,7 @@ def __init__(self, dtype: npt.DTypeLike | NumpyEADtype | None) -> None: self._dtype = np.dtype(dtype) def __repr__(self) -> str: - return f"NumpyEADtype({repr(self.name)})" + return f"NumpyEADtype({self.name!r})" @property def numpy_dtype(self) -> np.dtype: @@ -1537,6 +1608,25 @@ class BaseMaskedDtype(ExtensionDtype): base = None type: type + _internal_fill_value: Scalar + + @property + def _truthy_value(self): + # Fill values used for 'any' + if self.kind == "f": + return 1.0 + if self.kind in "iu": + return 1 + return True + + @property + def _falsey_value(self): + # Fill values used for 'all' + if self.kind == "f": + return 0.0 + if self.kind in "iu": + return 0 + return False @property def na_value(self) -> libmissing.NAType: @@ -1608,11 +1698,15 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: @register_extension_dtype +@set_module("pandas") class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. - This dtype implements the pandas ExtensionDtype interface. + ``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling + more efficient storage of data that contains a significant number of + repetitive values typically represented by a fill value. It supports any + scalar dtype as the underlying data type of the non-fill values. Parameters ---------- @@ -1620,19 +1714,20 @@ class SparseDtype(ExtensionDtype): The dtype of the underlying array storing the non-fill value values. fill_value : scalar, optional The scalar value not stored in the SparseArray. By default, this - depends on `dtype`. + depends on ``dtype``. =========== ========== dtype na_value =========== ========== float ``np.nan`` + complex ``np.nan`` int ``0`` bool ``False`` datetime64 ``pd.NaT`` timedelta64 ``pd.NaT`` =========== ========== - The default value may be overridden by specifying a `fill_value`. + The default value may be overridden by specifying a ``fill_value``. Attributes ---------- @@ -1642,6 +1737,11 @@ class SparseDtype(ExtensionDtype): ------- None + See Also + -------- + arrays.SparseArray : The array structure that uses SparseDtype + for data representation. + Examples -------- >>> ser = pd.Series([1, 0, 0], dtype=pd.SparseDtype(dtype=int, fill_value=0)) @@ -1704,17 +1804,15 @@ def __eq__(self, other: object) -> bool: if isinstance(other, type(self)): subtype = self.subtype == other.subtype - if self._is_na_fill_value: + if self._is_na_fill_value or other._is_na_fill_value: # this case is complicated by two things: # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) # i.e. we want to treat any floating-point NaN as equal, but # not a floating-point NaN and a datetime NaT. - fill_value = ( - other._is_na_fill_value - and isinstance(self.fill_value, type(other.fill_value)) - or isinstance(other.fill_value, type(self.fill_value)) - ) + fill_value = isinstance( + self.fill_value, type(other.fill_value) + ) or isinstance(other.fill_value, type(self.fill_value)) else: with warnings.catch_warnings(): # Ignore spurious numpy warning @@ -1763,24 +1861,18 @@ def _check_fill_value(self) -> None: val = self._fill_value if isna(val): if not is_valid_na_for_dtype(val, self.subtype): - warnings.warn( - "Allowing arbitrary scalar fill_value in SparseDtype is " - "deprecated. In a future version, the fill_value must be " - "a valid value for the SparseDtype.subtype.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + # GH#53043 + "fill_value must be a valid value for the SparseDtype.subtype" ) else: dummy = np.empty(0, dtype=self.subtype) dummy = ensure_wrapped_if_datetimelike(dummy) if not can_hold_element(dummy, val): - warnings.warn( - "Allowing arbitrary scalar fill_value in SparseDtype is " - "deprecated. In a future version, the fill_value must be " - "a valid value for the SparseDtype.subtype.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + # GH#53043 + "fill_value must be a valid value for the SparseDtype.subtype" ) @property @@ -1814,7 +1906,7 @@ def subtype(self): @property def name(self) -> str: - return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]" + return f"Sparse[{self.subtype.name}, {self.fill_value!r}]" def __repr__(self) -> str: return self.name @@ -2001,7 +2093,7 @@ def _subtype_with_str(self): >>> SparseDtype(object, 1)._subtype_with_str dtype('O') - >>> dtype = SparseDtype(str, '') + >>> dtype = SparseDtype(str, "") >>> dtype.subtype dtype('O') @@ -2030,7 +2122,9 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: # np.nan isn't a singleton, so we may end up with multiple # NaNs here, so we ignore the all NA case too. - if not (len(set(fill_values)) == 1 or isna(fill_values).all()): + if get_option("performance_warnings") and ( + not (len(set(fill_values)) == 1 or isna(fill_values).all()) + ): warnings.warn( "Concatenating sparse arrays with multiple fill " f"values: '{fill_values}'. Picking the first and " @@ -2038,12 +2132,15 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: PerformanceWarning, stacklevel=find_stack_level(), ) - np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) - return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value) + # error: Argument 1 to "np_find_common_type" has incompatible type + # "*Generator[Any | dtype[Any] | ExtensionDtype, None, None]"; + # expected "dtype[Any]" [arg-type] + return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value) # type: ignore [arg-type] @register_extension_dtype +@set_module("pandas") class ArrowDtype(StorageExtensionDtype): """ An ExtensionDtype for PyArrow data types. @@ -2074,6 +2171,10 @@ class ArrowDtype(StorageExtensionDtype): ------- ArrowDtype + See Also + -------- + DataFrame.convert_dtypes : Convert columns to the best possible dtypes. + Examples -------- >>> import pyarrow as pa @@ -2164,7 +2265,7 @@ def type(self): elif pa.types.is_null(pa_type): # TODO: None? pd.NA? pa.null? return type(pa_type) - elif isinstance(pa_type, pa.ExtensionType): + elif isinstance(pa_type, pa.BaseExtensionType): return type(self)(pa_type.storage_type).type raise NotImplementedError(pa_type) @@ -2173,7 +2274,7 @@ def name(self) -> str: # type: ignore[override] """ A string identifying the data type. """ - return f"{str(self.pyarrow_dtype)}[{self.storage}]" + return f"{self.pyarrow_dtype!s}[{self.storage}]" @cache_readonly def numpy_dtype(self) -> np.dtype: @@ -2190,7 +2291,9 @@ def numpy_dtype(self) -> np.dtype: # This can be removed if/when pyarrow addresses it: # https://github.com/apache/arrow/issues/34462 return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype): + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): # pa.string().to_pandas_dtype() = object which we don't want return np.dtype(str) try: @@ -2240,9 +2343,11 @@ def construct_from_string(cls, string: str) -> ArrowDtype: ) if not string.endswith("[pyarrow]"): raise TypeError(f"'{string}' must end with '[pyarrow]'") - if string == "string[pyarrow]": + if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") + if pa_version_under10p1: + raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" try: @@ -2337,7 +2442,7 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: except NotImplementedError: return None - def __from_arrow__(self, array: pa.Array | pa.ChunkedArray): + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ArrowExtensionArray: """ Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. """ diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 9718ad600cb80..8d3d86217dedf 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -1,4 +1,5 @@ -""" define generic base classes for pandas objects """ +"""define generic base classes for pandas objects""" + from __future__ import annotations from typing import ( @@ -33,7 +34,7 @@ # define abstract base classes to enable isinstance type checking on our # objects -def create_pandas_abc_type(name, attr, comp): +def create_pandas_abc_type(name, attr, comp) -> type: def _check(inst) -> bool: return getattr(inst, attr, "_typ") in comp diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index f551716772f61..918d107f2ce6c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -1,4 +1,4 @@ -""" basic inference routines """ +"""basic inference routines""" from __future__ import annotations @@ -29,14 +29,12 @@ is_decimal = lib.is_decimal -is_interval = lib.is_interval - is_list_like = lib.is_list_like is_iterator = lib.is_iterator -def is_number(obj) -> TypeGuard[Number | np.number]: +def is_number(obj: object) -> TypeGuard[Number | np.number]: """ Check if the object is a number. @@ -77,7 +75,7 @@ def is_number(obj) -> TypeGuard[Number | np.number]: return isinstance(obj, (Number, np.number)) -def iterable_not_string(obj) -> bool: +def iterable_not_string(obj: object) -> bool: """ Check if the object is an iterable but not a string. @@ -102,7 +100,7 @@ def iterable_not_string(obj) -> bool: return isinstance(obj, abc.Iterable) and not isinstance(obj, str) -def is_file_like(obj) -> bool: +def is_file_like(obj: object) -> bool: """ Check if the object is a file-like object. @@ -115,13 +113,24 @@ def is_file_like(obj) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object to check for file-like properties. + This can be any Python object, and the function will + check if it has attributes typically associated with + file-like objects (e.g., `read`, `write`, `__iter__`). Returns ------- bool Whether `obj` has file-like properties. + See Also + -------- + api.types.is_dict_like : Check if the object is dict-like. + api.types.is_hashable : Return True if hash(obj) will succeed, False otherwise. + api.types.is_named_tuple : Check if the object is a named tuple. + api.types.is_iterator : Check if the object is an iterator. + Examples -------- >>> import io @@ -138,19 +147,30 @@ def is_file_like(obj) -> bool: return bool(hasattr(obj, "__iter__")) -def is_re(obj) -> TypeGuard[Pattern]: +def is_re(obj: object) -> TypeGuard[Pattern]: """ Check if the object is a regex pattern instance. Parameters ---------- - obj : The object to check + obj : object + The object to check for being a regex pattern. Typically, + this would be an object that you expect to be a compiled + pattern from the `re` module. Returns ------- bool Whether `obj` is a regex pattern. + See Also + -------- + api.types.is_float : Return True if given object is float. + api.types.is_iterator : Check if the object is an iterator. + api.types.is_integer : Return True if given object is integer. + api.types.is_re_compilable : Check if the object can be compiled + into a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re @@ -163,19 +183,24 @@ def is_re(obj) -> TypeGuard[Pattern]: return isinstance(obj, Pattern) -def is_re_compilable(obj) -> bool: +def is_re_compilable(obj: object) -> bool: """ Check if the object can be compiled into a regex pattern instance. Parameters ---------- obj : The object to check + The object to check if the object can be compiled into a regex pattern instance. Returns ------- bool Whether `obj` can be compiled as a regex pattern. + See Also + -------- + api.types.is_re : Check if the object is a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re_compilable @@ -185,14 +210,14 @@ def is_re_compilable(obj) -> bool: False """ try: - re.compile(obj) + re.compile(obj) # type: ignore[call-overload] except TypeError: return False else: return True -def is_array_like(obj) -> bool: +def is_array_like(obj: object) -> bool: """ Check if the object is array-like. @@ -224,7 +249,7 @@ def is_array_like(obj) -> bool: return is_list_like(obj) and hasattr(obj, "dtype") -def is_nested_list_like(obj) -> bool: +def is_nested_list_like(obj: object) -> bool: """ Check if the object is list-like, and that all of its elements are also list-like. @@ -265,24 +290,34 @@ def is_nested_list_like(obj) -> bool: return ( is_list_like(obj) and hasattr(obj, "__len__") - and len(obj) > 0 - and all(is_list_like(item) for item in obj) + # need PEP 724 to handle these typing errors + and len(obj) > 0 # pyright: ignore[reportArgumentType] + and all(is_list_like(item) for item in obj) # type: ignore[attr-defined] ) -def is_dict_like(obj) -> bool: +def is_dict_like(obj: object) -> bool: """ Check if the object is dict-like. Parameters ---------- - obj : The object to check + obj : object + The object to check. This can be any Python object, + and the function will determine whether it + behaves like a dictionary. Returns ------- bool Whether `obj` has dict-like properties. + See Also + -------- + api.types.is_list_like : Check if the object is list-like. + api.types.is_file_like : Check if the object is a file-like. + api.types.is_named_tuple : Check if the object is a named tuple. + Examples -------- >>> from pandas.api.types import is_dict_like @@ -303,19 +338,28 @@ def is_dict_like(obj) -> bool: ) -def is_named_tuple(obj) -> bool: +def is_named_tuple(obj: object) -> bool: """ Check if the object is a named tuple. Parameters ---------- - obj : The object to check + obj : object + The object that will be checked to determine + whether it is a named tuple. Returns ------- bool Whether `obj` is a named tuple. + See Also + -------- + api.types.is_dict_like: Check if the object is dict-like. + api.types.is_hashable: Return True if hash(obj) + will succeed, False otherwise. + api.types.is_categorical_dtype : Check if the dtype is categorical. + Examples -------- >>> from collections import namedtuple @@ -331,7 +375,7 @@ def is_named_tuple(obj) -> bool: return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields") -def is_hashable(obj) -> TypeGuard[Hashable]: +def is_hashable(obj: object) -> TypeGuard[Hashable]: """ Return True if hash(obj) will succeed, False otherwise. @@ -341,9 +385,24 @@ def is_hashable(obj) -> TypeGuard[Hashable]: Distinguish between these and other types by trying the call to hash() and seeing if they raise TypeError. + Parameters + ---------- + obj : object + The object to check for hashability. Any Python object can be passed here. + Returns ------- bool + True if object can be hashed (i.e., does not raise TypeError when + passed to hash()), and False otherwise (e.g., if object is mutable + like a list or dictionary). + + See Also + -------- + api.types.is_float : Return True if given object is float. + api.types.is_iterator : Check if the object is an iterator. + api.types.is_list_like : Check if the object is list-like. + api.types.is_dict_like : Check if the object is dict-like. Examples -------- @@ -370,7 +429,7 @@ def is_hashable(obj) -> TypeGuard[Hashable]: return True -def is_sequence(obj) -> bool: +def is_sequence(obj: object) -> bool: """ Check if the object is a sequence of objects. String types are not included as sequences here. @@ -394,14 +453,16 @@ def is_sequence(obj) -> bool: False """ try: - iter(obj) # Can iterate over it. - len(obj) # Has a length associated with it. + # Can iterate over it. + iter(obj) # type: ignore[call-overload] + # Has a length associated with it. + len(obj) # type: ignore[arg-type] return not isinstance(obj, (str, bytes)) except (TypeError, AttributeError): return False -def is_dataclass(item) -> bool: +def is_dataclass(item: object) -> bool: """ Checks if the object is a data-class instance @@ -425,7 +486,7 @@ def is_dataclass(item) -> bool: >>> is_dataclass(Point) False - >>> is_dataclass(Point(0,2)) + >>> is_dataclass(Point(0, 2)) True """ diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4dc0d477f89e8..71fe0f6e4feb0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -1,10 +1,10 @@ """ missing types & inference """ + from __future__ import annotations from decimal import Decimal -from functools import partial from typing import ( TYPE_CHECKING, overload, @@ -13,14 +13,13 @@ import numpy as np -from pandas._config import get_option - from pandas._libs import lib import pandas._libs.missing as libmissing from pandas._libs.tslibs import ( NaT, iNaT, ) +from pandas.util._decorators import set_module from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -48,6 +47,8 @@ if TYPE_CHECKING: from re import Pattern + from pandas._libs.missing import NAType + from pandas._libs.tslibs import NaTType from pandas._typing import ( ArrayLike, DtypeObj, @@ -64,40 +65,36 @@ isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar -nan_checker = np.isnan -INF_AS_NA = False _dtype_object = np.dtype("object") _dtype_str = np.dtype(str) @overload -def isna(obj: Scalar | Pattern) -> bool: - ... +def isna(obj: Scalar | Pattern | NAType | NaTType) -> bool: ... @overload def isna( obj: ArrayLike | Index | list, -) -> npt.NDArray[np.bool_]: - ... +) -> npt.NDArray[np.bool_]: ... @overload -def isna(obj: NDFrameT) -> NDFrameT: - ... +def isna(obj: NDFrameT) -> NDFrameT: ... # handle unions @overload -def isna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]: - ... +def isna( + obj: NDFrameT | ArrayLike | Index | list, +) -> NDFrameT | npt.NDArray[np.bool_]: ... @overload -def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: - ... +def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: ... +@set_module("pandas") def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: """ Detect missing values for an array-like object. @@ -129,7 +126,7 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: -------- Scalar arguments (including strings) result in a scalar boolean. - >>> pd.isna('dog') + >>> pd.isna("dog") False >>> pd.isna(pd.NA) @@ -150,17 +147,16 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: For indexes, an ndarray of booleans is returned. - >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, - ... "2017-07-08"]) + >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.isna(index) array([False, False, True, False]) For Series and DataFrame, the same type is returned, containing booleans. - >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) + >>> df = pd.DataFrame([["ant", "bee", "cat"], ["dog", None, "fly"]]) >>> df 0 1 2 0 ant bee cat @@ -181,84 +177,50 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: isnull = isna -def _isna(obj, inf_as_na: bool = False): +def _isna(obj): """ - Detect missing values, treating None, NaN or NA as null. Infinite - values will also be treated as null if inf_as_na is True. + Detect missing values, treating None, NaN or NA as null. Parameters ---------- obj: ndarray or object value Input array or scalar value. - inf_as_na: bool - Whether to treat infinity as null. Returns ------- boolean ndarray or boolean """ if is_scalar(obj): - return libmissing.checknull(obj, inf_as_na=inf_as_na) + return libmissing.checknull(obj) elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, type): return False elif isinstance(obj, (np.ndarray, ABCExtensionArray)): - return _isna_array(obj, inf_as_na=inf_as_na) + return _isna_array(obj) elif isinstance(obj, ABCIndex): # Try to use cached isna, which also short-circuits for integer dtypes # and avoids materializing RangeIndex._values if not obj._can_hold_na: return obj.isna() - return _isna_array(obj._values, inf_as_na=inf_as_na) + return _isna_array(obj._values) elif isinstance(obj, ABCSeries): - result = _isna_array(obj._values, inf_as_na=inf_as_na) + result = _isna_array(obj._values) # box result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) return result elif isinstance(obj, ABCDataFrame): return obj.isna() elif isinstance(obj, list): - return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na) + return _isna_array(np.asarray(obj, dtype=object)) elif hasattr(obj, "__array__"): - return _isna_array(np.asarray(obj), inf_as_na=inf_as_na) + return _isna_array(np.asarray(obj)) else: return False -def _use_inf_as_na(key) -> None: - """ - Option change callback for na/inf behaviour. - - Choose which replacement for numpy.isnan / -numpy.isfinite is used. - - Parameters - ---------- - flag: bool - True means treat None, NaN, INF, -INF as null (old way), - False means None and NaN are null, but INF, -INF are not null - (new way). - - Notes - ----- - This approach to setting global module values is discussed and - approved here: - - * https://stackoverflow.com/questions/4859217/ - programmatically-creating-variables-in-python/4859312#4859312 - """ - inf_as_na = get_option(key) - globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na) - if inf_as_na: - globals()["nan_checker"] = lambda x: ~np.isfinite(x) - globals()["INF_AS_NA"] = True - else: - globals()["nan_checker"] = np.isnan - globals()["INF_AS_NA"] = False - - -def _isna_array(values: ArrayLike, inf_as_na: bool = False): +def _isna_array(values: ArrayLike) -> npt.NDArray[np.bool_] | NDFrame: """ Return an array indicating which values of the input array are NaN / NA. @@ -266,8 +228,6 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False): ---------- obj: ndarray or ExtensionArray The input array whose elements are to be checked. - inf_as_na: bool - Whether or not to treat infinite values as NA. Returns ------- @@ -275,34 +235,29 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False): Array of boolean values denoting the NA status of each element. """ dtype = values.dtype + result: npt.NDArray[np.bool_] | NDFrame if not isinstance(values, np.ndarray): # i.e. ExtensionArray - if inf_as_na and isinstance(dtype, CategoricalDtype): - result = libmissing.isnaobj(values.to_numpy(), inf_as_na=inf_as_na) - else: - # error: Incompatible types in assignment (expression has type - # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has - # type "ndarray[Any, dtype[bool_]]") - result = values.isna() # type: ignore[assignment] + # error: Incompatible types in assignment (expression has type + # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has + # type "ndarray[Any, dtype[bool_]]") + result = values.isna() # type: ignore[assignment] elif isinstance(values, np.rec.recarray): # GH 48526 - result = _isna_recarray_dtype(values, inf_as_na=inf_as_na) + result = _isna_recarray_dtype(values) elif is_string_or_object_np_dtype(values.dtype): - result = _isna_string_dtype(values, inf_as_na=inf_as_na) + result = _isna_string_dtype(values) elif dtype.kind in "mM": # this is the NaT pattern result = values.view("i8") == iNaT else: - if inf_as_na: - result = ~np.isfinite(values) - else: - result = np.isnan(values) + result = np.isnan(values) return result -def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]: +def _isna_string_dtype(values: np.ndarray) -> npt.NDArray[np.bool_]: # Working around NumPy ticket 1542 dtype = values.dtype @@ -310,73 +265,51 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo result = np.zeros(values.shape, dtype=bool) else: if values.ndim in {1, 2}: - result = libmissing.isnaobj(values, inf_as_na=inf_as_na) + result = libmissing.isnaobj(values) else: # 0-D, reached via e.g. mask_missing - result = libmissing.isnaobj(values.ravel(), inf_as_na=inf_as_na) + result = libmissing.isnaobj(values.ravel()) result = result.reshape(values.shape) return result -def _has_record_inf_value(record_as_array: np.ndarray) -> np.bool_: - is_inf_in_record = np.zeros(len(record_as_array), dtype=bool) - for i, value in enumerate(record_as_array): - is_element_inf = False - try: - is_element_inf = np.isinf(value) - except TypeError: - is_element_inf = False - is_inf_in_record[i] = is_element_inf - - return np.any(is_inf_in_record) - - -def _isna_recarray_dtype( - values: np.rec.recarray, inf_as_na: bool -) -> npt.NDArray[np.bool_]: +def _isna_recarray_dtype(values: np.rec.recarray) -> npt.NDArray[np.bool_]: result = np.zeros(values.shape, dtype=bool) for i, record in enumerate(values): record_as_array = np.array(record.tolist()) does_record_contain_nan = isna_all(record_as_array) - does_record_contain_inf = False - if inf_as_na: - does_record_contain_inf = bool(_has_record_inf_value(record_as_array)) - result[i] = np.any( - np.logical_or(does_record_contain_nan, does_record_contain_inf) - ) + result[i] = np.any(does_record_contain_nan) return result @overload -def notna(obj: Scalar) -> bool: - ... +def notna(obj: Scalar | Pattern | NAType | NaTType) -> bool: ... @overload def notna( obj: ArrayLike | Index | list, -) -> npt.NDArray[np.bool_]: - ... +) -> npt.NDArray[np.bool_]: ... @overload -def notna(obj: NDFrameT) -> NDFrameT: - ... +def notna(obj: NDFrameT) -> NDFrameT: ... # handle unions @overload -def notna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]: - ... +def notna( + obj: NDFrameT | ArrayLike | Index | list, +) -> NDFrameT | npt.NDArray[np.bool_]: ... @overload -def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: - ... +def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: ... +@set_module("pandas") def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: """ Detect non-missing values for an array-like object. @@ -408,7 +341,7 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: -------- Scalar arguments (including strings) result in a scalar boolean. - >>> pd.notna('dog') + >>> pd.notna("dog") True >>> pd.notna(pd.NA) @@ -429,17 +362,16 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: For indexes, an ndarray of booleans is returned. - >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, - ... "2017-07-08"]) + >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.notna(index) array([ True, True, False, True]) For Series and DataFrame, the same type is returned, containing booleans. - >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) + >>> df = pd.DataFrame([["ant", "bee", "cat"], ["dog", None, "fly"]]) >>> df 0 1 2 0 ant bee cat @@ -495,14 +427,10 @@ def array_equivalent( Examples -------- - >>> array_equivalent( - ... np.array([1, 2, np.nan]), - ... np.array([1, 2, np.nan])) - True - >>> array_equivalent( - ... np.array([1, np.nan, 2]), - ... np.array([1, 2, np.nan])) - False + >>> array_equivalent(np.array([1, 2, np.nan]), np.array([1, 2, np.nan])) + np.True_ + >>> array_equivalent(np.array([1, np.nan, 2]), np.array([1, 2, np.nan])) + np.False_ """ left, right = np.asarray(left), np.asarray(right) @@ -557,11 +485,13 @@ def _array_equivalent_float(left: np.ndarray, right: np.ndarray) -> bool: return bool(((left == right) | (np.isnan(left) & np.isnan(right))).all()) -def _array_equivalent_datetimelike(left: np.ndarray, right: np.ndarray): +def _array_equivalent_datetimelike(left: np.ndarray, right: np.ndarray) -> bool: return np.array_equal(left.view("i8"), right.view("i8")) -def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool): +def _array_equivalent_object( + left: np.ndarray, right: np.ndarray, strict_nan: bool +) -> bool: left = ensure_object(left) right = ensure_object(right) @@ -632,7 +562,7 @@ def infer_fill_value(val): """ if not is_list_like(val): val = [val] - val = np.array(val, copy=False) + val = np.asarray(val) if val.dtype.kind in "mM": return np.array("NaT", dtype=val.dtype) elif val.dtype == object: @@ -647,6 +577,20 @@ def infer_fill_value(val): return np.nan +def construct_1d_array_from_inferred_fill_value( + value: object, length: int +) -> ArrayLike: + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + from pandas.core.algorithms import take_nd + from pandas.core.construction import sanitize_array + from pandas.core.indexes.base import Index + + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(length, dtype=np.intp) + return take_nd(arr, taker) + + def maybe_fill(arr: np.ndarray) -> np.ndarray: """ Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. @@ -671,16 +615,18 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): Examples -------- - >>> na_value_for_dtype(np.dtype('int64')) + >>> na_value_for_dtype(np.dtype("int64")) 0 - >>> na_value_for_dtype(np.dtype('int64'), compat=False) + >>> na_value_for_dtype(np.dtype("int64"), compat=False) + nan + >>> na_value_for_dtype(np.dtype("float64")) nan - >>> na_value_for_dtype(np.dtype('float64')) + >>> na_value_for_dtype(np.dtype("complex128")) nan - >>> na_value_for_dtype(np.dtype('bool')) + >>> na_value_for_dtype(np.dtype("bool")) False - >>> na_value_for_dtype(np.dtype('datetime64[ns]')) - numpy.datetime64('NaT') + >>> na_value_for_dtype(np.dtype("datetime64[ns]")) + np.datetime64('NaT') """ if isinstance(dtype, ExtensionDtype): @@ -688,7 +634,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): elif dtype.kind in "mM": unit = np.datetime_data(dtype)[0] return dtype.type("NaT", unit) - elif dtype.kind == "f": + elif dtype.kind in "fc": return np.nan elif dtype.kind in "iu": if compat: @@ -775,7 +721,7 @@ def isna_all(arr: ArrayLike) -> bool: dtype = arr.dtype if lib.is_np_dtype(dtype, "f"): - checker = nan_checker + checker = np.isnan elif (lib.is_np_dtype(dtype, "mM")) or isinstance( dtype, (DatetimeTZDtype, PeriodDtype) @@ -787,9 +733,7 @@ def isna_all(arr: ArrayLike) -> bool: else: # error: Incompatible types in assignment (expression has type "Callable[[Any], # Any]", variable has type "ufunc") - checker = lambda x: _isna_array( # type: ignore[assignment] - x, inf_as_na=INF_AS_NA - ) + checker = _isna_array # type: ignore[assignment] return all( checker(arr[i : i + chunk_len]).all() for i in range(0, total_len, chunk_len) diff --git a/pandas/core/flags.py b/pandas/core/flags.py index aff7a15f283ba..eceb86dc61d9f 100644 --- a/pandas/core/flags.py +++ b/pandas/core/flags.py @@ -11,6 +11,10 @@ class Flags: """ Flags that apply to pandas objects. + “Flags” differ from “metadata”. Flags reflect properties of the pandas + object (the Series or DataFrame). Metadata refer to properties of the + dataset, and should be stored in DataFrame.attrs. + Parameters ---------- obj : Series or DataFrame @@ -30,6 +34,11 @@ class Flags: it is expected that every method taking or returning one or more DataFrame or Series objects will propagate ``allows_duplicate_labels``. + See Also + -------- + DataFrame.attrs : Dictionary of global attributes of this dataset. + Series.attrs : Dictionary of global attributes of this dataset. + Examples -------- Attributes can be set in two ways: @@ -41,7 +50,7 @@ class Flags: >>> df.flags - >>> df.flags['allows_duplicate_labels'] = True + >>> df.flags["allows_duplicate_labels"] = True >>> df.flags """ @@ -71,7 +80,7 @@ def allows_duplicate_labels(self) -> bool: Examples -------- - >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) + >>> df = pd.DataFrame({"A": [1, 2]}, index=["a", "a"]) >>> df.flags.allows_duplicate_labels True >>> df.flags.allows_duplicate_labels = False @@ -111,7 +120,7 @@ def __setitem__(self, key: str, value) -> None: def __repr__(self) -> str: return f"" - def __eq__(self, other) -> bool: + def __eq__(self, other: object) -> bool: if isinstance(other, type(self)): return self.allows_duplicate_labels == other.allows_duplicate_labels return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e2e589440bd9..dc3fecba7fb8c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8,11 +8,13 @@ alignment and a host of useful data manipulation methods having to do with the labeling information """ + from __future__ import annotations import collections from collections import abc from collections.abc import ( + Callable, Hashable, Iterable, Iterator, @@ -20,7 +22,6 @@ Sequence, ) import functools -from inspect import signature from io import StringIO import itertools import operator @@ -29,7 +30,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -39,12 +39,7 @@ import numpy as np from numpy import ma -from pandas._config import ( - get_option, - using_copy_on_write, - warn_copy_on_write, -) -from pandas._config.config import _get_option +from pandas._config import get_option from pandas._libs import ( algos as libalgos, @@ -60,16 +55,17 @@ from pandas.errors import ( ChainedAssignmentError, InvalidIndexError, +) +from pandas.errors.cow import ( _chained_assignment_method_msg, _chained_assignment_msg, - _chained_assignment_warning_method_msg, - _chained_assignment_warning_msg, ) from pandas.util._decorators import ( Appender, Substitution, deprecate_nonkeyword_arguments, doc, + set_module, ) from pandas.util._exceptions import ( find_stack_level, @@ -89,7 +85,6 @@ find_common_type, infer_dtype_from_scalar, invalidate_string_dtypes, - maybe_box_native, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -117,6 +112,10 @@ BaseMaskedDtype, ExtensionDtype, ) +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCSeries, +) from pandas.core.dtypes.missing import ( isna, notna, @@ -129,7 +128,7 @@ ops, roperator, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor from pandas.core.apply import reconstruct_and_relabel_result from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin @@ -167,15 +166,11 @@ check_bool_indexer, check_dict_or_set_indexers, ) -from pandas.core.internals import ( - ArrayManager, - BlockManager, -) +from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( arrays_to_mgr, dataclasses_to_dicts, dict_to_mgr, - mgr_to_mgr, ndarray_to_mgr, nested_data_to_arrays, rec_array_to_mgr, @@ -228,15 +223,17 @@ FormattersType, Frequency, FromDictOrient, + HashableT, + HashableT2, IgnoreRaise, IndexKeyFunc, IndexLabel, JoinValidate, Level, + ListLike, MergeHow, MergeValidate, MutableMappingT, - NaAction, NaPosition, NsmallestNlargestKeep, PythonFuncType, @@ -250,7 +247,7 @@ SortKind, StorageOptions, Suffixes, - ToGbqIfexist, + T, ToStataByteorder, ToTimestampHow, UpdateJoin, @@ -262,7 +259,7 @@ from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg - from pandas.core.internals import SingleDataManager + from pandas.core.internals.managers import SingleBlockManager from pandas.io.formats.style import Styler @@ -322,7 +319,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, + default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -335,6 +333,10 @@ join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use only keys from left frame that are not in right frame, similar + to SQL left anti join; preserve key order. + * right_anti: use only keys from right frame that are not in left frame, similar + to SQL right anti join; preserve key order. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -364,7 +366,7 @@ of a string to indicate that the column name from `left` or `right` should be left as-is, with no suffix. At least one of the values must not be None. -copy : bool, default True +copy : bool, default False If False, avoid copy if possible. .. note:: @@ -378,6 +380,8 @@ You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 indicator : bool or str, default False If True, adds a column to the output DataFrame called "_merge" with information on the source of each row. The column can be given a different @@ -506,6 +510,7 @@ # DataFrame class +@set_module("pandas") class DataFrame(NDFrame, OpsMixin): """ Two-dimensional, size-mutable, potentially heterogeneous tabular data. @@ -535,6 +540,7 @@ class DataFrame(NDFrame, OpsMixin): will perform column selection instead. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. + If ``data`` is DataFrame then is ignored. copy : bool or None, default None Copy data from inputs. For dict data, the default of None behaves like ``copy=True``. For DataFrame @@ -560,7 +566,7 @@ class DataFrame(NDFrame, OpsMixin): -------- Constructing DataFrame from a dictionary. - >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> d = {"col1": [1, 2], "col2": [3, 4]} >>> df = pd.DataFrame(data=d) >>> df col1 col2 @@ -584,7 +590,7 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from a dictionary including Series: - >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} + >>> d = {"col1": [0, 1, 2, 3], "col2": pd.Series([2, 3], index=[2, 3])} >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) col1 col2 0 0 NaN @@ -594,8 +600,9 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from numpy ndarray: - >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - ... columns=['a', 'b', 'c']) + >>> df2 = pd.DataFrame( + ... np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"] + ... ) >>> df2 a b c 0 1 2 3 @@ -604,10 +611,11 @@ class DataFrame(NDFrame, OpsMixin): Constructing DataFrame from a numpy ndarray that has labeled columns: - >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], - ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) - >>> df3 = pd.DataFrame(data, columns=['c', 'a']) - ... + >>> data = np.array( + ... [(1, 2, 3), (4, 5, 6), (7, 8, 9)], + ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")], + ... ) + >>> df3 = pd.DataFrame(data, columns=["c", "a"]) >>> df3 c a 0 3 1 @@ -646,36 +654,47 @@ class DataFrame(NDFrame, OpsMixin): _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) _accessors: set[str] = {"sparse"} _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) - _mgr: BlockManager | ArrayManager + _mgr: BlockManager # similar to __array_priority__, positions DataFrame before Series, Index, # and ExtensionArray. Should NOT be overridden by subclasses. __pandas_priority__ = 4000 @property - def _constructor(self) -> Callable[..., DataFrame]: + def _constructor(self) -> type[DataFrame]: return DataFrame - def _constructor_from_mgr(self, mgr, axes): - if self._constructor is DataFrame: - # we are pandas.DataFrame (or a subclass that doesn't override _constructor) - return DataFrame._from_mgr(mgr, axes=axes) - else: - assert axes is mgr.axes + def _constructor_from_mgr(self, mgr, axes) -> DataFrame: + df = DataFrame._from_mgr(mgr, axes=axes) + + if type(self) is DataFrame: + # This would also work `if self._constructor is DataFrame`, but + # this check is slightly faster, benefiting the most-common case. + return df + + elif type(self).__name__ == "GeoDataFrame": + # Shim until geopandas can override their _constructor_from_mgr + # bc they have different behavior for Managers than for DataFrames return self._constructor(mgr) + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor(df) + _constructor_sliced: Callable[..., Series] = Series - def _sliced_from_mgr(self, mgr, axes) -> Series: - return Series._from_mgr(mgr, axes) + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + ser = Series._from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name - def _constructor_sliced_from_mgr(self, mgr, axes): - if self._constructor_sliced is Series: - ser = self._sliced_from_mgr(mgr, axes) - ser._name = None # caller is responsible for setting real name + if type(self) is DataFrame: + # This would also work `if self._constructor_sliced is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - assert axes is mgr.axes - return self._constructor_sliced(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor_sliced(ser) # ---------------------------------------------------------------------- # Constructors @@ -700,7 +719,7 @@ def __init__( # to avoid the result sharing the same Manager data = data.copy(deep=False) - if isinstance(data, (BlockManager, ArrayManager)): + if isinstance(data, BlockManager): if not allow_mgr: # GH#52419 warnings.warn( @@ -708,11 +727,10 @@ def __init__( "is deprecated and will raise in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix + stacklevel=2, ) - if using_copy_on_write(): - data = data.copy(deep=False) + data = data.copy(deep=False) # first check if a Manager is passed without any other arguments # -> use fastpath (without checking Manager type) if index is None and columns is None and dtype is None and not copy: @@ -720,12 +738,6 @@ def __init__( NDFrame.__init__(self, data) return - manager = _get_option("mode.data_manager", silent=True) - - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - # GH47215 if isinstance(index, set): raise ValueError("index cannot be a set") @@ -736,17 +748,7 @@ def __init__( if isinstance(data, dict): # retain pre-GH#38939 default behavior copy = True - elif ( - manager == "array" - and isinstance(data, (np.ndarray, ExtensionArray)) - and data.ndim == 2 - ): - # INFO(ArrayManager) by default copy the 2D input array to get - # contiguous 1D arrays - copy = True - elif using_copy_on_write() and not isinstance( - data, (Index, DataFrame, Series) - ): + elif not isinstance(data, (Index, DataFrame, Series)): copy = True else: copy = False @@ -757,14 +759,14 @@ def __init__( dtype = dtype if dtype is not None else pandas_dtype(object) data = [] - if isinstance(data, (BlockManager, ArrayManager)): + if isinstance(data, BlockManager): mgr = self._init_mgr( data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy ) elif isinstance(data, dict): # GH#38939 de facto copy defaults to False only in non-dict cases - mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) + mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, ma.MaskedArray): from numpy.ma import mrecords @@ -784,7 +786,6 @@ def __init__( columns, dtype=dtype, copy=copy, - typ=manager, ) elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): @@ -797,20 +798,17 @@ def __init__( columns, dtype, copy, - typ=manager, ) - elif getattr(data, "name", None) is not None: + elif isinstance(data, (ABCSeries, ABCIndex)) and data.name is not None: # i.e. Series/Index with non-None name - _copy = copy if using_copy_on_write() else True mgr = dict_to_mgr( # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no # attribute "name" - {data.name: data}, # type: ignore[union-attr] + {data.name: data}, index, columns, dtype=dtype, - typ=manager, - copy=_copy, + copy=copy, ) else: mgr = ndarray_to_mgr( @@ -819,7 +817,6 @@ def __init__( columns, dtype=dtype, copy=copy, - typ=manager, ) # For data is list-like, or Iterable (will consume into list) @@ -850,7 +847,6 @@ def __init__( columns, index, dtype=dtype, - typ=manager, ) else: mgr = ndarray_to_mgr( @@ -859,7 +855,6 @@ def __init__( columns, dtype=dtype, copy=copy, - typ=manager, ) else: mgr = dict_to_mgr( @@ -867,7 +862,6 @@ def __init__( index, columns if columns is not None else default_index(0), dtype=dtype, - typ=manager, ) # For data is scalar else: @@ -888,7 +882,7 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) + mgr = arrays_to_mgr(values, columns, index, dtype=None) else: arr2d = construct_2d_arraylike_from_scalar( data, @@ -904,26 +898,10 @@ def __init__( columns, dtype=arr2d.dtype, copy=False, - typ=manager, ) - # ensure correct Manager type according to settings - mgr = mgr_to_mgr(mgr, typ=manager) - NDFrame.__init__(self, mgr) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtypes.iloc[0] != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The DataFrame " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - # ---------------------------------------------------------------------- def __dataframe__( @@ -932,6 +910,19 @@ def __dataframe__( """ Return the dataframe interchange object implementing the interchange protocol. + .. note:: + + For new development, we highly recommend using the Arrow C Data Interface + alongside the Arrow PyCapsule Interface instead of the interchange protocol + + .. warning:: + + Due to severe implementation issues, we recommend only considering using the + interchange protocol in the following cases: + + - converting to pandas: for pandas >= 2.0.3 + - converting from pandas: for pandas >= 3.0.0 + Parameters ---------- nan_as_null : bool, default False @@ -946,6 +937,11 @@ def __dataframe__( DataFrame interchange object The object which consuming library can use to ingress the dataframe. + See Also + -------- + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + Notes ----- Details on the interchange protocol: @@ -953,12 +949,13 @@ def __dataframe__( Examples -------- - >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> interchange_object = df_not_necessarily_pandas.__dataframe__() >>> interchange_object.column_names() Index(['A', 'B'], dtype='object') - >>> df_pandas = (pd.api.interchange.from_dataframe - ... (interchange_object.select_columns_by_name(['A']))) + >>> df_pandas = pd.api.interchange.from_dataframe( + ... interchange_object.select_columns_by_name(["A"]) + ... ) >>> df_pandas A 0 1 @@ -972,20 +969,32 @@ def __dataframe__( return PandasDataFrameXchg(self, allow_copy=allow_copy) - def __dataframe_consortium_standard__( - self, *, api_version: str | None = None - ) -> Any: + def __arrow_c_stream__(self, requested_schema=None): """ - Provide entry point to the Consortium DataFrame Standard API. + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. - This is developed and maintained outside of pandas. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. + Returns + ------- + PyCapsule """ - dataframe_api_compat = import_optional_dependency("dataframe_api_compat") - convert_to_standard_compliant_dataframe = ( - dataframe_api_compat.pandas_standard.convert_to_standard_compliant_dataframe - ) - return convert_to_standard_compliant_dataframe(self, api_version=api_version) + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() # ---------------------------------------------------------------------- @@ -997,9 +1006,14 @@ def axes(self) -> list[Index]: It has the row axis labels and column axis labels as the only members. They are returned in that order. + See Also + -------- + DataFrame.index: The index (row labels) of the DataFrame. + DataFrame.columns: The column labels of the DataFrame. + Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.axes [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], dtype='object')] @@ -1011,18 +1025,21 @@ def shape(self) -> tuple[int, int]: """ Return a tuple representing the dimensionality of the DataFrame. + Unlike the `len()` method, which only returns the number of rows, `shape` + provides both row and column counts, making it a more informative method for + understanding dataset size. + See Also -------- - ndarray.shape : Tuple of array dimensions. + numpy.ndarray.shape : Tuple of array dimensions. Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.shape (2, 2) - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], - ... 'col3': [5, 6]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}) >>> df.shape (2, 3) """ @@ -1047,21 +1064,22 @@ def _is_homogeneous_type(self) -> bool: Items with the same type but different sizes are considered different types. - >>> DataFrame({ - ... "A": np.array([1, 2], dtype=np.int32), - ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type + >>> DataFrame( + ... { + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64), + ... } + ... )._is_homogeneous_type False """ # The "<" part of "<=" here is for empty DataFrame cases - return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + return len({block.values.dtype for block in self._mgr.blocks}) <= 1 @property def _can_fast_transpose(self) -> bool: """ Can we transpose this DataFrame without creating any new array objects. """ - if isinstance(self._mgr, ArrayManager): - return False blocks = self._mgr.blocks if len(blocks) != 1: return False @@ -1077,13 +1095,6 @@ def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: """ mgr = self._mgr - if isinstance(mgr, ArrayManager): - if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): - # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" - # has no attribute "reshape" - return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] - return ensure_wrapped_if_datetimelike(self.values) - blocks = mgr.blocks if len(blocks) != 1: return ensure_wrapped_if_datetimelike(self.values) @@ -1194,6 +1205,7 @@ def _repr_html_(self) -> str | None: min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") + show_floats = get_option("display.float_format") formatter = fmt.DataFrameFormatter( self, @@ -1201,7 +1213,7 @@ def _repr_html_(self) -> str | None: col_space=None, na_rep="NaN", formatters=None, - float_format=None, + float_format=show_floats, sparsify=None, justify=None, index_names=True, @@ -1223,6 +1235,7 @@ def _repr_html_(self) -> str | None: def to_string( self, buf: None = ..., + *, columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., header: bool | SequenceNotStr[str] = ..., @@ -1241,13 +1254,13 @@ def to_string( min_rows: int | None = ..., max_colwidth: int | None = ..., encoding: str | None = ..., - ) -> str: - ... + ) -> str: ... @overload def to_string( self, buf: FilePath | WriteBuffer[str], + *, columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., header: bool | SequenceNotStr[str] = ..., @@ -1266,12 +1279,8 @@ def to_string( min_rows: int | None = ..., max_colwidth: int | None = ..., encoding: str | None = ..., - ) -> None: - ... + ) -> None: ... - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "buf"], name="to_string" - ) @Substitution( header_type="bool or list of str", header="Write out the column names. If a list of columns " @@ -1286,6 +1295,7 @@ def to_string( def to_string( self, buf: FilePath | WriteBuffer[str] | None = None, + *, columns: Axes | None = None, col_space: int | list[int] | dict[Hashable, int] | None = None, header: bool | SequenceNotStr[str] = True, @@ -1324,7 +1334,7 @@ def to_string( Examples -------- - >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} + >>> d = {"col1": [1, 2, 3], "col2": [4, 5, 6]} >>> df = pd.DataFrame(d) >>> print(df.to_string()) col1 col2 @@ -1367,7 +1377,7 @@ def _get_values_for_csv( decimal: str, na_rep: str, quoting, # int csv.QUOTE_FOO from stdlib - ) -> Self: + ) -> DataFrame: # helper used by to_csv mgr = self._mgr.get_values_for_csv( float_format=float_format, @@ -1394,19 +1404,22 @@ def style(self) -> Styler: Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3]}) + >>> df = pd.DataFrame({"A": [1, 2, 3]}) >>> df.style # doctest: +SKIP Please see `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. """ + # Raise AttributeError so that inspect works even if jinja2 is not installed. + has_jinja2 = import_optional_dependency("jinja2", errors="ignore") + if not has_jinja2: + raise AttributeError("The '.style' accessor requires jinja2") + from pandas.io.formats.style import Styler return Styler(self) - _shared_docs[ - "items" - ] = r""" + _shared_docs["items"] = r""" Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with @@ -1456,12 +1469,8 @@ def style(self) -> Styler: @Appender(_shared_docs["items"]) def items(self) -> Iterable[tuple[Hashable, Series]]: - if self.columns.is_unique and hasattr(self, "_item_cache"): - for k in self.columns: - yield k, self._get_item_cache(k) - else: - for i, k in enumerate(self.columns): - yield k, self._ixs(i, axis=1) + for i, k in enumerate(self.columns): + yield k, self._ixs(i, axis=1) def iterrows(self) -> Iterable[tuple[Hashable, Series]]: """ @@ -1497,24 +1506,23 @@ def iterrows(self) -> Iterable[tuple[Hashable, Series]]: Examples -------- - >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + >>> df = pd.DataFrame([[1, 1.5]], columns=["int", "float"]) >>> row = next(df.iterrows())[1] >>> row int 1.0 float 1.5 Name: 0, dtype: float64 - >>> print(row['int'].dtype) + >>> print(row["int"].dtype) float64 - >>> print(df['int'].dtype) + >>> print(df["int"].dtype) int64 """ columns = self.columns klass = self._constructor_sliced - using_cow = using_copy_on_write() for k, v in zip(self.index, self.values): s = klass(v, index=columns, name=k).__finalize__(self) - if using_cow and self._mgr.is_single_block: - s._mgr.add_references(self._mgr) # type: ignore[arg-type] + if self._mgr.is_single_block: + s._mgr.add_references(self._mgr) yield k, s def itertuples( @@ -1551,15 +1559,15 @@ def itertuples( Examples -------- - >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, - ... index=['dog', 'hawk']) + >>> df = pd.DataFrame( + ... {"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"] + ... ) >>> df num_legs num_wings dog 4 0 hawk 2 2 >>> for row in df.itertuples(): ... print(row) - ... Pandas(Index='dog', num_legs=4, num_wings=0) Pandas(Index='hawk', num_legs=2, num_wings=2) @@ -1568,16 +1576,14 @@ def itertuples( >>> for row in df.itertuples(index=False): ... print(row) - ... Pandas(num_legs=4, num_wings=0) Pandas(num_legs=2, num_wings=2) With the `name` parameter set we set a custom name for the yielded namedtuples: - >>> for row in df.itertuples(name='Animal'): + >>> for row in df.itertuples(name="Animal"): ... print(row) - ... Animal(Index='dog', num_legs=4, num_wings=0) Animal(Index='hawk', num_legs=2, num_wings=2) """ @@ -1608,12 +1614,10 @@ def __len__(self) -> int: return len(self.index) @overload - def dot(self, other: Series) -> Series: - ... + def dot(self, other: Series) -> Series: ... @overload - def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: - ... + def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: ... def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: """ @@ -1697,8 +1701,8 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: if len(common) > len(self.columns) or len(common) > len(other.index): raise ValueError("matrices are not aligned") - left = self.reindex(columns=common, copy=False) - right = other.reindex(index=common, copy=False) + left = self.reindex(columns=common) + right = other.reindex(index=common) lvals = left.values rvals = right._values else: @@ -1734,12 +1738,10 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: raise TypeError(f"unsupported type: {type(other)}") @overload - def __matmul__(self, other: Series) -> Series: - ... + def __matmul__(self, other: Series) -> Series: ... @overload - def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: - ... + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: ... def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: """ @@ -1812,7 +1814,7 @@ def from_dict( -------- By default the keys of the dict become the DataFrame columns: - >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} >>> pd.DataFrame.from_dict(data) col_1 col_2 0 3 a @@ -1823,8 +1825,8 @@ def from_dict( Specify ``orient='index'`` to create the DataFrame using dictionary keys as rows: - >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} - >>> pd.DataFrame.from_dict(data, orient='index') + >>> data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]} + >>> pd.DataFrame.from_dict(data, orient="index") 0 1 2 3 row_1 3 2 1 0 row_2 a b c d @@ -1832,8 +1834,7 @@ def from_dict( When using the 'index' orientation, the column names can be specified manually: - >>> pd.DataFrame.from_dict(data, orient='index', - ... columns=['A', 'B', 'C', 'D']) + >>> pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]) A B C D row_1 3 2 1 0 row_2 a b c d @@ -1841,19 +1842,21 @@ def from_dict( Specify ``orient='tight'`` to create the DataFrame using a 'tight' format: - >>> data = {'index': [('a', 'b'), ('a', 'c')], - ... 'columns': [('x', 1), ('y', 2)], - ... 'data': [[1, 3], [2, 4]], - ... 'index_names': ['n1', 'n2'], - ... 'column_names': ['z1', 'z2']} - >>> pd.DataFrame.from_dict(data, orient='tight') + >>> data = { + ... "index": [("a", "b"), ("a", "c")], + ... "columns": [("x", 1), ("y", 2)], + ... "data": [[1, 3], [2, 4]], + ... "index_names": ["n1", "n2"], + ... "column_names": ["z1", "z2"], + ... } + >>> pd.DataFrame.from_dict(data, orient="tight") z1 x y z2 1 2 n1 n2 a b 1 3 c 2 4 """ - index = None + index: list | Index | None = None orient = orient.lower() # type: ignore[assignment] if orient == "index": if len(data) > 0: @@ -1879,7 +1882,7 @@ def from_dict( else: realdata = data["data"] - def create_index(indexlist, namelist): + def create_index(indexlist, namelist) -> Index: index: Index if len(namelist) > 1: index = MultiIndex.from_tuples(indexlist, names=namelist) @@ -1922,6 +1925,7 @@ def to_numpy( Returns ------- numpy.ndarray + The NumPy array representing the values in the DataFrame. See Also -------- @@ -1944,7 +1948,7 @@ def to_numpy( For a mix of numeric and non-numeric types, the output array will have object dtype. - >>> df['C'] = pd.date_range('2000', periods=2) + >>> df["C"] = pd.date_range("2000", periods=2) >>> df.to_numpy() array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) @@ -1953,32 +1957,10 @@ def to_numpy( dtype = np.dtype(dtype) result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) if result.dtype is not dtype: - result = np.array(result, dtype=dtype, copy=False) + result = np.asarray(result, dtype=dtype) return result - def _create_data_for_split_and_tight_to_dict( - self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] - ) -> list: - """ - Simple helper method to create data for to ``to_dict(orient="split")`` and - ``to_dict(orient="tight")`` to create the main output data - """ - if are_all_object_dtype_cols: - data = [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ] - else: - data = [list(t) for t in self.itertuples(index=False, name=None)] - if object_dtype_indices: - # If we have object_dtype_cols, apply maybe_box_naive after list - # comprehension for perf - for row in data: - for i in object_dtype_indices: - row[i] = maybe_box_native(row[i]) - return data - @overload def to_dict( self, @@ -1986,8 +1968,7 @@ def to_dict( *, into: type[MutableMappingT] | MutableMappingT, index: bool = ..., - ) -> MutableMappingT: - ... + ) -> MutableMappingT: ... @overload def to_dict( @@ -1996,8 +1977,7 @@ def to_dict( *, into: type[MutableMappingT] | MutableMappingT, index: bool = ..., - ) -> list[MutableMappingT]: - ... + ) -> list[MutableMappingT]: ... @overload def to_dict( @@ -2006,8 +1986,7 @@ def to_dict( *, into: type[dict] = ..., index: bool = ..., - ) -> dict: - ... + ) -> dict: ... @overload def to_dict( @@ -2016,21 +1995,17 @@ def to_dict( *, into: type[dict] = ..., index: bool = ..., - ) -> list[dict]: - ... + ) -> list[dict]: ... # error: Incompatible default for argument "into" (default has type "type # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "orient"], name="to_dict" - ) def to_dict( self, orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - into: type[MutableMappingT] - | MutableMappingT = dict, # type: ignore[assignment] + *, + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, ) -> MutableMappingT | list[MutableMappingT]: """ @@ -2068,7 +2043,9 @@ def to_dict( index : bool, default True Whether to include the index item (and index_names item if `orient` is 'tight') in the returned dictionary. Can only be ``False`` - when `orient` is 'split' or 'tight'. + when `orient` is 'split' or 'tight'. Note that when `orient` is + 'records', this parameter does not take effect (index item always + not included). .. versionadded:: 2.0.0 @@ -2086,9 +2063,9 @@ def to_dict( Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], - ... 'col2': [0.5, 0.75]}, - ... index=['row1', 'row2']) + >>> df = pd.DataFrame( + ... {"col1": [1, 2], "col2": [0.5, 0.75]}, index=["row1", "row2"] + ... ) >>> df col1 col2 row1 1 0.50 @@ -2098,7 +2075,7 @@ def to_dict( You can specify the return orientation. - >>> df.to_dict('series') + >>> df.to_dict("series") {'col1': row1 1 row2 2 Name: col1, dtype: int64, @@ -2106,17 +2083,17 @@ def to_dict( row2 0.75 Name: col2, dtype: float64} - >>> df.to_dict('split') + >>> df.to_dict("split") {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]]} - >>> df.to_dict('records') + >>> df.to_dict("records") [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] - >>> df.to_dict('index') + >>> df.to_dict("index") {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} - >>> df.to_dict('tight') + >>> df.to_dict("tight") {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} @@ -2130,7 +2107,7 @@ def to_dict( If you want a `defaultdict`, you need to initialize it: >>> dd = defaultdict(list) - >>> df.to_dict('records', into=dd) + >>> df.to_dict("records", into=dd) [defaultdict(, {'col1': 1, 'col2': 0.5}), defaultdict(, {'col1': 2, 'col2': 0.75})] """ @@ -2138,144 +2115,6 @@ def to_dict( return to_dict(self, orient, into=into, index=index) - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "destination_table"], name="to_gbq" - ) - def to_gbq( - self, - destination_table: str, - project_id: str | None = None, - chunksize: int | None = None, - reauth: bool = False, - if_exists: ToGbqIfexist = "fail", - auth_local_webserver: bool = True, - table_schema: list[dict[str, str]] | None = None, - location: str | None = None, - progress_bar: bool = True, - credentials=None, - ) -> None: - """ - Write a DataFrame to a Google BigQuery table. - - .. deprecated:: 2.2.0 - - Please use ``pandas_gbq.to_gbq`` instead. - - This function requires the `pandas-gbq package - `__. - - See the `How to authenticate with Google BigQuery - `__ - guide for authentication instructions. - - Parameters - ---------- - destination_table : str - Name of table to be written, in the form ``dataset.tablename``. - project_id : str, optional - Google BigQuery Account project ID. Optional when available from - the environment. - chunksize : int, optional - Number of rows to be inserted in each chunk from the dataframe. - Set to ``None`` to load the whole dataframe at once. - reauth : bool, default False - Force Google BigQuery to re-authenticate the user. This is useful - if multiple accounts are used. - if_exists : str, default 'fail' - Behavior when the destination table exists. Value can be one of: - - ``'fail'`` - If table exists raise pandas_gbq.gbq.TableCreationError. - ``'replace'`` - If table exists, drop it, recreate it, and insert data. - ``'append'`` - If table exists, insert data. Create if does not exist. - auth_local_webserver : bool, default True - Use the `local webserver flow`_ instead of the `console flow`_ - when getting user credentials. - - .. _local webserver flow: - https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server - .. _console flow: - https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console - - *New in version 0.2.0 of pandas-gbq*. - - .. versionchanged:: 1.5.0 - Default value is changed to ``True``. Google has deprecated the - ``auth_local_webserver = False`` `"out of band" (copy-paste) - flow - `_. - table_schema : list of dicts, optional - List of BigQuery table fields to which according DataFrame - columns conform to, e.g. ``[{'name': 'col1', 'type': - 'STRING'},...]``. If schema is not provided, it will be - generated according to dtypes of DataFrame columns. See - BigQuery API documentation on available names of a field. - - *New in version 0.3.1 of pandas-gbq*. - location : str, optional - Location where the load job should run. See the `BigQuery locations - documentation - `__ for a - list of available locations. The location must match that of the - target dataset. - - *New in version 0.5.0 of pandas-gbq*. - progress_bar : bool, default True - Use the library `tqdm` to show the progress bar for the upload, - chunk by chunk. - - *New in version 0.5.0 of pandas-gbq*. - credentials : google.auth.credentials.Credentials, optional - Credentials for accessing Google APIs. Use this parameter to - override default credentials, such as to use Compute Engine - :class:`google.auth.compute_engine.Credentials` or Service - Account :class:`google.oauth2.service_account.Credentials` - directly. - - *New in version 0.8.0 of pandas-gbq*. - - See Also - -------- - pandas_gbq.to_gbq : This function in the pandas-gbq library. - read_gbq : Read a DataFrame from Google BigQuery. - - Examples - -------- - Example taken from `Google BigQuery documentation - `_ - - >>> project_id = "my-project" - >>> table_id = 'my_dataset.my_table' - >>> df = pd.DataFrame({ - ... "my_string": ["a", "b", "c"], - ... "my_int64": [1, 2, 3], - ... "my_float64": [4.0, 5.0, 6.0], - ... "my_bool1": [True, False, True], - ... "my_bool2": [False, True, False], - ... "my_dates": pd.date_range("now", periods=3), - ... } - ... ) - - >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP - """ - from pandas.io import gbq - - gbq.to_gbq( - self, - destination_table, - project_id=project_id, - chunksize=chunksize, - reauth=reauth, - if_exists=if_exists, - auth_local_webserver=auth_local_webserver, - table_schema=table_schema, - location=location, - progress_bar=progress_bar, - credentials=credentials, - ) - @classmethod def from_records( cls, @@ -2289,16 +2128,13 @@ def from_records( """ Convert structured or record ndarray to DataFrame. - Creates a DataFrame object from a structured ndarray, sequence of - tuples or dicts, or DataFrame. + Creates a DataFrame object from a structured ndarray, or sequence of + tuples or dicts. Parameters ---------- - data : structured ndarray, sequence of tuples or dicts, or DataFrame + data : structured ndarray, sequence of tuples or dicts Structured input data. - - .. deprecated:: 2.1.0 - Passing a DataFrame is deprecated. index : str, list of fields, array-like Field of array to use as the index, alternately a specific set of input labels to use. @@ -2307,9 +2143,10 @@ def from_records( columns : sequence, default None Column names to use. If the passed data do not have names associated with them, this argument provides names for the - columns. Otherwise this argument indicates the order of the columns + columns. Otherwise, this argument indicates the order of the columns in the result (any names not found in the data will become all-NA - columns). + columns) and limits the data to these columns if not all column names + are provided. coerce_float : bool, default False Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. @@ -2329,8 +2166,10 @@ def from_records( -------- Data can be provided as a structured ndarray: - >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], - ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) + >>> data = np.array( + ... [(3, "a"), (2, "b"), (1, "c"), (0, "d")], + ... dtype=[("col_1", "i4"), ("col_2", "U1")], + ... ) >>> pd.DataFrame.from_records(data) col_1 col_2 0 3 a @@ -2340,10 +2179,12 @@ def from_records( Data can be provided as a list of dicts: - >>> data = [{'col_1': 3, 'col_2': 'a'}, - ... {'col_1': 2, 'col_2': 'b'}, - ... {'col_1': 1, 'col_2': 'c'}, - ... {'col_1': 0, 'col_2': 'd'}] + >>> data = [ + ... {"col_1": 3, "col_2": "a"}, + ... {"col_1": 2, "col_2": "b"}, + ... {"col_1": 1, "col_2": "c"}, + ... {"col_1": 0, "col_2": "d"}, + ... ] >>> pd.DataFrame.from_records(data) col_1 col_2 0 3 a @@ -2353,8 +2194,8 @@ def from_records( Data can be provided as a list of tuples with corresponding columns: - >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] - >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) + >>> data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")] + >>> pd.DataFrame.from_records(data, columns=["col_1", "col_2"]) col_1 col_2 0 3 a 1 2 b @@ -2362,21 +2203,10 @@ def from_records( 3 0 d """ if isinstance(data, DataFrame): - warnings.warn( - "Passing a DataFrame to DataFrame.from_records is deprecated. Use " + raise TypeError( + "Passing a DataFrame to DataFrame.from_records is not supported. Use " "set_index and/or drop to modify the DataFrame instead.", - FutureWarning, - stacklevel=find_stack_level(), ) - if columns is not None: - if is_scalar(columns): - columns = [columns] - data = data[columns] - if index is not None: - data = data.set_index(index) - if exclude is not None: - data = data.drop(columns=exclude) - return data.copy(deep=False) result_index = None @@ -2389,7 +2219,7 @@ def maybe_reorder( ) -> tuple[list[ArrayLike], Index, Index | None]: """ If our desired 'columns' do not match the data's pre-existing 'arr_columns', - we re-order our arrays. This is like a pre-emptive (cheap) reindex. + we re-order our arrays. This is like a preemptive (cheap) reindex. """ if len(arrays): length = len(arrays[0]) @@ -2493,16 +2323,17 @@ def maybe_reorder( exclude.update(index) if any(exclude): - arr_exclude = [x for x in exclude if x in arr_columns] - to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arr_exclude = (x for x in exclude if x in arr_columns) + to_remove = {arr_columns.get_loc(col) for col in arr_exclude} # pyright: ignore[reportUnhashable] arrays = [v for i, v in enumerate(arrays) if i not in to_remove] columns = columns.drop(exclude) - manager = _get_option("mode.data_manager", silent=True) - mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) - - return cls._from_mgr(mgr, axes=mgr.axes) + mgr = arrays_to_mgr(arrays, columns, result_index) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) + if cls is not DataFrame: + return cls(df, copy=False) + return df def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None @@ -2545,8 +2376,7 @@ def to_records( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, - ... index=['a', 'b']) + >>> df = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"]) >>> df A B a 1 0.50 @@ -2700,7 +2530,6 @@ def _from_arrays( if dtype is not None: dtype = pandas_dtype(dtype) - manager = _get_option("mode.data_manager", silent=True) columns = ensure_index(columns) if len(columns) != len(arrays): raise ValueError("len(columns) must match len(arrays)") @@ -2710,7 +2539,6 @@ def _from_arrays( index, dtype=dtype, verify_integrity=verify_integrity, - typ=manager, ) return cls._from_mgr(mgr, axes=mgr.axes) @@ -2819,10 +2647,10 @@ def to_stata( Examples -------- - >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', - ... 'parrot'], - ... 'speed': [350, 18, 361, 15]}}) - >>> df.to_stata('animals.dta') # doctest: +SKIP + >>> df = pd.DataFrame( + ... [["falcon", 350], ["parrot", 18]], columns=["animal", "parrot"] + ... ) + >>> df.to_stata("animals.dta") # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): raise ValueError("Only formats 114, 117, 118 and 119 are supported.") @@ -2882,6 +2710,16 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: This includes the `compression`, `compression_level`, `chunksize` and `version` keywords. + See Also + -------- + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + DataFrame.to_excel : Write object to an Excel sheet. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_csv : Write a csv file. + DataFrame.to_json : Convert the object to a JSON string. + DataFrame.to_html : Render a DataFrame as an HTML table. + DataFrame.to_string : Convert DataFrame to a string. + Notes ----- This function writes the dataframe as a `feather file @@ -2898,14 +2736,88 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: to_feather(self, path, **kwargs) - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "buf"], name="to_markdown" - ) - @doc( - Series.to_markdown, - klass=_shared_doc_kwargs["klass"], - storage_options=_shared_docs["storage_options"], - examples="""Examples + @overload + def to_markdown( + self, + buf: None = ..., + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> str: ... + + @overload + def to_markdown( + self, + buf: FilePath | WriteBuffer[str], + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> None: ... + + @overload + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None, + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> str | None: ... + + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> str | None: + """ + Print DataFrame in Markdown-friendly format. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + mode : str, optional + Mode in which file is opened, "wt" by default. + index : bool, optional, default True + Add index (row) labels. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + + **kwargs + These parameters will be passed to `tabulate `_. + + Returns + ------- + str + DataFrame in Markdown-friendly format. + + See Also + -------- + DataFrame.to_html : Render DataFrame to HTML-formatted table. + DataFrame.to_latex : Render DataFrame to LaTeX-formatted table. + + Notes + ----- + Requires the `tabulate `_ package. + + Examples -------- >>> df = pd.DataFrame( ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} @@ -2925,16 +2837,8 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: | 0 | elk | dog | +----+------------+------------+ | 1 | pig | quetzal | - +----+------------+------------+""", - ) - def to_markdown( - self, - buf: FilePath | WriteBuffer[str] | None = None, - mode: str = "wt", - index: bool = True, - storage_options: StorageOptions | None = None, - **kwargs, - ) -> str | None: + +----+------------+------------+ + """ if "showindex" in kwargs: raise ValueError("Pass 'index' instead of 'showindex") @@ -2954,35 +2858,33 @@ def to_markdown( def to_parquet( self, path: None = ..., + *, engine: Literal["auto", "pyarrow", "fastparquet"] = ..., compression: str | None = ..., index: bool | None = ..., partition_cols: list[str] | None = ..., storage_options: StorageOptions = ..., **kwargs, - ) -> bytes: - ... + ) -> bytes: ... @overload def to_parquet( self, path: FilePath | WriteBuffer[bytes], + *, engine: Literal["auto", "pyarrow", "fastparquet"] = ..., compression: str | None = ..., index: bool | None = ..., partition_cols: list[str] | None = ..., storage_options: StorageOptions = ..., **kwargs, - ) -> None: - ... + ) -> None: ... - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "path"], name="to_parquet" - ) @doc(storage_options=_shared_docs["storage_options"]) def to_parquet( self, path: FilePath | WriteBuffer[bytes] | None = None, + *, engine: Literal["auto", "pyarrow", "fastparquet"] = "auto", compression: str | None = "snappy", index: bool | None = None, @@ -3034,6 +2936,9 @@ def to_parquet( Returns ------- bytes if no path argument is provided else None + Returns the DataFrame converted to the binary parquet format as bytes if no + path argument. Returns None and writes the DataFrame to the specified + location in the Parquet format if the path argument is provided. See Also -------- @@ -3045,16 +2950,22 @@ def to_parquet( Notes ----- - This function requires either the `fastparquet - `_ or `pyarrow - `_ library. + * This function requires either the `fastparquet + `_ or `pyarrow + `_ library. + * When saving a DataFrame with categorical columns to parquet, + the file size may increase due to the inclusion of all possible + categories, not just those present in the data. This behavior + is expected and consistent with pandas' handling of categorical data. + To manage file size and ensure a more predictable roundtrip process, + consider using :meth:`Categorical.remove_unused_categories` on the + DataFrame before saving. Examples -------- - >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) - >>> df.to_parquet('df.parquet.gzip', - ... compression='gzip') # doctest: +SKIP - >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP + >>> df = pd.DataFrame(data={{"col1": [1, 2], "col2": [3, 4]}}) + >>> df.to_parquet("df.parquet.gzip", compression="gzip") # doctest: +SKIP + >>> pd.read_parquet("df.parquet.gzip") # doctest: +SKIP col1 col2 0 1 3 1 2 4 @@ -3082,6 +2993,36 @@ def to_parquet( **kwargs, ) + @overload + def to_orc( + self, + path: None = ..., + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> bytes: ... + + @overload + def to_orc( + self, + path: FilePath | WriteBuffer[bytes], + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> None: ... + + @overload + def to_orc( + self, + path: FilePath | WriteBuffer[bytes] | None, + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> bytes | None: ... + def to_orc( self, path: FilePath | WriteBuffer[bytes] | None = None, @@ -3091,7 +3032,7 @@ def to_orc( engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ - Write a DataFrame to the ORC format. + Write a DataFrame to the Optimized Row Columnar (ORC) format. .. versionadded:: 1.5.0 @@ -3118,7 +3059,8 @@ def to_orc( Returns ------- - bytes if no path argument is provided else None + bytes if no ``path`` argument is provided else None + Bytes object with DataFrame data if ``path`` is not specified else None. Raises ------ @@ -3138,6 +3080,8 @@ def to_orc( Notes ----- + * Find more information on ORC + `here `__. * Before using this function you should read the :ref:`user guide about ORC ` and :ref:`install optional dependencies `. * This function requires `pyarrow `_ @@ -3149,9 +3093,9 @@ def to_orc( Examples -------- - >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) - >>> df.to_orc('df.orc') # doctest: +SKIP - >>> pd.read_orc('df.orc') # doctest: +SKIP + >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) + >>> df.to_orc("df.orc") # doctest: +SKIP + >>> pd.read_orc("df.orc") # doctest: +SKIP col1 col2 0 1 4 1 2 3 @@ -3174,6 +3118,7 @@ def to_orc( def to_html( self, buf: FilePath | WriteBuffer[str], + *, columns: Axes | None = ..., col_space: ColspaceArgType | None = ..., header: bool = ..., @@ -3196,13 +3141,13 @@ def to_html( table_id: str | None = ..., render_links: bool = ..., encoding: str | None = ..., - ) -> None: - ... + ) -> None: ... @overload def to_html( self, buf: None = ..., + *, columns: Axes | None = ..., col_space: ColspaceArgType | None = ..., header: bool = ..., @@ -3225,12 +3170,8 @@ def to_html( table_id: str | None = ..., render_links: bool = ..., encoding: str | None = ..., - ) -> str: - ... + ) -> str: ... - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "buf"], name="to_html" - ) @Substitution( header_type="bool", header="Whether to print column labels, default True", @@ -3242,6 +3183,7 @@ def to_html( def to_html( self, buf: FilePath | WriteBuffer[str] | None = None, + *, columns: Axes | None = None, col_space: ColspaceArgType | None = None, header: bool = True, @@ -3276,9 +3218,13 @@ def to_html( Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - `` tag. Default ``pd.options.display.html.border``. + border : int or bool + When an integer value is provided, it sets the border attribute in + the opening tag, specifying the thickness of the border. + If ``False`` or ``0`` is passed, the border attribute will not + be present in the ``
    `` tag. + The default value for this parameter is governed by + ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
    ` tag if specified. render_links : bool, default False @@ -3292,7 +3238,7 @@ def to_html( Examples -------- - >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) + >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) >>> html_string = '''
    ... ... @@ -3369,8 +3315,7 @@ def to_xml( stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., compression: CompressionOptions = ..., storage_options: StorageOptions | None = ..., - ) -> str: - ... + ) -> str: ... @overload def to_xml( @@ -3392,12 +3337,8 @@ def to_xml( stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., compression: CompressionOptions = ..., storage_options: StorageOptions | None = ..., - ) -> None: - ... + ) -> None: ... - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml" - ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buffer", @@ -3405,6 +3346,7 @@ def to_xml( def to_xml( self, path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + *, index: bool = True, root_name: str | None = "data", row_name: str | None = "row", @@ -3497,9 +3439,10 @@ def to_xml( Examples -------- - >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], - ... 'degrees': [360, 360, 180], - ... 'sides': [4, np.nan, 3]}}) + >>> df = pd.DataFrame( + ... [["square", 360, 4], ["circle", 360, np.nan], ["triangle", 180, 3]], + ... columns=["shape", "degrees", "sides"], + ... ) >>> df.to_xml() # doctest: +SKIP @@ -3524,9 +3467,9 @@ def to_xml( - >>> df.to_xml(attr_cols=[ - ... 'index', 'shape', 'degrees', 'sides' - ... ]) # doctest: +SKIP + >>> df.to_xml( + ... attr_cols=["index", "shape", "degrees", "sides"] + ... ) # doctest: +SKIP @@ -3534,8 +3477,9 @@ def to_xml( - >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, - ... prefix="doc") # doctest: +SKIP + >>> df.to_xml( + ... namespaces={{"doc": "https://example.com"}}, prefix="doc" + ... ) # doctest: +SKIP @@ -3667,9 +3611,8 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: Examples -------- - >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] - >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) - ... for t in dtypes]) + >>> dtypes = ["int64", "float64", "complex128", "object", "bool"] + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) for t in dtypes]) >>> df = pd.DataFrame(data) >>> df.head() int64 float64 complex128 object bool @@ -3710,8 +3653,8 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: Use a Categorical for efficient storage of an object-dtype column with many repeated values. - >>> df['object'].astype('category').memory_usage(deep=True) - 5244 + >>> df["object"].astype("category").memory_usage(deep=True) + 5136 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], @@ -3725,7 +3668,11 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: result = index_memory_usage._append(result) return result - def transpose(self, *args, copy: bool = False) -> DataFrame: + def transpose( + self, + *args, + copy: bool | lib.NoDefault = lib.no_default, + ) -> DataFrame: """ Transpose index and columns. @@ -3756,6 +3703,8 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- DataFrame @@ -3775,7 +3724,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: -------- **Square DataFrame with homogeneous dtype** - >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} + >>> d1 = {"col1": [1, 2], "col2": [3, 4]} >>> df1 = pd.DataFrame(data=d1) >>> df1 col1 col2 @@ -3802,10 +3751,12 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: **Non-square DataFrame with mixed dtypes** - >>> d2 = {'name': ['Alice', 'Bob'], - ... 'score': [9.5, 8], - ... 'employed': [False, True], - ... 'kids': [0, 0]} + >>> d2 = { + ... "name": ["Alice", "Bob"], + ... "score": [9.5, 8], + ... "employed": [False, True], + ... "kids": [0, 0], + ... } >>> df2 = pd.DataFrame(data=d2) >>> df2 name score employed kids @@ -3834,16 +3785,15 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: 1 object dtype: object """ + self._check_copy_deprecation(copy) nv.validate_transpose(args, {}) # construct the args - dtypes = list(self.dtypes) + first_dtype = self.dtypes.iloc[0] if len(self.columns) else None if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. new_vals = self._values.T - if copy and not using_copy_on_write(): - new_vals = new_vals.copy() result = self._constructor( new_vals, @@ -3852,16 +3802,16 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: copy=False, dtype=new_vals.dtype, ) - if using_copy_on_write() and len(self) > 0: - result._mgr.add_references(self._mgr) # type: ignore[arg-type] + if len(self) > 0: + result._mgr.add_references(self._mgr) elif ( self._is_homogeneous_type - and dtypes - and isinstance(dtypes[0], ExtensionDtype) + and first_dtype is not None + and isinstance(first_dtype, ExtensionDtype) ): new_values: list - if isinstance(dtypes[0], BaseMaskedDtype): + if isinstance(first_dtype, BaseMaskedDtype): # We have masked arrays with the same dtype. We can transpose faster. from pandas.core.arrays.masked import ( transpose_homogeneous_masked_arrays, @@ -3870,7 +3820,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: new_values = transpose_homogeneous_masked_arrays( cast(Sequence[BaseMaskedArray], self._iter_column_arrays()) ) - elif isinstance(dtypes[0], ArrowDtype): + elif isinstance(first_dtype, ArrowDtype): # We have arrow EAs with the same dtype. We can transpose faster. from pandas.core.arrays.arrow.array import ( ArrowExtensionArray, @@ -3882,10 +3832,11 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: ) else: # We have other EAs with the same dtype. We preserve dtype in transpose. - dtyp = dtypes[0] - arr_typ = dtyp.construct_array_type() + arr_typ = first_dtype.construct_array_type() values = self.values - new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] + new_values = [ + arr_typ._from_sequence(row, dtype=first_dtype) for row in values + ] result = type(self)._from_arrays( new_values, @@ -3896,8 +3847,6 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: else: new_arr = self.values.T - if copy and not using_copy_on_write(): - new_arr = new_arr.copy() result = self._constructor( new_arr, index=self.columns, @@ -3925,7 +3874,7 @@ def T(self) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df col1 col2 0 1 3 @@ -3956,24 +3905,14 @@ def _ixs(self, i: int, axis: AxisInt = 0) -> Series: if axis == 0: new_mgr = self._mgr.fast_xs(i) - # if we are a copy, mark as such - copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes) result._name = self.index[i] - result = result.__finalize__(self) - result._set_is_copy(self, copy=copy) - return result + return result.__finalize__(self) # icol else: - label = self.columns[i] - col_mgr = self._mgr.iget(i) - result = self._box_col_values(col_mgr, i) - - # this is a cached value, mark it so - result._set_as_cached(label, self) - return result + return self._box_col_values(col_mgr, i) def _get_column_array(self, i: int) -> ArrayLike: """ @@ -3993,48 +3932,27 @@ def _iter_column_arrays(self) -> Iterator[ArrayLike]: Warning! The returned array is a view but doesn't handle Copy-on-Write, so this should be used with caution (for read-only purposes). """ - if isinstance(self._mgr, ArrayManager): - yield from self._mgr.arrays - else: - for i in range(len(self.columns)): - yield self._get_column_array(i) - - def _getitem_nocopy(self, key: list): - """ - Behaves like __getitem__, but returns a view in cases where __getitem__ - would make a copy. - """ - # TODO(CoW): can be removed if/when we are always Copy-on-Write - indexer = self.columns._get_indexer_strict(key, "columns")[1] - new_axis = self.columns[indexer] - - new_mgr = self._mgr.reindex_indexer( - new_axis, - indexer, - axis=0, - allow_dups=True, - copy=False, - only_slice=True, - ) - return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + for i in range(len(self.columns)): + yield self._get_column_array(i) def __getitem__(self, key): check_dict_or_set_indexers(key) key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) - if is_hashable(key) and not is_iterator(key): + if is_hashable(key) and not is_iterator(key) and not isinstance(key, slice): # is_iterator to exclude generator e.g. test_getitem_listlike + # As of Python 3.12, slice is hashable which breaks MultiIndex (GH#57500) + # shortcut if the key is in columns is_mi = isinstance(self.columns, MultiIndex) # GH#45316 Return view if key is not duplicated # Only use drop_duplicates with duplicates for performance if not is_mi and ( - self.columns.is_unique - and key in self.columns + (self.columns.is_unique and key in self.columns) or key in self.columns.drop_duplicates(keep=False) ): - return self._get_item_cache(key) + return self._get_item(key) elif is_mi and self.columns.is_unique and key in self.columns: return self._getitem_multilevel(key) @@ -4073,7 +3991,7 @@ def __getitem__(self, key): if isinstance(indexer, slice): return self._slice(indexer, axis=1) - data = self._take_with_is_copy(indexer, axis=1) + data = self.take(indexer, axis=1) if is_single_key: # What does looking for a single key in a non-unique index return? @@ -4082,7 +4000,7 @@ def __getitem__(self, key): # - we have a MultiIndex on columns (test on self.columns, #21309) if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): # GH#26490 using data[key] can cause RecursionError - return data._get_item_cache(key) + return data._get_item(key) return data @@ -4108,10 +4026,10 @@ def _getitem_bool_array(self, key): key = check_bool_indexer(self.index, key) if key.all(): - return self.copy(deep=None) + return self.copy(deep=False) indexer = key.nonzero()[0] - return self._take_with_is_copy(indexer, axis=0) + return self.take(indexer, axis=0) def _getitem_multilevel(self, key): # self.columns is a MultiIndex @@ -4141,7 +4059,6 @@ def _getitem_multilevel(self, key): result, index=self.index, name=key ) - result._set_is_copy(self) return result else: # loc is neither a slice nor ndarray, so must be an int @@ -4170,8 +4087,7 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: series = self._ixs(col, axis=1) return series._values[index] - series = self._get_item_cache(col) - engine = self.index._engine + series = self._get_item(col) if not isinstance(self.index, MultiIndex): # CategoricalIndex: Trying to use the engine fastpath may give incorrect @@ -4182,7 +4098,7 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: # For MultiIndex going through engine effectively restricts us to # same-length tuples; see test_get_set_value_no_partial_indexing - loc = engine.get_loc(index) + loc = self.index._engine.get_loc(index) return series._values[loc] def isetitem(self, loc, value) -> None: @@ -4198,6 +4114,11 @@ def isetitem(self, loc, value) -> None: value : scalar or arraylike Value(s) for the column. + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + Notes ----- ``frame.isetitem(loc, value)`` is an in-place method as it will @@ -4208,6 +4129,15 @@ def isetitem(self, loc, value) -> None: In cases where ``frame.columns`` is unique, this is equivalent to ``frame[frame.columns[i]] = value``. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> df.isetitem(1, [5, 6]) + >>> df + A B + 0 1 5 + 1 2 6 """ if isinstance(value, DataFrame): if is_integer(loc): @@ -4228,22 +4158,11 @@ def isetitem(self, loc, value) -> None: self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs) def __setitem__(self, key, value) -> None: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= 3: warnings.warn( _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 ) - elif not PYPY and not using_copy_on_write(): - if sys.getrefcount(self) <= 3 and ( - warn_copy_on_write() - or ( - not warn_copy_on_write() - and any(b.refs.has_reference() for b in self._mgr.blocks) # type: ignore[union-attr] - ) - ): - warnings.warn( - _chained_assignment_warning_msg, FutureWarning, stacklevel=2 - ) key = com.apply_if_callable(key, self) @@ -4273,10 +4192,9 @@ def _setitem_slice(self, key: slice, value) -> None: # NB: we can't just use self.loc[key] = value because that # operates on labels and we need to operate positional for # backwards-compat, xref GH#31469 - self._check_setitem_copy() self.iloc[key] = value - def _setitem_array(self, key, value): + def _setitem_array(self, key, value) -> None: # also raises Exception if object array with NA values if com.is_bool_indexer(key): # bool indexer is indexing along rows @@ -4286,7 +4204,6 @@ def _setitem_array(self, key, value): ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] - self._check_setitem_copy() if isinstance(value, DataFrame): # GH#39931 reindex since iloc does not align value = value.reindex(self.index.take(indexer)) @@ -4311,12 +4228,12 @@ def _setitem_array(self, key, value): elif np.ndim(value) > 1: # list of lists value = DataFrame(value).values - return self._setitem_array(key, value) + self._setitem_array(key, value) else: self._iset_not_inplace(key, value) - def _iset_not_inplace(self, key, value): + def _iset_not_inplace(self, key, value) -> None: # GH#39510 when setting with df[key] = obj with a list-like key and # list-like value, we iterate over those listlikes and set columns # one at a time. This is different from dispatching to @@ -4360,7 +4277,7 @@ def igetitem(obj, i: int): finally: self.columns = orig_columns - def _setitem_frame(self, key, value): + def _setitem_frame(self, key, value) -> None: # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 if isinstance(key, np.ndarray): @@ -4368,12 +4285,11 @@ def _setitem_frame(self, key, value): raise ValueError("Array conditional must be same shape as self") key = self._constructor(key, **self._construct_axes_dict(), copy=False) - if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): + if key.size and not all(is_bool_dtype(blk.dtype) for blk in key._mgr.blocks): raise TypeError( "Must pass DataFrame or 2-d ndarray with boolean values only" ) - self._check_setitem_copy() self._where(-key, value, inplace=True) def _set_item_frame_value(self, key, value: DataFrame) -> None: @@ -4435,7 +4351,6 @@ def _iset_item_mgr( ) -> None: # when called from _set_item_mgr loc can be anything returned from get_loc self._mgr.iset(loc, value, inplace=inplace, refs=refs) - self._clear_item_cache() def _set_item_mgr( self, key, value: ArrayLike, refs: BlockValuesRefs | None = None @@ -4448,27 +4363,10 @@ def _set_item_mgr( else: self._iset_item_mgr(loc, value, refs=refs) - # check if we are modifying a copy - # try to set first as we want an invalid - # value exception to occur first - if len(self): - self._check_setitem_copy() - def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None: # We are only called from _replace_columnwise which guarantees that # no reindex is necessary - if using_copy_on_write(): - self._iset_item_mgr( - loc, value._values, inplace=inplace, refs=value._references - ) - else: - self._iset_item_mgr(loc, value._values.copy(), inplace=True) - - # check if we are modifying a copy - # try to set first as we want an invalid - # value exception to occur first - if len(self): - self._check_setitem_copy() + self._iset_item_mgr(loc, value._values, inplace=inplace, refs=value._references) def _set_item(self, key, value) -> None: """ @@ -4520,7 +4418,6 @@ def _set_value( icol = self.columns.get_loc(col) iindex = self.index.get_loc(index) self._mgr.column_setitem(icol, iindex, value, inplace_only=True) - self._clear_item_cache() except (KeyError, TypeError, ValueError, LossySetitemError): # get_loc might raise a KeyError for missing labels (falling back @@ -4532,7 +4429,6 @@ def _set_value( self.iloc[index, col] = value else: self.loc[index, col] = value - self._item_cache.pop(col, None) except InvalidIndexError as ii_err: # GH48729: Seems like you are trying to assign a value to a @@ -4564,106 +4460,132 @@ def _ensure_valid_index(self, value) -> None: self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) - def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: + def _box_col_values(self, values: SingleBlockManager, loc: int) -> Series: """ Provide boxed values for a column. """ # Lookup in columns so that if e.g. a str datetime was passed # we attach the Timestamp object as the name. name = self.columns[loc] - # We get index=self.index bc values is a SingleDataManager + # We get index=self.index bc values is a SingleBlockManager obj = self._constructor_sliced_from_mgr(values, axes=values.axes) obj._name = name return obj.__finalize__(self) - # ---------------------------------------------------------------------- - # Lookup Caching - - def _clear_item_cache(self) -> None: - self._item_cache.clear() - - def _get_item_cache(self, item: Hashable) -> Series: - """Return the cached item, item represents a label indexer.""" - if using_copy_on_write() or warn_copy_on_write(): - loc = self.columns.get_loc(item) - return self._ixs(loc, axis=1) - - cache = self._item_cache - res = cache.get(item) - if res is None: - # All places that call _get_item_cache have unique columns, - # pending resolution of GH#33047 - - loc = self.columns.get_loc(item) - res = self._ixs(loc, axis=1) - - cache[item] = res - - # for a chain - res._is_copy = self._is_copy - return res - - def _reset_cacher(self) -> None: - # no-op for DataFrame - pass - - def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: - """ - The object has called back to us saying maybe it has changed. - """ - loc = self._info_axis.get_loc(item) - arraylike = value._values - - old = self._ixs(loc, axis=1) - if old._values is value._values and inplace: - # GH#46149 avoid making unnecessary copies/block-splitting - return - - self._mgr.iset(loc, arraylike, inplace=inplace) + def _get_item(self, item: Hashable) -> Series: + loc = self.columns.get_loc(item) + return self._ixs(loc, axis=1) # ---------------------------------------------------------------------- # Unsorted @overload - def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame: - ... + def query( + self, + expr: str, + *, + parser: Literal["pandas", "python"] = ..., + engine: Literal["python", "numexpr"] | None = ..., + local_dict: dict[str, Any] | None = ..., + global_dict: dict[str, Any] | None = ..., + resolvers: list[Mapping] | None = ..., + level: int = ..., + inplace: Literal[False] = ..., + ) -> DataFrame: ... @overload - def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: - ... + def query( + self, + expr: str, + *, + parser: Literal["pandas", "python"] = ..., + engine: Literal["python", "numexpr"] | None = ..., + local_dict: dict[str, Any] | None = ..., + global_dict: dict[str, Any] | None = ..., + resolvers: list[Mapping] | None = ..., + level: int = ..., + inplace: Literal[True], + ) -> None: ... @overload - def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None: - ... + def query( + self, + expr: str, + *, + parser: Literal["pandas", "python"] = ..., + engine: Literal["python", "numexpr"] | None = ..., + local_dict: dict[str, Any] | None = ..., + global_dict: dict[str, Any] | None = ..., + resolvers: list[Mapping] | None = ..., + level: int = ..., + inplace: bool = ..., + ) -> DataFrame | None: ... - def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None: + def query( + self, + expr: str, + *, + parser: Literal["pandas", "python"] = "pandas", + engine: Literal["python", "numexpr"] | None = None, + local_dict: dict[str, Any] | None = None, + global_dict: dict[str, Any] | None = None, + resolvers: list[Mapping] | None = None, + level: int = 0, + inplace: bool = False, + ) -> DataFrame | None: """ Query the columns of a DataFrame with a boolean expression. + .. warning:: + + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. + Parameters ---------- expr : str The query string to evaluate. - You can refer to variables - in the environment by prefixing them with an '@' character like - ``@a + b``. - - You can refer to column names that are not valid Python variable names - by surrounding them in backticks. Thus, column names containing spaces - or punctuations (besides underscores) or starting with digits must be - surrounded by backticks. (For example, a column named "Area (cm^2)" would - be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "list", "for", "import", etc) cannot be used. - - For example, if one of your columns is called ``a a`` and you want - to sum it with ``b``, your query should be ```a a` + b``. - + See the documentation for :func:`eval` for details of + supported operations and functions in the query string. + + See the documentation for :meth:`DataFrame.eval` for details on + referring to column names and variables in the query string. + parser : {'pandas', 'python'}, default 'pandas' + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine : {'python', 'numexpr'}, default 'numexpr' + + The engine used to evaluate the expression. Supported engines are + + - None : tries to use ``numexpr``, falls back to ``python`` + - ``'numexpr'`` : This default engine evaluates pandas objects using + numexpr for large speed ups in complex expressions with large frames. + - ``'python'`` : Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + local_dict : dict or None, optional + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional + A dictionary of global variables, taken from globals() by default. + resolvers : list of dict-like or None, optional + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~DataFrame.query` method to inject the + ``DataFrame.index`` and ``DataFrame.columns`` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level : int, optional + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. inplace : bool Whether to modify the DataFrame rather than creating a new one. - **kwargs - See the documentation for :func:`eval` for complete details - on the keyword arguments accepted by :meth:`DataFrame.query`. Returns ------- @@ -4723,63 +4645,70 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No quoted string are replaced by strings that are allowed as a Python identifier. These characters include all operators in Python, the space character, the question mark, the exclamation mark, the dollar sign, and the euro sign. - For other characters that fall outside the ASCII range (U+0001..U+007F) - and those that are not further specified in PEP 3131, - the query parser will raise an error. - This excludes whitespace different than the space character, - but also the hashtag (as it is used for comments) and the backtick - itself (backtick can also not be escaped). - - In a special case, quotes that make a pair around a backtick can - confuse the parser. - For example, ```it's` > `that's``` will raise an error, - as it forms a quoted string (``'s > `that'``) with a backtick inside. - - See also the Python documentation about lexical analysis - (https://docs.python.org/3/reference/lexical_analysis.html) + + A backtick can be escaped by double backticks. + + See also the `Python documentation about lexical analysis + `__ in combination with the source code in :mod:`pandas.core.computation.parsing`. Examples -------- - >>> df = pd.DataFrame({'A': range(1, 6), - ... 'B': range(10, 0, -2), - ... 'C C': range(10, 5, -1)}) + >>> df = pd.DataFrame( + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} + ... ) >>> df - A B C C + A B C&C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 - >>> df.query('A > B') - A B C C + >>> df.query("A > B") + A B C&C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] - A B C C + A B C&C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. - >>> df.query('B == `C C`') - A B C C + >>> df.query("B == `C&C`") + A B C&C 0 1 10 10 The previous expression is equivalent to - >>> df[df.B == df['C C']] - A B C C + >>> df[df.B == df["C&C"]] + A B C&C 0 1 10 10 + + Using local variable: + + >>> local_var = 2 + >>> df.query("A <= @local_var") + A B C&C + 0 1 10 10 + 1 2 8 9 """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): msg = f"expr must be a string to be evaluated, {type(expr)} given" raise ValueError(msg) - kwargs["level"] = kwargs.pop("level", 0) + 1 - kwargs["target"] = None - res = self.eval(expr, **kwargs) + + res = self.eval( + expr, + level=level + 1, + parser=parser, + target=None, + engine=engine, + local_dict=local_dict, + global_dict=global_dict, + resolvers=resolvers or (), + ) try: result = self.loc[res] @@ -4795,17 +4724,20 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No return result @overload - def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: - ... + def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: ... @overload - def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: - ... + def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: ... def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: """ Evaluate a string describing operations on DataFrame columns. + .. warning:: + + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. + Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. @@ -4814,6 +4746,23 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: ---------- expr : str The expression string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuation (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "if", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + See the documentation for :func:`eval` for full details of + supported operations and functions in the expression string. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4821,7 +4770,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + :meth:`~pandas.DataFrame.eval`. Returns ------- @@ -4845,15 +4794,17 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Examples -------- - >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df = pd.DataFrame( + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} + ... ) >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 - >>> df.eval('A + B') + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + >>> df.eval("A + B") 0 11 1 10 2 9 @@ -4864,35 +4815,56 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Assignment is allowed though by default the original DataFrame is not modified. - >>> df.eval('C = A + B') - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 + >>> df.eval("D = A + B") + A B C&C D + 0 1 10 10 11 + 1 2 8 9 10 + 2 3 6 8 9 + 3 4 4 7 8 + 4 5 2 6 7 >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' - ... C = A + B - ... D = A - B + ... D = A + B + ... E = A - B ... ''' ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 + A B C&C D E + 0 1 10 10 11 -9 + 1 2 8 9 10 -6 + 2 3 6 8 9 -3 + 3 4 4 7 8 0 + 4 5 2 6 7 3 + + For columns with spaces or other disallowed characters in their name, you can + use backtick quoting. + + >>> df.eval("B * `C&C`") + 0 100 + 1 72 + 2 48 + 3 28 + 4 12 + + Local variables shall be explicitly referenced using ``@`` + character in front of the name: + + >>> local_var = 2 + >>> df.eval("@local_var * A") + 0 2 + 1 4 + 2 6 + 3 8 + 4 10 """ from pandas.core.computation.eval import eval as _eval @@ -4907,10 +4879,14 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None) -> Self: + def select_dtypes(self, include=None, exclude=None) -> DataFrame: """ Return a subset of the DataFrame's columns based on the column dtypes. + This method allows for filtering columns based on their data types. + It is useful when working with heterogeneous DataFrames where operations + need to be performed on a specific subset of data types. + Parameters ---------- include, exclude : scalar or list-like @@ -4928,6 +4904,7 @@ def select_dtypes(self, include=None, exclude=None) -> Self: ValueError * If both of ``include`` and ``exclude`` are empty * If ``include`` and ``exclude`` have overlapping elements + TypeError * If any kind of string dtype is passed in. See Also @@ -4938,7 +4915,9 @@ def select_dtypes(self, include=None, exclude=None) -> Self: ----- * To select all *numeric* types, use ``np.number`` or ``'number'`` * To select strings you must use the ``object`` dtype, but note that - this will return *all* object dtype columns + this will return *all* object dtype columns. With + ``pd.options.future.infer_string`` enabled, using ``"str"`` will + work to select all string columns. * See the `numpy dtype hierarchy `__ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or @@ -4951,9 +4930,9 @@ def select_dtypes(self, include=None, exclude=None) -> Self: Examples -------- - >>> df = pd.DataFrame({'a': [1, 2] * 3, - ... 'b': [True, False] * 3, - ... 'c': [1.0, 2.0] * 3}) + >>> df = pd.DataFrame( + ... {"a": [1, 2] * 3, "b": [True, False] * 3, "c": [1.0, 2.0] * 3} + ... ) >>> df a b c 0 1 True 1.0 @@ -4963,7 +4942,7 @@ def select_dtypes(self, include=None, exclude=None) -> Self: 4 1 True 1.0 5 2 False 2.0 - >>> df.select_dtypes(include='bool') + >>> df.select_dtypes(include="bool") b 0 True 1 False @@ -4972,7 +4951,7 @@ def select_dtypes(self, include=None, exclude=None) -> Self: 4 True 5 False - >>> df.select_dtypes(include=['float64']) + >>> df.select_dtypes(include=["float64"]) c 0 1.0 1 2.0 @@ -4981,7 +4960,7 @@ def select_dtypes(self, include=None, exclude=None) -> Self: 4 1.0 5 2.0 - >>> df.select_dtypes(exclude=['int64']) + >>> df.select_dtypes(exclude=["int64"]) b c 0 True 1.0 1 False 2.0 @@ -5047,14 +5026,14 @@ def predicate(arr: ArrayLike) -> bool: return True - mgr = self._mgr._get_data_subset(predicate).copy(deep=None) + mgr = self._mgr._get_data_subset(predicate).copy(deep=False) return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) def insert( self, loc: int, column: Hashable, - value: Scalar | AnyArrayLike, + value: object, allow_duplicates: bool | lib.NoDefault = lib.no_default, ) -> None: """ @@ -5080,7 +5059,7 @@ def insert( Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df col1 col2 0 1 3 @@ -5138,7 +5117,7 @@ def assign(self, **kwargs) -> DataFrame: Parameters ---------- - **kwargs : dict of {str: callable or Series} + **kwargs : callable or Series The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not @@ -5152,6 +5131,11 @@ def assign(self, **kwargs) -> DataFrame: A new DataFrame with the new columns in addition to all the existing columns. + See Also + -------- + DataFrame.loc : Select a subset of a DataFrame by labels. + DataFrame.iloc : Select a subset of a DataFrame by positions. + Notes ----- Assigning multiple columns within the same ``assign`` is possible. @@ -5160,8 +5144,7 @@ def assign(self, **kwargs) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, - ... index=['Portland', 'Berkeley']) + >>> df = pd.DataFrame({"temp_c": [17.0, 25.0]}, index=["Portland", "Berkeley"]) >>> df temp_c Portland 17.0 @@ -5177,7 +5160,7 @@ def assign(self, **kwargs) -> DataFrame: Alternatively, the same behavior can be achieved by directly referencing an existing Series or sequence: - >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) + >>> df.assign(temp_f=df["temp_c"] * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 Berkeley 25.0 77.0 @@ -5185,13 +5168,15 @@ def assign(self, **kwargs) -> DataFrame: You can create multiple columns within the same assign where one of the columns depends on another one defined within the same assign: - >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, - ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) + >>> df.assign( + ... temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, + ... temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, + ... ) temp_c temp_f temp_k Portland 17.0 62.6 290.15 Berkeley 25.0 77.0 298.15 """ - data = self.copy(deep=None) + data = self.copy(deep=False) for k, v in kwargs.items(): data[k] = com.apply_if_callable(v, data) @@ -5222,22 +5207,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - arr = sanitize_array(value, self.index, copy=True, allow_2d=True) - if ( - isinstance(value, Index) - and value.dtype == "object" - and arr.dtype != value.dtype - ): # - # TODO: Remove kludge in sanitize_array for string mode when enforcing - # this deprecation - warnings.warn( - "Setting an Index with object dtype into a DataFrame will stop " - "inferring another dtype in a future version. Cast the Index " - "explicitly before setting it into the DataFrame.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return arr, None + return sanitize_array(value, self.index, copy=True, allow_2d=True), None @property def _series(self): @@ -5246,9 +5216,7 @@ def _series(self): # ---------------------------------------------------------------------- # Reindexing and alignment - def _reindex_multi( - self, axes: dict[str, Index], copy: bool, fill_value - ) -> DataFrame: + def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: """ We are guaranteed non-Nones in the axes. """ @@ -5270,7 +5238,6 @@ def _reindex_multi( else: return self._reindex_with_indexers( {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, - copy=copy, fill_value=fill_value, ) @@ -5310,7 +5277,7 @@ def set_axis( labels, *, axis: Axis = 0, - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: return super().set_axis(labels, axis=axis, copy=copy) @@ -5327,7 +5294,7 @@ def reindex( columns=None, axis: Axis | None = None, method: ReindexMethod | None = None, - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, level: Level | None = None, fill_value: Scalar | None = np.nan, limit: int | None = None, @@ -5339,62 +5306,59 @@ def reindex( columns=columns, axis=axis, method=method, - copy=copy, level=level, fill_value=fill_value, limit=limit, tolerance=tolerance, + copy=copy, ) @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level = ..., inplace: Literal[True], errors: IgnoreRaise = ..., - ) -> None: - ... + ) -> None: ... @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level = ..., inplace: Literal[False] = ..., errors: IgnoreRaise = ..., - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level = ..., inplace: bool = ..., errors: IgnoreRaise = ..., - ) -> DataFrame | None: - ... + ) -> DataFrame | None: ... def drop( self, - labels: IndexLabel | None = None, + labels: IndexLabel | ListLike = None, *, axis: Axis = 0, - index: IndexLabel | None = None, - columns: IndexLabel | None = None, + index: IndexLabel | ListLike = None, + columns: IndexLabel | ListLike = None, level: Level | None = None, inplace: bool = False, errors: IgnoreRaise = "raise", @@ -5410,16 +5374,16 @@ def drop( Parameters ---------- - labels : single label or list-like + labels : single label or iterable of labels Index or column labels to drop. A tuple will be used as a single - label and not treated as a list-like. + label and not treated as an iterable. axis : {0 or 'index', 1 or 'columns'}, default 0 Whether to drop labels from the index (0 or 'index') or columns (1 or 'columns'). - index : single label or list-like + index : single label or iterable of labels Alternative to specifying axis (``labels, axis=0`` is equivalent to ``index=labels``). - columns : single label or list-like + columns : single label or iterable of labels Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). level : int or level name, optional @@ -5453,8 +5417,7 @@ def drop( Examples -------- - >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), - ... columns=['A', 'B', 'C', 'D']) + >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"]) >>> df A B C D 0 0 1 2 3 @@ -5463,13 +5426,13 @@ def drop( Drop columns - >>> df.drop(['B', 'C'], axis=1) + >>> df.drop(["B", "C"], axis=1) A D 0 0 3 1 4 7 2 8 11 - >>> df.drop(columns=['B', 'C']) + >>> df.drop(columns=["B", "C"]) A D 0 0 3 1 4 7 @@ -5483,14 +5446,25 @@ def drop( Drop columns and/or rows of MultiIndex DataFrame - >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], - ... ['speed', 'weight', 'length']], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], - ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], - ... [250, 150], [1.5, 0.8], [320, 250], - ... [1, 0.8], [0.3, 0.2]]) + >>> midx = pd.MultiIndex( + ... levels=[["llama", "cow", "falcon"], ["speed", "weight", "length"]], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ... ) + >>> df = pd.DataFrame( + ... index=midx, + ... columns=["big", "small"], + ... data=[ + ... [45, 30], + ... [200, 100], + ... [1.5, 1], + ... [30, 20], + ... [250, 150], + ... [1.5, 0.8], + ... [320, 250], + ... [1, 0.8], + ... [0.3, 0.2], + ... ], + ... ) >>> df big small llama speed 45.0 30.0 @@ -5507,7 +5481,7 @@ def drop( DataFrame, i.e., drop the combination ``'falcon'`` and ``'weight'``, which deletes only the corresponding row - >>> df.drop(index=('falcon', 'weight')) + >>> df.drop(index=("falcon", "weight")) big small llama speed 45.0 30.0 weight 200.0 100.0 @@ -5518,7 +5492,7 @@ def drop( falcon speed 320.0 250.0 length 0.3 0.2 - >>> df.drop(index='cow', columns='small') + >>> df.drop(index="cow", columns="small") big llama speed 45.0 weight 200.0 @@ -5527,7 +5501,7 @@ def drop( weight 1.0 length 0.3 - >>> df.drop(index='length', level=1) + >>> df.drop(index="length", level=1) big small llama speed 45.0 30.0 weight 200.0 100.0 @@ -5554,12 +5528,11 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool | None = ..., + copy: bool | lib.NoDefault = lib.no_default, inplace: Literal[True], level: Level = ..., errors: IgnoreRaise = ..., - ) -> None: - ... + ) -> None: ... @overload def rename( @@ -5569,12 +5542,11 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool | None = ..., + copy: bool | lib.NoDefault = lib.no_default, inplace: Literal[False] = ..., level: Level = ..., errors: IgnoreRaise = ..., - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def rename( @@ -5584,12 +5556,11 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool | None = ..., + copy: bool | lib.NoDefault = lib.no_default, inplace: bool = ..., level: Level = ..., errors: IgnoreRaise = ..., - ) -> DataFrame | None: - ... + ) -> DataFrame | None: ... def rename( self, @@ -5598,7 +5569,7 @@ def rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, inplace: bool = False, level: Level | None = None, errors: IgnoreRaise = "ignore", @@ -5628,7 +5599,7 @@ def rename( axis : {0 or 'index', 1 or 'columns'}, default 0 Axis to target with ``mapper``. Can be either the axis name ('index', 'columns') or number (0, 1). The default is 'index'. - copy : bool, default True + copy : bool, default False Also copy underlying data. .. note:: @@ -5642,6 +5613,8 @@ def rename( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 inplace : bool, default False Whether to modify the DataFrame rather than creating a new one. If True then value of copy is ignored. @@ -5710,24 +5683,24 @@ def rename( Using axis-style parameters: - >>> df.rename(str.lower, axis='columns') + >>> df.rename(str.lower, axis="columns") a b 0 1 4 1 2 5 2 3 6 - >>> df.rename({1: 2, 2: 4}, axis='index') + >>> df.rename({1: 2, 2: 4}, axis="index") A B 0 1 4 2 2 5 4 3 6 """ + self._check_copy_deprecation(copy) return super()._rename( mapper=mapper, index=index, columns=columns, axis=axis, - copy=copy, inplace=inplace, level=level, errors=errors, @@ -5735,7 +5708,7 @@ def rename( def pop(self, item: Hashable) -> Series: """ - Return item and drop from frame. Raise KeyError if not found. + Return item and drop it from DataFrame. Raise KeyError if not found. Parameters ---------- @@ -5745,14 +5718,24 @@ def pop(self, item: Hashable) -> Series: Returns ------- Series + Series representing the item that is dropped. + + See Also + -------- + DataFrame.drop: Drop specified labels from rows or columns. + DataFrame.drop_duplicates: Return DataFrame with duplicate rows removed. Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan)], - ... columns=('name', 'class', 'max_speed')) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ], + ... columns=("name", "class", "max_speed"), + ... ) >>> df name class max_speed 0 falcon bird 389.0 @@ -5760,7 +5743,7 @@ def pop(self, item: Hashable) -> Series: 2 lion mammal 80.5 3 monkey mammal NaN - >>> df.pop('class') + >>> df.pop("class") 0 bird 1 bird 2 mammal @@ -5776,9 +5759,19 @@ def pop(self, item: Hashable) -> Series: """ return super().pop(item=item) + @overload + def _replace_columnwise( + self, mapping: dict[Hashable, tuple[Any, Any]], inplace: Literal[True], regex + ) -> None: ... + + @overload + def _replace_columnwise( + self, mapping: dict[Hashable, tuple[Any, Any]], inplace: Literal[False], regex + ) -> Self: ... + def _replace_columnwise( self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex - ): + ) -> Self | None: """ Dispatch to Series.replace column-wise. @@ -5794,7 +5787,7 @@ def _replace_columnwise( DataFrame or None """ # Operate column-wise - res = self if inplace else self.copy(deep=None) + res = self if inplace else self.copy(deep=False) ax = self.columns for i, ax_value in enumerate(ax): @@ -5807,7 +5800,7 @@ def _replace_columnwise( res._iset_item(i, newobj, inplace=inplace) if inplace: - return + return None return res.__finalize__(self) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) @@ -5821,14 +5814,12 @@ def shift( ) -> DataFrame: if freq is not None and fill_value is not lib.no_default: # GH#53832 - warnings.warn( - "Passing a 'freq' together with a 'fill_value' silently ignores " - "the fill_value and is deprecated. This will raise in a future " - "version.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + "Passing a 'freq' together with a 'fill_value' is not allowed." ) - fill_value = lib.no_default + + if self.empty and freq is None: + return self.copy() axis = self._get_axis_number(axis) @@ -5860,7 +5851,6 @@ def shift( periods = cast(int, periods) ncols = len(self.columns) - arrays = self._mgr.arrays if axis == 1 and periods != 0 and ncols > 0 and freq is None: if fill_value is lib.no_default: # We will infer fill_value to match the closest column @@ -5886,12 +5876,12 @@ def shift( result.columns = self.columns.copy() return result - elif len(arrays) > 1 or ( + elif len(self._mgr.blocks) > 1 or ( # If we only have one block and we know that we can't # keep the same dtype (i.e. the _can_hold_element check) # then we can go through the reindex_indexer path # (and avoid casting logic in the Block method). - not can_hold_element(arrays[0], fill_value) + not can_hold_element(self._mgr.blocks[0].values, fill_value) ): # GH#35488 we need to watch out for multi-block cases # We only get here with fill_value not-lib.no_default @@ -5930,8 +5920,7 @@ def set_index( append: bool = ..., inplace: Literal[False] = ..., verify_integrity: bool = ..., - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def set_index( @@ -5942,8 +5931,7 @@ def set_index( append: bool = ..., inplace: Literal[True], verify_integrity: bool = ..., - ) -> None: - ... + ) -> None: ... def set_index( self, @@ -5973,6 +5961,8 @@ def set_index( Delete columns to be used as the new index. append : bool, default False Whether to append columns to existing index. + Setting to True will add the new columns to existing index. + When set to False, the current index will be dropped from the DataFrame. inplace : bool, default False Whether to modify the DataFrame rather than creating a new one. verify_integrity : bool, default False @@ -5993,9 +5983,13 @@ def set_index( Examples -------- - >>> df = pd.DataFrame({'month': [1, 4, 7, 10], - ... 'year': [2012, 2014, 2013, 2014], - ... 'sale': [55, 40, 84, 31]}) + >>> df = pd.DataFrame( + ... { + ... "month": [1, 4, 7, 10], + ... "year": [2012, 2014, 2013, 2014], + ... "sale": [55, 40, 84, 31], + ... } + ... ) >>> df month year sale 0 1 2012 55 @@ -6005,7 +5999,7 @@ def set_index( Set the index to become the 'month' column: - >>> df.set_index('month') + >>> df.set_index("month") year sale month 1 2012 55 @@ -6015,7 +6009,7 @@ def set_index( Create a MultiIndex using columns 'year' and 'month': - >>> df.set_index(['year', 'month']) + >>> df.set_index(["year", "month"]) sale year month 2012 1 55 @@ -6025,7 +6019,7 @@ def set_index( Create a MultiIndex using an Index and a column: - >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) + >>> df.set_index([pd.Index([1, 2, 3, 4]), "year"]) month sale year 1 2012 1 55 @@ -6042,6 +6036,25 @@ def set_index( 2 4 4 2014 40 3 9 7 2013 84 4 16 10 2014 31 + + Append a column to the existing index: + + >>> df = df.set_index("month") + >>> df.set_index("year", append=True) + sale + month year + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 + + >>> df.set_index("year", append=False) + sale + year + 2012 55 + 2014 40 + 2013 84 + 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") self._check_inplace_and_allows_duplicate_labels(inplace) @@ -6079,8 +6092,7 @@ def set_index( if inplace: frame = self else: - # GH 49473 Use "lazy copy" with Copy-on-Write - frame = self.copy(deep=None) + frame = self.copy(deep=False) arrays: list[Index] = [] names: list[Hashable] = [] @@ -6093,7 +6105,7 @@ def set_index( else: arrays.append(self.index) - to_remove: list[Hashable] = [] + to_remove: set[Hashable] = set() for col in keys: if isinstance(col, MultiIndex): arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) @@ -6120,7 +6132,7 @@ def set_index( arrays.append(frame[col]) names.append(col) if drop: - to_remove.append(col) + to_remove.add(col) if len(arrays[-1]) != len(self): # check newest element against length of calling frame, since @@ -6137,7 +6149,7 @@ def set_index( raise ValueError(f"Index has duplicate keys: {duplicates}") # use set to handle duplicate column names gracefully in case of drop - for c in set(to_remove): + for c in to_remove: del frame[c] # clear up memory usage @@ -6160,8 +6172,7 @@ def reset_index( col_fill: Hashable = ..., allow_duplicates: bool | lib.NoDefault = ..., names: Hashable | Sequence[Hashable] | None = None, - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def reset_index( @@ -6174,8 +6185,7 @@ def reset_index( col_fill: Hashable = ..., allow_duplicates: bool | lib.NoDefault = ..., names: Hashable | Sequence[Hashable] | None = None, - ) -> None: - ... + ) -> None: ... @overload def reset_index( @@ -6188,8 +6198,7 @@ def reset_index( col_fill: Hashable = ..., allow_duplicates: bool | lib.NoDefault = ..., names: Hashable | Sequence[Hashable] | None = None, - ) -> DataFrame | None: - ... + ) -> DataFrame | None: ... def reset_index( self, @@ -6233,8 +6242,8 @@ def reset_index( names : int, str or 1-dimensional list, default None Using the given string, rename the DataFrame column which contains the - index data. If the DataFrame has a MultiIndex, this has to be a list or - tuple with length equal to the number of levels. + index data. If the DataFrame has a MultiIndex, this has to be a list + with length equal to the number of levels. .. versionadded:: 1.5.0 @@ -6251,12 +6260,11 @@ def reset_index( Examples -------- - >>> df = pd.DataFrame([('bird', 389.0), - ... ('bird', 24.0), - ... ('mammal', 80.5), - ... ('mammal', np.nan)], - ... index=['falcon', 'parrot', 'lion', 'monkey'], - ... columns=('class', 'max_speed')) + >>> df = pd.DataFrame( + ... [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)], + ... index=["falcon", "parrot", "lion", "monkey"], + ... columns=("class", "max_speed"), + ... ) >>> df class max_speed falcon bird 389.0 @@ -6286,19 +6294,21 @@ class max_speed You can also use `reset_index` with `MultiIndex`. - >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), - ... ('bird', 'parrot'), - ... ('mammal', 'lion'), - ... ('mammal', 'monkey')], - ... names=['class', 'name']) - >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), - ... ('species', 'type')]) - >>> df = pd.DataFrame([(389.0, 'fly'), - ... (24.0, 'fly'), - ... (80.5, 'run'), - ... (np.nan, 'jump')], - ... index=index, - ... columns=columns) + >>> index = pd.MultiIndex.from_tuples( + ... [ + ... ("bird", "falcon"), + ... ("bird", "parrot"), + ... ("mammal", "lion"), + ... ("mammal", "monkey"), + ... ], + ... names=["class", "name"], + ... ) + >>> columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")]) + >>> df = pd.DataFrame( + ... [(389.0, "fly"), (24.0, "fly"), (80.5, "run"), (np.nan, "jump")], + ... index=index, + ... columns=columns, + ... ) >>> df speed species max type @@ -6310,7 +6320,7 @@ class name Using the `names` parameter, choose a name for the index column: - >>> df.reset_index(names=['classes', 'names']) + >>> df.reset_index(names=["classes", "names"]) classes names speed species max type 0 bird falcon 389.0 fly @@ -6320,7 +6330,7 @@ class name If the index has multiple levels, we can reset a subset of them: - >>> df.reset_index(level='class') + >>> df.reset_index(level="class") class speed species max type name @@ -6332,7 +6342,7 @@ class speed species If we are not dropping the index, by default, it is placed in the top level. We can place it in another level: - >>> df.reset_index(level='class', col_level=1) + >>> df.reset_index(level="class", col_level=1) speed species class max type name @@ -6344,7 +6354,7 @@ class max type When the index is inserted under another level, we can specify under which one with the parameter `col_fill`: - >>> df.reset_index(level='class', col_level=1, col_fill='species') + >>> df.reset_index(level="class", col_level=1, col_fill="species") species speed species class max type name @@ -6355,7 +6365,7 @@ class max type If we specify a nonexistent level for `col_fill`, it is created: - >>> df.reset_index(level='class', col_level=1, col_fill='genus') + >>> df.reset_index(level="class", col_level=1, col_fill="genus") genus speed species class max type name @@ -6369,7 +6379,7 @@ class max type if inplace: new_obj = self else: - new_obj = self.copy(deep=None) + new_obj = self.copy(deep=False) if allow_duplicates is not lib.no_default: allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") @@ -6388,12 +6398,13 @@ class max type names = self.index._get_default_index_names(names, default) if isinstance(self.index, MultiIndex): - to_insert = zip(self.index.levels, self.index.codes) + to_insert = zip(reversed(self.index.levels), reversed(self.index.codes)) else: to_insert = ((self.index, None),) multi_col = isinstance(self.columns, MultiIndex) - for i, (lev, lab) in reversed(list(enumerate(to_insert))): + for j, (lev, lab) in enumerate(to_insert, start=1): + i = self.index.nlevels - j if level is not None and i not in level: continue name = names[i] @@ -6474,8 +6485,7 @@ def dropna( subset: IndexLabel = ..., inplace: Literal[False] = ..., ignore_index: bool = ..., - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def dropna( @@ -6487,8 +6497,7 @@ def dropna( subset: IndexLabel = ..., inplace: Literal[True], ignore_index: bool = ..., - ) -> None: - ... + ) -> None: ... def dropna( self, @@ -6496,7 +6505,7 @@ def dropna( axis: Axis = 0, how: AnyAll | lib.NoDefault = lib.no_default, thresh: int | lib.NoDefault = lib.no_default, - subset: IndexLabel | None = None, + subset: IndexLabel | AnyArrayLike | None = None, inplace: bool = False, ignore_index: bool = False, ) -> DataFrame | None: @@ -6526,7 +6535,7 @@ def dropna( thresh : int, optional Require that many non-NA values. Cannot be combined with how. - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. inplace : bool, default False @@ -6551,10 +6560,13 @@ def dropna( Examples -------- - >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], - ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], - ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), - ... pd.NaT]}) + >>> df = pd.DataFrame( + ... { + ... "name": ["Alfred", "Batman", "Catwoman"], + ... "toy": [np.nan, "Batmobile", "Bullwhip"], + ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT], + ... } + ... ) >>> df name toy born 0 Alfred NaN NaT @@ -6569,7 +6581,7 @@ def dropna( Drop the columns where at least one element is missing. - >>> df.dropna(axis='columns') + >>> df.dropna(axis="columns") name 0 Alfred 1 Batman @@ -6577,7 +6589,7 @@ def dropna( Drop the rows where all elements are missing. - >>> df.dropna(how='all') + >>> df.dropna(how="all") name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 @@ -6592,7 +6604,7 @@ def dropna( Define in which columns to look for missing values. - >>> df.dropna(subset=['name', 'toy']) + >>> df.dropna(subset=["name", "toy"]) name toy born 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT @@ -6617,7 +6629,7 @@ def dropna( if subset is not None: # subset needs to be list if not is_list_like(subset): - subset = [subset] + subset = [cast(Hashable, subset)] ax = self._get_axis(agg_axis) indices = ax.get_indexer_for(subset) check = indices == -1 @@ -6638,7 +6650,7 @@ def dropna( raise ValueError(f"invalid how option: {how}") if np.all(mask): - result = self.copy(deep=None) + result = self.copy(deep=False) else: result = self.loc(axis=axis)[mask] @@ -6653,39 +6665,36 @@ def dropna( @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: Literal[True], ignore_index: bool = ..., - ) -> None: - ... + ) -> None: ... @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: Literal[False] = ..., ignore_index: bool = ..., - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: bool = ..., ignore_index: bool = ..., - ) -> DataFrame | None: - ... + ) -> DataFrame | None: ... def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = None, + subset: Hashable | Iterable[Hashable] | None = None, *, keep: DropKeep = "first", inplace: bool = False, @@ -6699,7 +6708,7 @@ def drop_duplicates( Parameters ---------- - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', ``False``}, default 'first' @@ -6723,15 +6732,22 @@ def drop_duplicates( -------- DataFrame.value_counts: Count unique combinations of columns. + Notes + ----- + This method requires columns specified by ``subset`` to be of hashable type. + Passing unhashable columns will raise a ``TypeError``. + Examples -------- Consider dataset containing ramen rating. - >>> df = pd.DataFrame({ - ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], - ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], - ... 'rating': [4, 4, 3.5, 15, 5] - ... }) + >>> df = pd.DataFrame( + ... { + ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], + ... "style": ["cup", "cup", "cup", "pack", "pack"], + ... "rating": [4, 4, 3.5, 15, 5], + ... } + ... ) >>> df brand style rating 0 Yum Yum cup 4.0 @@ -6751,21 +6767,21 @@ def drop_duplicates( To remove duplicates on specific column(s), use ``subset``. - >>> df.drop_duplicates(subset=['brand']) + >>> df.drop_duplicates(subset=["brand"]) brand style rating 0 Yum Yum cup 4.0 2 Indomie cup 3.5 To remove duplicates and keep last occurrences, use ``keep``. - >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') + >>> df.drop_duplicates(subset=["brand", "style"], keep="last") brand style rating 1 Yum Yum cup 4.0 2 Indomie cup 3.5 4 Indomie pack 5.0 """ if self.empty: - return self.copy(deep=None) + return self.copy(deep=False) inplace = validate_bool_kwarg(inplace, "inplace") ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") @@ -6782,7 +6798,7 @@ def drop_duplicates( def duplicated( self, - subset: Hashable | Sequence[Hashable] | None = None, + subset: Hashable | Iterable[Hashable] | None = None, keep: DropKeep = "first", ) -> Series: """ @@ -6792,7 +6808,7 @@ def duplicated( Parameters ---------- - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', False}, default 'first' @@ -6818,11 +6834,13 @@ def duplicated( -------- Consider dataset containing ramen rating. - >>> df = pd.DataFrame({ - ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], - ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], - ... 'rating': [4, 4, 3.5, 15, 5] - ... }) + >>> df = pd.DataFrame( + ... { + ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], + ... "style": ["cup", "cup", "cup", "pack", "pack"], + ... "rating": [4, 4, 3.5, 15, 5], + ... } + ... ) >>> df brand style rating 0 Yum Yum cup 4.0 @@ -6845,7 +6863,7 @@ def duplicated( By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True. - >>> df.duplicated(keep='last') + >>> df.duplicated(keep="last") 0 True 1 False 2 False @@ -6865,7 +6883,7 @@ def duplicated( To find duplicates on specific column(s), use ``subset``. - >>> df.duplicated(subset=['brand']) + >>> df.duplicated(subset=["brand"]) 0 False 1 True 2 False @@ -6879,18 +6897,14 @@ def duplicated( def f(vals) -> tuple[np.ndarray, int]: labels, shape = algorithms.factorize(vals, size_hint=len(self)) - return labels.astype("i8", copy=False), len(shape) + return labels.astype("i8"), len(shape) if subset is None: - # https://github.com/pandas-dev/pandas/issues/28770 - # Incompatible types in assignment (expression has type "Index", variable - # has type "Sequence[Any]") - subset = self.columns # type: ignore[assignment] + subset = self.columns elif ( not np.iterable(subset) or isinstance(subset, str) - or isinstance(subset, tuple) - and subset in self.columns + or (isinstance(subset, tuple) and subset in self.columns) ): subset = (subset,) @@ -6906,7 +6920,7 @@ def f(vals) -> tuple[np.ndarray, int]: if len(subset) == 1 and self.columns.is_unique: # GH#45236 This is faster than get_group_index below - result = self[subset[0]].duplicated(keep) + result = self[next(iter(subset))].duplicated(keep) result.name = None else: vals = (col.values for name, col in self.items() if name in subset) @@ -6931,8 +6945,7 @@ def sort_values( na_position: NaPosition = ..., ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def sort_values( @@ -6946,8 +6959,7 @@ def sort_values( na_position: str = ..., ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> None: - ... + ) -> None: ... def sort_values( self, @@ -6997,7 +7009,8 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. + It will be applied to each column in `by` independently. The values in the + returned Series will be used as the keys for sorting. Returns ------- @@ -7011,12 +7024,14 @@ def sort_values( Examples -------- - >>> df = pd.DataFrame({ - ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], - ... 'col2': [2, 1, 9, 8, 7, 4], - ... 'col3': [0, 1, 9, 4, 2, 3], - ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] - ... }) + >>> df = pd.DataFrame( + ... { + ... "col1": ["A", "A", "B", np.nan, "D", "C"], + ... "col2": [2, 1, 9, 8, 7, 4], + ... "col3": [0, 1, 9, 4, 2, 3], + ... "col4": ["a", "B", "c", "D", "e", "F"], + ... } + ... ) >>> df col1 col2 col3 col4 0 A 2 0 a @@ -7026,9 +7041,11 @@ def sort_values( 4 D 7 2 e 5 C 4 3 F - Sort by col1 + **Sort by a single column** + + In this case, we are sorting the rows according to values in ``col1``: - >>> df.sort_values(by=['col1']) + >>> df.sort_values(by=["col1"]) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -7037,9 +7054,14 @@ def sort_values( 4 D 7 2 e 3 NaN 8 4 D - Sort by multiple columns + **Sort by multiple columns** - >>> df.sort_values(by=['col1', 'col2']) + You can also provide multiple columns to ``by`` argument, as shown below. + In this example, the rows are first sorted according to ``col1``, and then + the rows that have an identical value in ``col1`` are sorted according + to ``col2``. + + >>> df.sort_values(by=["col1", "col2"]) col1 col2 col3 col4 1 A 1 1 B 0 A 2 0 a @@ -7048,9 +7070,11 @@ def sort_values( 4 D 7 2 e 3 NaN 8 4 D - Sort Descending + **Sort in a descending order** + + The sort order can be reversed using ``ascending`` argument, as shown below: - >>> df.sort_values(by='col1', ascending=False) + >>> df.sort_values(by="col1", ascending=False) col1 col2 col3 col4 4 D 7 2 e 5 C 4 3 F @@ -7059,9 +7083,13 @@ def sort_values( 1 A 1 1 B 3 NaN 8 4 D - Putting NAs first + **Placing any** ``NA`` **first** - >>> df.sort_values(by='col1', ascending=False, na_position='first') + Note that in the above example, the rows that contain an ``NA`` value in their + ``col1`` are placed at the end of the dataframe. This behavior can be modified + via ``na_position`` argument, as shown below: + + >>> df.sort_values(by="col1", ascending=False, na_position="first") col1 col2 col3 col4 3 NaN 8 4 D 4 D 7 2 e @@ -7070,9 +7098,14 @@ def sort_values( 0 A 2 0 a 1 A 1 1 B - Sorting with a key function + **Customized sort order** + + The ``key`` argument allows for a further customization of sorting behaviour. + For example, you may want + to ignore the `letter's case `__ + when sorting strings: - >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) + >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -7081,13 +7114,19 @@ def sort_values( 4 D 7 2 e 5 C 4 3 F - Natural sort with the key argument, - using the `natsort ` package. + Another typical example is + `natural sorting `__. + This can be done using + ``natsort`` `package `__, + which provides sorted indices according + to their natural order, as shown below: - >>> df = pd.DataFrame({ - ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], - ... "value": [10, 20, 30, 40, 50] - ... }) + >>> df = pd.DataFrame( + ... { + ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], + ... "value": [10, 20, 30, 40, 50], + ... } + ... ) >>> df time value 0 0hr 10 @@ -7096,9 +7135,11 @@ def sort_values( 3 48hr 40 4 96hr 50 >>> from natsort import index_natsorted + >>> index_natsorted(df["time"]) + [0, 3, 2, 4, 1] >>> df.sort_values( ... by="time", - ... key=lambda x: np.argsort(index_natsorted(df["time"])) + ... key=lambda x: np.argsort(index_natsorted(x)), ... ) time value 0 0hr 10 @@ -7124,19 +7165,19 @@ def sort_values( f" != length of by ({len(by)})" ) if len(by) > 1: - keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + keys = (self._get_label_or_level_values(x, axis=axis) for x in by) # need to rewrap columns in Series to apply key function if key is not None: - # error: List comprehension has incompatible type List[Series]; - # expected List[ndarray] - keys = [ - Series(k, name=name) # type: ignore[misc] - for (k, name) in zip(keys, by) - ] + keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)] + else: + # error: Argument 1 to "list" has incompatible type + # "Generator[ExtensionArray | ndarray[Any, Any], None, None]"; + # expected "Iterable[Series]" + keys_data = list(keys) # type: ignore[arg-type] indexer = lexsort_indexer( - keys, orders=ascending, na_position=na_position, key=key + keys_data, orders=ascending, na_position=na_position, key=key ) elif len(by): # len(by) == 1 @@ -7159,10 +7200,10 @@ def sort_values( if inplace: return self._update_inplace(self) else: - return self.copy(deep=None) + return self.copy(deep=False) if is_range_indexer(indexer, len(indexer)): - result = self.copy(deep=(not inplace and not using_copy_on_write())) + result = self.copy(deep=False) if ignore_index: result.index = default_index(len(result)) @@ -7199,8 +7240,7 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> None: - ... + ) -> None: ... @overload def sort_index( @@ -7215,8 +7255,7 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def sort_index( @@ -7231,8 +7270,7 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> DataFrame | None: - ... + ) -> DataFrame | None: ... def sort_index( self, @@ -7299,8 +7337,9 @@ def sort_index( Examples -------- - >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], - ... columns=['A']) + >>> df = pd.DataFrame( + ... [1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], columns=["A"] + ... ) >>> df.sort_index() A 1 4 @@ -7323,7 +7362,7 @@ def sort_index( A key function can be specified which is applied to the index before sorting. For a ``MultiIndex`` this is applied to each level separately. - >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) + >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=["A", "b", "C", "d"]) >>> df.sort_index(key=lambda x: x.str.lower()) a A 1 @@ -7352,7 +7391,7 @@ def value_counts( dropna: bool = True, ) -> Series: """ - Return a Series containing the frequency of each distinct row in the Dataframe. + Return a Series containing the frequency of each distinct row in the DataFrame. Parameters ---------- @@ -7361,17 +7400,22 @@ def value_counts( normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True - Sort by frequencies when True. Sort by DataFrame column values when False. + Sort by frequencies when True. Preserve the order of the data when False. + + .. versionchanged:: 3.0.0 + + Prior to 3.0.0, ``sort=False`` would sort by the columns values. ascending : bool, default False Sort in ascending order. dropna : bool, default True - Don't include counts of rows that contain NA values. + Do not include counts of rows that contain NA values. .. versionadded:: 1.3.0 Returns ------- Series + Series containing the frequency of each distinct row in the DataFrame. See Also -------- @@ -7382,14 +7426,15 @@ def value_counts( The returned Series will have a MultiIndex with one level per input column but an Index (non-multi) for a single label. By default, rows that contain any NA values are omitted from the result. By default, - the resulting Series will be in descending order so that the first - element is the most frequently-occurring row. + the resulting Series will be sorted by frequencies in descending order so that + the first element is the most frequently-occurring row. Examples -------- - >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], - ... 'num_wings': [2, 0, 0, 0]}, - ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df = pd.DataFrame( + ... {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + ... index=["falcon", "dog", "cat", "ant"], + ... ) >>> df num_legs num_wings falcon 2 2 @@ -7427,8 +7472,12 @@ def value_counts( With `dropna` set to `False` we can also count rows with NA values. - >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], - ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) + >>> df = pd.DataFrame( + ... { + ... "first_name": ["John", "Anne", "John", "Beth"], + ... "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], + ... } + ... ) >>> df first_name middle_name 0 John Smith @@ -7461,7 +7510,9 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() + counts = self.groupby( + subset, sort=False, dropna=dropna, observed=False + )._grouper.size() counts.name = name if sort: @@ -7526,16 +7577,34 @@ def nlargest( Examples -------- - >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, - ... 434000, 434000, 337000, 11300, - ... 11300, 11300], - ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, - ... 17036, 182, 38, 311], - ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", - ... "IS", "NR", "TV", "AI"]}, - ... index=["Italy", "France", "Malta", - ... "Maldives", "Brunei", "Iceland", - ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df = pd.DataFrame( + ... { + ... "population": [ + ... 59000000, + ... 65000000, + ... 434000, + ... 434000, + ... 434000, + ... 337000, + ... 11300, + ... 11300, + ... 11300, + ... ], + ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], + ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], + ... }, + ... index=[ + ... "Italy", + ... "France", + ... "Malta", + ... "Maldives", + ... "Brunei", + ... "Iceland", + ... "Nauru", + ... "Tuvalu", + ... "Anguilla", + ... ], + ... ) >>> df population GDP alpha-2 Italy 59000000 1937894 IT @@ -7551,7 +7620,7 @@ def nlargest( In the following example, we will use ``nlargest`` to select the three rows having the largest values in column "population". - >>> df.nlargest(3, 'population') + >>> df.nlargest(3, "population") population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7559,7 +7628,7 @@ def nlargest( When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nlargest(3, 'population', keep='last') + >>> df.nlargest(3, "population", keep="last") population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7569,7 +7638,7 @@ def nlargest( if there are duplicate values for the smallest element, all the ties are kept: - >>> df.nlargest(3, 'population', keep='all') + >>> df.nlargest(3, "population", keep="all") population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7579,7 +7648,7 @@ def nlargest( However, ``nlargest`` does not keep ``n`` distinct largest elements: - >>> df.nlargest(5, 'population', keep='all') + >>> df.nlargest(5, "population", keep="all") population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7590,7 +7659,7 @@ def nlargest( To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. - >>> df.nlargest(3, ['population', 'GDP']) + >>> df.nlargest(3, ["population", "GDP"]) population GDP alpha-2 France 65000000 2583560 FR Italy 59000000 1937894 IT @@ -7629,6 +7698,7 @@ def nsmallest( Returns ------- DataFrame + DataFrame with the first `n` rows ordered by `columns` in ascending order. See Also -------- @@ -7639,16 +7709,34 @@ def nsmallest( Examples -------- - >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, - ... 434000, 434000, 337000, 337000, - ... 11300, 11300], - ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, - ... 17036, 182, 38, 311], - ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", - ... "IS", "NR", "TV", "AI"]}, - ... index=["Italy", "France", "Malta", - ... "Maldives", "Brunei", "Iceland", - ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df = pd.DataFrame( + ... { + ... "population": [ + ... 59000000, + ... 65000000, + ... 434000, + ... 434000, + ... 434000, + ... 337000, + ... 337000, + ... 11300, + ... 11300, + ... ], + ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], + ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], + ... }, + ... index=[ + ... "Italy", + ... "France", + ... "Malta", + ... "Maldives", + ... "Brunei", + ... "Iceland", + ... "Nauru", + ... "Tuvalu", + ... "Anguilla", + ... ], + ... ) >>> df population GDP alpha-2 Italy 59000000 1937894 IT @@ -7664,7 +7752,7 @@ def nsmallest( In the following example, we will use ``nsmallest`` to select the three rows having the smallest values in column "population". - >>> df.nsmallest(3, 'population') + >>> df.nsmallest(3, "population") population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7672,7 +7760,7 @@ def nsmallest( When using ``keep='last'``, ties are resolved in reverse order: - >>> df.nsmallest(3, 'population', keep='last') + >>> df.nsmallest(3, "population", keep="last") population GDP alpha-2 Anguilla 11300 311 AI Tuvalu 11300 38 TV @@ -7682,7 +7770,7 @@ def nsmallest( if there are duplicate values for the largest element, all the ties are kept. - >>> df.nsmallest(3, 'population', keep='all') + >>> df.nsmallest(3, "population", keep="all") population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7692,7 +7780,7 @@ def nsmallest( However, ``nsmallest`` does not keep ``n`` distinct smallest elements: - >>> df.nsmallest(4, 'population', keep='all') + >>> df.nsmallest(4, "population", keep="all") population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7702,7 +7790,7 @@ def nsmallest( To order by the smallest values in column "population" and then "GDP", we can specify multiple columns like in the next example. - >>> df.nsmallest(3, ['population', 'GDP']) + >>> df.nsmallest(3, ["population", "GDP"]) population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI @@ -7710,16 +7798,30 @@ def nsmallest( """ return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() - @doc( - Series.swaplevel, - klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to swap levels on. 0 or 'index' for row-wise, 1 or - 'columns' for column-wise.""" - ), - examples=dedent( - """\ + def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + """ + Swap levels i and j in a :class:`MultiIndex`. + + Default is to swap the two innermost levels of the index. + + Parameters + ---------- + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise. + + Returns + ------- + DataFrame + DataFrame with levels swapped in MultiIndex. + + See Also + -------- + DataFrame.reorder_levels: Reorder levels of MultiIndex. + DataFrame.sort_index: Sort MultiIndex. + Examples -------- >>> df = pd.DataFrame( @@ -7769,11 +7871,9 @@ def nsmallest( History Final exam January A Geography Final exam February B History Coursework March A - Geography Coursework April C""" - ), - ) - def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: - result = self.copy(deep=None) + Geography Coursework April C + """ + result = self.copy(deep=False) axis = self._get_axis_number(axis) @@ -7790,7 +7890,9 @@ def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame: """ - Rearrange index levels using input order. May not drop or duplicate levels. + Rearrange index or column levels using input ``order``. + + May not drop or duplicate levels. Parameters ---------- @@ -7803,6 +7905,11 @@ def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFram Returns ------- DataFrame + DataFrame with indices or columns with reordered levels. + + See Also + -------- + DataFrame.swaplevel : Swap levels i and j in a MultiIndex. Examples -------- @@ -7833,7 +7940,7 @@ class diet if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover raise TypeError("Can only reorder levels on a hierarchical axis.") - result = self.copy(deep=None) + result = self.copy(deep=False) if axis == 0: assert isinstance(result.index, MultiIndex) @@ -7853,7 +7960,7 @@ def _cmp_method(self, other, op): # See GH#4537 for discussion of scalar op behavior new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data) + return self._construct_result(new_data, other=other) def _arith_method(self, other, op): if self._should_reindex_frame_op(other, op, 1, None, None): @@ -7866,7 +7973,7 @@ def _arith_method(self, other, op): with np.errstate(all="ignore"): new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data) + return self._construct_result(new_data, other=other) _logical_method = _arith_method @@ -7909,13 +8016,7 @@ def _dispatch_frame_op( # TODO operate_blockwise expects a manager of the same type bm = self._mgr.operate_blockwise( - # error: Argument 1 to "operate_blockwise" of "ArrayManager" has - # incompatible type "Union[ArrayManager, BlockManager]"; expected - # "ArrayManager" - # error: Argument 1 to "operate_blockwise" of "BlockManager" has - # incompatible type "Union[ArrayManager, BlockManager]"; expected - # "BlockManager" - right._mgr, # type: ignore[arg-type] + right._mgr, array_op, ) return self._constructor_from_mgr(bm, axes=bm.axes) @@ -7983,19 +8084,27 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: left = self # GH#31623, only operate on shared columns - cols, lcols, rcols = left.columns.join( - right.columns, how="inner", level=None, return_indexers=True + cols, lcol_indexer, rcol_indexer = left.columns.join( + right.columns, how="inner", return_indexers=True ) - new_left = left.iloc[:, lcols] - new_right = right.iloc[:, rcols] + new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] + new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] + + # GH#60498 For MultiIndex column alignment + if isinstance(cols, MultiIndex): + # When overwriting column names, make a shallow copy so as to not modify + # the input DFs + new_left = new_left.copy(deep=False) + new_right = new_right.copy(deep=False) + new_left.columns = cols + new_right.columns = cols + result = op(new_left, new_right) # Do the join on the columns instead of using left._align_for_op # to avoid constructing two potentially large/sparse DataFrames - join_columns, _, _ = left.columns.join( - right.columns, how="outer", level=None, return_indexers=True - ) + join_columns = left.columns.join(right.columns, how="outer") if result.columns.has_duplicates: # Avoid reindexing with a duplicate axis. @@ -8021,6 +8130,18 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b if not isinstance(right, DataFrame): return False + if ( + ( + isinstance(self.columns, MultiIndex) + or isinstance(right.columns, MultiIndex) + ) + and not self.columns.equals(right.columns) + and fill_value is None + ): + # GH#60498 Reindex if MultiIndexe columns are not matching + # GH#60903 Don't reindex if fill_value is provided + return True + if fill_value is None and level is None and axis == 1: # TODO: any other cases we should handle here? @@ -8048,8 +8169,7 @@ def _align_for_op( Parameters ---------- - left : DataFrame - right : Any + other : Any axis : int flex : bool or None, default False Whether this is a flex op, in which case we reindex. @@ -8144,9 +8264,7 @@ def to_series(right): if flex is not None and isinstance(right, DataFrame): if not left._indexed_same(right): if flex: - left, right = left.align( - right, join="outer", level=level, copy=False - ) + left, right = left.align(right, join="outer", level=level) else: raise ValueError( "Can only compare identically-labeled (both index and columns) " @@ -8159,7 +8277,7 @@ def to_series(right): if not left.axes[axis].equals(right.index): raise ValueError( "Operands are not aligned. Do " - "`left, right = left.align(right, axis=1, copy=False)` " + "`left, right = left.align(right, axis=1)` " "before operating." ) @@ -8168,10 +8286,8 @@ def to_series(right): join="outer", axis=axis, level=level, - copy=False, ) right = left._maybe_align_series_as_frame(right, axis) - return left, right def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): @@ -8200,7 +8316,7 @@ def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): index=self.index, columns=self.columns, dtype=rvalues.dtype, - ) + ).__finalize__(series) def _flex_arith_method( self, other, op, *, axis: Axis = "columns", level=None, fill_value=None @@ -8232,9 +8348,9 @@ def _flex_arith_method( new_data = self._dispatch_frame_op(other, op) - return self._construct_result(new_data) + return self._construct_result(new_data, other=other) - def _construct_result(self, result) -> DataFrame: + def _construct_result(self, result, other) -> DataFrame: """ Wrap the result of an arithmetic, comparison, or logical operation. @@ -8251,6 +8367,7 @@ def _construct_result(self, result) -> DataFrame: # non-unique columns case out.columns = self.columns out.index = self.index + out = out.__finalize__(other) return out def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: @@ -8271,7 +8388,7 @@ def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None): self, other = self._align_for_op(other, axis, flex=True, level=level) new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data) + return self._construct_result(new_data, other=other) @Appender(ops.make_flex_doc("eq", "dataframe")) def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame: @@ -8595,8 +8712,8 @@ def combine( -------- Combine using a simple function that chooses the smaller column. - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 >>> df1.combine(df2, take_smaller) A B @@ -8605,8 +8722,8 @@ def combine( Example using a true element-wise combine function. - >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1 = pd.DataFrame({"A": [5, 0], "B": [2, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine(df2, np.minimum) A B 0 1 2 @@ -8615,8 +8732,8 @@ def combine( Using `fill_value` fills Nones prior to passing the column to the merge function. - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine(df2, take_smaller, fill_value=-5) A B 0 0 -5.0 @@ -8625,8 +8742,8 @@ def combine( However, if the same element in both dataframes is None, that None is preserved - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) + >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [None, 3]}) >>> df1.combine(df2, take_smaller, fill_value=-5) A B 0 0 -5.0 @@ -8635,8 +8752,14 @@ def combine( Example that demonstrates the use of `overwrite` and behavior when the axis differ between the dataframes. - >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) + >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]}) + >>> df2 = pd.DataFrame( + ... { + ... "B": [3, 3], + ... "C": [-10, 1], + ... }, + ... index=[1, 2], + ... ) >>> df1.combine(df2, take_smaller) A B C 0 NaN NaN NaN @@ -8651,7 +8774,13 @@ def combine( Demonstrating the preference of the passed in dataframe. - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) + >>> df2 = pd.DataFrame( + ... { + ... "B": [3, 3], + ... "C": [1, 1], + ... }, + ... index=[1, 2], + ... ) >>> df2.combine(df1, take_smaller) A B C 0 0.0 NaN NaN @@ -8665,8 +8794,9 @@ def combine( 2 NaN 3.0 1.0 """ other_idxlen = len(other.index) # save for compare + other_columns = other.columns - this, other = self.align(other, copy=False) + this, other = self.align(other) new_index = this.index if other.empty and len(new_index) == len(self.index): @@ -8675,8 +8805,8 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() - # sorts if possible; otherwise align above ensures that these are set-equal - new_columns = this.columns.union(other.columns) + # preserve column order + new_columns = self.columns.union(other_columns, sort=False) do_fill = fill_value is not None result = {} for col in new_columns: @@ -8706,15 +8836,15 @@ def combine( # try to promote series, which is all NaN, as other_dtype. new_dtype = other_dtype try: - series = series.astype(new_dtype, copy=False) + series = series.astype(new_dtype) except ValueError: # e.g. new_dtype is integer types pass else: # if we have different dtypes, possibly promote new_dtype = find_common_type([this_dtype, other_dtype]) - series = series.astype(new_dtype, copy=False) - other_series = other_series.astype(new_dtype, copy=False) + series = series.astype(new_dtype) + other_series = other_series.astype(new_dtype) arr = func(series, other_series) if isinstance(new_dtype, np.dtype): @@ -8761,8 +8891,8 @@ def combine_first(self, other: DataFrame) -> DataFrame: Examples -------- - >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) - >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine_first(df2) A B 0 1.0 3.0 @@ -8771,8 +8901,8 @@ def combine_first(self, other: DataFrame) -> DataFrame: Null values still persist if the location of that null value does not exist in `other` - >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) - >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) + >>> df1 = pd.DataFrame({"A": [None, 0], "B": [4, None]}) + >>> df2 = pd.DataFrame({"B": [3, 3], "C": [1, 1]}, index=[1, 2]) >>> df1.combine_first(df2) A B C 0 NaN 4.0 NaN @@ -8869,12 +8999,14 @@ def update( dict.update : Similar method for dictionaries. DataFrame.merge : For column(s)-on-column(s) operations. + Notes + ----- + 1. Duplicate indices on `other` are not supported and raises `ValueError`. + Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) - >>> new_df = pd.DataFrame({'B': [4, 5, 6], - ... 'C': [7, 8, 9]}) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]}) + >>> new_df = pd.DataFrame({"B": [4, 5, 6], "C": [7, 8, 9]}) >>> df.update(new_df) >>> df A B @@ -8885,9 +9017,8 @@ def update( The DataFrame's length does not increase as a result of the update, only values at matching index/column labels are updated. - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) + >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) + >>> new_df = pd.DataFrame({"B": ["d", "e", "f", "g", "h", "i"]}) >>> df.update(new_df) >>> df A B @@ -8895,9 +9026,8 @@ def update( 1 b e 2 c f - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2]) + >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) + >>> new_df = pd.DataFrame({"B": ["d", "f"]}, index=[0, 2]) >>> df.update(new_df) >>> df A B @@ -8907,9 +9037,8 @@ def update( For Series, its name attribute must be set. - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_column = pd.Series(['d', 'e', 'f'], name='B') + >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) + >>> new_column = pd.Series(["d", "e", "f"], name="B") >>> df.update(new_column) >>> df A B @@ -8920,9 +9049,8 @@ def update( If `other` contains NaNs the corresponding values are not updated in the original dataframe. - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400., 500., 600.]}) - >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400.0, 500.0, 600.0]}) + >>> new_df = pd.DataFrame({"B": [4, np.nan, 6]}) >>> df.update(new_df) >>> df A B @@ -8930,20 +9058,13 @@ def update( 1 2 500.0 2 3 6.0 """ - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules(): - if sys.getrefcount(self) <= REF_COUNT: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) # TODO: Support other joins if join != "left": # pragma: no cover @@ -8954,11 +9075,22 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - other = other.reindex(self.index) + if other.index.has_duplicates: + raise ValueError("Update not allowed with duplicate indexes on other.") + + index_intersection = other.index.intersection(self.index) + if index_intersection.empty: + raise ValueError( + "Update not allowed when the index on `other` has no intersection " + "with this dataframe." + ) + + other = other.reindex(index_intersection) + this_data = self.loc[index_intersection] for col in self.columns.intersection(other.columns): - this = self[col]._values - that = other[col]._values + this = this_data[col] + that = other[col] if filter_func is not None: mask = ~filter_func(this) | isna(that) @@ -8978,7 +9110,7 @@ def update( if mask.all(): continue - self.loc[:, col] = self[col].where(mask, that) + self.loc[index_intersection, col] = this.where(mask, that) # ---------------------------------------------------------------------- # Data reshaping @@ -9033,8 +9165,8 @@ def update( We can also choose to include NA in group keys or not by setting `dropna` parameter, the default setting is `True`. - >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + >>> arr = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + >>> df = pd.DataFrame(arr, columns=["a", "b", "c"]) >>> df.groupby(by=["b"]).sum() a c @@ -9049,8 +9181,8 @@ def update( 2.0 2 5 NaN 1 4 - >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] - >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + >>> arr = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] + >>> df = pd.DataFrame(arr, columns=["a", "b", "c"]) >>> df.groupby(by="a").sum() b c @@ -9092,33 +9224,13 @@ def update( def groupby( self, by=None, - axis: Axis | lib.NoDefault = lib.no_default, level: IndexLabel | None = None, as_index: bool = True, sort: bool = True, group_keys: bool = True, - observed: bool | lib.NoDefault = lib.no_default, + observed: bool = True, dropna: bool = True, ) -> DataFrameGroupBy: - if axis is not lib.no_default: - axis = self._get_axis_number(axis) - if axis == 1: - warnings.warn( - "DataFrame.groupby with axis=1 is deprecated. Do " - "`frame.T.groupby(...)` without axis instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - warnings.warn( - "The 'axis' keyword in DataFrame.groupby is deprecated and " - "will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - axis = 0 - from pandas.core.groupby.generic import DataFrameGroupBy if level is None and by is None: @@ -9127,7 +9239,6 @@ def groupby( return DataFrameGroupBy( obj=self, keys=by, - axis=axis, level=level, as_index=as_index, sort=sort, @@ -9136,9 +9247,7 @@ def groupby( dropna=dropna, ) - _shared_docs[ - "pivot" - ] = """ + _shared_docs["pivot"] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses @@ -9149,11 +9258,11 @@ def groupby( Parameters ----------%s - columns : str or object or a list of str + columns : Hashable or a sequence of the previous Column to use to make new frame's columns. - index : str or object or a list of str, optional + index : Hashable or a sequence of the previous, optional Column to use to make new frame's index. If not given, uses existing index. - values : str, object or a list of the previous, optional + values : Hashable or a sequence of the previous, optional Column(s) to use for populating new frame's values. If not specified, all remaining columns will be used and the result will have hierarchically indexed columns. @@ -9223,11 +9332,11 @@ def groupby( You could also assign a list of column names or a list of index names. >>> df = pd.DataFrame({ - ... "lev1": [1, 1, 1, 2, 2, 2], - ... "lev2": [1, 1, 2, 1, 1, 2], - ... "lev3": [1, 2, 1, 2, 1, 2], - ... "lev4": [1, 2, 3, 4, 5, 6], - ... "values": [0, 1, 2, 3, 4, 5]}) + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5]}) >>> df lev1 lev2 lev3 lev4 values 0 1 1 1 1 0 @@ -9282,9 +9391,7 @@ def pivot( return pivot(self, index=index, columns=columns, values=values) - _shared_docs[ - "pivot_table" - ] = """ + _shared_docs["pivot_table"] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects @@ -9294,12 +9401,12 @@ def pivot( ----------%s values : list-like or scalar, optional Column or columns to aggregate. - index : column, Grouper, array, or list of the previous + index : column, Grouper, array, or sequence of the previous Keys to group by on the pivot table index. If a list is passed, it can contain any of the other types (except list). If an array is passed, it must be the same length as the data and will be used in the same manner as column values. - columns : column, Grouper, array, or list of the previous + columns : column, Grouper, array, or sequence of the previous Keys to group by on the pivot table column. If a list is passed, it can contain any of the other types (except list). If an array is passed, it must be the same length as the data and will be used in @@ -9320,8 +9427,12 @@ def pivot( on the rows and columns. dropna : bool, default True Do not include columns whose entries are all NaN. If True, - rows with a NaN value in any column will be omitted before - computing margins. + + * rows with an NA value in any column will be omitted before computing + margins, + * index/column keys containing NA values will be dropped (see ``dropna`` + parameter in :meth:`DataFrame.groupby`). + margins_name : str, default 'All' Name of the row / column that will contain the totals when margins is True. @@ -9330,16 +9441,20 @@ def pivot( If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0.0 - The default value of ``False`` is deprecated and will change to - ``True`` in a future version of pandas. + The default value is now ``True``. sort : bool, default True Specifies if the result should be sorted. .. versionadded:: 1.3.0 + **kwargs : dict + Optional keyword arguments to pass to ``aggfunc``. + + .. versionadded:: 3.0.0 + Returns ------- DataFrame @@ -9445,8 +9560,9 @@ def pivot_table( margins: bool = False, dropna: bool = True, margins_name: Level = "All", - observed: bool | lib.NoDefault = lib.no_default, + observed: bool = True, sort: bool = True, + **kwargs, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table @@ -9462,6 +9578,7 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, + **kwargs, ) def stack( @@ -9469,7 +9586,7 @@ def stack( level: IndexLabel = -1, dropna: bool | lib.NoDefault = lib.no_default, sort: bool | lib.NoDefault = lib.no_default, - future_stack: bool = False, + future_stack: bool = True, ): """ Stack the prescribed level(s) from columns to index. @@ -9479,10 +9596,9 @@ def stack( DataFrame. The new inner-most levels are created by pivoting the columns of the current dataframe: - - if the columns have a single level, the output is a Series; - - if the columns have multiple levels, the new index - level(s) is (are) taken from the prescribed level(s) and - the output is a DataFrame. + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index level(s) is (are) + taken from the prescribed level(s) and the output is a DataFrame. Parameters ---------- @@ -9498,7 +9614,7 @@ def stack( section. sort : bool, default True Whether to sort the levels of the resulting MultiIndex. - future_stack : bool, default False + future_stack : bool, default True Whether to use the new implementation that will replace the current implementation in pandas 3.0. When True, dropna and sort have no impact on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release @@ -9532,9 +9648,9 @@ def stack( -------- **Single level columns** - >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], - ... index=['cat', 'dog'], - ... columns=['weight', 'height']) + >>> df_single_level_cols = pd.DataFrame( + ... [[0, 1], [2, 3]], index=["cat", "dog"], columns=["weight", "height"] + ... ) Stacking a dataframe with a single level column axis returns a Series: @@ -9542,7 +9658,7 @@ def stack( weight height cat 0 1 dog 2 3 - >>> df_single_level_cols.stack(future_stack=True) + >>> df_single_level_cols.stack() cat weight 0 height 1 dog weight 2 @@ -9551,11 +9667,12 @@ def stack( **Multi level columns: simple case** - >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), - ... ('weight', 'pounds')]) - >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], - ... index=['cat', 'dog'], - ... columns=multicol1) + >>> multicol1 = pd.MultiIndex.from_tuples( + ... [("weight", "kg"), ("weight", "pounds")] + ... ) + >>> df_multi_level_cols1 = pd.DataFrame( + ... [[1, 2], [2, 4]], index=["cat", "dog"], columns=multicol1 + ... ) Stacking a dataframe with a multi-level column axis: @@ -9564,7 +9681,7 @@ def stack( kg pounds cat 1 2 dog 2 4 - >>> df_multi_level_cols1.stack(future_stack=True) + >>> df_multi_level_cols1.stack() weight cat kg 1 pounds 2 @@ -9573,11 +9690,10 @@ def stack( **Missing values** - >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), - ... ('height', 'm')]) - >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], - ... index=['cat', 'dog'], - ... columns=multicol2) + >>> multicol2 = pd.MultiIndex.from_tuples([("weight", "kg"), ("height", "m")]) + >>> df_multi_level_cols2 = pd.DataFrame( + ... [[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=multicol2 + ... ) It is common to have missing values when stacking a dataframe with multi-level columns, as the stacked dataframe typically @@ -9589,7 +9705,7 @@ def stack( kg m cat 1.0 2.0 dog 3.0 4.0 - >>> df_multi_level_cols2.stack(future_stack=True) + >>> df_multi_level_cols2.stack() weight height cat kg 1.0 NaN m NaN 2.0 @@ -9600,13 +9716,13 @@ def stack( The first parameter controls which level or levels are stacked: - >>> df_multi_level_cols2.stack(0, future_stack=True) + >>> df_multi_level_cols2.stack(0) kg m cat weight 1.0 NaN height NaN 2.0 dog weight 3.0 NaN height NaN 4.0 - >>> df_multi_level_cols2.stack([0, 1], future_stack=True) + >>> df_multi_level_cols2.stack([0, 1]) cat weight kg 1.0 height m 2.0 dog weight kg 3.0 @@ -9619,19 +9735,14 @@ def stack( stack_multiple, ) - if ( - dropna is not lib.no_default - or sort is not lib.no_default - or self.columns.nlevels > 1 - ): - warnings.warn( - "The previous implementation of stack is deprecated and will be " - "removed in a future version of pandas. See the What's New notes " - "for pandas 2.1.0 for details. Specify future_stack=True to adopt " - "the new implementation and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + warnings.warn( + "The previous implementation of stack is deprecated and will be " + "removed in a future version of pandas. See the What's New notes " + "for pandas 2.1.0 for details. Do not specify the future_stack " + "argument to adopt the new implementation and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) if dropna is lib.no_default: dropna = True @@ -9647,14 +9758,14 @@ def stack( if dropna is not lib.no_default: raise ValueError( - "dropna must be unspecified with future_stack=True as the new " + "dropna must be unspecified as the new " "implementation does not introduce rows of NA values. This " "argument will be removed in a future version of pandas." ) if sort is not lib.no_default: raise ValueError( - "Cannot specify sort with future_stack=True, this argument will be " + "Cannot specify sort, this argument will be " "removed in a future version of pandas. Sort the result using " ".sort_index instead." ) @@ -9731,9 +9842,13 @@ def explode( Examples -------- - >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], - ... 'B': 1, - ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) + >>> df = pd.DataFrame( + ... { + ... "A": [[0, 1, 2], "foo", [], [3, 4]], + ... "B": 1, + ... "C": [["a", "b", "c"], np.nan, [], ["d", "e"]], + ... } + ... ) >>> df A B C 0 [0, 1, 2] 1 [a, b, c] @@ -9743,7 +9858,7 @@ def explode( Single-column explode. - >>> df.explode('A') + >>> df.explode("A") A B C 0 0 1 [a, b, c] 0 1 1 [a, b, c] @@ -9755,7 +9870,7 @@ def explode( Multi-column explode. - >>> df.explode(list('AC')) + >>> df.explode(list("AC")) A B C 0 0 1 a 0 1 1 b @@ -9800,11 +9915,13 @@ def explode( result.index = default_index(len(result)) else: result.index = self.index.take(result.index) - result = result.reindex(columns=self.columns, copy=False) + result = result.reindex(columns=self.columns) return result.__finalize__(self, method="explode") - def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): + def unstack( + self, level: IndexLabel = -1, fill_value=None, sort: bool = True + ) -> DataFrame | Series: """ Pivot a level of the (necessarily hierarchical) index labels. @@ -9818,7 +9935,7 @@ def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): ---------- level : int, str, or list of these, default -1 (last level) Level(s) of index to unstack, can pass level name. - fill_value : int, str or dict + fill_value : scalar Replace NaN with this value if the unstack produces missing values. sort : bool, default True Sort the level(s) in the resulting MultiIndex columns. @@ -9826,6 +9943,8 @@ def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): Returns ------- Series or DataFrame + If index is a MultiIndex: DataFrame with pivoted index labels as new + inner-most level column labels, else Series. See Also -------- @@ -9839,8 +9958,9 @@ def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): Examples -------- - >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), - ... ('two', 'a'), ('two', 'b')]) + >>> index = pd.MultiIndex.from_tuples( + ... [("one", "a"), ("one", "b"), ("two", "a"), ("two", "b")] + ... ) >>> s = pd.Series(np.arange(1.0, 5.0), index=index) >>> s one a 1.0 @@ -9873,7 +9993,6 @@ def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): return result.__finalize__(self, method="unstack") - @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) def melt( self, id_vars=None, @@ -9883,6 +10002,127 @@ def melt( col_level: Level | None = None, ignore_index: bool = True, ) -> DataFrame: + """ + Unpivot DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + + Parameters + ---------- + id_vars : scalar, tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : scalar, tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar, default None + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name : scalar, default 'value' + Name to use for the 'value' column, can't be an existing column label. + col_level : scalar, optional + If columns are a MultiIndex then use this level to melt. + ignore_index : bool, default True + If True, original index is ignored. If False, original index is retained. + Index labels will be repeated as necessary. + + Returns + ------- + DataFrame + Unpivoted DataFrame. + + See Also + -------- + melt : Identical method. + pivot_table : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "A": {0: "a", 1: "b", 2: "c"}, + ... "B": {0: 1, 1: 3, 2: 5}, + ... "C": {0: 2, 1: 4, 2: 6}, + ... } + ... ) + >>> df + A B C + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> df.melt(id_vars=["A"], value_vars=["B"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> df.melt(id_vars=["A"], value_vars=["B", "C"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 3 a C 2 + 4 b C 4 + 5 c C 6 + + The names of 'variable' and 'value' columns can be customized: + + >>> df.melt( + ... id_vars=["A"], + ... value_vars=["B"], + ... var_name="myVarname", + ... value_name="myValname", + ... ) + A myVarname myValname + 0 a B 1 + 1 b B 3 + 2 c B 5 + + Original index values can be kept around: + + >>> df.melt(id_vars=["A"], value_vars=["B", "C"], ignore_index=False) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 0 a C 2 + 1 b C 4 + 2 c C 6 + + If you have multi-index columns: + + >>> df.columns = [list("ABC"), list("DEF")] + >>> df + A B C + D E F + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> df.melt(col_level=0, id_vars=["A"], value_vars=["B"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> df.melt(id_vars=[("A", "D")], value_vars=[("B", "E")]) + (A, D) variable_0 variable_1 value + 0 a B E 1 + 1 b B E 3 + 2 c B E 5 + """ return melt( self, id_vars=id_vars, @@ -10023,11 +10263,11 @@ def _gotitem( -------- DataFrame.apply : Perform any type of operations. DataFrame.transform : Perform transformation type operations. - pandas.DataFrame.groupby : Perform operations over groups. - pandas.DataFrame.resample : Perform operations over resampled bins. - pandas.DataFrame.rolling : Perform operations over rolling window. - pandas.DataFrame.expanding : Perform operations over expanding window. - pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential + DataFrame.groupby : Perform operations over groups. + DataFrame.resample : Perform operations over resampled bins. + DataFrame.rolling : Perform operations over rolling window. + DataFrame.expanding : Perform operations over expanding window. + core.window.ewm.ExponentialMovingWindow : Perform operation over exponential weighted window. """ ) @@ -10119,7 +10359,7 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", - engine: Literal["python", "numba"] = "python", + engine: Callable | None | Literal["python", "numba"] = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): @@ -10130,7 +10370,9 @@ def apply( either the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``). By default (``result_type=None``), the final return type is inferred from the return type of the applied function. Otherwise, - it depends on the `result_type` argument. + it depends on the `result_type` argument. The return type of the applied + function is inferred based on the first computed result obtained after + applying the function to a Series object. Parameters ---------- @@ -10181,28 +10423,24 @@ def apply( .. versionadded:: 2.1.0 - engine : {'python', 'numba'}, default 'python' - Choose between the python (default) engine or the numba engine in apply. + engine : decorator or {'python', 'numba'}, optional + Choose the execution engine to use. If not provided the function + will be executed by the regular Python interpreter. - The numba engine will attempt to JIT compile the passed function, - which may result in speedups for large DataFrames. - It also supports the following engine_kwargs : + Other options include JIT compilers such Numba and Bodo, which in some + cases can speed up the execution. To use an executor you can provide + the decorators ``numba.jit``, ``numba.njit`` or ``bodo.jit``. You can + also provide the decorator with parameters, like ``numba.jit(nogit=True)``. - - nopython (compile the function in nopython mode) - - nogil (release the GIL inside the JIT compiled function) - - parallel (try to apply the function in parallel over the DataFrame) + Not all functions can be executed with all execution engines. In general, + JIT compilers will require type stability in the function (no variable + should change data type during the execution). And not all pandas and + NumPy APIs are supported. Check the engine documentation [1]_ and [2]_ + for limitations. - Note: Due to limitations within numba/how pandas interfaces with numba, - you should only use this if raw=True + .. warning:: - Note: The numba compiler only supports a subset of - valid Python/numpy operations. - - Please read more about the `supported python features - `_ - and `supported numpy features - `_ - in numba to learn what you can or cannot use in the passed function. + String parameters will stop being supported in a future pandas version. .. versionadded:: 2.2.0 @@ -10210,6 +10448,7 @@ def apply( Pass keyword arguments to the engine. This is currently only used by the numba engine, see the documentation for the engine argument for more information. + **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -10232,9 +10471,16 @@ def apply( behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. + References + ---------- + .. [1] `Numba documentation + `_ + .. [2] `Bodo documentation + `/ + Examples -------- - >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) + >>> df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) >>> df A B 0 4 9 @@ -10274,7 +10520,7 @@ def apply( Passing ``result_type='expand'`` will expand list-like results to columns of a Dataframe - >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') + >>> df.apply(lambda x: [1, 2], axis=1, result_type="expand") 0 1 0 1 2 1 1 2 @@ -10284,7 +10530,7 @@ def apply( ``result_type='expand'``. The resulting column names will be the Series index. - >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) + >>> df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1) foo bar 0 1 2 1 1 2 @@ -10295,30 +10541,107 @@ def apply( and broadcast it along the axis. The resulting column names will be the originals. - >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') + >>> df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") A B 0 1 2 1 1 2 2 1 2 + + Advanced users can speed up their code by using a Just-in-time (JIT) compiler + with ``apply``. The main JIT compilers available for pandas are Numba and Bodo. + In general, JIT compilation is only possible when the function passed to + ``apply`` has type stability (variables in the function do not change their + type during the execution). + + >>> import bodo + >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit) + + Note that JIT compilation is only recommended for functions that take a + significant amount of time to run. Fast functions are unlikely to run faster + with JIT compilation. """ - from pandas.core.apply import frame_apply + if engine is None or isinstance(engine, str): + from pandas.core.apply import frame_apply - op = frame_apply( - self, - func=func, - axis=axis, - raw=raw, - result_type=result_type, - by_row=by_row, - engine=engine, - engine_kwargs=engine_kwargs, - args=args, - kwargs=kwargs, - ) - return op.apply().__finalize__(self, method="apply") + if engine is None: + engine = "python" + + if engine not in ["python", "numba"]: + raise ValueError(f"Unknown engine '{engine}'") + + op = frame_apply( + self, + func=func, + axis=axis, + raw=raw, + result_type=result_type, + by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + return op.apply().__finalize__(self, method="apply") + elif hasattr(engine, "__pandas_udf__"): + if result_type is not None: + raise NotImplementedError( + f"{result_type=} only implemented for the default engine" + ) + + agg_axis = self._get_agg_axis(self._get_axis_number(axis)) + + # one axis is empty + if not all(self.shape): + func = cast(Callable, func) + try: + if axis == 0: + r = func(Series([], dtype=np.float64), *args, **kwargs) + else: + r = func( + Series(index=self.columns, dtype=np.float64), + *args, + **kwargs, + ) + except Exception: + pass + else: + if not isinstance(r, Series): + if len(agg_axis): + r = func(Series([], dtype=np.float64), *args, **kwargs) + else: + r = np.nan + + return self._constructor_sliced(r, index=agg_axis) + return self.copy() + + data: DataFrame | np.ndarray = self + if raw: + # This will upcast the whole DataFrame to the same type, + # and likely result in an object 2D array. + # We should probably pass a list of 1D arrays instead, at + # lest for ``axis=0`` + data = self.values + result = engine.__pandas_udf__.apply( + data=data, + func=func, + args=args, + kwargs=kwargs, + decorator=engine, + axis=axis, + ) + if raw: + if result.ndim == 2: + return self._constructor( + result, index=self.index, columns=self.columns + ) + else: + return self._constructor_sliced(result, index=agg_axis) + return result + else: + raise ValueError(f"Unknown engine {engine}") def map( - self, func: PythonFuncType, na_action: str | None = None, **kwargs + self, func: PythonFuncType, na_action: Literal["ignore"] | None = None, **kwargs ) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -10368,7 +10691,7 @@ def map( >>> df_copy = df.copy() >>> df_copy.iloc[0, 0] = pd.NA - >>> df_copy.map(lambda x: len(str(x)), na_action='ignore') + >>> df_copy.map(lambda x: len(str(x)), na_action="ignore") 0 1 0 NaN 4 1 5.0 5 @@ -10391,15 +10714,13 @@ def map( But it's better to avoid map in that case. - >>> df ** 2 + >>> df**2 0 1 0 1.000000 4.494400 1 11.262736 20.857489 """ if na_action not in {"ignore", None}: - raise ValueError( - f"na_action must be 'ignore' or None. Got {repr(na_action)}" - ) + raise ValueError(f"na_action must be 'ignore' or None. Got {na_action!r}") if self.empty: return self.copy() @@ -10411,60 +10732,6 @@ def infer(x): return self.apply(infer).__finalize__(self, "map") - def applymap( - self, func: PythonFuncType, na_action: NaAction | None = None, **kwargs - ) -> DataFrame: - """ - Apply a function to a Dataframe elementwise. - - .. deprecated:: 2.1.0 - - DataFrame.applymap has been deprecated. Use DataFrame.map instead. - - This method applies a function that accepts and returns a scalar - to every element of a DataFrame. - - Parameters - ---------- - func : callable - Python function, returns a single value from a single value. - na_action : {None, 'ignore'}, default None - If 'ignore', propagate NaN values, without passing them to func. - **kwargs - Additional keyword arguments to pass as keywords arguments to - `func`. - - Returns - ------- - DataFrame - Transformed DataFrame. - - See Also - -------- - DataFrame.apply : Apply a function along input axis of DataFrame. - DataFrame.map : Apply a function along input axis of DataFrame. - DataFrame.replace: Replace values given in `to_replace` with `value`. - - Examples - -------- - >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) - >>> df - 0 1 - 0 1.000 2.120 - 1 3.356 4.567 - - >>> df.map(lambda x: len(str(x))) - 0 1 - 0 3 4 - 1 5 5 - """ - warnings.warn( - "DataFrame.applymap has been deprecated. Use DataFrame.map instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.map(func, na_action=na_action, **kwargs) - # ---------------------------------------------------------------------- # Merging / joining methods @@ -10488,16 +10755,16 @@ def _append( index = Index( [other.name], - name=self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name, + name=( + self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name + ), ) row_df = other.to_frame().T # infer_objects is needed for # test_append_empty_frame_to_series_with_dateutil_tz - other = row_df.infer_objects(copy=False).rename_axis( - index.names, copy=False - ) + other = row_df.infer_objects().rename_axis(index.names) elif isinstance(other, list): if not other: pass @@ -10550,7 +10817,8 @@ def join( values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. - how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left' + how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, + default 'left' How to handle the operation of the two objects. * left: use calling frame's index (or column if on is specified) @@ -10562,6 +10830,10 @@ def join( of the calling's one. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use set difference of calling frame's index and `other`'s + index. + * right_anti: use set difference of `other`'s index and calling frame's + index. lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' @@ -10596,8 +10868,12 @@ def join( Examples -------- - >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], - ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + >>> df = pd.DataFrame( + ... { + ... "key": ["K0", "K1", "K2", "K3", "K4", "K5"], + ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], + ... } + ... ) >>> df key A @@ -10608,8 +10884,7 @@ def join( 4 K4 A4 5 K5 A5 - >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], - ... 'B': ['B0', 'B1', 'B2']}) + >>> other = pd.DataFrame({"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}) >>> other key B @@ -10619,7 +10894,7 @@ def join( Join DataFrames using their indexes. - >>> df.join(other, lsuffix='_caller', rsuffix='_other') + >>> df.join(other, lsuffix="_caller", rsuffix="_other") key_caller A key_other B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 @@ -10632,7 +10907,7 @@ def join( the index in both `df` and `other`. The joined DataFrame will have key as its index. - >>> df.set_index('key').join(other.set_index('key')) + >>> df.set_index("key").join(other.set_index("key")) A B key K0 A0 B0 @@ -10647,7 +10922,7 @@ def join( any column in `df`. This method preserves the original DataFrame's index in the result. - >>> df.join(other.set_index('key'), on='key') + >>> df.join(other.set_index("key"), on="key") key A B 0 K0 A0 B0 1 K1 A1 B1 @@ -10658,8 +10933,12 @@ def join( Using non-unique key values shows how they are matched. - >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], - ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + >>> df = pd.DataFrame( + ... { + ... "key": ["K0", "K1", "K1", "K3", "K0", "K1"], + ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], + ... } + ... ) >>> df key A @@ -10670,7 +10949,7 @@ def join( 4 K0 A4 5 K1 A5 - >>> df.join(other.set_index('key'), on='key', validate='m:1') + >>> df.join(other.set_index("key"), on="key", validate="m:1") key A B 0 K0 A0 B0 1 K1 A1 B1 @@ -10734,7 +11013,7 @@ def join( res = concat( frames, axis=1, join="outer", verify_integrity=True, sort=sort ) - return res.reindex(self.index, copy=False) + return res.reindex(self.index) else: return concat( frames, axis=1, join=how, verify_integrity=True, sort=sort @@ -10767,10 +11046,12 @@ def merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, indicator: str | bool = False, validate: MergeValidate | None = None, ) -> DataFrame: + self._check_copy_deprecation(copy) + from pandas.core.reshape.merge import merge return merge( @@ -10784,7 +11065,6 @@ def merge( right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator, validate=validate, ) @@ -10793,7 +11073,7 @@ def round( self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs ) -> DataFrame: """ - Round a DataFrame to a variable number of decimal places. + Round numeric columns in a DataFrame to a variable number of decimal places. Parameters ---------- @@ -10824,10 +11104,18 @@ def round( numpy.around : Round a numpy array to the given number of decimals. Series.round : Round a Series to the given number of decimals. + Notes + ----- + For values exactly halfway between rounded decimal values, pandas rounds + to the nearest even value (e.g. -0.5 and 0.5 round to 0.0, 1.5 and 2.5 + round to 2.0, etc.). + Examples -------- - >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], - ... columns=['dogs', 'cats']) + >>> df = pd.DataFrame( + ... [(0.21, 0.32), (0.01, 0.67), (0.66, 0.03), (0.21, 0.18)], + ... columns=["dogs", "cats"], + ... ) >>> df dogs cats 0 0.21 0.32 @@ -10849,7 +11137,7 @@ def round( specified with the column names as key and the number of decimal places as value - >>> df.round({'dogs': 1, 'cats': 0}) + >>> df.round({"dogs": 1, "cats": 0}) dogs cats 0 0.2 0.0 1 0.0 1.0 @@ -10860,7 +11148,7 @@ def round( specified with the column names as index and the number of decimal places as value - >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) + >>> decimals = pd.Series([0, 1], index=["cats", "dogs"]) >>> df.round(decimals) dogs cats 0 0.2 0.0 @@ -10870,7 +11158,7 @@ def round( """ from pandas.core.reshape.concat import concat - def _dict_round(df: DataFrame, decimals): + def _dict_round(df: DataFrame, decimals) -> Iterator[Series]: for col, vals in df.items(): try: yield _series_round(vals, decimals[col]) @@ -10898,7 +11186,6 @@ def _series_round(ser: Series, decimals: int) -> Series: # type "Union[int, integer[Any]]"; expected "int" new_mgr = self._mgr.round( decimals=decimals, # type: ignore[arg-type] - using_cow=using_copy_on_write(), ) return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" @@ -10973,15 +11260,18 @@ def corr( >>> def histogram_intersection(a, b): ... v = np.minimum(a, b).sum().round(decimals=1) ... return v - >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], - ... columns=['dogs', 'cats']) + >>> df = pd.DataFrame( + ... [(0.2, 0.3), (0.0, 0.6), (0.6, 0.0), (0.2, 0.1)], + ... columns=["dogs", "cats"], + ... ) >>> df.corr(method=histogram_intersection) dogs cats dogs 1.0 0.3 cats 0.3 1.0 - >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], - ... columns=['dogs', 'cats']) + >>> df = pd.DataFrame( + ... [(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], columns=["dogs", "cats"] + ... ) >>> df.corr(min_periods=3) dogs cats dogs 1.0 NaN @@ -11107,16 +11397,18 @@ def cov( Examples -------- - >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], - ... columns=['dogs', 'cats']) + >>> df = pd.DataFrame( + ... [(1, 2), (0, 3), (2, 0), (1, 1)], columns=["dogs", "cats"] + ... ) >>> df.cov() dogs cats dogs 0.666667 -1.000000 cats -1.000000 1.666667 >>> np.random.seed(42) - >>> df = pd.DataFrame(np.random.randn(1000, 5), - ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df = pd.DataFrame( + ... np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"] + ... ) >>> df.cov() a b c d e a 0.998438 -0.020161 0.059277 -0.008943 0.014144 @@ -11132,10 +11424,9 @@ def cov( each column pair in order to have a valid result: >>> np.random.seed(42) - >>> df = pd.DataFrame(np.random.randn(20, 3), - ... columns=['a', 'b', 'c']) - >>> df.loc[df.index[:5], 'a'] = np.nan - >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + >>> df.loc[df.index[:5], "a"] = np.nan + >>> df.loc[df.index[5:10], "b"] = np.nan >>> df.cov(min_periods=12) a b c a 0.316741 NaN -0.150812 @@ -11167,6 +11458,7 @@ def corrwith( drop: bool = False, method: CorrelationMethod = "pearson", numeric_only: bool = False, + min_periods: int | None = None, ) -> Series: """ Compute pairwise correlation. @@ -11197,6 +11489,9 @@ def corrwith( numeric_only : bool, default False Include only `float`, `int` or `boolean` data. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + .. versionadded:: 1.5.0 .. versionchanged:: 2.0.0 @@ -11215,8 +11510,12 @@ def corrwith( -------- >>> index = ["a", "b", "c", "d", "e"] >>> columns = ["one", "two", "three", "four"] - >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) - >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) + >>> df1 = pd.DataFrame( + ... np.arange(20).reshape(5, 4), index=index, columns=columns + ... ) + >>> df2 = pd.DataFrame( + ... np.arange(16).reshape(4, 4), index=index[:4], columns=columns + ... ) >>> df1.corrwith(df2) one 1.0 two 1.0 @@ -11231,16 +11530,19 @@ def corrwith( d 1.0 e NaN dtype: float64 - """ # noqa: E501 + """ axis = self._get_axis_number(axis) this = self._get_numeric_data() if numeric_only else self if isinstance(other, Series): - return this.apply(lambda x: other.corr(x, method=method), axis=axis) + return this.apply( + lambda x: other.corr(x, method=method, min_periods=min_periods), + axis=axis, + ) if numeric_only: other = other._get_numeric_data() - left, right = this.align(other, join="inner", copy=False) + left, right = this.align(other, join="inner") if axis == 1: left = left.T @@ -11299,7 +11601,7 @@ def c(x): # ---------------------------------------------------------------------- # ndarray-like stats methods - def count(self, axis: Axis = 0, numeric_only: bool = False): + def count(self, axis: Axis = 0, numeric_only: bool = False) -> Series: """ Count non-NA cells for each column or row. @@ -11331,10 +11633,13 @@ def count(self, axis: Axis = 0, numeric_only: bool = False): -------- Constructing DataFrame from a dictionary: - >>> df = pd.DataFrame({"Person": - ... ["John", "Myla", "Lewis", "John", "Myla"], - ... "Age": [24., np.nan, 21., 33, 26], - ... "Single": [False, True, True, True, False]}) + >>> df = pd.DataFrame( + ... { + ... "Person": ["John", "Myla", "Lewis", "John", "Myla"], + ... "Age": [24.0, np.nan, 21.0, 33, 26], + ... "Single": [False, True, True, True, False], + ... } + ... ) >>> df Person Age Single 0 John 24.0 False @@ -11353,7 +11658,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False): Counts for each **row**: - >>> df.count(axis='columns') + >>> df.count(axis="columns") 0 3 1 2 2 3 @@ -11374,7 +11679,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False): else: result = notna(frame).sum(axis=axis) - return result.astype("int64", copy=False).__finalize__(self, method="count") + return result.astype("int64").__finalize__(self, method="count") def _reduce( self, @@ -11397,30 +11702,11 @@ def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) - dtype_has_keepdims: dict[ExtensionDtype, bool] = {} - def blk_func(values, axis: Axis = 1): if isinstance(values, ExtensionArray): - if not is_1d_only_ea_dtype(values.dtype) and not isinstance( - self._mgr, ArrayManager - ): + if not is_1d_only_ea_dtype(values.dtype): return values._reduce(name, axis=1, skipna=skipna, **kwds) - has_keepdims = dtype_has_keepdims.get(values.dtype) - if has_keepdims is None: - sign = signature(values._reduce) - has_keepdims = "keepdims" in sign.parameters - dtype_has_keepdims[values.dtype] = has_keepdims - if has_keepdims: - return values._reduce(name, skipna=skipna, keepdims=True, **kwds) - else: - warnings.warn( - f"{type(values)}._reduce will require a `keepdims` parameter " - "in the future", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = values._reduce(name, skipna=skipna, **kwds) - return np.array([result]) + return values._reduce(name, skipna=skipna, keepdims=True, **kwds) else: return op(values, axis=axis, skipna=skipna, **kwds) @@ -11438,9 +11724,9 @@ def _get_data() -> DataFrame: if numeric_only: df = _get_data() if axis is None: - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type([block.values.dtype for block in df._mgr.blocks]) if isinstance(dtype, ExtensionDtype): - df = df.astype(dtype, copy=False) + df = df.astype(dtype) arr = concat_compat(list(df._iter_column_arrays())) return arr._reduce(name, skipna=skipna, keepdims=False, **kwds) return func(df.values) @@ -11463,7 +11749,9 @@ def _get_data() -> DataFrame: # kurtosis excluded since groupby does not implement it if df.shape[1] and name != "kurt": - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type( + [block.values.dtype for block in df._mgr.blocks] + ) if isinstance(dtype, ExtensionDtype): # GH 54341: fastpath for EA-backed axis=1 reductions # This flattens the frame into a single 1D array while keeping @@ -11472,7 +11760,7 @@ def _get_data() -> DataFrame: # be equivalent to transposing the original frame and aggregating # with axis=0. name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) - df = df.astype(dtype, copy=False) + df = df.astype(dtype) arr = concat_compat(list(df._iter_column_arrays())) nrows, ncols = df.shape row_index = np.tile(np.arange(nrows), ncols) @@ -11537,16 +11825,46 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: else: raise NotImplementedError(name) - for arr in self._mgr.arrays: - middle = func(arr, axis=0, skipna=skipna) + for blocks in self._mgr.blocks: + middle = func(blocks.values, axis=0, skipna=skipna) result = ufunc(result, middle) res_ser = self._constructor_sliced(result, index=self.index, copy=False) return res_ser - @doc(make_doc("any", ndim=2)) # error: Signature of "any" incompatible with supertype "NDFrame" - def any( # type: ignore[override] + @overload # type: ignore[override] + def any( + self, + *, + axis: Axis = ..., + bool_only: bool = ..., + skipna: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def any( + self, + *, + axis: None, + bool_only: bool = ..., + skipna: bool = ..., + **kwargs, + ) -> bool: ... + + @overload + def any( + self, + *, + axis: Axis | None, + bool_only: bool = ..., + skipna: bool = ..., + **kwargs, + ) -> Series | bool: ... + + @doc(make_doc("any", ndim=1)) + def any( self, *, axis: Axis | None = 0, @@ -11561,34 +11879,131 @@ def any( # type: ignore[override] result = result.__finalize__(self, method="any") return result - @doc(make_doc("all", ndim=2)) + @overload def all( self, - axis: Axis | None = 0, - bool_only: bool = False, - skipna: bool = True, + *, + axis: Axis = ..., + bool_only: bool = ..., + skipna: bool = ..., **kwargs, - ) -> Series | bool: - result = self._logical_func( - "all", nanops.nanall, axis, bool_only, skipna, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="all") - return result + ) -> Series: ... - @doc(make_doc("min", ndim=2)) + @overload + def all( + self, + *, + axis: None, + bool_only: bool = ..., + skipna: bool = ..., + **kwargs, + ) -> bool: ... + + @overload + def all( + self, + *, + axis: Axis | None, + bool_only: bool = ..., + skipna: bool = ..., + **kwargs, + ) -> Series | bool: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") + @doc(make_doc("all", ndim=1)) + def all( + self, + axis: Axis | None = 0, + bool_only: bool = False, + skipna: bool = True, + **kwargs, + ) -> Series | bool: + result = self._logical_func( + "all", nanops.nanall, axis, bool_only, skipna, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="all") + return result + + # error: Signature of "min" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def min( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def min( + self, + *, + axis: None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Any: ... + + @overload + def min( + self, + *, + axis: Axis | None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series | Any: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") + @doc(make_doc("min", ndim=2)) def min( self, axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, **kwargs, - ): - result = super().min(axis, skipna, numeric_only, **kwargs) + ) -> Series | Any: + result = super().min( + axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) if isinstance(result, Series): result = result.__finalize__(self, method="min") return result + # error: Signature of "max" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def max( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def max( + self, + *, + axis: None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Any: ... + + @overload + def max( + self, + *, + axis: Axis | None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series | Any: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") @doc(make_doc("max", ndim=2)) def max( self, @@ -11596,13 +12011,15 @@ def max( skipna: bool = True, numeric_only: bool = False, **kwargs, - ): - result = super().max(axis, skipna, numeric_only, **kwargs) + ) -> Series | Any: + result = super().max( + axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) if isinstance(result, Series): result = result.__finalize__(self, method="max") return result - @doc(make_doc("sum", ndim=2)) + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") def sum( self, axis: Axis | None = 0, @@ -11610,11 +12027,100 @@ def sum( numeric_only: bool = False, min_count: int = 0, **kwargs, - ): - result = super().sum(axis, skipna, numeric_only, min_count, **kwargs) - return result.__finalize__(self, method="sum") + ) -> Series: + """ + Return the sum of the values over the requested axis. + + This is equivalent to the method ``numpy.sum``. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sum with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Sum over requested axis. + + See Also + -------- + Series.sum : Return the sum over Series values. + DataFrame.mean : Return the mean of the values over the requested axis. + DataFrame.median : Return the median of the values over the requested axis. + DataFrame.mode : Get the mode(s) of each element along the requested axis. + DataFrame.std : Return the standard deviation of the values over the + requested axis. + + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.sum() + 14 + + By default, the sum of an empty or all-NA Series is ``0``. + + >>> pd.Series([], dtype="float64").sum() # min_count=0 is the default + 0.0 + + This can be controlled with the ``min_count`` parameter. For example, if + you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + + >>> pd.Series([], dtype="float64").sum(min_count=1) + nan + + Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and + empty series identically. - @doc(make_doc("prod", ndim=2)) + >>> pd.Series([np.nan]).sum() + 0.0 + + >>> pd.Series([np.nan]).sum(min_count=1) + nan + """ + result = super().sum( + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="sum") + return result + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") def prod( self, axis: Axis | None = 0, @@ -11622,122 +12128,840 @@ def prod( numeric_only: bool = False, min_count: int = 0, **kwargs, - ): - result = super().prod(axis, skipna, numeric_only, min_count, **kwargs) - return result.__finalize__(self, method="prod") + ) -> Series: + """ + Return the product of the values over the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + The behavior of DataFrame.prod with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + The product of the values over the requested axis. + + See Also + -------- + Series.sum : Return the sum. + Series.min : Return the minimum. + Series.max : Return the maximum. + Series.idxmin : Return the index of the minimum. + Series.idxmax : Return the index of the maximum. + DataFrame.sum : Return the sum over the requested axis. + DataFrame.min : Return the minimum over the requested axis. + DataFrame.max : Return the maximum over the requested axis. + DataFrame.idxmin : Return the index of the minimum over the requested axis. + DataFrame.idxmax : Return the index of the maximum over the requested axis. + + Examples + -------- + By default, the product of an empty or all-NA Series is ``1`` + + >>> pd.Series([], dtype="float64").prod() + 1.0 + + This can be controlled with the ``min_count`` parameter + + >>> pd.Series([], dtype="float64").prod(min_count=1) + nan + + Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and + empty series identically. + + >>> pd.Series([np.nan]).prod() + 1.0 + + >>> pd.Series([np.nan]).prod(min_count=1) + nan + """ + result = super().prod( + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="prod") + return result + + # error: Signature of "mean" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def mean( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def mean( + self, + *, + axis: None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Any: ... + + @overload + def mean( + self, + *, + axis: Axis | None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series | Any: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") @doc(make_doc("mean", ndim=2)) def mean( self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series | Any: + result = super().mean( + axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="mean") + return result + + # error: Signature of "median" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def median( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def median( + self, + *, + axis: None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Any: ... + + @overload + def median( + self, + *, + axis: Axis | None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series | Any: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") + @doc(make_doc("median", ndim=2)) + def median( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series | Any: + result = super().median( + axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="median") + return result + + # error: Signature of "sem" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def sem( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def sem( + self, + *, + axis: None, + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Any: ... + + @overload + def sem( + self, + *, + axis: Axis | None, + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series | Any: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") + def sem( + self, + axis: Axis | None = 0, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ) -> Series | Any: + """ + Return unbiased standard error of the mean over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sem with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + Series or DataFrame (if level specified) + Unbiased standard error of the mean over requested axis. + + See Also + -------- + DataFrame.var : Return unbiased variance over requested axis. + DataFrame.std : Returns sample standard deviation over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.sem().round(6) + 0.57735 + + With a DataFrame + + >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"]) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.sem() + a 0.5 + b 0.5 + dtype: float64 + + Using axis=1 + + >>> df.sem(axis=1) + tiger 0.5 + zebra 0.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` + to avoid getting an error. + + >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"]) + >>> df.sem(numeric_only=True) + a 0.5 + dtype: float64 + """ + result = super().sem( + axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="sem") + return result + + # error: Signature of "var" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def var( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def var( + self, + *, + axis: None, + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Any: ... + + @overload + def var( + self, + *, + axis: Axis | None, + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series | Any: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") + def var( + self, + axis: Axis | None = 0, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ) -> Series | Any: + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0), columns (1)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.var with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + Series or scalaer + Unbiased variance over requested axis. + + See Also + -------- + numpy.var : Equivalent function in NumPy. + Series.var : Return unbiased variance over Series values. + Series.std : Return standard deviation over Series values. + DataFrame.std : Return standard deviation of the values over + the requested axis. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "person_id": [0, 1, 2, 3], + ... "age": [21, 25, 62, 43], + ... "height": [1.61, 1.87, 1.49, 2.01], + ... } + ... ).set_index("person_id") + >>> df + age height + person_id + 0 21 1.61 + 1 25 1.87 + 2 62 1.49 + 3 43 2.01 + + >>> df.var() + age 352.916667 + height 0.056367 + dtype: float64 + + Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1: + + >>> df.var(ddof=0) + age 264.687500 + height 0.042275 + dtype: float64 + """ + result = super().var( + axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="var") + return result + + # error: Signature of "std" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def std( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def std( + self, + *, + axis: None, + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Any: ... + + @overload + def std( + self, + *, + axis: Axis | None, + skipna: bool = ..., + ddof: int = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series | Any: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") + def std( + self, + axis: Axis | None = 0, + skipna: bool = True, + ddof: int = 1, + numeric_only: bool = False, + **kwargs, + ) -> Series | Any: + """ + Return sample standard deviation over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0), columns (1)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.std with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : dict + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Standard deviation over requested axis. + + See Also + -------- + Series.std : Return standard deviation over Series values. + DataFrame.mean : Return the mean of the values over the requested axis. + DataFrame.median : Return the median of the values over the requested axis. + DataFrame.mode : Get the mode(s) of each element along the requested axis. + DataFrame.sum : Return the sum of the values over the requested axis. + + Notes + ----- + To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the + default `ddof=1`) + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "person_id": [0, 1, 2, 3], + ... "age": [21, 25, 62, 43], + ... "height": [1.61, 1.87, 1.49, 2.01], + ... } + ... ).set_index("person_id") + >>> df + age height + person_id + 0 21 1.61 + 1 25 1.87 + 2 62 1.49 + 3 43 2.01 + + The standard deviation of the columns can be found as follows: + + >>> df.std() + age 18.786076 + height 0.237417 + dtype: float64 + + Alternatively, `ddof=0` can be set to normalize by N instead of N-1: + + >>> df.std(ddof=0) + age 16.269219 + height 0.205609 + dtype: float64 + """ + result = super().std( + axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="std") + return result + + # error: Signature of "skew" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def skew( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def skew( + self, + *, + axis: None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Any: ... + + @overload + def skew( + self, + *, + axis: Axis | None, + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series | Any: ... + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") + def skew( + self, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series | Any: + """ + Return unbiased skew over requested axis. + + Normalized by N-1. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Unbiased skew over requested axis. + + See Also + -------- + Dataframe.kurt : Returns unbiased kurtosis over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.skew() + 0.0 + + With a DataFrame + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 3], "b": [2, 3, 4], "c": [1, 3, 5]}, + ... index=["tiger", "zebra", "cow"], + ... ) + >>> df + a b c + tiger 1 2 1 + zebra 2 3 3 + cow 3 4 5 + >>> df.skew() + a 0.0 + b 0.0 + c 0.0 + dtype: float64 + + Using axis=1 + + >>> df.skew(axis=1) + tiger 1.732051 + zebra -1.732051 + cow 0.000000 + dtype: float64 + + In this case, `numeric_only` should be set to `True` to avoid + getting an error. + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 3], "b": ["T", "Z", "X"]}, index=["tiger", "zebra", "cow"] + ... ) + >>> df.skew(numeric_only=True) + a 0.0 + dtype: float64 + """ + result = super().skew( + axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="skew") + return result + + # error: Signature of "kurt" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def kurt( + self, + *, + axis: Axis = ..., + skipna: bool = ..., + numeric_only: bool = ..., + **kwargs, + ) -> Series: ... + + @overload + def kurt( + self, + *, + axis: None, + skipna: bool = ..., + numeric_only: bool = ..., **kwargs, - ): - result = super().mean(axis, skipna, numeric_only, **kwargs) - if isinstance(result, Series): - result = result.__finalize__(self, method="mean") - return result + ) -> Any: ... - @doc(make_doc("median", ndim=2)) - def median( + @overload + def kurt( self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, + *, + axis: Axis | None, + skipna: bool = ..., + numeric_only: bool = ..., **kwargs, - ): - result = super().median(axis, skipna, numeric_only, **kwargs) - if isinstance(result, Series): - result = result.__finalize__(self, method="median") - return result + ) -> Series | Any: ... - @doc(make_doc("sem", ndim=2)) - def sem( + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") + def kurt( self, axis: Axis | None = 0, skipna: bool = True, - ddof: int = 1, numeric_only: bool = False, **kwargs, - ): - result = super().sem(axis, skipna, ddof, numeric_only, **kwargs) + ) -> Series | Any: + """ + Return unbiased kurtosis over requested axis. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Unbiased kurtosis over requested axis. + + See Also + -------- + Dataframe.kurtosis : Returns unbiased kurtosis over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"]) + >>> s + cat 1 + dog 2 + dog 2 + mouse 3 + dtype: int64 + >>> s.kurt() + 1.5 + + With a DataFrame + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 2, 3], "b": [3, 4, 4, 4]}, + ... index=["cat", "dog", "dog", "mouse"], + ... ) + >>> df + a b + cat 1 3 + dog 2 4 + dog 2 4 + mouse 3 4 + >>> df.kurt() + a 1.5 + b 4.0 + dtype: float64 + + With axis=None + + >>> df.kurt(axis=None).round(6) + -0.988693 + + Using axis=1 + + >>> df = pd.DataFrame( + ... {"a": [1, 2], "b": [3, 4], "c": [3, 4], "d": [1, 2]}, + ... index=["cat", "dog"], + ... ) + >>> df.kurt(axis=1) + cat -6.0 + dog -6.0 + dtype: float64 + """ + result = super().kurt( + axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) if isinstance(result, Series): - result = result.__finalize__(self, method="sem") + result = result.__finalize__(self, method="kurt") return result - @doc(make_doc("var", ndim=2)) - def var( + # error: Incompatible types in assignment + kurtosis = kurt # type: ignore[assignment] + product = prod + + @doc(make_doc("cummin", ndim=2)) + def cummin( self, - axis: Axis | None = 0, + axis: Axis = 0, skipna: bool = True, - ddof: int = 1, numeric_only: bool = False, + *args, **kwargs, - ): - result = super().var(axis, skipna, ddof, numeric_only, **kwargs) - if isinstance(result, Series): - result = result.__finalize__(self, method="var") - return result + ) -> Self: + data = self._get_numeric_data() if numeric_only else self + return NDFrame.cummin(data, axis, skipna, *args, **kwargs) - @doc(make_doc("std", ndim=2)) - def std( + @doc(make_doc("cummax", ndim=2)) + def cummax( self, - axis: Axis | None = 0, + axis: Axis = 0, skipna: bool = True, - ddof: int = 1, numeric_only: bool = False, + *args, **kwargs, - ): - result = super().std(axis, skipna, ddof, numeric_only, **kwargs) - if isinstance(result, Series): - result = result.__finalize__(self, method="std") - return result + ) -> Self: + data = self._get_numeric_data() if numeric_only else self + return NDFrame.cummax(data, axis, skipna, *args, **kwargs) - @doc(make_doc("skew", ndim=2)) - def skew( + @doc(make_doc("cumsum", ndim=2)) + def cumsum( self, - axis: Axis | None = 0, + axis: Axis = 0, skipna: bool = True, numeric_only: bool = False, + *args, **kwargs, - ): - result = super().skew(axis, skipna, numeric_only, **kwargs) - if isinstance(result, Series): - result = result.__finalize__(self, method="skew") - return result + ) -> Self: + data = self._get_numeric_data() if numeric_only else self + return NDFrame.cumsum(data, axis, skipna, *args, **kwargs) - @doc(make_doc("kurt", ndim=2)) - def kurt( + @doc(make_doc("cumprod", 2)) + def cumprod( self, - axis: Axis | None = 0, + axis: Axis = 0, skipna: bool = True, numeric_only: bool = False, + *args, **kwargs, - ): - result = super().kurt(axis, skipna, numeric_only, **kwargs) - if isinstance(result, Series): - result = result.__finalize__(self, method="kurt") - return result - - kurtosis = kurt - product = prod - - @doc(make_doc("cummin", ndim=2)) - def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): - return NDFrame.cummin(self, axis, skipna, *args, **kwargs) - - @doc(make_doc("cummax", ndim=2)) - def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): - return NDFrame.cummax(self, axis, skipna, *args, **kwargs) - - @doc(make_doc("cumsum", ndim=2)) - def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): - return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) - - @doc(make_doc("cumprod", 2)) - def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): - return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) + ) -> Self: + data = self._get_numeric_data() if numeric_only else self + return NDFrame.cumprod(data, axis, skipna, *args, **kwargs) def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ @@ -11757,6 +12981,7 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: Returns ------- Series + Series with counts of unique values per row or column, depending on `axis`. See Also -------- @@ -11765,7 +12990,7 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: Examples -------- - >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df = pd.DataFrame({"A": [4, 5, 6], "B": [4, 1, 1]}) >>> df.nunique() A 3 B 2 @@ -11779,10 +13004,80 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ return self.apply(Series.nunique, axis=axis, dropna=dropna) - @doc(_shared_docs["idxmin"], numeric_only_default="False") def idxmin( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If the entire DataFrame is NA, + or if ``skipna=False`` and there is an NA value, this method + will raise a ``ValueError``. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin : Return index of the minimum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame( + ... { + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... } + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object + """ axis = self._get_axis_number(axis) if self.empty and len(self.axes[axis]): @@ -11816,10 +13111,80 @@ def idxmin( final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) return final_result.__finalize__(self, method="idxmin") - @doc(_shared_docs["idxmax"], numeric_only_default="False") def idxmax( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If the entire DataFrame is NA, + or if ``skipna=False`` and there is an NA value, this method + will raise a ``ValueError``. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax : Return index of the maximum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame( + ... { + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... } + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object + """ axis = self._get_axis_number(axis) if self.empty and len(self.axes[axis]): @@ -11862,7 +13227,7 @@ def _get_agg_axis(self, axis_num: int) -> Index: elif axis_num == 1: return self.index else: - raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") + raise ValueError(f"Axis must be 0 or 1 (got {axis_num!r})") def mode( self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True @@ -11898,12 +13263,16 @@ def mode( Examples -------- - >>> df = pd.DataFrame([('bird', 2, 2), - ... ('mammal', 4, np.nan), - ... ('arthropod', 8, 0), - ... ('bird', 2, np.nan)], - ... index=('falcon', 'horse', 'spider', 'ostrich'), - ... columns=('species', 'legs', 'wings')) + >>> df = pd.DataFrame( + ... [ + ... ("bird", 2, 2), + ... ("mammal", 4, np.nan), + ... ("arthropod", 8, 0), + ... ("bird", 2, np.nan), + ... ], + ... index=("falcon", "horse", "spider", "ostrich"), + ... columns=("species", "legs", "wings"), + ... ) >>> df species legs wings falcon bird 2 2.0 @@ -11937,7 +13306,7 @@ def mode( To compute the mode over columns and not rows, use the axis parameter: - >>> df.mode(axis='columns', numeric_only=True) + >>> df.mode(axis="columns", numeric_only=True) 0 1 falcon 2.0 NaN horse 4.0 NaN @@ -11964,8 +13333,7 @@ def quantile( numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., method: Literal["single", "table"] = ..., - ) -> Series: - ... + ) -> Series: ... @overload def quantile( @@ -11975,8 +13343,7 @@ def quantile( numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., method: Literal["single", "table"] = ..., - ) -> Series | DataFrame: - ... + ) -> Series | DataFrame: ... @overload def quantile( @@ -11986,8 +13353,7 @@ def quantile( numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., method: Literal["single", "table"] = ..., - ) -> Series | DataFrame: - ... + ) -> Series | DataFrame: ... def quantile( self, @@ -12044,24 +13410,25 @@ def quantile( Examples -------- - >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), - ... columns=['a', 'b']) - >>> df.quantile(.1) + >>> df = pd.DataFrame( + ... np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=["a", "b"] + ... ) + >>> df.quantile(0.1) a 1.3 b 3.7 Name: 0.1, dtype: float64 - >>> df.quantile([.1, .5]) + >>> df.quantile([0.1, 0.5]) a b 0.1 1.3 3.7 0.5 2.5 55.0 Specifying `method='table'` will compute the quantile over all columns. - >>> df.quantile(.1, method="table", interpolation="nearest") + >>> df.quantile(0.1, method="table", interpolation="nearest") a 1 b 1 Name: 0.1, dtype: int64 - >>> df.quantile([.1, .5], method="table", interpolation="nearest") + >>> df.quantile([0.1, 0.5], method="table", interpolation="nearest") a b 0.1 1 1 0.5 3 100 @@ -12069,11 +13436,13 @@ def quantile( Specifying `numeric_only=False` will also compute the quantile of datetime and timedelta data. - >>> df = pd.DataFrame({'A': [1, 2], - ... 'B': [pd.Timestamp('2010'), - ... pd.Timestamp('2011')], - ... 'C': [pd.Timedelta('1 days'), - ... pd.Timedelta('2 days')]}) + >>> df = pd.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [pd.Timestamp("2010"), pd.Timestamp("2011")], + ... "C": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], + ... } + ... ) >>> df.quantile(0.5, numeric_only=False) A 1.5 B 2010-07-02 12:00:00 @@ -12114,7 +13483,7 @@ def quantile( if len(data.columns) == 0: # GH#23925 _get_numeric_data may have dropped all columns - cols = Index([], name=self.columns.name) + cols = self.columns[:0] dtype = np.float64 if axis == 1: @@ -12169,10 +13538,12 @@ def to_timestamp( freq: Frequency | None = None, how: ToTimestampHow = "start", axis: Axis = 0, - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: """ - Cast to DatetimeIndex of timestamps, at *beginning* of period. + Cast PeriodIndex to DatetimeIndex of timestamps, at *beginning* of period. + + This can be changed to the *end* of the period, by specifying `how="e"`. Parameters ---------- @@ -12183,7 +13554,7 @@ def to_timestamp( vs. end. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default). - copy : bool, default True + copy : bool, default False If False then underlying input data is not copied. .. note:: @@ -12198,15 +13569,22 @@ def to_timestamp( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- - DataFrame - The DataFrame has a DatetimeIndex. + DataFrame with DatetimeIndex + DataFrame with the PeriodIndex cast to DatetimeIndex. + + See Also + -------- + DataFrame.to_period: Inverse method to cast DatetimeIndex to PeriodIndex. + Series.to_timestamp: Equivalent method for Series. Examples -------- - >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y') - >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> idx = pd.PeriodIndex(["2023", "2024"], freq="Y") + >>> d = {"col1": [1, 2], "col2": [3, 4]} >>> df1 = pd.DataFrame(data=d, index=idx) >>> df1 col1 col2 @@ -12226,7 +13604,7 @@ def to_timestamp( Using `freq` which is the offset that the Timestamps will have >>> df2 = pd.DataFrame(data=d, index=idx) - >>> df2 = df2.to_timestamp(freq='M') + >>> df2 = df2.to_timestamp(freq="M") >>> df2 col1 col2 2023-01-31 1 3 @@ -12234,7 +13612,8 @@ def to_timestamp( >>> df2.index DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None) """ - new_obj = self.copy(deep=copy and not using_copy_on_write()) + self._check_copy_deprecation(copy) + new_obj = self.copy(deep=False) axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) @@ -12247,13 +13626,17 @@ def to_timestamp( return new_obj def to_period( - self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None + self, + freq: Frequency | None = None, + axis: Axis = 0, + copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: """ Convert DataFrame from DatetimeIndex to PeriodIndex. Convert DataFrame from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed). + frequency (inferred from index if not passed). Either index of columns can be + converted, depending on `axis` argument. Parameters ---------- @@ -12261,7 +13644,7 @@ def to_period( Frequency of the PeriodIndex. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default). - copy : bool, default True + copy : bool, default False If False then underlying input data is not copied. .. note:: @@ -12276,10 +13659,17 @@ def to_period( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- DataFrame - The DataFrame has a PeriodIndex. + The DataFrame with the converted PeriodIndex. + + See Also + -------- + Series.to_period: Equivalent method for Series. + Series.dt.to_period: Convert DateTime column values. Examples -------- @@ -12293,7 +13683,7 @@ def to_period( >>> idx DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.to_period("M") PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') @@ -12303,7 +13693,8 @@ def to_period( >>> idx.to_period("Y") PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]') """ - new_obj = self.copy(deep=copy and not using_copy_on_write()) + self._check_copy_deprecation(copy) + new_obj = self.copy(deep=False) axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) @@ -12341,10 +13732,16 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: Series.str.contains: Test if pattern or regex is contained within a string of a Series or Index. + Notes + ----- + ``__iter__`` is used (and not ``__contains__``) to iterate over values + when checking if it contains the elements in DataFrame. + Examples -------- - >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, - ... index=['falcon', 'dog']) + >>> df = pd.DataFrame( + ... {"num_legs": [2, 4], "num_wings": [2, 0]}, index=["falcon", "dog"] + ... ) >>> df num_legs num_wings falcon 2 2 @@ -12368,7 +13765,7 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: When ``values`` is a dict, we can pass values to check for each column separately: - >>> df.isin({'num_wings': [0, 3]}) + >>> df.isin({"num_wings": [0, 3]}) num_legs num_wings falcon False False dog False True @@ -12377,8 +13774,9 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: match. Note that 'falcon' does not match based on the number of legs in other. - >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, - ... index=['spider', 'falcon']) + >>> other = pd.DataFrame( + ... {"num_legs": [8, 3], "num_wings": [0, 2]}, index=["spider", "falcon"] + ... ) >>> df.isin(other) num_legs num_wings falcon False True @@ -12489,29 +13887,41 @@ def isin_(x): ) columns = properties.AxisProperty( axis=0, - doc=dedent( - """ - The column labels of the DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) - >>> df - A B - 0 1 3 - 1 2 4 - >>> df.columns - Index(['A', 'B'], dtype='object') - """ - ), + doc=""" + The column labels of the DataFrame. + + This property holds the column names as a pandas ``Index`` object. + It provides an immutable sequence of column labels that can be + used for data selection, renaming, and alignment in DataFrame operations. + + Returns + ------- + pandas.Index + The column labels of the DataFrame. + + See Also + -------- + DataFrame.index: The index (row labels) of the DataFrame. + DataFrame.axes: Return a list representing the axes of the DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df + A B + 0 1 3 + 1 2 4 + >>> df.columns + Index(['A', 'B'], dtype='object') + """, ) # ---------------------------------------------------------------------- # Add plotting methods to DataFrame - plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) + plot = Accessor("plot", pandas.plotting.PlotAccessor) hist = pandas.plotting.hist_frame boxplot = pandas.plotting.boxplot_frame - sparse = CachedAccessor("sparse", SparseFrameAccessor) + sparse = Accessor("sparse", SparseFrameAccessor) # ---------------------------------------------------------------------- # Internal Interface Methods @@ -12521,14 +13931,12 @@ def _to_dict_of_blocks(self): Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. - Internal ONLY - only works for BlockManager + Internal ONLY. """ mgr = self._mgr - # convert to BlockManager if needed -> this way support ArrayManager as well - mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v, in mgr.to_dict().items() + for k, v in mgr.to_iter_dict() } @property @@ -12571,9 +13979,9 @@ def values(self) -> np.ndarray: A DataFrame where all columns are the same type (e.g., int64) results in an array of the same type. - >>> df = pd.DataFrame({'age': [ 3, 29], - ... 'height': [94, 170], - ... 'weight': [31, 115]}) + >>> df = pd.DataFrame( + ... {"age": [3, 29], "height": [94, 170], "weight": [31, 115]} + ... ) >>> df age height weight 0 3 94 31 @@ -12591,10 +13999,14 @@ def values(self) -> np.ndarray: results in an ndarray of the broadest type that accommodates these mixed types (e.g., object). - >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), - ... ('lion', 80.5, 1), - ... ('monkey', np.nan, None)], - ... columns=('name', 'max_speed', 'rank')) + >>> df2 = pd.DataFrame( + ... [ + ... ("parrot", 24.0, "second"), + ... ("lion", 80.5, 1), + ... ("monkey", np.nan, None), + ... ], + ... columns=("name", "max_speed", "rank"), + ... ) >>> df2.dtypes name object max_speed float64 @@ -12608,8 +14020,12 @@ def values(self) -> np.ndarray: return self._mgr.as_array() -def _from_nested_dict(data) -> collections.defaultdict: - new_data: collections.defaultdict = collections.defaultdict(dict) +def _from_nested_dict( + data: Mapping[HashableT, Mapping[HashableT2, T]], +) -> collections.defaultdict[HashableT2, dict[HashableT, T]]: + new_data: collections.defaultdict[HashableT2, dict[HashableT, T]] = ( + collections.defaultdict(dict) + ) for index, s in data.items(): for col, v in s.items(): new_data[col][index] = v @@ -12622,7 +14038,7 @@ def _reindex_for_setitem( # reindex if necessary if value.index.equals(index) or not len(index): - if using_copy_on_write() and isinstance(value, Series): + if isinstance(value, Series): return value._values, value._references return value._values.copy(), None diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de25a02c6b37c..13585d7de6beb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5,7 +5,6 @@ from copy import deepcopy import datetime as dt from functools import partial -import gc from json import loads import operator import pickle @@ -14,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, NoReturn, @@ -23,25 +21,18 @@ overload, ) import warnings -import weakref import numpy as np -from pandas._config import ( - config, - using_copy_on_write, - warn_copy_on_write, -) +from pandas._config import config from pandas._libs import lib from pandas._libs.lib import is_range_indexer from pandas._libs.tslibs import ( Period, - Tick, Timestamp, to_offset, ) -from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas._typing import ( AlignJoin, AnyArrayLike, @@ -50,6 +41,7 @@ Axis, AxisInt, CompressionOptions, + Concatenate, DtypeArg, DtypeBackend, DtypeObj, @@ -65,6 +57,7 @@ IntervalClosedType, JSONSerializable, Level, + ListLike, Manager, NaPosition, NDFrameT, @@ -97,14 +90,10 @@ AbstractMethodError, ChainedAssignmentError, InvalidIndexError, - SettingWithCopyError, - SettingWithCopyWarning, - _chained_assignment_method_msg, - _chained_assignment_warning_method_msg, - _check_cacher, ) +from pandas.errors.cow import _chained_assignment_method_msg from pandas.util._decorators import ( - deprecate_nonkeyword_arguments, + deprecate_kwarg, doc, ) from pandas.util._exceptions import find_stack_level @@ -112,7 +101,6 @@ check_dtype_backend, validate_ascending, validate_bool_kwarg, - validate_fillna_kwargs, validate_inclusive, ) @@ -135,6 +123,7 @@ from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, ExtensionDtype, + PeriodDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -168,19 +157,10 @@ Index, MultiIndex, PeriodIndex, - RangeIndex, default_index, ensure_index, ) -from pandas.core.internals import ( - ArrayManager, - BlockManager, - SingleArrayManager, -) -from pandas.core.internals.construction import ( - mgr_to_mgr, - ndarray_to_mgr, -) +from pandas.core.internals import BlockManager from pandas.core.methods.describe import describe_ndframe from pandas.core.missing import ( clean_fill_method, @@ -204,6 +184,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Hashable, Iterator, @@ -212,6 +193,7 @@ ) from pandas._libs.tslibs import BaseOffset + from pandas._typing import P from pandas import ( DataFrame, @@ -222,6 +204,8 @@ from pandas.core.indexers.objects import BaseIndexer from pandas.core.resample import Resampler +import textwrap + # goal is to be able to define the docs close to function, while still being # able to share _shared_docs = {**_shared_docs} @@ -238,9 +222,6 @@ } -bool_t = bool # Need alias because NDFrame has def bool: - - class NDFrame(PandasObject, indexing.IndexingMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -255,10 +236,8 @@ class NDFrame(PandasObject, indexing.IndexingMixin): _internal_names: list[str] = [ "_mgr", - "_cacher", "_item_cache", "_cache", - "_is_copy", "_name", "_metadata", "_flags", @@ -267,7 +246,6 @@ class NDFrame(PandasObject, indexing.IndexingMixin): _accessors: set[str] = set() _hidden_attrs: frozenset[str] = frozenset([]) _metadata: list[str] = [] - _is_copy: weakref.ReferenceType[NDFrame] | str | None = None _mgr: Manager _attrs: dict[Hashable, Any] _typ: str @@ -276,9 +254,7 @@ class NDFrame(PandasObject, indexing.IndexingMixin): # Constructors def __init__(self, data: Manager) -> None: - object.__setattr__(self, "_is_copy", None) object.__setattr__(self, "_mgr", data) - object.__setattr__(self, "_item_cache", {}) object.__setattr__(self, "_attrs", {}) object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) @@ -289,7 +265,7 @@ def _init_mgr( mgr: Manager, axes: dict[Literal["index", "columns"], Axes | None], dtype: DtypeObj | None = None, - copy: bool_t = False, + copy: bool = False, ) -> Manager: """passed a manager and a axes dict""" for a, axe in axes.items(): @@ -314,28 +290,6 @@ def _init_mgr( return mgr @final - def _as_manager(self, typ: str, copy: bool_t = True) -> Self: - """ - Private helper function to create a DataFrame with specific manager. - - Parameters - ---------- - typ : {"block", "array"} - copy : bool, default True - Only controls whether the conversion from Block->ArrayManager - copies the 1D arrays (to ensure proper/contiguous memory layout). - - Returns - ------- - DataFrame - New DataFrame using specified manager type. Is not guaranteed - to be a copy or not. - """ - new_mgr: Manager - new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy) - # fastpath of passing a manager doesn't check the option/manager class - return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) - @classmethod def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self: """ @@ -390,7 +344,7 @@ def attrs(self) -> dict[Hashable, Any]: For DataFrame: - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.attrs = {"A": [10, 20, 30]} >>> df.attrs {'A': [10, 20, 30]} @@ -446,12 +400,18 @@ def flags(self) -> Flags: def set_flags( self, *, - copy: bool_t = False, - allows_duplicate_labels: bool_t | None = None, + copy: bool | lib.NoDefault = lib.no_default, + allows_duplicate_labels: bool | None = None, ) -> Self: """ Return a new object with updated flags. + This method creates a shallow copy of the original object, preserving its + underlying data while modifying its global flags. In particular, it allows + you to update properties such as whether duplicate labels are permitted. This + behavior is especially useful in method chains, where one wishes to + adjust DataFrame or Series characteristics without altering the original object. + Parameters ---------- copy : bool, default False @@ -468,6 +428,8 @@ def set_flags( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 allows_duplicate_labels : bool, optional Whether the returned object allows duplicate labels. @@ -502,7 +464,8 @@ def set_flags( >>> df2.flags.allows_duplicate_labels False """ - df = self.copy(deep=copy and not using_copy_on_write()) + self._check_copy_deprecation(copy) + df = self.copy(deep=False) if allows_duplicate_labels is not None: df.flags["allows_duplicate_labels"] = allows_duplicate_labels return df @@ -526,31 +489,15 @@ def _validate_dtype(cls, dtype) -> DtypeObj | None: # ---------------------------------------------------------------------- # Construction + # error: Signature of "_constructor" incompatible with supertype "PandasObject" @property - def _constructor(self) -> Callable[..., Self]: + def _constructor(self) -> Callable[..., Self]: # type: ignore[override] """ Used when a manipulation result has the same dimensions as the original. """ raise AbstractMethodError(self) - # ---------------------------------------------------------------------- - # Internals - - @final - @property - def _data(self): - # GH#33054 retained because some downstream packages uses this, - # e.g. fastparquet - # GH#33333 - warnings.warn( - f"{type(self).__name__}._data is deprecated and will be removed in " - "a future version. Use public APIs instead.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - return self._mgr - # ---------------------------------------------------------------------- # Axis _AXIS_ORDERS: list[Literal["index", "columns"]] @@ -560,7 +507,9 @@ def _data(self): _AXIS_LEN: int @final - def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs): + def _construct_axes_dict( + self, axes: Sequence[Axis] | None = None, **kwargs: AxisInt + ) -> dict: """Return an axes dictionary for myself.""" d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)} # error: Argument 1 to "update" of "MutableMapping" has incompatible type @@ -573,8 +522,10 @@ def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs): def _get_axis_number(cls, axis: Axis) -> AxisInt: try: return cls._AXIS_TO_AXIS_NUMBER[axis] - except KeyError: - raise ValueError(f"No axis named {axis} for object type {cls.__name__}") + except KeyError as err: + raise ValueError( + f"No axis named {axis} for object type {cls.__name__}" + ) from err @final @classmethod @@ -643,7 +594,7 @@ def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]: @final def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: """ - Return the special character free column resolvers of a dataframe. + Return the special character free column resolvers of a DataFrame. Column names with special characters are 'cleaned up' so that they can be referred to by backtick quoting. @@ -655,12 +606,12 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: if isinstance(self, ABCSeries): return {clean_column_name(self.name): self} + dtypes = self.dtypes return { clean_column_name(k): Series( - v, copy=False, index=self.index, name=k + v, copy=False, index=self.index, name=k, dtype=dtype ).__finalize__(self) - for k, v in zip(self.columns, self._iter_column_arrays()) - if not isinstance(k, int) + for k, v, dtype in zip(self.columns, self._iter_column_arrays(), dtypes) } @final @@ -668,14 +619,6 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: def _info_axis(self) -> Index: return getattr(self, self._info_axis_name) - def _is_view_after_cow_rules(self): - # Only to be used in cases of chained assignment checks, this is a - # simplified check that assumes that either the whole object is a view - # or a copy - if len(self._mgr.blocks) == 0: # type: ignore[union-attr] - return False - return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] - @property def shape(self) -> tuple[int, ...]: """ @@ -702,15 +645,15 @@ def ndim(self) -> int: See Also -------- - ndarray.ndim : Number of array dimensions. + numpy.ndarray.ndim : Number of array dimensions. Examples -------- - >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s = pd.Series({"a": 1, "b": 2, "c": 3}) >>> s.ndim 1 - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.ndim 2 """ @@ -727,15 +670,15 @@ def size(self) -> int: See Also -------- - ndarray.size : Number of elements in the array. + numpy.ndarray.size : Number of elements in the array. Examples -------- - >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s = pd.Series({"a": 1, "b": 2, "c": 3}) >>> s.size 3 - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.size 4 """ @@ -747,7 +690,7 @@ def set_axis( labels, *, axis: Axis = 0, - copy: bool_t | None = None, + copy: bool | lib.NoDefault = lib.no_default, ) -> Self: """ Assign desired index to given axis. @@ -764,7 +707,7 @@ def set_axis( The axis to update. The value 0 identifies the rows. For `Series` this parameter is unused and defaults to 0. - copy : bool, default True + copy : bool, default False Whether to make a copy of the underlying data. .. note:: @@ -779,6 +722,8 @@ def set_axis( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- %(klass)s @@ -788,20 +733,28 @@ def set_axis( -------- %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. """ - return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy) + self._check_copy_deprecation(copy) + return self._set_axis_nocheck(labels, axis, inplace=False) - @final + @overload def _set_axis_nocheck( - self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None - ): + self, labels, axis: Axis, inplace: Literal[False] + ) -> Self: ... + + @overload + def _set_axis_nocheck(self, labels, axis: Axis, inplace: Literal[True]) -> None: ... + + @overload + def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool) -> Self | None: ... + + @final + def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool) -> Self | None: if inplace: setattr(self, self._get_axis_name(axis), labels) - else: - # With copy=False, we create a new object but don't copy the - # underlying data. - obj = self.copy(deep=copy and not using_copy_on_write()) - setattr(obj, obj._get_axis_name(axis), labels) - return obj + return None + obj = self.copy(deep=False) + setattr(obj, obj._get_axis_name(axis), labels) + return obj @final def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None: @@ -811,71 +764,6 @@ def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None: """ labels = ensure_index(labels) self._mgr.set_axis(axis, labels) - self._clear_item_cache() - - @final - def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self: - """ - Interchange axes and swap values axes appropriately. - - .. deprecated:: 2.1.0 - ``swapaxes`` is deprecated and will be removed. - Please use ``transpose`` instead. - - Returns - ------- - same as input - - Examples - -------- - Please see examples for :meth:`DataFrame.transpose`. - """ - warnings.warn( - # GH#51946 - f"'{type(self).__name__}.swapaxes' is deprecated and " - "will be removed in a future version. " - f"Please use '{type(self).__name__}.transpose' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - i = self._get_axis_number(axis1) - j = self._get_axis_number(axis2) - - if i == j: - return self.copy(deep=copy and not using_copy_on_write()) - - mapping = {i: j, j: i} - - new_axes = [self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)] - new_values = self._values.swapaxes(i, j) # type: ignore[union-attr] - if self._mgr.is_single_block and isinstance(self._mgr, BlockManager): - # This should only get hit in case of having a single block, otherwise a - # copy is made, we don't have to set up references. - new_mgr = ndarray_to_mgr( - new_values, - new_axes[0], - new_axes[1], - dtype=None, - copy=False, - typ="block", - ) - assert isinstance(new_mgr, BlockManager) - assert isinstance(self._mgr, BlockManager) - new_mgr.blocks[0].refs = self._mgr.blocks[0].refs - new_mgr.blocks[0].refs.add_reference(new_mgr.blocks[0]) - if not using_copy_on_write() and copy is not False: - new_mgr = new_mgr.copy(deep=True) - - out = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) - return out.__finalize__(self, method="swapaxes") - - return self._constructor( - new_values, - *new_axes, - # The no-copy case for CoW is handled above - copy=False, - ).__finalize__(self, method="swapaxes") @final @doc(klass=_shared_doc_kwargs["klass"]) @@ -903,17 +791,23 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: {klass} {klass} with requested index / column level(s) removed. + See Also + -------- + DataFrame.replace : Replace values given in `to_replace` with `value`. + DataFrame.pivot : Return reshaped DataFrame organized by given + index / column values. + Examples -------- - >>> df = pd.DataFrame([ - ... [1, 2, 3, 4], - ... [5, 6, 7, 8], - ... [9, 10, 11, 12] - ... ]).set_index([0, 1]).rename_axis(['a', 'b']) + >>> df = ( + ... pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + ... .set_index([0, 1]) + ... .rename_axis(["a", "b"]) + ... ) - >>> df.columns = pd.MultiIndex.from_tuples([ - ... ('c', 'e'), ('d', 'f') - ... ], names=['level_1', 'level_2']) + >>> df.columns = pd.MultiIndex.from_tuples( + ... [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ... ) >>> df level_1 c d @@ -923,7 +817,7 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: 5 6 7 8 9 10 11 12 - >>> df.droplevel('a') + >>> df.droplevel("a") level_1 c d level_2 e f b @@ -931,7 +825,7 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: 6 7 8 10 11 12 - >>> df.droplevel('level_2', axis=1) + >>> df.droplevel("level_2", axis=1) level_1 c d a b 1 2 3 4 @@ -940,7 +834,7 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: """ labels = self._get_axis(axis) new_labels = labels.droplevel(level) - return self.set_axis(new_labels, axis=axis, copy=None) + return self.set_axis(new_labels, axis=axis) def pop(self, item: Hashable) -> Series | Any: result = self[item] @@ -949,7 +843,7 @@ def pop(self, item: Hashable) -> Series | Any: return result @final - def squeeze(self, axis: Axis | None = None): + def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: """ Squeeze 1 dimensional axis objects into scalars. @@ -992,7 +886,7 @@ def squeeze(self, axis: Axis | None = None): dtype: int64 >>> even_primes.squeeze() - 2 + np.int64(2) Squeezing objects with more than one value in every axis does nothing: @@ -1011,7 +905,7 @@ def squeeze(self, axis: Axis | None = None): Squeezing is even more effective when used with DataFrames. - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) >>> df a b 0 1 2 @@ -1020,7 +914,7 @@ def squeeze(self, axis: Axis | None = None): Slicing a single column will produce a DataFrame with the columns having only one value: - >>> df_a = df[['a']] + >>> df_a = df[["a"]] >>> df_a a 0 1 @@ -1028,7 +922,7 @@ def squeeze(self, axis: Axis | None = None): So the columns can be squeezed down, resulting in a Series: - >>> df_a.squeeze('columns') + >>> df_a.squeeze("columns") 0 1 1 3 Name: a, dtype: int64 @@ -1036,21 +930,21 @@ def squeeze(self, axis: Axis | None = None): Slicing a single row from a single column will produce a single scalar DataFrame: - >>> df_0a = df.loc[df.index < 1, ['a']] + >>> df_0a = df.loc[df.index < 1, ["a"]] >>> df_0a a 0 1 Squeezing the rows produces a single scalar Series: - >>> df_0a.squeeze('rows') + >>> df_0a.squeeze("rows") a 1 Name: 0, dtype: int64 Squeezing all axes will project directly into a scalar: >>> df_0a.squeeze() - 1 + np.int64(1) """ axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),) result = self.iloc[ @@ -1066,6 +960,45 @@ def squeeze(self, axis: Axis | None = None): # ---------------------------------------------------------------------- # Rename + @overload + def _rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + inplace: Literal[False] = ..., + level: Level | None = ..., + errors: str = ..., + ) -> Self: ... + + @overload + def _rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + inplace: Literal[True], + level: Level | None = ..., + errors: str = ..., + ) -> None: ... + + @overload + def _rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + inplace: bool, + level: Level | None = ..., + errors: str = ..., + ) -> Self | None: ... + @final def _rename( self, @@ -1074,8 +1007,7 @@ def _rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool_t | None = None, - inplace: bool_t = False, + inplace: bool = False, level: Level | None = None, errors: str = "ignore", ) -> Self | None: @@ -1101,7 +1033,7 @@ def _rename( index = mapper self._check_inplace_and_allows_duplicate_labels(inplace) - result = self if inplace else self.copy(deep=copy and not using_copy_on_write()) + result = self if inplace else self.copy(deep=False) for axis_no, replacements in enumerate((index, columns)): if replacements is None: @@ -1129,8 +1061,7 @@ def _rename( raise KeyError(f"{missing_labels} not found in axis") new_index = ax._transform_index(f, level=level) - result._set_axis_nocheck(new_index, axis=axis_no, inplace=True, copy=False) - result._clear_item_cache() + result._set_axis_nocheck(new_index, axis=axis_no, inplace=True) if inplace: self._update_inplace(result) @@ -1146,10 +1077,9 @@ def rename_axis( index=..., columns=..., axis: Axis = ..., - copy: bool_t | None = ..., + copy: bool | lib.NoDefault = lib.no_default, inplace: Literal[False] = ..., - ) -> Self: - ... + ) -> Self: ... @overload def rename_axis( @@ -1159,10 +1089,9 @@ def rename_axis( index=..., columns=..., axis: Axis = ..., - copy: bool_t | None = ..., + copy: bool | lib.NoDefault = lib.no_default, inplace: Literal[True], - ) -> None: - ... + ) -> None: ... @overload def rename_axis( @@ -1172,10 +1101,9 @@ def rename_axis( index=..., columns=..., axis: Axis = ..., - copy: bool_t | None = ..., - inplace: bool_t = ..., - ) -> Self | None: - ... + copy: bool | lib.NoDefault = lib.no_default, + inplace: bool = ..., + ) -> Self | None: ... def rename_axis( self, @@ -1184,8 +1112,8 @@ def rename_axis( index=lib.no_default, columns=lib.no_default, axis: Axis = 0, - copy: bool_t | None = None, - inplace: bool_t = False, + copy: bool | lib.NoDefault = lib.no_default, + inplace: bool = False, ) -> Self | None: """ Set the name of the axis for the index or columns. @@ -1194,19 +1122,19 @@ def rename_axis( ---------- mapper : scalar, list-like, optional Value to set the axis name attribute. - index, columns : scalar, list-like, dict-like or function, optional - A scalar, list-like, dict-like or functions transformations to - apply to that axis' values. - Note that the ``columns`` parameter is not allowed if the - object is a Series. This parameter only apply for DataFrame - type objects. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and/or ``columns``. + index : scalar, list-like, dict-like or function, optional + A scalar, list-like, dict-like or functions transformations to + apply to that axis' values. + columns : scalar, list-like, dict-like or function, optional + A scalar, list-like, dict-like or functions transformations to + apply to that axis' values. axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to rename. For `Series` this parameter is unused and defaults to 0. - copy : bool, default None + The axis to rename. + copy : bool, default False Also copy underlying data. .. note:: @@ -1220,13 +1148,15 @@ def rename_axis( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 inplace : bool, default False Modifies the object directly, instead of creating a new Series or DataFrame. Returns ------- - Series, DataFrame, or None + DataFrame, or None The same type as the caller or None if ``inplace=True``. See Also @@ -1256,26 +1186,11 @@ def rename_axis( Examples -------- - **Series** - - >>> s = pd.Series(["dog", "cat", "monkey"]) - >>> s - 0 dog - 1 cat - 2 monkey - dtype: object - >>> s.rename_axis("animal") - animal - 0 dog - 1 cat - 2 monkey - dtype: object - **DataFrame** - >>> df = pd.DataFrame({"num_legs": [4, 4, 2], - ... "num_arms": [0, 0, 2]}, - ... ["dog", "cat", "monkey"]) + >>> df = pd.DataFrame( + ... {"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]}, ["dog", "cat", "monkey"] + ... ) >>> df num_legs num_arms dog 4 0 @@ -1298,9 +1213,9 @@ def rename_axis( **MultiIndex** - >>> df.index = pd.MultiIndex.from_product([['mammal'], - ... ['dog', 'cat', 'monkey']], - ... names=['type', 'name']) + >>> df.index = pd.MultiIndex.from_product( + ... [["mammal"], ["dog", "cat", "monkey"]], names=["type", "name"] + ... ) >>> df limbs num_legs num_arms type name @@ -1308,7 +1223,7 @@ def rename_axis( cat 4 0 monkey 2 2 - >>> df.rename_axis(index={'type': 'class'}) + >>> df.rename_axis(index={"type": "class"}) limbs num_legs num_arms class name mammal dog 4 0 @@ -1322,6 +1237,7 @@ class name cat 4 0 monkey 2 2 """ + self._check_copy_deprecation(copy) axes = {"index": index, "columns": columns} if axis is not None: @@ -1329,24 +1245,19 @@ class name inplace = validate_bool_kwarg(inplace, "inplace") - if copy and using_copy_on_write(): - copy = False - if mapper is not lib.no_default: # Use v0.23 behavior if a scalar or list non_mapper = is_scalar(mapper) or ( is_list_like(mapper) and not is_dict_like(mapper) ) if non_mapper: - return self._set_axis_name( - mapper, axis=axis, inplace=inplace, copy=copy - ) + return self._set_axis_name(mapper, axis=axis, inplace=inplace) else: raise ValueError("Use `.rename` to alter labels with a mapper.") else: # Use new behavior. Means that index and/or columns # is specified - result = self if inplace else self.copy(deep=copy) + result = self if inplace else self.copy(deep=False) for axis in range(self._AXIS_LEN): v = axes.get(self._get_axis_name(axis)) @@ -1359,15 +1270,30 @@ class name f = common.get_rename_function(v) curnames = self._get_axis(axis).names newnames = [f(name) for name in curnames] - result._set_axis_name(newnames, axis=axis, inplace=True, copy=copy) + result._set_axis_name(newnames, axis=axis, inplace=True) if not inplace: return result return None + @overload + def _set_axis_name( + self, name, axis: Axis = ..., *, inplace: Literal[False] = ... + ) -> Self: ... + + @overload + def _set_axis_name( + self, name, axis: Axis = ..., *, inplace: Literal[True] + ) -> None: ... + + @overload + def _set_axis_name( + self, name, axis: Axis = ..., *, inplace: bool + ) -> Self | None: ... + @final def _set_axis_name( - self, name, axis: Axis = 0, inplace: bool_t = False, copy: bool_t | None = True - ): + self, name, axis: Axis = 0, *, inplace: bool = False + ) -> Self | None: """ Set the name(s) of the axis. @@ -1380,8 +1306,6 @@ def _set_axis_name( and the value 1 or 'columns' specifies columns. inplace : bool, default False If `True`, do operation inplace and return None. - copy: - Whether to make a copy of the result. Returns ------- @@ -1397,8 +1321,7 @@ def _set_axis_name( Examples -------- - >>> df = pd.DataFrame({"num_legs": [4, 4, 2]}, - ... ["dog", "cat", "monkey"]) + >>> df = pd.DataFrame({"num_legs": [4, 4, 2]}, ["dog", "cat", "monkey"]) >>> df num_legs dog 4 @@ -1411,7 +1334,8 @@ def _set_axis_name( cat 4 monkey 2 >>> df.index = pd.MultiIndex.from_product( - ... [["mammal"], ['dog', 'cat', 'monkey']]) + ... [["mammal"], ["dog", "cat", "monkey"]] + ... ) >>> df._set_axis_name(["type", "name"]) num_legs type name @@ -1423,7 +1347,7 @@ def _set_axis_name( idx = self._get_axis(axis).set_names(name) inplace = validate_bool_kwarg(inplace, "inplace") - renamed = self if inplace else self.copy(deep=copy) + renamed = self if inplace else self.copy(deep=False) if axis == 0: renamed.index = idx else: @@ -1431,18 +1355,19 @@ def _set_axis_name( if not inplace: return renamed + return None # ---------------------------------------------------------------------- # Comparison Methods @final - def _indexed_same(self, other) -> bool_t: + def _indexed_same(self, other) -> bool: return all( self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS ) @final - def equals(self, other: object) -> bool_t: + def equals(self, other: object) -> bool: """ Test whether two objects contain the same elements. @@ -1572,81 +1497,12 @@ def __invert__(self) -> Self: return res.__finalize__(self, method="__invert__") @final - def __nonzero__(self) -> NoReturn: + def __bool__(self) -> NoReturn: raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) - __bool__ = __nonzero__ - - @final - def bool(self) -> bool_t: - """ - Return the bool of a single element Series or DataFrame. - - .. deprecated:: 2.1.0 - - bool is deprecated and will be removed in future version of pandas. - For ``Series`` use ``pandas.Series.item``. - - This must be a boolean scalar value, either True or False. It will raise a - ValueError if the Series or DataFrame does not have exactly 1 element, or that - element is not boolean (integer values 0 and 1 will also raise an exception). - - Returns - ------- - bool - The value in the Series or DataFrame. - - See Also - -------- - Series.astype : Change the data type of a Series, including to boolean. - DataFrame.astype : Change the data type of a DataFrame, including to boolean. - numpy.bool_ : NumPy boolean data type, used by pandas for boolean values. - - Examples - -------- - The method will only work for single element objects with a boolean value: - - >>> pd.Series([True]).bool() # doctest: +SKIP - True - >>> pd.Series([False]).bool() # doctest: +SKIP - False - - >>> pd.DataFrame({'col': [True]}).bool() # doctest: +SKIP - True - >>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP - False - - This is an alternative method and will only work - for single element objects with a boolean value: - - >>> pd.Series([True]).item() # doctest: +SKIP - True - >>> pd.Series([False]).item() # doctest: +SKIP - False - """ - - warnings.warn( - f"{type(self).__name__}.bool is now deprecated and will be removed " - "in future version of pandas", - FutureWarning, - stacklevel=find_stack_level(), - ) - v = self.squeeze() - if isinstance(v, (bool, np.bool_)): - return bool(v) - elif is_scalar(v): - raise ValueError( - "bool cannot act on a non-boolean single element " - f"{type(self).__name__}" - ) - - self.__nonzero__() - # for mypy (__nonzero__ raises) - return True - @final def abs(self) -> Self: """ @@ -1689,7 +1545,7 @@ def abs(self) -> Self: Absolute numeric values in a Series with a Timedelta element. - >>> s = pd.Series([pd.Timedelta('1 days')]) + >>> s = pd.Series([pd.Timedelta("1 days")]) >>> s.abs() 0 1 days dtype: timedelta64[ns] @@ -1697,11 +1553,9 @@ def abs(self) -> Self: Select rows with data closest to certain value using argsort (from `StackOverflow `__). - >>> df = pd.DataFrame({ - ... 'a': [4, 5, 6, 7], - ... 'b': [10, 20, 30, 40], - ... 'c': [100, 50, -30, -50] - ... }) + >>> df = pd.DataFrame( + ... {"a": [4, 5, 6, 7], "b": [10, 20, 30, 40], "c": [100, 50, -30, -50]} + ... ) >>> df a b c 0 4 10 100 @@ -1737,7 +1591,7 @@ def __round__(self, decimals: int = 0) -> Self: # have consistent precedence and validation logic throughout the library. @final - def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool_t: + def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool: """ Test whether a key is a level reference for a given axis. @@ -1768,7 +1622,7 @@ def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool_t: ) @final - def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool_t: + def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool: """ Test whether a key is a label reference for a given axis. @@ -1798,7 +1652,7 @@ def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool_t: ) @final - def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool_t: + def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool: """ Test whether a key is a label or level reference for a given axis. @@ -1899,11 +1753,15 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike if `key` matches multiple labels """ axis = self._get_axis_number(axis) - other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + first_other_axes = next( + (ax for ax in range(self._AXIS_LEN) if ax != axis), None + ) if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) - values = self.xs(key, axis=other_axes[0])._values + if first_other_axes is None: + raise ValueError("axis matched all axes") + values = self.xs(key, axis=first_other_axes)._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values else: @@ -1911,7 +1769,9 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike # Check for duplicates if values.ndim > 1: - if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex): + if first_other_axes is not None and isinstance( + self._get_axis(first_other_axes), MultiIndex + ): multi_message = ( "\n" "For a multi-index, the label must be a " @@ -1995,7 +1855,7 @@ def _drop_labels_or_levels(self, keys, axis: AxisInt = 0): else: # Drop the last level of Index by replacing with # a RangeIndex - dropped.columns = RangeIndex(dropped.columns.size) + dropped.columns = default_index(dropped.columns.size) # Handle dropping index labels if labels_to_drop: @@ -2020,9 +1880,14 @@ def __iter__(self) -> Iterator: iterator Info axis as iterator. + See Also + -------- + DataFrame.items : Iterate over (column name, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples. + Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> for x in df: ... print(x) A @@ -2042,10 +1907,16 @@ def keys(self) -> Index: Index Info axis. + See Also + -------- + DataFrame.index : The index (row labels) of the DataFrame. + DataFrame.columns: The column labels of the DataFrame. + Examples -------- - >>> d = pd.DataFrame(data={'A': [1, 2, 3], 'B': [0, 4, 8]}, - ... index=['a', 'b', 'c']) + >>> d = pd.DataFrame( + ... data={"A": [1, 2, 3], "B": [0, 4, 8]}, index=["a", "b", "c"] + ... ) >>> d A B a 1 0 @@ -2074,12 +1945,12 @@ def __len__(self) -> int: return len(self._info_axis) @final - def __contains__(self, key) -> bool_t: + def __contains__(self, key) -> bool: """True if the key is in the info axis""" return key in self._info_axis @property - def empty(self) -> bool_t: + def empty(self) -> bool: """ Indicator whether Series/DataFrame is empty. @@ -2106,7 +1977,7 @@ def empty(self) -> bool_t: -------- An example of an actual empty DataFrame. Notice the index is empty: - >>> df_empty = pd.DataFrame({'A' : []}) + >>> df_empty = pd.DataFrame({"A": []}) >>> df_empty Empty DataFrame Columns: [A] @@ -2117,7 +1988,7 @@ def empty(self) -> bool_t: If we only have NaNs in our DataFrame, it is not considered empty! We will need to drop the NaNs to make the DataFrame empty: - >>> df = pd.DataFrame({'A' : [np.nan]}) + >>> df = pd.DataFrame({"A": [np.nan]}) >>> df A 0 NaN @@ -2126,7 +1997,7 @@ def empty(self) -> bool_t: >>> df.dropna().empty True - >>> ser_empty = pd.Series({'A' : []}) + >>> ser_empty = pd.Series({"A": []}) >>> ser_empty A [] dtype: object @@ -2145,12 +2016,25 @@ def empty(self) -> bool_t: # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__: int = 1000 - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool | None = None + ) -> np.ndarray: + if copy is False and not self._mgr.is_single_block and not self.empty: + # check this manually, otherwise ._values will already return a copy + # and np.array(values, copy=False) will not raise an error + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) values = self._values - arr = np.asarray(values, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + if ( - astype_is_view(values.dtype, arr.dtype) - and using_copy_on_write() + copy is not True + and astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block ): # Check if both conversions can be done without a copy @@ -2204,7 +2088,7 @@ def __setstate__(self, state) -> None: # e.g. say fill_value needing _mgr to be # defined meta = set(self._internal_names + self._metadata) - for k in list(meta): + for k in meta: if k in state and k != "_flags": v = state[k] object.__setattr__(self, k, v) @@ -2218,8 +2102,6 @@ def __setstate__(self, state) -> None: elif len(state) == 2: raise NotImplementedError("Pre-0.12 pickles are no longer supported") - self._item_cache: dict[Hashable, Series] = {} - # ---------------------------------------------------------------------- # Rendering Methods @@ -2257,28 +2139,34 @@ def _repr_data_resource_(self): # I/O Methods @final - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "excel_writer"], name="to_excel" - ) @doc( klass="object", storage_options=_shared_docs["storage_options"], storage_options_versionadded="1.2.0", + encoding_parameter="", + verbose_parameter="", + extra_parameters=textwrap.dedent( + """\ + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. + """ + ), ) def to_excel( self, excel_writer: FilePath | WriteExcelBuffer | ExcelWriter, + *, sheet_name: str = "Sheet1", na_rep: str = "", float_format: str | None = None, columns: Sequence[Hashable] | None = None, - header: Sequence[Hashable] | bool_t = True, - index: bool_t = True, + header: Sequence[Hashable] | bool = True, + index: bool = True, index_label: IndexLabel | None = None, startrow: int = 0, startcol: int = 0, engine: Literal["openpyxl", "xlsxwriter"] | None = None, - merge_cells: bool_t = True, + merge_cells: bool = True, inf_rep: str = "inf", freeze_panes: tuple[int, int] | None = None, storage_options: StorageOptions | None = None, @@ -2328,20 +2216,21 @@ def to_excel( via the options ``io.excel.xlsx.writer`` or ``io.excel.xlsm.writer``. - merge_cells : bool, default True - Write MultiIndex and Hierarchical Rows as merged cells. + merge_cells : bool or 'columns', default False + If True, write MultiIndex index and columns as merged cells. + If 'columns', merge MultiIndex column cells only. + {encoding_parameter} inf_rep : str, default 'inf' Representation for infinity (there is no native representation for infinity in Excel). + {verbose_parameter} freeze_panes : tuple of int (length 2), optional Specifies the one-based bottommost row and rightmost column that is to be frozen. {storage_options} .. versionadded:: {storage_options_versionadded} - engine_kwargs : dict, optional - Arbitrary keyword arguments passed to excel engine. - + {extra_parameters} See Also -------- to_csv : Write DataFrame to a comma-separated values (csv) file. @@ -2358,40 +2247,44 @@ def to_excel( Once a workbook has been saved it is not possible to write further data without rewriting the whole workbook. + pandas will check the number of rows, columns, + and cell character count does not exceed Excel's limitations. + All other limitations must be checked by the user. + Examples -------- Create, write to and save a workbook: - >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], - ... index=['row 1', 'row 2'], - ... columns=['col 1', 'col 2']) + >>> df1 = pd.DataFrame( + ... [["a", "b"], ["c", "d"]], + ... index=["row 1", "row 2"], + ... columns=["col 1", "col 2"], + ... ) >>> df1.to_excel("output.xlsx") # doctest: +SKIP To specify the sheet name: - >>> df1.to_excel("output.xlsx", - ... sheet_name='Sheet_name_1') # doctest: +SKIP + >>> df1.to_excel("output.xlsx", sheet_name="Sheet_name_1") # doctest: +SKIP If you wish to write to more than one sheet in the workbook, it is necessary to specify an ExcelWriter object: >>> df2 = df1.copy() - >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP - ... df1.to_excel(writer, sheet_name='Sheet_name_1') - ... df2.to_excel(writer, sheet_name='Sheet_name_2') + >>> with pd.ExcelWriter("output.xlsx") as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name="Sheet_name_1") + ... df2.to_excel(writer, sheet_name="Sheet_name_2") ExcelWriter can also be used to append to an existing Excel file: - >>> with pd.ExcelWriter('output.xlsx', - ... mode='a') as writer: # doctest: +SKIP - ... df1.to_excel(writer, sheet_name='Sheet_name_3') + >>> with pd.ExcelWriter("output.xlsx", mode="a") as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name="Sheet_name_3") To set the library that is used to write the Excel file, you can pass the `engine` keyword (the default engine is automatically chosen depending on the file extension): - >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP + >>> df1.to_excel("output1.xlsx", engine="xlsxwriter") # doctest: +SKIP """ if engine_kwargs is None: engine_kwargs = {} @@ -2423,9 +2316,6 @@ def to_excel( ) @final - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "path_or_buf"], name="to_json" - ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buf", @@ -2433,16 +2323,17 @@ def to_excel( def to_json( self, path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + *, orient: Literal["split", "records", "index", "table", "columns", "values"] | None = None, date_format: str | None = None, double_precision: int = 10, - force_ascii: bool_t = True, + force_ascii: bool = True, date_unit: TimeUnit = "ms", default_handler: Callable[[Any], JSONSerializable] | None = None, - lines: bool_t = False, + lines: bool = False, compression: CompressionOptions = "infer", - index: bool_t | None = None, + index: bool | None = None, indent: int | None = None, storage_options: StorageOptions | None = None, mode: Literal["a", "w"] = "w", @@ -2490,6 +2381,11 @@ def to_json( 'iso' = ISO8601. The default depends on the `orient`. For ``orient='table'``, the default is 'iso'. For all other orients, the default is 'epoch'. + + .. deprecated:: 3.0.0 + 'epoch' date format is deprecated and will be removed in a future + version, please use 'iso' instead. + double_precision : int, default 10 The number of decimal places to use when encoding floating point values. The possible maximal value is 15. @@ -2515,7 +2411,8 @@ def to_json( index : bool or None, default None The index is only used when 'orient' is 'split', 'index', 'column', or 'table'. Of these, 'index' and 'column' do not support - `index=False`. + `index=False`. The string 'index' as a column name with empty :class:`Index` + or if it is 'index' will raise a ``ValueError``. indent : int, optional Length of whitespace used to indent each record. @@ -2692,6 +2589,22 @@ def to_json( date_format = "iso" elif date_format is None: date_format = "epoch" + dtypes = self.dtypes if self.ndim == 2 else [self.dtype] + if any(lib.is_np_dtype(dtype, "mM") for dtype in dtypes): + warnings.warn( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif date_format == "epoch": + # GH#57063 + warnings.warn( + "'epoch' date format is deprecated and will be removed in a future " + "version, please use 'iso' date format instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) config.is_nonnegative_int(indent) indent = indent or 0 @@ -2714,22 +2627,20 @@ def to_json( ) @final - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "path_or_buf"], name="to_hdf" - ) def to_hdf( self, path_or_buf: FilePath | HDFStore, + *, key: str, mode: Literal["a", "w", "r+"] = "a", complevel: int | None = None, complib: Literal["zlib", "lzo", "bzip2", "blosc"] | None = None, - append: bool_t = False, + append: bool = False, format: Literal["fixed", "table"] | None = None, - index: bool_t = True, + index: bool = True, min_itemsize: int | dict[str, int] | None = None, nan_rep=None, - dropna: bool_t | None = None, + dropna: bool | None = None, data_columns: Literal[True] | list[str] | None = None, errors: OpenFileErrors = "strict", encoding: str = "UTF-8", @@ -2810,6 +2721,7 @@ def to_hdf( See the errors argument for :func:`open` for a full list of options. encoding : str, default "UTF-8" + Set character encoding. See Also -------- @@ -2822,23 +2734,24 @@ def to_hdf( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - ... index=['a', 'b', 'c']) # doctest: +SKIP - >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"] + ... ) # doctest: +SKIP + >>> df.to_hdf("data.h5", key="df", mode="w") # doctest: +SKIP We can add another object to the same file: >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP - >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP + >>> s.to_hdf("data.h5", key="s") # doctest: +SKIP Reading from HDF file: - >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP + >>> pd.read_hdf("data.h5", "df") # doctest: +SKIP A B a 1 4 b 2 5 c 3 6 - >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP + >>> pd.read_hdf("data.h5", "s") # doctest: +SKIP 0 1 1 2 2 3 @@ -2868,16 +2781,14 @@ def to_hdf( ) @final - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "name", "con"], name="to_sql" - ) def to_sql( self, name: str, con, + *, schema: str | None = None, - if_exists: Literal["fail", "replace", "append"] = "fail", - index: bool_t = True, + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", + index: bool = True, index_label: IndexLabel | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, @@ -2889,11 +2800,18 @@ def to_sql( Databases supported by SQLAlchemy [1]_ are supported. Tables can be newly created, appended to, or overwritten. + .. warning:: + The pandas library does not attempt to sanitize inputs provided via a to_sql call. + Please refer to the documentation for the underlying database driver to see if it + will properly prevent injection, or alternatively be advised of a security risk when + executing arbitrary commands in a to_sql call. + Parameters ---------- name : str Name of SQL table. - con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + con : ADBC connection, sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. Legacy support is provided for sqlite3.Connection objects. The user is responsible for engine disposal and connection closure for the SQLAlchemy @@ -2906,12 +2824,13 @@ def to_sql( schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. - if_exists : {'fail', 'replace', 'append'}, default 'fail' + if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' How to behave if the table already exists. * fail: Raise a ValueError. * replace: Drop the table before inserting new values. * append: Insert new values to the existing table. + * delete_rows: If a table exists, delete all records and insert data. index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column @@ -2921,8 +2840,8 @@ def to_sql( `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional - Specify the number of rows in each batch to be written at a time. - By default, all rows will be written at once. + Specify the number of rows in each batch to be written to the database connection at a time. + By default, all rows will be written at once. Also see the method keyword. dtype : dict or scalar, optional Specifying the datatype for columns. If a dictionary is used, the keys should be the column names and the values should be the @@ -2969,6 +2888,9 @@ def to_sql( database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. + Not all datastores support ``method="multi"``. Oracle, for example, + does not support multi-value insert. + References ---------- .. [1] https://docs.sqlalchemy.org @@ -2994,7 +2916,7 @@ def to_sql( 3 >>> from sqlalchemy import text >>> with engine.connect() as conn: - ... conn.execute(text("SELECT * FROM users")).fetchall() + ... conn.execute(text("SELECT * FROM users")).fetchall() [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] An `sqlalchemy.engine.Connection` can also be passed to `con`: @@ -3011,7 +2933,7 @@ def to_sql( >>> df2.to_sql(name='users', con=engine, if_exists='append') 2 >>> with engine.connect() as conn: - ... conn.execute(text("SELECT * FROM users")).fetchall() + ... conn.execute(text("SELECT * FROM users")).fetchall() [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'), (0, 'User 4'), (1, 'User 5'), (0, 'User 6'), (1, 'User 7')] @@ -3022,9 +2944,19 @@ def to_sql( ... index_label='id') 2 >>> with engine.connect() as conn: - ... conn.execute(text("SELECT * FROM users")).fetchall() + ... conn.execute(text("SELECT * FROM users")).fetchall() [(0, 'User 6'), (1, 'User 7')] + Delete all rows before inserting new records with ``df3`` + + >>> df3 = pd.DataFrame({"name": ['User 8', 'User 9']}) + >>> df3.to_sql(name='users', con=engine, if_exists='delete_rows', + ... index_label='id') + 2 + >>> with engine.connect() as conn: + ... conn.execute(text("SELECT * FROM users")).fetchall() + [(0, 'User 8'), (1, 'User 9')] + Use ``method`` to define a callable insertion method to do nothing if there's a primary key conflict on a table in a PostgreSQL database. @@ -3035,13 +2967,14 @@ def to_sql( ... stmt = insert(table.table).values(data).on_conflict_do_nothing(index_elements=["a"]) ... result = conn.execute(stmt) ... return result.rowcount - >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_nothing) # doctest: +SKIP + >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", # noqa: F821 + ... method=insert_on_conflict_nothing) # doctest: +SKIP 0 For MySQL, a callable to update columns ``b`` and ``c`` if there's a conflict on a primary key. - >>> from sqlalchemy.dialects.mysql import insert + >>> from sqlalchemy.dialects.mysql import insert # noqa: F811 >>> def insert_on_conflict_update(table, conn, keys, data_iter): ... # update columns "b" and "c" on primary key conflict ... data = [dict(zip(keys, row)) for row in data_iter] @@ -3052,7 +2985,8 @@ def to_sql( ... stmt = stmt.on_duplicate_key_update(b=stmt.inserted.b, c=stmt.inserted.c) ... result = conn.execute(stmt) ... return result.rowcount - >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_update) # doctest: +SKIP + >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", # noqa: F821 + ... method=insert_on_conflict_update) # doctest: +SKIP 2 Specify the dtype (especially useful for integers with missing values). @@ -3073,8 +3007,24 @@ def to_sql( 3 >>> with engine.connect() as conn: - ... conn.execute(text("SELECT * FROM integers")).fetchall() + ... conn.execute(text("SELECT * FROM integers")).fetchall() [(1,), (None,), (2,)] + + .. versionadded:: 2.2.0 + + pandas now supports writing via ADBC drivers + + >>> df = pd.DataFrame({'name' : ['User 10', 'User 11', 'User 12']}) + >>> df + name + 0 User 10 + 1 User 11 + 2 User 12 + + >>> from adbc_driver_sqlite import dbapi # doctest:+SKIP + >>> with dbapi.connect("sqlite://") as conn: # doctest:+SKIP + ... df.to_sql(name="users", con=conn) + 3 """ # noqa: E501 from pandas.io import sql @@ -3092,9 +3042,6 @@ def to_sql( ) @final - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "path"], name="to_pickle" - ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path", @@ -3102,6 +3049,7 @@ def to_sql( def to_pickle( self, path: FilePath | WriteBuffer[bytes], + *, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions | None = None, @@ -3135,7 +3083,9 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP + >>> original_df = pd.DataFrame( + ... {{"foo": range(5), "bar": range(5, 10)}} + ... ) # doctest: +SKIP >>> original_df # doctest: +SKIP foo bar 0 0 5 @@ -3153,7 +3103,7 @@ def to_pickle( 2 2 7 3 3 8 4 4 9 - """ # noqa: E501 + """ from pandas.io.pickle import to_pickle to_pickle( @@ -3165,11 +3115,8 @@ def to_pickle( ) @final - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self"], name="to_clipboard" - ) def to_clipboard( - self, excel: bool_t = True, sep: str | None = None, **kwargs + self, *, excel: bool = True, sep: str | None = None, **kwargs ) -> None: r""" Copy object to the system clipboard. @@ -3211,9 +3158,9 @@ def to_clipboard( -------- Copy the contents of a DataFrame to the clipboard. - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) - >>> df.to_clipboard(sep=',') # doctest: +SKIP + >>> df.to_clipboard(sep=",") # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # ,A,B,C ... # 0,1,2,3 @@ -3222,7 +3169,7 @@ def to_clipboard( We can omit the index by passing the keyword `index` and setting it to false. - >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP + >>> df.to_clipboard(sep=",", index=False) # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # A,B,C ... # 1,2,3 @@ -3233,6 +3180,7 @@ def to_clipboard( .. code-block:: python import pyperclip + html = df.style.to_html() pyperclip.copy(html) """ @@ -3262,12 +3210,15 @@ def to_xarray(self): Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2), - ... ('parrot', 'bird', 24.0, 2), - ... ('lion', 'mammal', 80.5, 4), - ... ('monkey', 'mammal', np.nan, 4)], - ... columns=['name', 'class', 'max_speed', - ... 'num_legs']) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0, 2), + ... ("parrot", "bird", 24.0, 2), + ... ("lion", "mammal", 80.5, 4), + ... ("monkey", "mammal", np.nan, 4), + ... ], + ... columns=["name", "class", "max_speed", "num_legs"], + ... ) >>> df name class max_speed num_legs 0 falcon bird 389.0 2 @@ -3275,30 +3226,34 @@ def to_xarray(self): 2 lion mammal 80.5 4 3 monkey mammal NaN 4 - >>> df.to_xarray() + >>> df.to_xarray() # doctest: +SKIP Dimensions: (index: 4) Coordinates: - * index (index) int64 0 1 2 3 + * index (index) int64 32B 0 1 2 3 Data variables: - name (index) object 'falcon' 'parrot' 'lion' 'monkey' - class (index) object 'bird' 'bird' 'mammal' 'mammal' - max_speed (index) float64 389.0 24.0 80.5 nan - num_legs (index) int64 2 2 4 4 + name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey' + class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' + max_speed (index) float64 32B 389.0 24.0 80.5 nan + num_legs (index) int64 32B 2 2 4 4 - >>> df['max_speed'].to_xarray() + >>> df["max_speed"].to_xarray() # doctest: +SKIP array([389. , 24. , 80.5, nan]) Coordinates: * index (index) int64 0 1 2 3 - >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01', - ... '2018-01-02', '2018-01-02']) - >>> df_multiindex = pd.DataFrame({'date': dates, - ... 'animal': ['falcon', 'parrot', - ... 'falcon', 'parrot'], - ... 'speed': [350, 18, 361, 15]}) - >>> df_multiindex = df_multiindex.set_index(['date', 'animal']) + >>> dates = pd.to_datetime( + ... ["2018-01-01", "2018-01-01", "2018-01-02", "2018-01-02"] + ... ) + >>> df_multiindex = pd.DataFrame( + ... { + ... "date": dates, + ... "animal": ["falcon", "parrot", "falcon", "parrot"], + ... "speed": [350, 18, 361, 15], + ... } + ... ) + >>> df_multiindex = df_multiindex.set_index(["date", "animal"]) >>> df_multiindex speed @@ -3308,11 +3263,11 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' 2018-01-02 falcon 361 parrot 15 - >>> df_multiindex.to_xarray() + >>> df_multiindex.to_xarray() # doctest: +SKIP Dimensions: (date: 2, animal: 2) Coordinates: - * date (date) datetime64[ns] 2018-01-01 2018-01-02 + * date (date) datetime64[s] 2018-01-01 2018-01-02 * animal (animal) object 'falcon' 'parrot' Data variables: speed (date, animal) int64 350 18 361 15 @@ -3328,80 +3283,78 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' def to_latex( self, buf: None = ..., + *, columns: Sequence[Hashable] | None = ..., - header: bool_t | SequenceNotStr[str] = ..., - index: bool_t = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., na_rep: str = ..., formatters: FormattersType | None = ..., float_format: FloatFormatType | None = ..., - sparsify: bool_t | None = ..., - index_names: bool_t = ..., - bold_rows: bool_t = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + bold_rows: bool = ..., column_format: str | None = ..., - longtable: bool_t | None = ..., - escape: bool_t | None = ..., + longtable: bool | None = ..., + escape: bool | None = ..., encoding: str | None = ..., decimal: str = ..., - multicolumn: bool_t | None = ..., + multicolumn: bool | None = ..., multicolumn_format: str | None = ..., - multirow: bool_t | None = ..., + multirow: bool | None = ..., caption: str | tuple[str, str] | None = ..., label: str | None = ..., position: str | None = ..., - ) -> str: - ... + ) -> str: ... @overload def to_latex( self, buf: FilePath | WriteBuffer[str], + *, columns: Sequence[Hashable] | None = ..., - header: bool_t | SequenceNotStr[str] = ..., - index: bool_t = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., na_rep: str = ..., formatters: FormattersType | None = ..., float_format: FloatFormatType | None = ..., - sparsify: bool_t | None = ..., - index_names: bool_t = ..., - bold_rows: bool_t = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + bold_rows: bool = ..., column_format: str | None = ..., - longtable: bool_t | None = ..., - escape: bool_t | None = ..., + longtable: bool | None = ..., + escape: bool | None = ..., encoding: str | None = ..., decimal: str = ..., - multicolumn: bool_t | None = ..., + multicolumn: bool | None = ..., multicolumn_format: str | None = ..., - multirow: bool_t | None = ..., + multirow: bool | None = ..., caption: str | tuple[str, str] | None = ..., label: str | None = ..., position: str | None = ..., - ) -> None: - ... + ) -> None: ... @final - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "buf"], name="to_latex" - ) def to_latex( self, buf: FilePath | WriteBuffer[str] | None = None, + *, columns: Sequence[Hashable] | None = None, - header: bool_t | SequenceNotStr[str] = True, - index: bool_t = True, + header: bool | SequenceNotStr[str] = True, + index: bool = True, na_rep: str = "NaN", formatters: FormattersType | None = None, float_format: FloatFormatType | None = None, - sparsify: bool_t | None = None, - index_names: bool_t = True, - bold_rows: bool_t = False, + sparsify: bool | None = None, + index_names: bool = True, + bold_rows: bool = False, column_format: str | None = None, - longtable: bool_t | None = None, - escape: bool_t | None = None, + longtable: bool | None = None, + escape: bool | None = None, encoding: str | None = None, decimal: str = ".", - multicolumn: bool_t | None = None, + multicolumn: bool | None = None, multicolumn_format: str | None = None, - multirow: bool_t | None = None, + multirow: bool | None = None, caption: str | tuple[str, str] | None = None, label: str | None = None, position: str | None = None, @@ -3409,9 +3362,9 @@ def to_latex( r""" Render object to a LaTeX tabular, longtable, or nested table. - Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted + Requires ``\usepackage{booktabs}``. The output can be copy/pasted into a main LaTeX document or read from an external file - with ``\input{{table.tex}}``. + with ``\input{table.tex}``. .. versionchanged:: 2.0.0 Refactored to use the Styler implementation via jinja2 templating. @@ -3424,18 +3377,18 @@ def to_latex( The subset of columns to write. Writes all columns by default. header : bool or list of str, default True Write out the column names. If a list of strings is given, - it is assumed to be aliases for the column names. + it is assumed to be aliases for the column names. Braces must be escaped. index : bool, default True Write row names (index). na_rep : str, default 'NaN' Missing data representation. - formatters : list of functions or dict of {{str: function}}, optional + formatters : list of functions or dict of {str: function}, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a unicode string. List must be of length equal to the number of columns. float_format : one-parameter function or str, optional, default None Formatter for floating point numbers. For example - ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will + ``float_format="%.2f"`` and ``float_format="{:0.2f}".format`` will both result in 0.1234 being formatted as 0.12. sparsify : bool, optional Set to False for a DataFrame with a hierarchical index to print @@ -3452,7 +3405,7 @@ def to_latex( columns of numbers, which default to 'r'. longtable : bool, optional Use a longtable environment instead of tabular. Requires - adding a \usepackage{{longtable}} to your LaTeX preamble. + adding a \usepackage{longtable} to your LaTeX preamble. By default, the value will be read from the pandas config module, and set to `True` if the option ``styler.latex.environment`` is `"longtable"`. @@ -3490,7 +3443,7 @@ def to_latex( default value to "r". multirow : bool, default True Use \multirow to enhance MultiIndex rows. Requires adding a - \usepackage{{multirow}} to your LaTeX preamble. Will print + \usepackage{multirow} to your LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module, and is set as the option @@ -3501,15 +3454,15 @@ def to_latex( default value to `True`. caption : str or tuple, optional Tuple (full_caption, short_caption), - which results in ``\caption[short_caption]{{full_caption}}``; + which results in ``\caption[short_caption]{full_caption}``; if a single string is passed, no short caption will be set. label : str, optional - The LaTeX label to be placed inside ``\label{{}}`` in the output. - This is used with ``\ref{{}}`` in the main ``.tex`` file. + The LaTeX label to be placed inside ``\label{}`` in the output. + This is used with ``\ref{}`` in the main ``.tex`` file. position : str, optional The LaTeX positional argument for tables, to be placed after - ``\begin{{}}`` in the output. + ``\begin{}`` in the output. Returns ------- @@ -3543,7 +3496,7 @@ def to_latex( >>> print(df.to_latex(index=False, ... formatters={"name": str.upper}, ... float_format="{:.1f}".format, - ... )) # doctest: +SKIP + ... )) # doctest: +SKIP \begin{tabular}{lrr} \toprule name & age & height \\ @@ -3615,6 +3568,7 @@ def _wrap(x, alt_format_): elif formatters is None and float_format is not None: formatters_ = partial(_wrap, alt_format_=lambda v: v) format_index_ = [index_format_, column_format_] + format_index_names_ = [index_format_, column_format_] # Deal with hiding indexes and relabelling column names hide_: list[dict] = [] @@ -3663,6 +3617,7 @@ def _wrap(x, alt_format_): relabel_index=relabel_index_, format={"formatter": formatters_, **base_format_}, format_index=format_index_, + format_index_names=format_index_names_, render_kwargs=render_kwargs_, ) @@ -3675,6 +3630,7 @@ def _to_latex_via_styler( relabel_index: dict | list[dict] | None = None, format: dict | list[dict] | None = None, format_index: dict | list[dict] | None = None, + format_index_names: dict | list[dict] | None = None, render_kwargs: dict | None = None, ): """ @@ -3719,7 +3675,13 @@ def _to_latex_via_styler( self = cast("DataFrame", self) styler = Styler(self, uuid="") - for kw_name in ["hide", "relabel_index", "format", "format_index"]: + for kw_name in [ + "hide", + "relabel_index", + "format", + "format_index", + "format_index_names", + ]: kw = vars()[kw_name] if isinstance(kw, dict): getattr(styler, kw_name)(**kw) @@ -3738,12 +3700,13 @@ def _to_latex_via_styler( def to_csv( self, path_or_buf: None = ..., + *, sep: str = ..., na_rep: str = ..., float_format: str | Callable | None = ..., columns: Sequence[Hashable] | None = ..., - header: bool_t | list[str] = ..., - index: bool_t = ..., + header: bool | list[str] = ..., + index: bool = ..., index_label: IndexLabel | None = ..., mode: str = ..., encoding: str | None = ..., @@ -3753,24 +3716,24 @@ def to_csv( lineterminator: str | None = ..., chunksize: int | None = ..., date_format: str | None = ..., - doublequote: bool_t = ..., + doublequote: bool = ..., escapechar: str | None = ..., decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., - ) -> str: - ... + ) -> str: ... @overload def to_csv( self, path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str], + *, sep: str = ..., na_rep: str = ..., float_format: str | Callable | None = ..., columns: Sequence[Hashable] | None = ..., - header: bool_t | list[str] = ..., - index: bool_t = ..., + header: bool | list[str] = ..., + index: bool = ..., index_label: IndexLabel | None = ..., mode: str = ..., encoding: str | None = ..., @@ -3780,18 +3743,14 @@ def to_csv( lineterminator: str | None = ..., chunksize: int | None = ..., date_format: str | None = ..., - doublequote: bool_t = ..., + doublequote: bool = ..., escapechar: str | None = ..., decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., - ) -> None: - ... + ) -> None: ... @final - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv" - ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buf", @@ -3799,12 +3758,13 @@ def to_csv( def to_csv( self, path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + *, sep: str = ",", na_rep: str = "", float_format: str | Callable | None = None, columns: Sequence[Hashable] | None = None, - header: bool_t | list[str] = True, - index: bool_t = True, + header: bool | list[str] = True, + index: bool = True, index_label: IndexLabel | None = None, mode: str = "w", encoding: str | None = None, @@ -3814,7 +3774,7 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, - doublequote: bool_t = True, + doublequote: bool = True, escapechar: str | None = None, decimal: str = ".", errors: OpenFileErrors = "strict", @@ -3921,31 +3881,42 @@ def to_csv( -------- Create 'out.csv' containing 'df' without indices - >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], - ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}}) - >>> df.to_csv('out.csv', index=False) # doctest: +SKIP + >>> df = pd.DataFrame( + ... [["Raphael", "red", "sai"], ["Donatello", "purple", "bo staff"]], + ... columns=["name", "mask", "weapon"], + ... ) + >>> df.to_csv("out.csv", index=False) # doctest: +SKIP Create 'out.zip' containing 'out.csv' >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' - >>> compression_opts = dict(method='zip', - ... archive_name='out.csv') # doctest: +SKIP - >>> df.to_csv('out.zip', index=False, - ... compression=compression_opts) # doctest: +SKIP + >>> compression_opts = dict( + ... method="zip", archive_name="out.csv" + ... ) # doctest: +SKIP + >>> df.to_csv( + ... "out.zip", index=False, compression=compression_opts + ... ) # doctest: +SKIP To write a csv file to a new folder or nested folder you will first need to create it using either Pathlib or os: >>> from pathlib import Path # doctest: +SKIP - >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP + >>> filepath = Path("folder/subfolder/out.csv") # doctest: +SKIP >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP >>> df.to_csv(filepath) # doctest: +SKIP >>> import os # doctest: +SKIP - >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP - >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP + >>> os.makedirs("folder/subfolder", exist_ok=True) # doctest: +SKIP + >>> df.to_csv("folder/subfolder/out.csv") # doctest: +SKIP + + Format floats to two decimal places: + + >>> df.to_csv("out1.csv", float_format="%.2f") # doctest: +SKIP + + Format floats using scientific notation: + + >>> df.to_csv("out2.csv", float_format="{{:.2e}}".format) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() @@ -3978,50 +3949,12 @@ def to_csv( ) # ---------------------------------------------------------------------- - # Lookup Caching - - def _reset_cacher(self) -> None: - """ - Reset the cacher. - """ - raise AbstractMethodError(self) + # Indexing Methods - def _maybe_update_cacher( - self, - clear: bool_t = False, - verify_is_copy: bool_t = True, - inplace: bool_t = False, - ) -> None: + @final + def take(self, indices, axis: Axis = 0, **kwargs) -> Self: """ - See if we need to update our parent cacher if clear, then clear our - cache. - - Parameters - ---------- - clear : bool, default False - Clear the item cache. - verify_is_copy : bool, default True - Provide is_copy checks. - """ - if using_copy_on_write(): - return - - if verify_is_copy: - self._check_setitem_copy(t="referent") - - if clear: - self._clear_item_cache() - - def _clear_item_cache(self) -> None: - raise AbstractMethodError(self) - - # ---------------------------------------------------------------------- - # Indexing Methods - - @final - def take(self, indices, axis: Axis = 0, **kwargs) -> Self: - """ - Return the elements in the given *positional* indices along an axis. + Return the elements in the given *positional* indices along an axis. This means that we are not indexing according to actual values in the index attribute of the object. We are indexing according to the @@ -4031,7 +3964,7 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Self: ---------- indices : array-like An array of ints indicating which positions to take. - axis : {0 or 'index', 1 or 'columns', None}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis on which to select elements. ``0`` means that we are selecting rows, ``1`` means that we are selecting columns. For `Series` this parameter is unused and defaults to 0. @@ -4052,12 +3985,16 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Self: Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan)], - ... columns=['name', 'class', 'max_speed'], - ... index=[0, 2, 3, 1]) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ], + ... columns=["name", "class", "max_speed"], + ... index=[0, 2, 3, 1], + ... ) >>> df name class max_speed 0 falcon bird 389.0 @@ -4096,33 +4033,14 @@ class max_speed nv.validate_take((), kwargs) - if not isinstance(indices, slice): - indices = np.asarray(indices, dtype=np.intp) - if ( - axis == 0 - and indices.ndim == 1 - and using_copy_on_write() - and is_range_indexer(indices, len(self)) - ): - return self.copy(deep=None) - elif self.ndim == 1: + if isinstance(indices, slice): raise TypeError( f"{type(self).__name__}.take requires a sequence of integers, " "not slice." ) - else: - warnings.warn( - # GH#51539 - f"Passing a slice to {type(self).__name__}.take is deprecated " - "and will raise in a future version. Use `obj[slicer]` or pass " - "a sequence of integers instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - # We can get here with a slice via DataFrame.__getitem__ - indices = np.arange( - indices.start, indices.stop, indices.step, dtype=np.intp - ) + indices = np.asarray(indices, dtype=np.intp) + if axis == 0 and indices.ndim == 1 and is_range_indexer(indices, len(self)): + return self.copy(deep=False) new_data = self._mgr.take( indices, @@ -4133,30 +4051,13 @@ class max_speed self, method="take" ) - @final - def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self: - """ - Internal version of the `take` method that sets the `_is_copy` - attribute to keep track of the parent dataframe (using in indexing - for the SettingWithCopyWarning). - - For Series this does the same as the public take (it never sets `_is_copy`). - - See the docstring of `take` for full explanation of the parameters. - """ - result = self.take(indices=indices, axis=axis) - # Maybe set copy if we didn't actually change the index. - if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) - return result - @final def xs( self, key: IndexLabel, axis: Axis = 0, level: IndexLabel | None = None, - drop_level: bool_t = True, + drop_level: bool = True, ) -> Self: """ Return cross-section from the Series/DataFrame. @@ -4200,13 +4101,15 @@ def xs( Examples -------- - >>> d = {'num_legs': [4, 4, 2, 2], - ... 'num_wings': [0, 0, 2, 2], - ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], - ... 'animal': ['cat', 'dog', 'bat', 'penguin'], - ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} + >>> d = { + ... "num_legs": [4, 4, 2, 2], + ... "num_wings": [0, 0, 2, 2], + ... "class": ["mammal", "mammal", "mammal", "bird"], + ... "animal": ["cat", "dog", "bat", "penguin"], + ... "locomotion": ["walks", "walks", "flies", "walks"], + ... } >>> df = pd.DataFrame(data=d) - >>> df = df.set_index(['class', 'animal', 'locomotion']) + >>> df = df.set_index(["class", "animal", "locomotion"]) >>> df num_legs num_wings class animal locomotion @@ -4217,7 +4120,7 @@ class animal locomotion Get values at specified index - >>> df.xs('mammal') + >>> df.xs("mammal") num_legs num_wings animal locomotion cat walks 4 0 @@ -4226,29 +4129,28 @@ class animal locomotion Get values at several indexes - >>> df.xs(('mammal', 'dog', 'walks')) + >>> df.xs(("mammal", "dog", "walks")) num_legs 4 num_wings 0 Name: (mammal, dog, walks), dtype: int64 Get values at specified index and level - >>> df.xs('cat', level=1) + >>> df.xs("cat", level=1) num_legs num_wings class locomotion mammal walks 4 0 Get values at several indexes and levels - >>> df.xs(('bird', 'walks'), - ... level=[0, 'locomotion']) + >>> df.xs(("bird", "walks"), level=[0, "locomotion"]) num_legs num_wings animal penguin 2 2 Get values at specified column and axis - >>> df.xs('num_wings', axis=1) + >>> df.xs("num_wings", axis=1) class animal locomotion mammal cat walks 0 dog walks 0 @@ -4297,9 +4199,9 @@ class animal locomotion if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: (inds,) = loc.nonzero() - return self._take_with_is_copy(inds, axis=axis) + return self.take(inds, axis=axis) else: - return self._take_with_is_copy(loc, axis=axis) + return self.take(loc, axis=axis) if not is_scalar(loc): new_index = index[loc] @@ -4325,9 +4227,6 @@ class animal locomotion result = self.iloc[loc] result.index = new_index - # this could be a view - # but only in a single-dtyped view sliceable case - result._set_is_copy(self, copy=not result._is_view) return result def __getitem__(self, item): @@ -4343,9 +4242,7 @@ def _getitem_slice(self, key: slice) -> Self: slobj = self.index._convert_slice_indexer(key, kind="getitem") if isinstance(slobj, np.ndarray): # reachable with DatetimeIndex - indexer = lib.maybe_indices_to_slice( - slobj.astype(np.intp, copy=False), len(self) - ) + indexer = lib.maybe_indices_to_slice(slobj.astype(np.intp), len(self)) if isinstance(indexer, np.ndarray): # GH#43223 If we can not convert, use take return self.take(indexer, axis=0) @@ -4363,111 +4260,8 @@ def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self: new_mgr = self._mgr.get_slice(slobj, axis=axis) result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) result = result.__finalize__(self) - - # this could be a view - # but only in a single-dtyped view sliceable case - is_copy = axis != 0 or result._is_view - result._set_is_copy(self, copy=is_copy) return result - @final - def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None: - if not copy: - self._is_copy = None - else: - assert ref is not None - self._is_copy = weakref.ref(ref) - - def _check_is_chained_assignment_possible(self) -> bool_t: - """ - Check if we are a view, have a cacher, and are of mixed type. - If so, then force a setitem_copy check. - - Should be called just near setting a value - - Will return a boolean if it we are a view and are cached, but a - single-dtype meaning that the cacher should be updated following - setting. - """ - if self._is_copy: - self._check_setitem_copy(t="referent") - return False - - @final - def _check_setitem_copy(self, t: str = "setting", force: bool_t = False): - """ - - Parameters - ---------- - t : str, the type of setting error - force : bool, default False - If True, then force showing an error. - - validate if we are doing a setitem on a chained copy. - - It is technically possible to figure out that we are setting on - a copy even WITH a multi-dtyped pandas object. In other words, some - blocks may be views while other are not. Currently _is_view will ALWAYS - return False for multi-blocks to avoid having to handle this case. - - df = DataFrame(np.arange(0,9), columns=['count']) - df['group'] = 'b' - - # This technically need not raise SettingWithCopy if both are view - # (which is not generally guaranteed but is usually True. However, - # this is in general not a good practice and we recommend using .loc. - df.iloc[0:5]['group'] = 'a' - - """ - if using_copy_on_write() or warn_copy_on_write(): - return - - # return early if the check is not needed - if not (force or self._is_copy): - return - - value = config.get_option("mode.chained_assignment") - if value is None: - return - - # see if the copy is not actually referred; if so, then dissolve - # the copy weakref - if self._is_copy is not None and not isinstance(self._is_copy, str): - r = self._is_copy() - if not gc.get_referents(r) or (r is not None and r.shape == self.shape): - self._is_copy = None - return - - # a custom message - if isinstance(self._is_copy, str): - t = self._is_copy - - elif t == "referent": - t = ( - "\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame\n\n" - "See the caveats in the documentation: " - "https://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" - ) - - else: - t = ( - "\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame.\n" - "Try using .loc[row_indexer,col_indexer] = value " - "instead\n\nSee the caveats in the documentation: " - "https://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" - ) - - if value == "raise": - raise SettingWithCopyError(t) - if value == "warn": - warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level()) - @final def __delitem__(self, key) -> None: """ @@ -4500,17 +4294,11 @@ def __delitem__(self, key) -> None: loc = self.axes[-1].get_loc(key) self._mgr = self._mgr.idelete(loc) - # delete from the caches - try: - del self._item_cache[key] - except KeyError: - pass - # ---------------------------------------------------------------------- # Unsorted @final - def _check_inplace_and_allows_duplicate_labels(self, inplace: bool_t): + def _check_inplace_and_allows_duplicate_labels(self, inplace: bool) -> None: if inplace and not self.flags.allows_duplicate_labels: raise ValueError( "Cannot specify 'inplace=True' when " @@ -4522,15 +4310,24 @@ def get(self, key, default=None): """ Get item from object for given key (ex: DataFrame column). - Returns default value if not found. + Returns ``default`` value if not found. Parameters ---------- key : object + Key for which item should be returned. + default : object, default None + Default value to return if key is not found. Returns ------- same type as items contained in object + Item for given key or ``default`` value, if key is not found. + + See Also + -------- + DataFrame.get : Get item from object for given key (ex: DataFrame column). + Series.get : Get item from object for given key (ex: DataFrame column). Examples -------- @@ -4559,8 +4356,8 @@ def get(self, key, default=None): 2014-02-14 22.0 medium 2014-02-15 35.0 medium - >>> ser = df['windspeed'] - >>> ser.get('2014-02-13') + >>> ser = df["windspeed"] + >>> ser.get("2014-02-13") 'high' If the key isn't found, the default value will be used. @@ -4568,7 +4365,7 @@ def get(self, key, default=None): >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value") 'default_value' - >>> ser.get('2014-02-10', '[unknown]') + >>> ser.get("2014-02-10", "[unknown]") '[unknown]' """ try: @@ -4576,18 +4373,26 @@ def get(self, key, default=None): except (KeyError, ValueError, IndexError): return default - @final - @property - def _is_view(self) -> bool_t: - """Return boolean indicating if self is view of another array""" - return self._mgr.is_view + @staticmethod + def _check_copy_deprecation(copy): + if copy is not lib.no_default: + warnings.warn( + "The copy keyword is deprecated and will be removed in a future " + "version. Copy-on-Write is active in pandas since 3.0 which utilizes " + "a lazy copy mechanism that defers copies until necessary. Use " + ".copy() to make an eager copy if necessary.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + # issue 58667 + @deprecate_kwarg("method", None) @final def reindex_like( self, other, method: Literal["backfill", "bfill", "pad", "ffill", "nearest"] | None = None, - copy: bool_t | None = None, + copy: bool | lib.NoDefault = lib.no_default, limit: int | None = None, tolerance=None, ) -> Self: @@ -4609,13 +4414,15 @@ def reindex_like( Please note: this is only applicable to DataFrames/Series with a monotonically increasing/decreasing index. + .. deprecated:: 3.0.0 + * None (default): don't fill gaps * pad / ffill: propagate last valid observation forward to next valid * backfill / bfill: use next valid observation to fill gap * nearest: use nearest valid observations to fill gap. - copy : bool, default True + copy : bool, default False Return a new object, even if the passed indexes are the same. .. note:: @@ -4629,6 +4436,8 @@ def reindex_like( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 limit : int, default None Maximum number of consecutive labels to fill for inexact matches. tolerance : optional @@ -4660,14 +4469,16 @@ def reindex_like( Examples -------- - >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'], - ... [31, 87.8, 'high'], - ... [22, 71.6, 'medium'], - ... [35, 95, 'medium']], - ... columns=['temp_celsius', 'temp_fahrenheit', - ... 'windspeed'], - ... index=pd.date_range(start='2014-02-12', - ... end='2014-02-15', freq='D')) + >>> df1 = pd.DataFrame( + ... [ + ... [24.3, 75.7, "high"], + ... [31, 87.8, "high"], + ... [22, 71.6, "medium"], + ... [35, 95, "medium"], + ... ], + ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"], + ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"), + ... ) >>> df1 temp_celsius temp_fahrenheit windspeed @@ -4676,12 +4487,11 @@ def reindex_like( 2014-02-14 22.0 71.6 medium 2014-02-15 35.0 95.0 medium - >>> df2 = pd.DataFrame([[28, 'low'], - ... [30, 'low'], - ... [35.1, 'medium']], - ... columns=['temp_celsius', 'windspeed'], - ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', - ... '2014-02-15'])) + >>> df2 = pd.DataFrame( + ... [[28, "low"], [30, "low"], [35.1, "medium"]], + ... columns=["temp_celsius", "windspeed"], + ... index=pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]), + ... ) >>> df2 temp_celsius windspeed @@ -4696,10 +4506,10 @@ def reindex_like( 2014-02-14 NaN NaN NaN 2014-02-15 35.1 NaN medium """ + self._check_copy_deprecation(copy) d = other._construct_axes_dict( axes=self._AXIS_ORDERS, method=method, - copy=copy, limit=limit, tolerance=tolerance, ) @@ -4709,54 +4519,51 @@ def reindex_like( @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level | None = ..., inplace: Literal[True], errors: IgnoreRaise = ..., - ) -> None: - ... + ) -> None: ... @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level | None = ..., inplace: Literal[False] = ..., errors: IgnoreRaise = ..., - ) -> Self: - ... + ) -> Self: ... @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level | None = ..., - inplace: bool_t = ..., + inplace: bool = ..., errors: IgnoreRaise = ..., - ) -> Self | None: - ... + ) -> Self | None: ... def drop( self, - labels: IndexLabel | None = None, + labels: IndexLabel | ListLike = None, *, axis: Axis = 0, - index: IndexLabel | None = None, - columns: IndexLabel | None = None, + index: IndexLabel | ListLike = None, + columns: IndexLabel | ListLike = None, level: Level | None = None, - inplace: bool_t = False, + inplace: bool = False, errors: IgnoreRaise = "raise", ) -> Self | None: inplace = validate_bool_kwarg(inplace, "inplace") @@ -4794,7 +4601,7 @@ def _drop_axis( axis, level=None, errors: IgnoreRaise = "raise", - only_slice: bool_t = False, + only_slice: bool = False, ) -> Self: """ Drop labels from specified axis. Used in the ``drop`` method @@ -4865,7 +4672,6 @@ def _drop_axis( indexer, axis=bm_axis, allow_dups=True, - copy=None, only_slice=only_slice, ) result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) @@ -4875,22 +4681,17 @@ def _drop_axis( return result.__finalize__(self) @final - def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: + def _update_inplace(self, result) -> None: """ Replace self internals with result. Parameters ---------- result : same type as self - verify_is_copy : bool, default True - Provide is_copy checks. """ # NOTE: This does *not* call __finalize__ and that's an explicit # decision that we may revisit in the future. - self._reset_cache() - self._clear_item_cache() self._mgr = result._mgr - self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True) @final def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self: @@ -4929,14 +4730,14 @@ def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self: 3 4 dtype: int64 - >>> s.add_prefix('item_') + >>> s.add_prefix("item_") item_0 1 item_1 2 item_2 3 item_3 4 dtype: int64 - >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) >>> df A B 0 1 3 @@ -4944,7 +4745,7 @@ def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self: 2 3 5 3 4 6 - >>> df.add_prefix('col_') + >>> df.add_prefix("col_") col_A col_B 0 1 3 1 2 4 @@ -4959,12 +4760,10 @@ def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self: mapper = {axis_name: f} - # error: Incompatible return value type (got "Optional[Self]", - # expected "Self") - # error: Argument 1 to "rename" of "NDFrame" has incompatible type - # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" # error: Keywords must be strings - return self._rename(**mapper) # type: ignore[return-value, arg-type, misc] + # error: No overload variant of "_rename" of "NDFrame" matches + # argument type "dict[Literal['index', 'columns'], Callable[[Any], str]]" + return self._rename(**mapper) # type: ignore[call-overload, misc] @final def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self: @@ -5003,14 +4802,14 @@ def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self: 3 4 dtype: int64 - >>> s.add_suffix('_item') + >>> s.add_suffix("_item") 0_item 1 1_item 2 2_item 3 3_item 4 dtype: int64 - >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) >>> df A B 0 1 3 @@ -5018,7 +4817,7 @@ def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self: 2 3 5 3 4 6 - >>> df.add_suffix('_col') + >>> df.add_suffix("_col") A_col B_col 0 1 3 1 2 4 @@ -5032,64 +4831,59 @@ def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self: axis_name = self._get_axis_name(axis) mapper = {axis_name: f} - # error: Incompatible return value type (got "Optional[Self]", - # expected "Self") - # error: Argument 1 to "rename" of "NDFrame" has incompatible type - # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" # error: Keywords must be strings - return self._rename(**mapper) # type: ignore[return-value, arg-type, misc] + # error: No overload variant of "_rename" of "NDFrame" matches argument + # type "dict[Literal['index', 'columns'], Callable[[Any], str]]" + return self._rename(**mapper) # type: ignore[call-overload, misc] @overload def sort_values( self, *, axis: Axis = ..., - ascending: bool_t | Sequence[bool_t] = ..., + ascending: bool | Sequence[bool] = ..., inplace: Literal[False] = ..., kind: SortKind = ..., na_position: NaPosition = ..., - ignore_index: bool_t = ..., + ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> Self: - ... + ) -> Self: ... @overload def sort_values( self, *, axis: Axis = ..., - ascending: bool_t | Sequence[bool_t] = ..., + ascending: bool | Sequence[bool] = ..., inplace: Literal[True], kind: SortKind = ..., na_position: NaPosition = ..., - ignore_index: bool_t = ..., + ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> None: - ... + ) -> None: ... @overload def sort_values( self, *, axis: Axis = ..., - ascending: bool_t | Sequence[bool_t] = ..., - inplace: bool_t = ..., + ascending: bool | Sequence[bool] = ..., + inplace: bool = ..., kind: SortKind = ..., na_position: NaPosition = ..., - ignore_index: bool_t = ..., + ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> Self | None: - ... + ) -> Self | None: ... def sort_values( self, *, axis: Axis = 0, - ascending: bool_t | Sequence[bool_t] = True, - inplace: bool_t = False, + ascending: bool | Sequence[bool] = True, + inplace: bool = False, kind: SortKind = "quicksort", na_position: NaPosition = "last", - ignore_index: bool_t = False, + ignore_index: bool = False, key: ValueKeyFunc | None = None, ) -> Self | None: """ @@ -5121,7 +4915,8 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. + It will be applied to each column in `by` independently. The values in the + returned Series will be used as the keys for sorting. Returns ------- @@ -5135,12 +4930,14 @@ def sort_values( Examples -------- - >>> df = pd.DataFrame({ - ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], - ... 'col2': [2, 1, 9, 8, 7, 4], - ... 'col3': [0, 1, 9, 4, 2, 3], - ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] - ... }) + >>> df = pd.DataFrame( + ... { + ... "col1": ["A", "A", "B", np.nan, "D", "C"], + ... "col2": [2, 1, 9, 8, 7, 4], + ... "col3": [0, 1, 9, 4, 2, 3], + ... "col4": ["a", "B", "c", "D", "e", "F"], + ... } + ... ) >>> df col1 col2 col3 col4 0 A 2 0 a @@ -5152,7 +4949,7 @@ def sort_values( Sort by col1 - >>> df.sort_values(by=['col1']) + >>> df.sort_values(by=["col1"]) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -5163,7 +4960,7 @@ def sort_values( Sort by multiple columns - >>> df.sort_values(by=['col1', 'col2']) + >>> df.sort_values(by=["col1", "col2"]) col1 col2 col3 col4 1 A 1 1 B 0 A 2 0 a @@ -5174,7 +4971,7 @@ def sort_values( Sort Descending - >>> df.sort_values(by='col1', ascending=False) + >>> df.sort_values(by="col1", ascending=False) col1 col2 col3 col4 4 D 7 2 e 5 C 4 3 F @@ -5185,7 +4982,7 @@ def sort_values( Putting NAs first - >>> df.sort_values(by='col1', ascending=False, na_position='first') + >>> df.sort_values(by="col1", ascending=False, na_position="first") col1 col2 col3 col4 3 NaN 8 4 D 4 D 7 2 e @@ -5196,7 +4993,7 @@ def sort_values( Sorting with a key function - >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) + >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B @@ -5208,10 +5005,12 @@ def sort_values( Natural sort with the key argument, using the `natsort ` package. - >>> df = pd.DataFrame({ - ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], - ... "value": [10, 20, 30, 40, 50] - ... }) + >>> df = pd.DataFrame( + ... { + ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], + ... "value": [10, 20, 30, 40, 50], + ... } + ... ) >>> df time value 0 0hr 10 @@ -5221,8 +5020,7 @@ def sort_values( 4 96hr 50 >>> from natsort import index_natsorted >>> df.sort_values( - ... by="time", - ... key=lambda x: np.argsort(index_natsorted(df["time"])) + ... by="time", key=lambda x: np.argsort(index_natsorted(df["time"])) ... ) time value 0 0hr 10 @@ -5239,15 +5037,14 @@ def sort_index( *, axis: Axis = ..., level: IndexLabel = ..., - ascending: bool_t | Sequence[bool_t] = ..., + ascending: bool | Sequence[bool] = ..., inplace: Literal[True], kind: SortKind = ..., na_position: NaPosition = ..., - sort_remaining: bool_t = ..., - ignore_index: bool_t = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> None: - ... + ) -> None: ... @overload def sort_index( @@ -5255,15 +5052,14 @@ def sort_index( *, axis: Axis = ..., level: IndexLabel = ..., - ascending: bool_t | Sequence[bool_t] = ..., + ascending: bool | Sequence[bool] = ..., inplace: Literal[False] = ..., kind: SortKind = ..., na_position: NaPosition = ..., - sort_remaining: bool_t = ..., - ignore_index: bool_t = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> Self: - ... + ) -> Self: ... @overload def sort_index( @@ -5271,27 +5067,26 @@ def sort_index( *, axis: Axis = ..., level: IndexLabel = ..., - ascending: bool_t | Sequence[bool_t] = ..., - inplace: bool_t = ..., + ascending: bool | Sequence[bool] = ..., + inplace: bool = ..., kind: SortKind = ..., na_position: NaPosition = ..., - sort_remaining: bool_t = ..., - ignore_index: bool_t = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> Self | None: - ... + ) -> Self | None: ... def sort_index( self, *, axis: Axis = 0, level: IndexLabel | None = None, - ascending: bool_t | Sequence[bool_t] = True, - inplace: bool_t = False, + ascending: bool | Sequence[bool] = True, + inplace: bool = False, kind: SortKind = "quicksort", na_position: NaPosition = "last", - sort_remaining: bool_t = True, - ignore_index: bool_t = False, + sort_remaining: bool = True, + ignore_index: bool = False, key: IndexKeyFunc | None = None, ) -> Self | None: inplace = validate_bool_kwarg(inplace, "inplace") @@ -5308,10 +5103,13 @@ def sort_index( if inplace: result = self else: - result = self.copy(deep=None) + result = self.copy(deep=False) if ignore_index: - result.index = default_index(len(self)) + if axis == 1: + result.columns = default_index(len(self.columns)) + else: + result.index = default_index(len(self)) if inplace: return None else: @@ -5346,7 +5144,7 @@ def reindex( columns=None, axis: Axis | None = None, method: ReindexMethod | None = None, - copy: bool_t | None = None, + copy: bool | lib.NoDefault = lib.no_default, level: Level | None = None, fill_value: Scalar | None = np.nan, limit: int | None = None, @@ -5373,7 +5171,7 @@ def reindex( * backfill / bfill: Use next valid observation to fill gap. * nearest: Use nearest valid observations to fill gap. - copy : bool, default True + copy : bool, default False Return a new object, even if the passed indexes are the same. .. note:: @@ -5387,6 +5185,8 @@ def reindex( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level. @@ -5408,7 +5208,8 @@ def reindex( Returns ------- - {klass} with changed index. + {klass} + {klass} with changed index. See Also -------- @@ -5426,12 +5227,15 @@ def reindex( We *highly* recommend using keyword arguments to clarify your intent. - Create a dataframe with some fictional data. + Create a DataFrame with some fictional data. - >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] - >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301], - ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}}, - ... index=index) + >>> index = ["Firefox", "Chrome", "Safari", "IE10", "Konqueror"] + >>> columns = ["http_status", "response_time"] + >>> df = pd.DataFrame( + ... [[200, 0.04], [200, 0.02], [404, 0.07], [404, 0.08], [301, 1.0]], + ... columns=columns, + ... index=index, + ... ) >>> df http_status response_time Firefox 200 0.04 @@ -5440,12 +5244,11 @@ def reindex( IE10 404 0.08 Konqueror 301 1.00 - Create a new index and reindex the dataframe. By default + Create a new index and reindex the DataFrame. By default values in the new index that do not have corresponding - records in the dataframe are assigned ``NaN``. + records in the DataFrame are assigned ``NaN``. - >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', - ... 'Chrome'] + >>> new_index = ["Safari", "Iceweasel", "Comodo Dragon", "IE10", "Chrome"] >>> df.reindex(new_index) http_status response_time Safari 404.0 0.07 @@ -5467,7 +5270,7 @@ def reindex( IE10 404 0.08 Chrome 200 0.02 - >>> df.reindex(new_index, fill_value='missing') + >>> df.reindex(new_index, fill_value="missing") http_status response_time Safari 404 0.07 Iceweasel missing missing @@ -5477,7 +5280,7 @@ def reindex( We can also reindex the columns. - >>> df.reindex(columns=['http_status', 'user_agent']) + >>> df.reindex(columns=["http_status", "user_agent"]) http_status user_agent Firefox 200 NaN Chrome 200 NaN @@ -5487,7 +5290,7 @@ def reindex( Or we can use "axis-style" keyword arguments - >>> df.reindex(['http_status', 'user_agent'], axis="columns") + >>> df.reindex(["http_status", "user_agent"], axis="columns") http_status user_agent Firefox 200 NaN Chrome 200 NaN @@ -5496,13 +5299,14 @@ def reindex( Konqueror 301 NaN To further illustrate the filling functionality in - ``reindex``, we will create a dataframe with a + ``reindex``, we will create a DataFrame with a monotonically increasing index (for example, a sequence of dates). - >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') - >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}}, - ... index=date_index) + >>> date_index = pd.date_range("1/1/2010", periods=6, freq="D") + >>> df2 = pd.DataFrame( + ... {{"prices": [100, 101, np.nan, 100, 89, 88]}}, index=date_index + ... ) >>> df2 prices 2010-01-01 100.0 @@ -5512,10 +5316,10 @@ def reindex( 2010-01-05 89.0 2010-01-06 88.0 - Suppose we decide to expand the dataframe to cover a wider + Suppose we decide to expand the DataFrame to cover a wider date range. - >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') + >>> date_index2 = pd.date_range("12/29/2009", periods=10, freq="D") >>> df2.reindex(date_index2) prices 2009-12-29 NaN @@ -5537,7 +5341,7 @@ def reindex( For example, to back-propagate the last valid value to fill the ``NaN`` values, pass ``bfill`` as an argument to the ``method`` keyword. - >>> df2.reindex(date_index2, method='bfill') + >>> df2.reindex(date_index2, method="bfill") prices 2009-12-29 100.0 2009-12-30 100.0 @@ -5550,17 +5354,18 @@ def reindex( 2010-01-06 88.0 2010-01-07 NaN - Please note that the ``NaN`` value present in the original dataframe + Please note that the ``NaN`` value present in the original DataFrame (at index value 2010-01-03) will not be filled by any of the value propagation schemes. This is because filling while reindexing - does not look at dataframe values, but only compares the original and + does not look at DataFrame values, but only compares the original and desired indexes. If you do want to fill in the ``NaN`` values present - in the original dataframe, use the ``fillna()`` method. + in the original DataFrame, use the ``fillna()`` method. See the :ref:`user guide ` for more. """ # TODO: Decide if we care about having different examples for different # kinds + self._check_copy_deprecation(copy) if index is not None and columns is not None and labels is not None: raise TypeError("Cannot specify all of 'labels', 'index', 'columns'.") @@ -5587,22 +5392,20 @@ def reindex( # if all axes that are requested to reindex are equal, then only copy # if indicated must have index names equal here as well as values - if copy and using_copy_on_write(): - copy = False if all( self._get_axis(axis_name).identical(ax) for axis_name, ax in axes.items() if ax is not None ): - return self.copy(deep=copy) + return self.copy(deep=False) # check if we are a multi reindex if self._needs_reindex_multi(axes, method, level): - return self._reindex_multi(axes, copy, fill_value) + return self._reindex_multi(axes, fill_value) # perform the reindex on the axes return self._reindex_axes( - axes, level, limit, tolerance, method, fill_value, copy + axes, level, limit, tolerance, method, fill_value ).__finalize__(self, method="reindex") @final @@ -5614,7 +5417,6 @@ def _reindex_axes( tolerance, method, fill_value: Scalar | None, - copy: bool_t | None, ) -> Self: """Perform the reindex for all the axes.""" obj = self @@ -5632,15 +5434,12 @@ def _reindex_axes( obj = obj._reindex_with_indexers( {axis: [new_index, indexer]}, fill_value=fill_value, - copy=copy, allow_dups=False, ) - # If we've made a copy once, no need to make another one - copy = False return obj - def _needs_reindex_multi(self, axes, method, level: Level | None) -> bool_t: + def _needs_reindex_multi(self, axes, method, level: Level | None) -> bool: """Check if we do need a multi reindex.""" return ( (common.count_not_none(*axes.values()) == self._AXIS_LEN) @@ -5651,7 +5450,7 @@ def _needs_reindex_multi(self, axes, method, level: Level | None) -> bool_t: and self._can_fast_transpose ) - def _reindex_multi(self, axes, copy, fill_value): + def _reindex_multi(self, axes, fill_value): raise AbstractMethodError(self) @final @@ -5659,8 +5458,7 @@ def _reindex_with_indexers( self, reindexers, fill_value=None, - copy: bool_t | None = False, - allow_dups: bool_t = False, + allow_dups: bool = False, ) -> Self: """allow_dups indicates an internal call here""" # reindex doing multiple operations on different axes if indicated @@ -5683,18 +5481,9 @@ def _reindex_with_indexers( axis=baxis, fill_value=fill_value, allow_dups=allow_dups, - copy=copy, ) - # If we've made a copy once, no need to make another one - copy = False - if ( - (copy or copy is None) - and new_data is self._mgr - and not using_copy_on_write() - ): - new_data = new_data.copy(deep=copy) - elif using_copy_on_write() and new_data is self._mgr: + if new_data is self._mgr: new_data = new_data.copy(deep=False) return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__( @@ -5709,10 +5498,11 @@ def filter( axis: Axis | None = None, ) -> Self: """ - Subset the dataframe rows or columns according to the specified index labels. + Subset the DataFrame or Series according to the specified index labels. - Note that this routine does not filter a dataframe on its - contents. The filter is applied to the labels of the index. + For DataFrame, filter rows or columns depending on ``axis`` argument. + Note that this routine does not filter based on content. + The filter is applied to the labels of the index. Parameters ---------- @@ -5725,11 +5515,13 @@ def filter( axis : {0 or 'index', 1 or 'columns', None}, default None The axis to filter on, expressed either as an index (int) or axis name (str). By default this is the info axis, 'columns' for - DataFrame. For `Series` this parameter is unused and defaults to `None`. + ``DataFrame``. For ``Series`` this parameter is unused and defaults to + ``None``. Returns ------- - same type as input object + Same type as caller + The filtered subset of the DataFrame or Series. See Also -------- @@ -5746,36 +5538,37 @@ def filter( Examples -------- - >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), - ... index=['mouse', 'rabbit'], - ... columns=['one', 'two', 'three']) + >>> df = pd.DataFrame( + ... np.array(([1, 2, 3], [4, 5, 6])), + ... index=["mouse", "rabbit"], + ... columns=["one", "two", "three"], + ... ) >>> df one two three mouse 1 2 3 rabbit 4 5 6 >>> # select columns by name - >>> df.filter(items=['one', 'three']) + >>> df.filter(items=["one", "three"]) one three mouse 1 3 rabbit 4 6 >>> # select columns by regular expression - >>> df.filter(regex='e$', axis=1) + >>> df.filter(regex="e$", axis=1) one three mouse 1 3 rabbit 4 6 >>> # select rows containing 'bbi' - >>> df.filter(like='bbi', axis=0) + >>> df.filter(like="bbi", axis=0) one two three rabbit 4 5 6 """ nkw = common.count_not_none(items, like, regex) if nkw > 1: raise TypeError( - "Keyword arguments `items`, `like`, or `regex` " - "are mutually exclusive" + "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" ) if axis is None: @@ -5792,7 +5585,7 @@ def filter( return self.reindex(**{name: items}) # type: ignore[misc] elif like: - def f(x) -> bool_t: + def f(x) -> bool: assert like is not None # needed for mypy return like in ensure_str(x) @@ -5800,7 +5593,7 @@ def f(x) -> bool_t: return self.loc(axis=axis)[values] elif regex: - def f(x) -> bool_t: + def f(x) -> bool: return matcher.search(ensure_str(x)) is not None matcher = re.compile(regex) @@ -5814,14 +5607,15 @@ def head(self, n: int = 5) -> Self: """ Return the first `n` rows. - This function returns the first `n` rows for the object based - on position. It is useful for quickly testing if your object - has the right type of data in it. + This function exhibits the same behavior as ``df[:n]``, returning the + first ``n`` rows based on position. It is useful for quickly checking + if your object has the right type of data in it. - For negative values of `n`, this function returns all rows except - the last `|n|` rows, equivalent to ``df[:n]``. + When ``n`` is positive, it returns the first ``n`` rows. For ``n`` equal to 0, + it returns an empty object. When ``n`` is negative, it returns + all rows except the last ``|n|`` rows, mirroring the behavior of ``df[:n]``. - If n is larger than the number of rows, this function returns all rows. + If ``n`` is larger than the number of rows, this function returns all rows. Parameters ---------- @@ -5839,8 +5633,21 @@ def head(self, n: int = 5) -> Self: Examples -------- - >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', - ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df = pd.DataFrame( + ... { + ... "animal": [ + ... "alligator", + ... "bee", + ... "falcon", + ... "lion", + ... "monkey", + ... "parrot", + ... "shark", + ... "whale", + ... "zebra", + ... ] + ... } + ... ) >>> df animal 0 alligator @@ -5882,9 +5689,7 @@ def head(self, n: int = 5) -> Self: 4 monkey 5 parrot """ - if using_copy_on_write(): - return self.iloc[:n].copy() - return self.iloc[:n] + return self.iloc[:n].copy() @final def tail(self, n: int = 5) -> Self: @@ -5898,7 +5703,7 @@ def tail(self, n: int = 5) -> Self: For negative values of `n`, this function returns all rows except the first `|n|` rows, equivalent to ``df[|n|:]``. - If n is larger than the number of rows, this function returns all rows. + If ``n`` is larger than the number of rows, this function returns all rows. Parameters ---------- @@ -5916,8 +5721,21 @@ def tail(self, n: int = 5) -> Self: Examples -------- - >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', - ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df = pd.DataFrame( + ... { + ... "animal": [ + ... "alligator", + ... "bee", + ... "falcon", + ... "lion", + ... "monkey", + ... "parrot", + ... "shark", + ... "whale", + ... "zebra", + ... ] + ... } + ... ) >>> df animal 0 alligator @@ -5959,24 +5777,20 @@ def tail(self, n: int = 5) -> Self: 7 whale 8 zebra """ - if using_copy_on_write(): - if n == 0: - return self.iloc[0:0].copy() - return self.iloc[-n:].copy() if n == 0: - return self.iloc[0:0] - return self.iloc[-n:] + return self.iloc[0:0].copy() + return self.iloc[-n:].copy() @final def sample( self, n: int | None = None, frac: float | None = None, - replace: bool_t = False, + replace: bool = False, weights=None, random_state: RandomState | None = None, axis: Axis | None = None, - ignore_index: bool_t = False, + ignore_index: bool = False, ) -> Self: """ Return a random sample of items from an axis of object. @@ -5993,7 +5807,7 @@ def sample( replace : bool, default False Allow or disallow sampling of the same row more than once. weights : str or ndarray-like, optional - Default 'None' results in equal probability weighting. + Default ``None`` results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned @@ -6008,6 +5822,7 @@ def sample( random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. + Default ``None`` results in sampling with the current state of np.random. .. versionchanged:: 1.4.0 @@ -6042,10 +5857,14 @@ def sample( Examples -------- - >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0], - ... 'num_wings': [2, 0, 0, 0], - ... 'num_specimen_seen': [10, 2, 1, 8]}, - ... index=['falcon', 'dog', 'spider', 'fish']) + >>> df = pd.DataFrame( + ... { + ... "num_legs": [2, 4, 8, 0], + ... "num_wings": [2, 0, 0, 0], + ... "num_specimen_seen": [10, 2, 1, 8], + ... }, + ... index=["falcon", "dog", "spider", "fish"], + ... ) >>> df num_legs num_wings num_specimen_seen falcon 2 2 10 @@ -6057,7 +5876,7 @@ def sample( Note that we use `random_state` to ensure the reproducibility of the examples. - >>> df['num_legs'].sample(n=3, random_state=1) + >>> df["num_legs"].sample(n=3, random_state=1) fish 0 spider 8 falcon 2 @@ -6087,7 +5906,7 @@ def sample( Using a DataFrame column as weights. Rows with larger value in the `num_specimen_seen` column are more likely to be sampled. - >>> df.sample(n=2, weights='num_specimen_seen', random_state=1) + >>> df.sample(n=2, weights="num_specimen_seen", random_state=1) num_legs num_wings num_specimen_seen falcon 2 2 10 fish 0 0 8 @@ -6117,13 +5936,29 @@ def sample( return result + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + @final @doc(klass=_shared_doc_kwargs["klass"]) def pipe( self, - func: Callable[..., T] | tuple[Callable[..., T], str], - *args, - **kwargs, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, ) -> T: r""" Apply chainable functions that expect Series or DataFrames. @@ -6143,7 +5978,8 @@ def pipe( Returns ------- - the return type of ``func``. + The return type of ``func``. + The result of applying ``func`` to the Series or DataFrame. See Also -------- @@ -6159,10 +5995,10 @@ def pipe( Examples -------- - Constructing a income DataFrame from a dictionary. + Constructing an income DataFrame from a dictionary. >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]] - >>> df = pd.DataFrame(data, columns=['Salary', 'Others']) + >>> df = pd.DataFrame(data, columns=["Salary", "Others"]) >>> df Salary Others 0 8000 1000.0 @@ -6184,7 +6020,8 @@ def pipe( >>> subtract_national_insurance( ... subtract_state_tax(subtract_federal_tax(df), rate=0.12), ... rate=0.05, - ... rate_increase=0.02) # doctest: +SKIP + ... rate_increase=0.02, + ... ) # doctest: +SKIP You can write @@ -6210,9 +6047,7 @@ def pipe( ... df.pipe(subtract_federal_tax) ... .pipe(subtract_state_tax, rate=0.12) ... .pipe( - ... (subtract_national_insurance, 'df'), - ... rate=0.05, - ... rate_increase=0.02 + ... (subtract_national_insurance, "df"), rate=0.05, rate_increase=0.02 ... ) ... ) Salary Others @@ -6220,9 +6055,7 @@ def pipe( 1 6997.32 NaN 2 3682.80 1473.12 """ - if using_copy_on_write(): - return common.pipe(self.copy(deep=None), func, *args, **kwargs) - return common.pipe(self, func, *args, **kwargs) + return common.pipe(self.copy(deep=False), func, *args, **kwargs) # ---------------------------------------------------------------------- # Attribute access @@ -6252,25 +6085,26 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: # One could make the deepcopy unconditionally, but a deepcopy # of an empty dict is 50x more expensive than the empty check. self.attrs = deepcopy(other.attrs) - - self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels + self.flags.allows_duplicate_labels = ( + self.flags.allows_duplicate_labels + and other.flags.allows_duplicate_labels + ) # For subclasses using _metadata. for name in set(self._metadata) & set(other._metadata): assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": + objs = other.objs # propagate attrs only if all concat arguments have the same attrs - if all(bool(obj.attrs) for obj in other.objs): + if all(bool(obj.attrs) for obj in objs): # all concatenate arguments have non-empty attrs - attrs = other.objs[0].attrs - have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + attrs = objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in objs[1:]) if have_same_attrs: self.attrs = deepcopy(attrs) - allows_duplicate_labels = all( - x.flags.allows_duplicate_labels for x in other.objs - ) + allows_duplicate_labels = all(x.flags.allows_duplicate_labels for x in objs) self.flags.allows_duplicate_labels = allows_duplicate_labels return self @@ -6348,28 +6182,11 @@ def _dir_additions(self) -> set[str]: # ---------------------------------------------------------------------- # Consolidation of internals - @final - def _protect_consolidate(self, f): - """ - Consolidate _mgr -- if the blocks have changed, then clear the - cache - """ - if isinstance(self._mgr, (ArrayManager, SingleArrayManager)): - return f() - blocks_before = len(self._mgr.blocks) - result = f() - if len(self._mgr.blocks) != blocks_before: - self._clear_item_cache() - return result - @final def _consolidate_inplace(self) -> None: """Consolidate data in place and return None""" - def f() -> None: - self._mgr = self._mgr.consolidate() - - self._protect_consolidate(f) + self._mgr = self._mgr.consolidate() @final def _consolidate(self): @@ -6381,15 +6198,14 @@ def _consolidate(self): ------- consolidated : same type as caller """ - f = lambda: self._mgr.consolidate() - cons_data = self._protect_consolidate(f) + cons_data = self._mgr.consolidate() return self._constructor_from_mgr(cons_data, axes=cons_data.axes).__finalize__( self ) @final @property - def _is_mixed_type(self) -> bool_t: + def _is_mixed_type(self) -> bool: if self._mgr.is_single_block: # Includes all Series cases return False @@ -6438,16 +6254,24 @@ def dtypes(self): pandas.Series The data type of each column. + See Also + -------- + Series.dtypes : Return the dtype object of the underlying data. + Examples -------- - >>> df = pd.DataFrame({'float': [1.0], - ... 'int': [1], - ... 'datetime': [pd.Timestamp('20180310')], - ... 'string': ['foo']}) + >>> df = pd.DataFrame( + ... { + ... "float": [1.0], + ... "int": [1], + ... "datetime": [pd.Timestamp("20180310")], + ... "string": ["foo"], + ... } + ... ) >>> df.dtypes float float64 int int64 - datetime datetime64[ns] + datetime datetime64[s] string object dtype: object """ @@ -6456,11 +6280,19 @@ def dtypes(self): @final def astype( - self, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise" + self, + dtype, + copy: bool | lib.NoDefault = lib.no_default, + errors: IgnoreRaise = "raise", ) -> Self: """ Cast a pandas object to a specified dtype ``dtype``. + This method allows the conversion of the data types of pandas objects, + including DataFrames and Series, to the specified dtype. It supports casting + entire objects to a single data type or applying different data types to + individual columns using a mapping. + Parameters ---------- dtype : str, data type, Series or Mapping of column name -> data type @@ -6469,7 +6301,7 @@ def astype( mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame's columns to column-specific types. - copy : bool, default True + copy : bool, default False Return a copy when ``copy=True`` (be very careful setting ``copy=False`` as changes to values then may propagate to other pandas objects). @@ -6485,6 +6317,8 @@ def astype( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 errors : {'raise', 'ignore'}, default 'raise' Control raising of exceptions on invalid data for provided dtype. @@ -6494,6 +6328,7 @@ def astype( Returns ------- same type as caller + The pandas object casted to the specified ``dtype``. See Also -------- @@ -6514,7 +6349,7 @@ def astype( -------- Create a DataFrame: - >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> d = {"col1": [1, 2], "col2": [3, 4]} >>> df = pd.DataFrame(data=d) >>> df.dtypes col1 int64 @@ -6523,33 +6358,33 @@ def astype( Cast all columns to int32: - >>> df.astype('int32').dtypes + >>> df.astype("int32").dtypes col1 int32 col2 int32 dtype: object Cast col1 to int32 using a dictionary: - >>> df.astype({'col1': 'int32'}).dtypes + >>> df.astype({"col1": "int32"}).dtypes col1 int32 col2 int64 dtype: object Create a series: - >>> ser = pd.Series([1, 2], dtype='int32') + >>> ser = pd.Series([1, 2], dtype="int32") >>> ser 0 1 1 2 dtype: int32 - >>> ser.astype('int64') + >>> ser.astype("int64") 0 1 1 2 dtype: int64 Convert to categorical type: - >>> ser.astype('category') + >>> ser.astype("category") 0 1 1 2 dtype: category @@ -6558,8 +6393,7 @@ def astype( Convert to ordered categorical type with custom ordering: >>> from pandas.api.types import CategoricalDtype - >>> cat_dtype = CategoricalDtype( - ... categories=[2, 1], ordered=True) + >>> cat_dtype = CategoricalDtype(categories=[2, 1], ordered=True) >>> ser.astype(cat_dtype) 0 1 1 2 @@ -6568,16 +6402,14 @@ def astype( Create a series of dates: - >>> ser_date = pd.Series(pd.date_range('20200101', periods=3)) + >>> ser_date = pd.Series(pd.date_range("20200101", periods=3)) >>> ser_date 0 2020-01-01 1 2020-01-02 2 2020-01-03 dtype: datetime64[ns] """ - if copy and using_copy_on_write(): - copy = False - + self._check_copy_deprecation(copy) if is_dict_like(dtype): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or self.name not in dtype: @@ -6586,7 +6418,7 @@ def astype( "the key in Series dtype mappings." ) new_type = dtype[self.name] - return self.astype(new_type, copy, errors) + return self.astype(new_type, errors=errors) # GH#44417 cast to Series so we can use .iat below, which will be # robust in case we @@ -6602,16 +6434,16 @@ def astype( f"'{col_name}' not found in columns." ) - dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False) + dtype_ser = dtype_ser.reindex(self.columns, fill_value=None) results = [] for i, (col_name, col) in enumerate(self.items()): cdt = dtype_ser.iat[i] if isna(cdt): - res_col = col.copy(deep=copy) + res_col = col.copy(deep=False) else: try: - res_col = col.astype(dtype=cdt, copy=copy, errors=errors) + res_col = col.astype(dtype=cdt, errors=errors) except ValueError as ex: ex.args = ( f"{ex}: Error while type casting for column '{col_name}'", @@ -6623,27 +6455,25 @@ def astype( # TODO(EA2D): special case not needed with 2D EAs dtype = pandas_dtype(dtype) if isinstance(dtype, ExtensionDtype) and all( - arr.dtype == dtype for arr in self._mgr.arrays + block.values.dtype == dtype for block in self._mgr.blocks ): - return self.copy(deep=copy) + return self.copy(deep=False) # GH 18099/22869: columnwise conversion to extension dtype # GH 24704: self.items handles duplicate column names - results = [ - ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items() - ] + results = [ser.astype(dtype, errors=errors) for _, ser in self.items()] else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) + new_data = self._mgr.astype(dtype=dtype, errors=errors) res = self._constructor_from_mgr(new_data, axes=new_data.axes) return res.__finalize__(self, method="astype") # GH 33113: handle empty frame or series if not results: - return self.copy(deep=None) + return self.copy(deep=False) # GH 19920: retain column metadata after concat - result = concat(results, axis=1, copy=False) + result = concat(results, axis=1) # GH#40810 retain subclass # error: Incompatible types in assignment # (expression has type "Self", variable has type "DataFrame") @@ -6654,7 +6484,7 @@ def astype( return cast(Self, result) @final - def copy(self, deep: bool_t | None = True) -> Self: + def copy(self, deep: bool = True) -> Self: """ Make a copy of this object's indices and data. @@ -6694,6 +6524,11 @@ def copy(self, deep: bool_t | None = True) -> Self: Series or DataFrame Object type matches caller. + See Also + -------- + copy.copy : Return a shallow copy of an object. + copy.deepcopy : Return a deep copy of an object. + Notes ----- When ``deep=True``, data is copied but actual Python objects @@ -6710,8 +6545,7 @@ def copy(self, deep: bool_t | None = True) -> Self: :ref:`gotchas ` when copying in a threading environment. - When ``copy_on_write`` in pandas config is set to ``True``, the - ``copy_on_write`` config takes effect even when ``deep=False``. + Copy-on-Write protects shallow copies against accidental modifications. This means that any changes to the copied data would make a new copy of the data upon write (and vice versa). Changes made to either the original or copied variable would not be reflected in the counterpart. @@ -6737,12 +6571,15 @@ def copy(self, deep: bool_t | None = True) -> Self: >>> deep = s.copy() >>> shallow = s.copy(deep=False) - Shallow copy shares data and index with original. + Shallow copy shares index with original, the data is a + view of the original. >>> s is shallow False - >>> s.values is shallow.values and s.index is shallow.index - True + >>> s.values is shallow.values + False + >>> s.index is shallow.index + False Deep copy has own copy of data and index. @@ -6751,18 +6588,17 @@ def copy(self, deep: bool_t | None = True) -> Self: >>> s.values is deep.values or s.index is deep.index False - Updates to the data shared by shallow copy and original is reflected - in both (NOTE: this will no longer be true for pandas >= 3.0); - deep copy remains unchanged. + The shallow copy is protected against updating the original object + as well. Thus, updates will only reflect in one of both objects. >>> s.iloc[0] = 3 >>> shallow.iloc[1] = 4 >>> s a 3 - b 4 + b 2 dtype: int64 >>> shallow - a 3 + a 1 b 4 dtype: int64 >>> deep @@ -6785,31 +6621,14 @@ def copy(self, deep: bool_t | None = True) -> Self: 0 [10, 2] 1 [3, 4] dtype: object - - **Copy-on-Write is set to true**, the shallow copy is not modified - when the original data is changed: - - >>> with pd.option_context("mode.copy_on_write", True): - ... s = pd.Series([1, 2], index=["a", "b"]) - ... copy = s.copy(deep=False) - ... s.iloc[0] = 100 - ... s - a 100 - b 2 - dtype: int64 - >>> copy - a 1 - b 2 - dtype: int64 """ data = self._mgr.copy(deep=deep) - self._clear_item_cache() return self._constructor_from_mgr(data, axes=data.axes).__finalize__( self, method="copy" ) @final - def __copy__(self, deep: bool_t = True) -> Self: + def __copy__(self, deep: bool = True) -> Self: return self.copy(deep=deep) @final @@ -6823,7 +6642,7 @@ def __deepcopy__(self, memo=None) -> Self: return self.copy(deep=True) @final - def infer_objects(self, copy: bool_t | None = None) -> Self: + def infer_objects(self, copy: bool | lib.NoDefault = lib.no_default) -> Self: """ Attempt to infer better dtypes for object columns. @@ -6834,7 +6653,7 @@ def infer_objects(self, copy: bool_t | None = None) -> Self: Parameters ---------- - copy : bool, default True + copy : bool, default False Whether to make a copy for non-object or non-inferable columns or Series. @@ -6850,9 +6669,12 @@ def infer_objects(self, copy: bool_t | None = None) -> Self: You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- same type as input object + Returns an object of the same type as the input object. See Also -------- @@ -6879,22 +6701,23 @@ def infer_objects(self, copy: bool_t | None = None) -> Self: A int64 dtype: object """ - new_mgr = self._mgr.convert(copy=copy) + self._check_copy_deprecation(copy) + new_mgr = self._mgr.convert() res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) return res.__finalize__(self, method="infer_objects") @final def convert_dtypes( self, - infer_objects: bool_t = True, - convert_string: bool_t = True, - convert_integer: bool_t = True, - convert_boolean: bool_t = True, - convert_floating: bool_t = True, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, dtype_backend: DtypeBackend = "numpy_nullable", ) -> Self: """ - Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``. + Convert columns from numpy dtypes to the best dtypes that support ``pd.NA``. Parameters ---------- @@ -6911,13 +6734,13 @@ def convert_dtypes( If `convert_integer` is also True, preference will be give to integer dtypes if the floats can be faithfully casted to integers. dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + Back-end data type applied to the resultant :class:`DataFrame` or + :class:`Series` (still experimental). Behaviour is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). + * ``"numpy_nullable"``: returns nullable-dtype-backed + :class:`DataFrame` or :class:`Serires`. * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + :class:`DataFrame` or :class:`Series`. .. versionadded:: 2.0 @@ -6996,12 +6819,12 @@ def convert_dtypes( 2 3 z 20 200.0 >>> dfn.dtypes - a Int32 - b string[python] - c boolean - d string[python] - e Int64 - f Float64 + a Int32 + b string + c boolean + d string + e Int64 + f Float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -7022,7 +6845,7 @@ def convert_dtypes( dtype: string """ check_dtype_backend(dtype_backend) - new_mgr = self._mgr.convert_dtypes( # type: ignore[union-attr] + new_mgr = self._mgr.convert_dtypes( infer_objects=infer_objects, convert_string=convert_string, convert_integer=convert_integer, @@ -7036,44 +6859,25 @@ def convert_dtypes( # ---------------------------------------------------------------------- # Filling NA's - def _deprecate_downcast(self, downcast, method_name: str): - # GH#40988 - if downcast is not lib.no_default: - warnings.warn( - f"The 'downcast' keyword in {method_name} is deprecated and " - "will be removed in a future version. Use " - "res.infer_objects(copy=False) to infer non-object dtype, or " - "pd.to_numeric with the 'downcast' keyword to downcast numeric " - "results.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - downcast = None - return downcast - @final def _pad_or_backfill( self, method: Literal["ffill", "bfill", "pad", "backfill"], *, axis: None | Axis = None, - inplace: bool_t = False, + inplace: bool = False, limit: None | int = None, limit_area: Literal["inside", "outside"] | None = None, - downcast: dict | None = None, ): if axis is None: axis = 0 axis = self._get_axis_number(axis) method = clean_fill_method(method) - if not self._mgr.is_single_block and axis == 1: + if axis == 1: + if not self._mgr.is_single_block and inplace: + raise NotImplementedError # e.g. test_align_fill_method - # TODO(3.0): once downcast is removed, we can do the .T - # in all axis=1 cases, and remove axis kward from mgr.pad_or_backfill. - if inplace: - raise NotImplementedError() result = self.T._pad_or_backfill( method=method, limit=limit, limit_area=limit_area ).T @@ -7082,11 +6886,9 @@ def _pad_or_backfill( new_mgr = self._mgr.pad_or_backfill( method=method, - axis=self._get_block_manager_axis(axis), limit=limit, limit_area=limit_area, inplace=inplace, - downcast=downcast, ) result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) if inplace: @@ -7097,41 +6899,32 @@ def _pad_or_backfill( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, - method: FillnaOptions | None = ..., axis: Axis | None = ..., inplace: Literal[False] = ..., limit: int | None = ..., - downcast: dict | None = ..., - ) -> Self: - ... + ) -> Self: ... @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, - method: FillnaOptions | None = ..., axis: Axis | None = ..., inplace: Literal[True], limit: int | None = ..., - downcast: dict | None = ..., - ) -> None: - ... + ) -> None: ... @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, - method: FillnaOptions | None = ..., axis: Axis | None = ..., - inplace: bool_t = ..., + inplace: bool = ..., limit: int | None = ..., - downcast: dict | None = ..., - ) -> Self | None: - ... + ) -> Self | None: ... @final @doc( @@ -7140,16 +6933,14 @@ def fillna( ) def fillna( self, - value: Hashable | Mapping | Series | DataFrame | None = None, + value: Hashable | Mapping | Series | DataFrame, *, - method: FillnaOptions | None = None, axis: Axis | None = None, - inplace: bool_t = False, + inplace: bool = False, limit: int | None = None, - downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Fill NA/NaN values using the specified method. + Fill NA/NaN values with `value`. Parameters ---------- @@ -7159,15 +6950,6 @@ def fillna( each index (for a Series) or column (for a DataFrame). Values not in the dict/Series/DataFrame will not be filled. This value cannot be a list. - method : {{'backfill', 'bfill', 'ffill', None}}, default None - Method to use for filling holes in reindexed Series: - - * ffill: propagate last valid observation forward to next valid. - * backfill / bfill: use next valid observation to fill gap. - - .. deprecated:: 2.1.0 - Use ffill or bfill instead. - axis : {axes_single_arg} Axis along which to fill missing values. For `Series` this parameter is unused and defaults to 0. @@ -7176,16 +6958,8 @@ def fillna( other views on this object (e.g., a no-copy slice for a column in a DataFrame). limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. Must be greater than 0 if not None. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). + This is the maximum number of entries along the entire axis + where NaNs will be filled. Must be greater than 0 if not None. Returns ------- @@ -7200,13 +6974,23 @@ def fillna( reindex : Conform object to new index. asfreq : Convert TimeSeries to specified frequency. + Notes + ----- + For non-object dtype, ``value=None`` will use the NA value of the dtype. + See more details in the :ref:`Filling missing data` + section. + Examples -------- - >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], - ... [3, 4, np.nan, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [np.nan, 3, np.nan, 4]], - ... columns=list("ABCD")) + >>> df = pd.DataFrame( + ... [ + ... [np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4], + ... ], + ... columns=list("ABCD"), + ... ) >>> df A B C D 0 NaN 2.0 NaN 0.0 @@ -7258,178 +7042,112 @@ def fillna( """ inplace = validate_bool_kwarg(inplace, "inplace") if inplace: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif ( - not PYPY - and not using_copy_on_write() - and self._is_view_after_cow_rules() - ): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if isinstance(self, ABCSeries) and _check_cacher(self): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) - value, method = validate_fillna_kwargs(value, method) - if method is not None: - warnings.warn( - f"{type(self).__name__}.fillna with 'method' is deprecated and " - "will raise in a future version. Use obj.ffill() or obj.bfill() " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), + if isinstance(value, (list, tuple)): + raise TypeError( + '"value" parameter must be a scalar or dict, but ' + f'you passed a "{type(value).__name__}"' ) - was_no_default = downcast is lib.no_default - downcast = self._deprecate_downcast(downcast, "fillna") - - # set the default here, so functions examining the signaure + # set the default here, so functions examining the signature # can detect if something was set (e.g. in groupby) (GH9221) if axis is None: axis = 0 axis = self._get_axis_number(axis) - if value is None: - return self._pad_or_backfill( - # error: Argument 1 to "_pad_or_backfill" of "NDFrame" has - # incompatible type "Optional[Literal['backfill', 'bfill', 'ffill', - # 'pad']]"; expected "Literal['ffill', 'bfill', 'pad', 'backfill']" - method, # type: ignore[arg-type] - axis=axis, - limit=limit, - inplace=inplace, - # error: Argument "downcast" to "_fillna_with_method" of "NDFrame" - # has incompatible type "Union[Dict[Any, Any], None, - # Literal[_NoDefault.no_default]]"; expected - # "Optional[Dict[Any, Any]]" - downcast=downcast, # type: ignore[arg-type] - ) - else: - if self.ndim == 1: - if isinstance(value, (dict, ABCSeries)): - if not len(value): - # test_fillna_nonscalar - if inplace: - return None - return self.copy(deep=None) - from pandas import Series - - value = Series(value) - value = value.reindex(self.index, copy=False) - value = value._values - elif not is_list_like(value): - pass - else: - raise TypeError( - '"value" parameter must be a scalar, dict ' - "or Series, but you passed a " - f'"{type(value).__name__}"' - ) + if self.ndim == 1: + if isinstance(value, (dict, ABCSeries)): + if not len(value): + # test_fillna_nonscalar + if inplace: + return None + return self.copy(deep=False) + from pandas import Series - new_data = self._mgr.fillna( - value=value, limit=limit, inplace=inplace, downcast=downcast + value = Series(value) + value = value.reindex(self.index) + value = value._values + elif not is_list_like(value): + pass + else: + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + f'"{type(value).__name__}"' ) - elif isinstance(value, (dict, ABCSeries)): - if axis == 1: - raise NotImplementedError( - "Currently only can fill " - "with dict/Series column " - "by column" - ) - if using_copy_on_write(): - result = self.copy(deep=None) - else: - result = self if inplace else self.copy() - is_dict = isinstance(downcast, dict) - for k, v in value.items(): - if k not in result: - continue - - if was_no_default: - downcast_k = lib.no_default - else: - downcast_k = ( - # error: Incompatible types in assignment (expression - # has type "Union[Dict[Any, Any], None, - # Literal[_NoDefault.no_default], Any]", variable has - # type "_NoDefault") - downcast # type: ignore[assignment] - if not is_dict - # error: Item "None" of "Optional[Dict[Any, Any]]" has - # no attribute "get" - else downcast.get(k) # type: ignore[union-attr] - ) + new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + + elif isinstance(value, (dict, ABCSeries)): + if axis == 1: + raise NotImplementedError( + "Currently only can fill with dict/Series column by column" + ) + result = self if inplace else self.copy(deep=False) + for k, v in value.items(): + if k not in result: + continue - res_k = result[k].fillna(v, limit=limit, downcast=downcast_k) + res_k = result[k].fillna(v, limit=limit) - if not inplace: - result[k] = res_k + if not inplace: + result[k] = res_k + else: + # We can write into our existing column(s) iff dtype + # was preserved. + if isinstance(res_k, ABCSeries): + # i.e. 'k' only shows up once in self.columns + if res_k.dtype == result[k].dtype: + result.loc[:, k] = res_k + else: + # Different dtype -> no way to do inplace. + result[k] = res_k else: - # We can write into our existing column(s) iff dtype - # was preserved. - if isinstance(res_k, ABCSeries): - # i.e. 'k' only shows up once in self.columns - if res_k.dtype == result[k].dtype: - result.loc[:, k] = res_k + # see test_fillna_dict_inplace_nonunique_columns + locs = result.columns.get_loc(k) + if isinstance(locs, slice): + locs = range(self.shape[1])[locs] + elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b": + locs = locs.nonzero()[0] + elif not ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "i" + ): + # Should never be reached, but let's cover our bases + raise NotImplementedError( + "Unexpected get_loc result, please report a bug at " + "https://github.com/pandas-dev/pandas" + ) + + for i, loc in enumerate(locs): + res_loc = res_k.iloc[:, i] + target = self.iloc[:, loc] + + if res_loc.dtype == target.dtype: + result.iloc[:, loc] = res_loc else: - # Different dtype -> no way to do inplace. - result[k] = res_k - else: - # see test_fillna_dict_inplace_nonunique_columns - locs = result.columns.get_loc(k) - if isinstance(locs, slice): - locs = np.arange(self.shape[1])[locs] - elif ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "b" - ): - locs = locs.nonzero()[0] - elif not ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "i" - ): - # Should never be reached, but let's cover our bases - raise NotImplementedError( - "Unexpected get_loc result, please report a bug at " - "https://github.com/pandas-dev/pandas" - ) - - for i, loc in enumerate(locs): - res_loc = res_k.iloc[:, i] - target = self.iloc[:, loc] - - if res_loc.dtype == target.dtype: - result.iloc[:, loc] = res_loc - else: - result.isetitem(loc, res_loc) - if inplace: - return self._update_inplace(result) - else: - return result - - elif not is_list_like(value): - if axis == 1: - result = self.T.fillna(value=value, limit=limit).T - new_data = result._mgr - else: - new_data = self._mgr.fillna( - value=value, limit=limit, inplace=inplace, downcast=downcast - ) - elif isinstance(value, ABCDataFrame) and self.ndim == 2: - new_data = self.where(self.notna(), value)._mgr + result.isetitem(loc, res_loc) + if inplace: + return self._update_inplace(result) else: - raise ValueError(f"invalid fill value with a {type(value)}") + return result + + elif not is_list_like(value): + if axis == 1: + result = self.T.fillna(value=value, limit=limit).T + new_data = result._mgr + else: + new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + elif isinstance(value, ABCDataFrame) and self.ndim == 2: + new_data = self.where(self.notna(), value)._mgr + else: + raise ValueError(f"invalid fill value with a {type(value)}") result = self._constructor_from_mgr(new_data, axes=new_data.axes) if inplace: @@ -7445,9 +7163,7 @@ def ffill( inplace: Literal[False] = ..., limit: None | int = ..., limit_area: Literal["inside", "outside"] | None = ..., - downcast: dict | None | lib.NoDefault = ..., - ) -> Self: - ... + ) -> Self: ... @overload def ffill( @@ -7457,21 +7173,17 @@ def ffill( inplace: Literal[True], limit: None | int = ..., limit_area: Literal["inside", "outside"] | None = ..., - downcast: dict | None | lib.NoDefault = ..., - ) -> None: - ... + ) -> None: ... @overload def ffill( self, *, axis: None | Axis = ..., - inplace: bool_t = ..., + inplace: bool = ..., limit: None | int = ..., limit_area: Literal["inside", "outside"] | None = ..., - downcast: dict | None | lib.NoDefault = ..., - ) -> Self | None: - ... + ) -> Self | None: ... @final @doc( @@ -7482,10 +7194,9 @@ def ffill( self, *, axis: None | Axis = None, - inplace: bool_t = False, + inplace: bool = False, limit: None | int = None, limit_area: Literal["inside", "outside"] | None = None, - downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ Fill NA/NaN values by propagating the last valid observation to next valid. @@ -7517,23 +7228,27 @@ def ffill( .. versionadded:: 2.2.0 - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). - Returns ------- {klass} or None Object with missing values filled or None if ``inplace=True``. + See Also + -------- + DataFrame.bfill : Fill NA/NaN values by using the next valid observation + to fill the gap. + Examples -------- - >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], - ... [3, 4, np.nan, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [np.nan, 3, np.nan, 4]], - ... columns=list("ABCD")) + >>> df = pd.DataFrame( + ... [ + ... [np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4], + ... ], + ... columns=list("ABCD"), + ... ) >>> df A B C D 0 NaN 2.0 NaN 0.0 @@ -7556,32 +7271,15 @@ def ffill( 3 3.0 dtype: float64 """ - downcast = self._deprecate_downcast(downcast, "ffill") inplace = validate_bool_kwarg(inplace, "inplace") if inplace: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif ( - not PYPY - and not using_copy_on_write() - and self._is_view_after_cow_rules() - ): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if isinstance(self, ABCSeries) and _check_cacher(self): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) return self._pad_or_backfill( "ffill", @@ -7589,46 +7287,8 @@ def ffill( inplace=inplace, limit=limit, limit_area=limit_area, - # error: Argument "downcast" to "_fillna_with_method" of "NDFrame" - # has incompatible type "Union[Dict[Any, Any], None, - # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]" - downcast=downcast, # type: ignore[arg-type] ) - @final - @doc(klass=_shared_doc_kwargs["klass"]) - def pad( - self, - *, - axis: None | Axis = None, - inplace: bool_t = False, - limit: None | int = None, - downcast: dict | None | lib.NoDefault = lib.no_default, - ) -> Self | None: - """ - Fill NA/NaN values by propagating the last valid observation to next valid. - - .. deprecated:: 2.0 - - {klass}.pad is deprecated. Use {klass}.ffill instead. - - Returns - ------- - {klass} or None - Object with missing values filled or None if ``inplace=True``. - - Examples - -------- - Please see examples for :meth:`DataFrame.ffill` or :meth:`Series.ffill`. - """ - warnings.warn( - "DataFrame.pad/Series.pad is deprecated. Use " - "DataFrame.ffill/Series.ffill instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast) - @overload def bfill( self, @@ -7637,9 +7297,7 @@ def bfill( inplace: Literal[False] = ..., limit: None | int = ..., limit_area: Literal["inside", "outside"] | None = ..., - downcast: dict | None | lib.NoDefault = ..., - ) -> Self: - ... + ) -> Self: ... @overload def bfill( @@ -7648,21 +7306,17 @@ def bfill( axis: None | Axis = ..., inplace: Literal[True], limit: None | int = ..., - downcast: dict | None | lib.NoDefault = ..., - ) -> None: - ... + ) -> None: ... @overload def bfill( self, *, axis: None | Axis = ..., - inplace: bool_t = ..., + inplace: bool = ..., limit: None | int = ..., limit_area: Literal["inside", "outside"] | None = ..., - downcast: dict | None | lib.NoDefault = ..., - ) -> Self | None: - ... + ) -> Self | None: ... @final @doc( @@ -7673,10 +7327,9 @@ def bfill( self, *, axis: None | Axis = None, - inplace: bool_t = False, + inplace: bool = False, limit: None | int = None, limit_area: Literal["inside", "outside"] | None = None, - downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ Fill NA/NaN values by using the next valid observation to fill the gap. @@ -7708,16 +7361,16 @@ def bfill( .. versionadded:: 2.2.0 - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). - Returns ------- {klass} or None Object with missing values filled or None if ``inplace=True``. + See Also + -------- + DataFrame.ffill : Fill NA/NaN values by propagating the last valid + observation to next valid. + Examples -------- For Series: @@ -7738,7 +7391,7 @@ def bfill( With DataFrame: - >>> df = pd.DataFrame({{'A': [1, None, None, 4], 'B': [None, 5, None, 7]}}) + >>> df = pd.DataFrame({{"A": [1, None, None, 4], "B": [None, 5, None, 7]}}) >>> df A B 0 1.0 NaN @@ -7758,32 +7411,15 @@ def bfill( 2 4.0 7.0 3 4.0 7.0 """ - downcast = self._deprecate_downcast(downcast, "bfill") inplace = validate_bool_kwarg(inplace, "inplace") if inplace: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif ( - not PYPY - and not using_copy_on_write() - and self._is_view_after_cow_rules() - ): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if isinstance(self, ABCSeries) and _check_cacher(self): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) return self._pad_or_backfill( "bfill", @@ -7791,45 +7427,7 @@ def bfill( inplace=inplace, limit=limit, limit_area=limit_area, - # error: Argument "downcast" to "_fillna_with_method" of "NDFrame" - # has incompatible type "Union[Dict[Any, Any], None, - # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]" - downcast=downcast, # type: ignore[arg-type] - ) - - @final - @doc(klass=_shared_doc_kwargs["klass"]) - def backfill( - self, - *, - axis: None | Axis = None, - inplace: bool_t = False, - limit: None | int = None, - downcast: dict | None | lib.NoDefault = lib.no_default, - ) -> Self | None: - """ - Fill NA/NaN values by using the next valid observation to fill the gap. - - .. deprecated:: 2.0 - - {klass}.backfill is deprecated. Use {klass}.bfill instead. - - Returns - ------- - {klass} or None - Object with missing values filled or None if ``inplace=True``. - - Examples - -------- - Please see examples for :meth:`DataFrame.bfill` or :meth:`Series.bfill`. - """ - warnings.warn( - "DataFrame.backfill/Series.backfill is deprecated. Use " - "DataFrame.bfill/Series.bfill instead", - FutureWarning, - stacklevel=find_stack_level(), ) - return self.bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast) @overload def replace( @@ -7838,11 +7436,8 @@ def replace( value=..., *, inplace: Literal[False] = ..., - limit: int | None = ..., - regex: bool_t = ..., - method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., - ) -> Self: - ... + regex: bool = ..., + ) -> Self: ... @overload def replace( @@ -7851,11 +7446,8 @@ def replace( value=..., *, inplace: Literal[True], - limit: int | None = ..., - regex: bool_t = ..., - method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., - ) -> None: - ... + regex: bool = ..., + ) -> None: ... @overload def replace( @@ -7863,12 +7455,9 @@ def replace( to_replace=..., value=..., *, - inplace: bool_t = ..., - limit: int | None = ..., - regex: bool_t = ..., - method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., - ) -> Self | None: - ... + inplace: bool = ..., + regex: bool = ..., + ) -> Self | None: ... @final @doc( @@ -7881,43 +7470,11 @@ def replace( to_replace=None, value=lib.no_default, *, - inplace: bool_t = False, - limit: int | None = None, - regex: bool_t = False, - method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default, + inplace: bool = False, + regex: bool = False, ) -> Self | None: - if method is not lib.no_default: - warnings.warn( - # GH#33302 - f"The 'method' keyword in {type(self).__name__}.replace is " - "deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - elif limit is not None: - warnings.warn( - # GH#33302 - f"The 'limit' keyword in {type(self).__name__}.replace is " - "deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if ( - value is lib.no_default - and method is lib.no_default - and not is_dict_like(to_replace) - and regex is False - ): - # case that goes through _replace_single and defaults to method="pad" - warnings.warn( - # GH#33302 - f"{type(self).__name__}.replace without 'value' and with " - "non-dict-like 'to_replace' is deprecated " - "and will raise in a future version. " - "Explicitly specify the new values instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + if not is_bool(regex) and to_replace is not None: + raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") if not ( is_scalar(to_replace) @@ -7927,77 +7484,32 @@ def replace( raise TypeError( "Expecting 'to_replace' to be either a scalar, array-like, " "dict or None, got invalid type " - f"{repr(type(to_replace).__name__)}" + f"{type(to_replace).__name__!r}" + ) + + if value is lib.no_default and not ( + is_dict_like(to_replace) or is_dict_like(regex) + ): + raise ValueError( + # GH#33302 + f"{type(self).__name__}.replace must specify either 'value', " + "a dict-like 'to_replace', or dict-like 'regex'." ) inplace = validate_bool_kwarg(inplace, "inplace") if inplace: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif ( - not PYPY - and not using_copy_on_write() - and self._is_view_after_cow_rules() - ): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if isinstance(self, ABCSeries) and _check_cacher(self): - # in non-CoW mode, chained Series access will populate the - # `_item_cache` which results in an increased ref count not below - # the threshold, while we still need to warn. We detect this case - # of a Series derived from a DataFrame through the presence of - # checking the `_cacher` - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) - - if not is_bool(regex) and to_replace is not None: - raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") - - if value is lib.no_default or method is not lib.no_default: - # GH#36984 if the user explicitly passes value=None we want to - # respect that. We have the corner case where the user explicitly - # passes value=None *and* a method, which we interpret as meaning - # they want the (documented) default behavior. - if method is lib.no_default: - # TODO: get this to show up as the default in the docs? - method = "pad" - - # passing a single value that is scalar like - # when value is None (GH5319), for compat - if not is_dict_like(to_replace) and not is_dict_like(regex): - to_replace = [to_replace] - - if isinstance(to_replace, (tuple, list)): - # TODO: Consider copy-on-write for non-replaced columns's here - if isinstance(self, ABCDataFrame): - from pandas import Series - - result = self.apply( - Series._replace_single, - args=(to_replace, method, inplace, limit), - ) - if inplace: - return None - return result - return self._replace_single(to_replace, method, inplace, limit) + if value is lib.no_default: if not is_dict_like(to_replace): - if not is_dict_like(regex): - raise TypeError( - 'If "to_replace" and "value" are both None ' - 'and "to_replace" is not a list, then ' - "regex must be a mapping" - ) + # In this case we have checked above that + # 1) regex is dict-like and 2) to_replace is None to_replace = regex regex = True @@ -8005,8 +7517,6 @@ def replace( if items: keys, values = zip(*items) else: - # error: Incompatible types in assignment (expression has type - # "list[Never]", variable has type "tuple[Any, ...]") keys, values = ([], []) # type: ignore[assignment] are_mappings = [is_dict_like(v) for v in values] @@ -8036,18 +7546,20 @@ def replace( else: to_replace, value = keys, values - return self.replace( - to_replace, value, inplace=inplace, limit=limit, regex=regex - ) + return self.replace(to_replace, value, inplace=inplace, regex=regex) else: # need a non-zero len on all axes if not self.size: if inplace: return None - return self.copy(deep=None) - + return self.copy(deep=False) if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} + if isinstance(self, ABCSeries): + raise ValueError( + "to_replace and value cannot be dict-like for " + "Series.replace" + ) # Note: Checking below for `in foo.keys()` instead of # `in foo` is needed for when we have a Series and not dict mapping = { @@ -8100,11 +7612,9 @@ def replace( raise TypeError( f"'regex' must be a string or a compiled regular expression " f"or a list or dict of strings or regular expressions, " - f"you passed a {repr(type(regex).__name__)}" + f"you passed a {type(regex).__name__!r}" ) - return self.replace( - regex, value, inplace=inplace, limit=limit, regex=True - ) + return self.replace(regex, value, inplace=inplace, regex=True) else: # dest iterable dict-like if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1} @@ -8131,7 +7641,7 @@ def replace( ) else: raise TypeError( - f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' + f'Invalid "to_replace" type: {type(to_replace).__name__!r}' ) result = self._constructor_from_mgr(new_data, axes=new_data.axes) @@ -8150,10 +7660,8 @@ def interpolate( inplace: Literal[False] = ..., limit_direction: Literal["forward", "backward", "both"] | None = ..., limit_area: Literal["inside", "outside"] | None = ..., - downcast: Literal["infer"] | None | lib.NoDefault = ..., **kwargs, - ) -> Self: - ... + ) -> Self: ... @overload def interpolate( @@ -8165,10 +7673,8 @@ def interpolate( inplace: Literal[True], limit_direction: Literal["forward", "backward", "both"] | None = ..., limit_area: Literal["inside", "outside"] | None = ..., - downcast: Literal["infer"] | None | lib.NoDefault = ..., **kwargs, - ) -> None: - ... + ) -> None: ... @overload def interpolate( @@ -8177,13 +7683,11 @@ def interpolate( *, axis: Axis = ..., limit: int | None = ..., - inplace: bool_t = ..., + inplace: bool = ..., limit_direction: Literal["forward", "backward", "both"] | None = ..., limit_area: Literal["inside", "outside"] | None = ..., - downcast: Literal["infer"] | None | lib.NoDefault = ..., **kwargs, - ) -> Self | None: - ... + ) -> Self | None: ... @final def interpolate( @@ -8192,10 +7696,9 @@ def interpolate( *, axis: Axis = 0, limit: int | None = None, - inplace: bool_t = False, + inplace: bool = False, limit_direction: Literal["forward", "backward", "both"] | None = None, limit_area: Literal["inside", "outside"] | None = None, - downcast: Literal["infer"] | None | lib.NoDefault = lib.no_default, **kwargs, ) -> Self | None: """ @@ -8212,9 +7715,12 @@ def interpolate( * 'linear': Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes. * 'time': Works on daily and higher resolution data to interpolate - given length of interval. - * 'index', 'values': use the actual numerical values of the index. - * 'pad': Fill in NaNs using existing values. + given length of interval. This interpolates values based on + time interval between observations. + * 'index': The interpolation uses the numerical values + of the DataFrame's index to linearly calculate missing values. + * 'values': Interpolation based on the numerical values + in the DataFrame, treating them as equally spaced along the index. * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'polynomial': Passed to `scipy.interpolate.interp1d`, whereas 'spline' is passed to @@ -8238,23 +7744,9 @@ def interpolate( 0. inplace : bool, default False Update the data in place if possible. - limit_direction : {{'forward', 'backward', 'both'}}, Optional + limit_direction : {{'forward', 'backward', 'both'}}, optional, default 'forward' Consecutive NaNs will be filled in this direction. - If limit is specified: - * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'. - * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be - 'backwards'. - - If 'limit' is not specified: - * If 'method' is 'backfill' or 'bfill', the default is 'backward' - * else the default is 'forward' - - raises ValueError if `limit_direction` is 'forward' or 'both' and - method is 'backfill' or 'bfill'. - raises ValueError if `limit_direction` is 'backward' or 'both' and - method is 'pad' or 'ffill'. - limit_area : {{`None`, 'inside', 'outside'}}, default None If limit is specified, consecutive NaNs will be filled with this restriction. @@ -8264,12 +7756,7 @@ def interpolate( (interpolate). * 'outside': Only fill NaNs outside valid values (extrapolate). - downcast : optional, 'infer' or None, defaults to None - Downcast dtypes if possible. - - .. deprecated:: 2.1.0 - - ``**kwargs`` : optional + **kwargs : optional Keyword arguments to pass on to the interpolating function. Returns @@ -8325,7 +7812,7 @@ def interpolate( an ``order`` (int). >>> s = pd.Series([0, 2, np.nan, 8]) - >>> s.interpolate(method='polynomial', order=2) + >>> s.interpolate(method="polynomial", order=2) 0 0.000000 1 2.000000 2 4.666667 @@ -8340,18 +7827,22 @@ def interpolate( Note how the first entry in column 'b' remains ``NaN``, because there is no entry before it to use for interpolation. - >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0), - ... (np.nan, 2.0, np.nan, np.nan), - ... (2.0, 3.0, np.nan, 9.0), - ... (np.nan, 4.0, -4.0, 16.0)], - ... columns=list('abcd')) + >>> df = pd.DataFrame( + ... [ + ... (0.0, np.nan, -1.0, 1.0), + ... (np.nan, 2.0, np.nan, np.nan), + ... (2.0, 3.0, np.nan, 9.0), + ... (np.nan, 4.0, -4.0, 16.0), + ... ], + ... columns=list("abcd"), + ... ) >>> df a b c d 0 0.0 NaN -1.0 1.0 1 NaN 2.0 NaN NaN 2 2.0 3.0 NaN 9.0 3 NaN 4.0 -4.0 16.0 - >>> df.interpolate(method='linear', limit_direction='forward', axis=0) + >>> df.interpolate(method="linear", limit_direction="forward", axis=0) a b c d 0 0.0 NaN -1.0 1.0 1 1.0 2.0 -2.0 5.0 @@ -8360,53 +7851,23 @@ def interpolate( Using polynomial interpolation. - >>> df['d'].interpolate(method='polynomial', order=2) + >>> df["d"].interpolate(method="polynomial", order=2) 0 1.0 1 4.0 2 9.0 3 16.0 Name: d, dtype: float64 """ - if downcast is not lib.no_default: - # GH#40988 - warnings.warn( - f"The 'downcast' keyword in {type(self).__name__}.interpolate " - "is deprecated and will be removed in a future version. " - "Call result.infer_objects(copy=False) on the result instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - downcast = None - if downcast is not None and downcast != "infer": - raise ValueError("downcast must be either None or 'infer'") - inplace = validate_bool_kwarg(inplace, "inplace") if inplace: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif ( - not PYPY - and not using_copy_on_write() - and self._is_view_after_cow_rules() - ): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if isinstance(self, ABCSeries) and _check_cacher(self): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) axis = self._get_axis_number(axis) @@ -8418,37 +7879,7 @@ def interpolate( if not isinstance(method, str): raise ValueError("'method' should be a string, not None.") - fillna_methods = ["ffill", "bfill", "pad", "backfill"] - if method.lower() in fillna_methods: - # GH#53581 - warnings.warn( - f"{type(self).__name__}.interpolate with method={method} is " - "deprecated and will raise in a future version. " - "Use obj.ffill() or obj.bfill() instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - obj, should_transpose = self, False - else: - obj, should_transpose = (self.T, True) if axis == 1 else (self, False) - if np.any(obj.dtypes == object): - # GH#53631 - if not (obj.ndim == 2 and np.all(obj.dtypes == object)): - # don't warn in cases that already raise - warnings.warn( - f"{type(self).__name__}.interpolate with object dtype is " - "deprecated and will raise in a future version. Call " - "obj.infer_objects(copy=False) before interpolating instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if method in fillna_methods and "fill_value" in kwargs: - raise ValueError( - "'fill_value' is not a valid keyword for " - f"{type(self).__name__}.interpolate with method from " - f"{fillna_methods}" - ) + obj, should_transpose = (self.T, True) if axis == 1 else (self, False) if isinstance(obj.index, MultiIndex) and method != "linear": raise ValueError( @@ -8457,43 +7888,16 @@ def interpolate( limit_direction = missing.infer_limit_direction(limit_direction, method) - if obj.ndim == 2 and np.all(obj.dtypes == object): - raise TypeError( - "Cannot interpolate with all object-dtype columns " - "in the DataFrame. Try setting at least one " - "column to a numeric dtype." - ) - - if method.lower() in fillna_methods: - # TODO(3.0): remove this case - # TODO: warn/raise on limit_direction or kwargs which are ignored? - # as of 2023-06-26 no tests get here with either - if not self._mgr.is_single_block and axis == 1: - # GH#53898 - if inplace: - raise NotImplementedError() - obj, axis, should_transpose = self.T, 1 - axis, True - - new_data = obj._mgr.pad_or_backfill( - method=method, - axis=self._get_block_manager_axis(axis), - limit=limit, - limit_area=limit_area, - inplace=inplace, - downcast=downcast, - ) - else: - index = missing.get_interp_index(method, obj.index) - new_data = obj._mgr.interpolate( - method=method, - index=index, - limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - inplace=inplace, - downcast=downcast, - **kwargs, - ) + index = missing.get_interp_index(method, obj.index) + new_data = obj._mgr.interpolate( + method=method, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + inplace=inplace, + **kwargs, + ) result = self._constructor_from_mgr(new_data, axes=new_data.axes) if should_transpose: @@ -8560,7 +7964,7 @@ def asof(self, where, subset=None): dtype: float64 >>> s.asof(20) - 2.0 + np.float64(2.0) For a sequence `where`, a Series is returned. The first value is NaN, because the first element of `where` is before the first @@ -8575,28 +7979,36 @@ def asof(self, where, subset=None): NaN, even though NaN is at the index location for ``30``. >>> s.asof(30) - 2.0 + np.float64(2.0) Take all columns into consideration - >>> df = pd.DataFrame({'a': [10., 20., 30., 40., 50.], - ... 'b': [None, None, None, None, 500]}, - ... index=pd.DatetimeIndex(['2018-02-27 09:01:00', - ... '2018-02-27 09:02:00', - ... '2018-02-27 09:03:00', - ... '2018-02-27 09:04:00', - ... '2018-02-27 09:05:00'])) - >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', - ... '2018-02-27 09:04:30'])) + >>> df = pd.DataFrame( + ... { + ... "a": [10.0, 20.0, 30.0, 40.0, 50.0], + ... "b": [None, None, None, None, 500], + ... }, + ... index=pd.DatetimeIndex( + ... [ + ... "2018-02-27 09:01:00", + ... "2018-02-27 09:02:00", + ... "2018-02-27 09:03:00", + ... "2018-02-27 09:04:00", + ... "2018-02-27 09:05:00", + ... ] + ... ), + ... ) + >>> df.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) a b 2018-02-27 09:03:30 NaN NaN 2018-02-27 09:04:30 NaN NaN Take a single column into consideration - >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', - ... '2018-02-27 09:04:30']), - ... subset=['a']) + >>> df.asof( + ... pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]), + ... subset=["a"], + ... ) a b 2018-02-27 09:03:30 30.0 NaN 2018-02-27 09:04:30 40.0 NaN @@ -8663,7 +8075,9 @@ def asof(self, where, subset=None): np.nan, index=self.columns, name=where[0] ) - locs = self.index.asof_locs(where, ~(nulls._values)) + # error: Unsupported operand type for + # ~ ("ExtensionArray | ndarray[Any, Any] | Any") + locs = self.index.asof_locs(where, ~nulls._values) # type: ignore[operator] # mask the missing mask = locs == -1 @@ -8687,8 +8101,7 @@ def isna(self) -> Self: NA values, such as None or :attr:`numpy.NaN`, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty - strings ``''`` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). + strings ``''`` or :attr:`numpy.inf` are not considered NA values. Returns ------- @@ -8707,11 +8120,18 @@ def isna(self) -> Self: -------- Show which entries in a DataFrame are NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], - ... born=[pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... name=['Alfred', 'Batman', ''], - ... toy=[None, 'Batmobile', 'Joker'])) + >>> df = pd.DataFrame( + ... dict( + ... age=[5, 6, np.nan], + ... born=[ + ... pd.NaT, + ... pd.Timestamp("1939-05-27"), + ... pd.Timestamp("1940-04-25"), + ... ], + ... name=["Alfred", "Batman", ""], + ... toy=[None, "Batmobile", "Joker"], + ... ) + ... ) >>> df age born name toy 0 5.0 NaT Alfred None @@ -8752,8 +8172,7 @@ def notna(self) -> Self: Return a boolean same-sized object indicating if the values are not NA. Non-missing values get mapped to True. Characters such as empty - strings ``''`` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). + strings ``''`` or :attr:`numpy.inf` are not considered NA values. NA values, such as None or :attr:`numpy.NaN`, get mapped to False values. @@ -8774,11 +8193,18 @@ def notna(self) -> Self: -------- Show which entries in a DataFrame are not NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], - ... born=[pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... name=['Alfred', 'Batman', ''], - ... toy=[None, 'Batmobile', 'Joker'])) + >>> df = pd.DataFrame( + ... dict( + ... age=[5, 6, np.nan], + ... born=[ + ... pd.NaT, + ... pd.Timestamp("1939-05-27"), + ... pd.Timestamp("1940-04-25"), + ... ], + ... name=["Alfred", "Batman", ""], + ... toy=[None, "Batmobile", "Joker"], + ... ) + ... ) >>> df age born name toy 0 5.0 NaT Alfred None @@ -8813,7 +8239,7 @@ def notnull(self) -> Self: return notna(self).__finalize__(self, method="notnull") @final - def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): + def _clip_with_scalar(self, lower, upper, inplace: bool = False): if (lower is not None and np.any(isna(lower))) or ( upper is not None and np.any(isna(upper)) ): @@ -8824,15 +8250,11 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): if lower is not None: cond = mask | (self >= lower) - result = result.where( - cond, lower, inplace=inplace - ) # type: ignore[assignment] + result = result.where(cond, lower, inplace=inplace) # type: ignore[assignment] if upper is not None: cond = mask | (self <= upper) result = self if inplace else result - result = result.where( - cond, upper, inplace=inplace - ) # type: ignore[assignment] + result = result.where(cond, upper, inplace=inplace) # type: ignore[assignment] return result @@ -8878,8 +8300,7 @@ def clip( axis: Axis | None = ..., inplace: Literal[False] = ..., **kwargs, - ) -> Self: - ... + ) -> Self: ... @overload def clip( @@ -8890,8 +8311,7 @@ def clip( axis: Axis | None = ..., inplace: Literal[True], **kwargs, - ) -> None: - ... + ) -> None: ... @overload def clip( @@ -8900,10 +8320,9 @@ def clip( upper=..., *, axis: Axis | None = ..., - inplace: bool_t = ..., + inplace: bool = ..., **kwargs, - ) -> Self | None: - ... + ) -> Self | None: ... @final def clip( @@ -8912,7 +8331,7 @@ def clip( upper=None, *, axis: Axis | None = None, - inplace: bool_t = False, + inplace: bool = False, **kwargs, ) -> Self | None: """ @@ -8937,7 +8356,7 @@ def clip( For `Series` this parameter is unused and defaults to `None`. inplace : bool, default False Whether to perform the operation in place on the data. - *args, **kwargs + **kwargs Additional keywords have no effect but might be accepted for compatibility with numpy. @@ -8950,12 +8369,12 @@ def clip( See Also -------- Series.clip : Trim values at input threshold in series. - DataFrame.clip : Trim values at input threshold in dataframe. + DataFrame.clip : Trim values at input threshold in DataFrame. numpy.clip : Clip (limit) the values in an array. Examples -------- - >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]} + >>> data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} >>> df = pd.DataFrame(data) >>> df col_0 col_1 @@ -9017,38 +8436,22 @@ def clip( >>> df.clip(t, axis=0) col_0 col_1 - 0 9 2 - 1 -3 -4 - 2 0 6 - 3 6 8 - 4 5 3 + 0 9.0 2.0 + 1 -3.0 -4.0 + 2 0.0 6.0 + 3 6.0 8.0 + 4 5.0 3.0 """ inplace = validate_bool_kwarg(inplace, "inplace") if inplace: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif ( - not PYPY - and not using_copy_on_write() - and self._is_view_after_cow_rules() - ): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) axis = nv.validate_clip_with_axis(axis, (), kwargs) if axis is not None: @@ -9106,7 +8509,7 @@ def asfreq( freq: Frequency, method: FillnaOptions | None = None, how: Literal["start", "end"] | None = None, - normalize: bool_t = False, + normalize: bool = False, fill_value: Hashable | None = None, ) -> Self: """ @@ -9121,8 +8524,8 @@ def asfreq( will map one-to-one to the new index). Otherwise, the new index will be equivalent to ``pd.date_range(start, end, - freq=freq)`` where ``start`` and ``end`` are, respectively, the first and - last entries in the original index (see :func:`pandas.date_range`). The + freq=freq)`` where ``start`` and ``end`` are, respectively, the min and + max entries in the original index (see :func:`pandas.date_range`). The values corresponding to any timesteps in the new index which were not present in the original index will be null (``NaN``), unless a method for filling such unknowns is provided (see the ``method`` parameter below). @@ -9140,7 +8543,7 @@ def asfreq( does not fill NaNs that already were present): * 'pad' / 'ffill': propagate last valid observation forward to next - valid + valid based on the order of the index * 'backfill' / 'bfill': use NEXT valid observation to fill. how : {{'start', 'end'}}, default end For PeriodIndex only (see PeriodIndex.asfreq). @@ -9161,16 +8564,16 @@ def asfreq( Notes ----- - To learn more about the frequency strings, please see `this link - `__. + To learn more about the frequency strings, please see + :ref:`this link`. Examples -------- Start by creating a series with 4 one minute timestamps. - >>> index = pd.date_range('1/1/2000', periods=4, freq='min') + >>> index = pd.date_range("1/1/2000", periods=4, freq="min") >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index) - >>> df = pd.DataFrame({{'s': series}}) + >>> df = pd.DataFrame({{"s": series}}) >>> df s 2000-01-01 00:00:00 0.0 @@ -9180,7 +8583,7 @@ def asfreq( Upsample the series into 30 second bins. - >>> df.asfreq(freq='30s') + >>> df.asfreq(freq="30s") s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN @@ -9192,7 +8595,7 @@ def asfreq( Upsample again, providing a ``fill value``. - >>> df.asfreq(freq='30s', fill_value=9.0) + >>> df.asfreq(freq="30s", fill_value=9.0) s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 9.0 @@ -9204,7 +8607,7 @@ def asfreq( Upsample again, providing a ``method``. - >>> df.asfreq(freq='30s', method='bfill') + >>> df.asfreq(freq="30s", method="bfill") s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN @@ -9226,7 +8629,7 @@ def asfreq( ) @final - def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: + def at_time(self, time, asof: bool = False, axis: Axis | None = None) -> Self: """ Select values at particular time of day (e.g., 9:30AM). @@ -9234,12 +8637,15 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: ---------- time : datetime.time or str The values to select. + asof : bool, default False + This parameter is currently not supported. axis : {0 or 'index', 1 or 'columns'}, default 0 For `Series` this parameter is unused and defaults to 0. Returns ------- Series or DataFrame + The values with the specified time. Raises ------ @@ -9256,8 +8662,8 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='12h') - >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> i = pd.date_range("2018-04-09", periods=4, freq="12h") + >>> ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 00:00:00 1 @@ -9265,7 +8671,7 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: 2018-04-10 00:00:00 3 2018-04-10 12:00:00 4 - >>> ts.at_time('12:00') + >>> ts.at_time("12:00") A 2018-04-09 12:00:00 2 2018-04-10 12:00:00 4 @@ -9280,7 +8686,7 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: raise TypeError("Index must be DatetimeIndex") indexer = index.indexer_at_time(time, asof=asof) - return self._take_with_is_copy(indexer, axis=axis) + return self.take(indexer, axis=axis) @final def between_time( @@ -9328,8 +8734,8 @@ def between_time( Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min') - >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> i = pd.date_range("2018-04-09", periods=4, freq="1D20min") + >>> ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 00:00:00 1 @@ -9337,7 +8743,7 @@ def between_time( 2018-04-11 00:40:00 3 2018-04-12 01:00:00 4 - >>> ts.between_time('0:15', '0:45') + >>> ts.between_time("0:15", "0:45") A 2018-04-10 00:20:00 2 2018-04-11 00:40:00 3 @@ -9345,7 +8751,7 @@ def between_time( You get the times that are *not* between two times by setting ``start_time`` later than ``end_time``: - >>> ts.between_time('0:45', '0:15') + >>> ts.between_time("0:45", "0:15") A 2018-04-09 00:00:00 1 2018-04-12 01:00:00 4 @@ -9365,23 +8771,21 @@ def between_time( include_start=left_inclusive, include_end=right_inclusive, ) - return self._take_with_is_copy(indexer, axis=axis) + return self.take(indexer, axis=axis) @final @doc(klass=_shared_doc_kwargs["klass"]) def resample( self, rule, - axis: Axis | lib.NoDefault = lib.no_default, closed: Literal["right", "left"] | None = None, label: Literal["right", "left"] | None = None, convention: Literal["start", "end", "s", "e"] | lib.NoDefault = lib.no_default, - kind: Literal["timestamp", "period"] | None | lib.NoDefault = lib.no_default, on: Level | None = None, level: Level | None = None, origin: str | TimestampConvertibleTypes = "start_day", offset: TimedeltaConvertibleTypes | None = None, - group_keys: bool_t = False, + group_keys: bool = False, ) -> Resampler: """ Resample time-series data. @@ -9395,13 +8799,6 @@ def resample( ---------- rule : DateOffset, Timedelta or str The offset string or object representing target conversion. - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - Which axis to use for up- or down-sampling. For `Series` this parameter - is unused and defaults to 0. Must be - `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`. - - .. deprecated:: 2.0.0 - Use frame.T.resample(...) instead. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' for all frequency offsets except for 'ME', 'YE', 'QE', 'BME', @@ -9416,14 +8813,6 @@ def resample( .. deprecated:: 2.2.0 Convert PeriodIndex to DatetimeIndex before resampling instead. - kind : {{'timestamp', 'period'}}, optional, default None - Pass 'timestamp' to convert the resulting index to a - `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`. - By default the input representation is retained. - - .. deprecated:: 2.2.0 - Convert index to desired type explicitly instead. - on : str, optional For a DataFrame, column to use instead of index for resampling. Column must be datetime-like. @@ -9433,7 +8822,7 @@ def resample( origin : Timestamp or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. - If string, must be one of the following: + If string, must be Timestamp convertible or one of the following: - 'epoch': `origin` is 1970-01-01 - 'start': `origin` is the first value of the timeseries @@ -9490,7 +8879,7 @@ def resample( -------- Start by creating a series with 9 one minute timestamps. - >>> index = pd.date_range('1/1/2000', periods=9, freq='min') + >>> index = pd.date_range("1/1/2000", periods=9, freq="min") >>> series = pd.Series(range(9), index=index) >>> series 2000-01-01 00:00:00 0 @@ -9507,7 +8896,7 @@ def resample( Downsample the series into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> series.resample('3min').sum() + >>> series.resample("3min").sum() 2000-01-01 00:00:00 3 2000-01-01 00:03:00 12 2000-01-01 00:06:00 21 @@ -9521,7 +8910,7 @@ def resample( value in the resampled bucket with the label ``2000-01-01 00:03:00`` does not include 3 (if it did, the summed value would be 6, not 3). - >>> series.resample('3min', label='right').sum() + >>> series.resample("3min", label="right").sum() 2000-01-01 00:03:00 3 2000-01-01 00:06:00 12 2000-01-01 00:09:00 21 @@ -9530,7 +8919,7 @@ def resample( To include this value close the right side of the bin interval, as shown below. - >>> series.resample('3min', label='right', closed='right').sum() + >>> series.resample("3min", label="right", closed="right").sum() 2000-01-01 00:00:00 0 2000-01-01 00:03:00 6 2000-01-01 00:06:00 15 @@ -9539,7 +8928,7 @@ def resample( Upsample the series into 30 second bins. - >>> series.resample('30s').asfreq()[0:5] # Select first 5 rows + >>> series.resample("30s").asfreq()[0:5] # Select first 5 rows 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 1.0 @@ -9550,7 +8939,7 @@ def resample( Upsample the series into 30 second bins and fill the ``NaN`` values using the ``ffill`` method. - >>> series.resample('30s').ffill()[0:5] + >>> series.resample("30s").ffill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 0 2000-01-01 00:01:00 1 @@ -9561,7 +8950,7 @@ def resample( Upsample the series into 30 second bins and fill the ``NaN`` values using the ``bfill`` method. - >>> series.resample('30s').bfill()[0:5] + >>> series.resample("30s").bfill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 1 2000-01-01 00:01:00 1 @@ -9573,8 +8962,7 @@ def resample( >>> def custom_resampler(arraylike): ... return np.sum(arraylike) + 5 - ... - >>> series.resample('3min').apply(custom_resampler) + >>> series.resample("3min").apply(custom_resampler) 2000-01-01 00:00:00 8 2000-01-01 00:03:00 17 2000-01-01 00:06:00 26 @@ -9583,12 +8971,9 @@ def resample( For DataFrame objects, the keyword `on` can be used to specify the column instead of the index for resampling. - >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}} - >>> df = pd.DataFrame(d) - >>> df['week_starting'] = pd.date_range('01/01/2018', - ... periods=8, - ... freq='W') + >>> df = pd.DataFrame([10, 11, 9, 13, 14, 18, 17, 19], columns=["price"]) + >>> df["volume"] = [50, 60, 40, 100, 50, 100, 40, 50] + >>> df["week_starting"] = pd.date_range("01/01/2018", periods=8, freq="W") >>> df price volume week_starting 0 10 50 2018-01-07 @@ -9599,7 +8984,7 @@ def resample( 5 18 100 2018-02-11 6 17 40 2018-02-18 7 19 50 2018-02-25 - >>> df.resample('ME', on='week_starting').mean() + >>> df.resample("ME", on="week_starting").mean() price volume week_starting 2018-01-31 10.75 62.5 @@ -9608,14 +8993,20 @@ def resample( For a DataFrame with MultiIndex, the keyword `level` can be used to specify on which level the resampling needs to take place. - >>> days = pd.date_range('1/1/2000', periods=4, freq='D') - >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}} + >>> days = pd.date_range("1/1/2000", periods=4, freq="D") >>> df2 = pd.DataFrame( - ... d2, - ... index=pd.MultiIndex.from_product( - ... [days, ['morning', 'afternoon']] - ... ) + ... [ + ... [10, 50], + ... [11, 60], + ... [9, 40], + ... [13, 100], + ... [14, 50], + ... [18, 100], + ... [17, 40], + ... [19, 50], + ... ], + ... columns=["price", "volume"], + ... index=pd.MultiIndex.from_product([days, ["morning", "afternoon"]]), ... ) >>> df2 price volume @@ -9627,7 +9018,7 @@ def resample( afternoon 18 100 2000-01-04 morning 17 40 afternoon 19 50 - >>> df2.resample('D', level=0).sum() + >>> df2.resample("D", level=0).sum() price volume 2000-01-01 21 110 2000-01-02 22 140 @@ -9636,8 +9027,8 @@ def resample( If you want to adjust the start of the bins based on a fixed timestamp: - >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' - >>> rng = pd.date_range(start, end, freq='7min') + >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + >>> rng = pd.date_range(start, end, freq="7min") >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) >>> ts 2000-10-01 23:30:00 0 @@ -9651,7 +9042,7 @@ def resample( 2000-10-02 00:26:00 24 Freq: 7min, dtype: int64 - >>> ts.resample('17min').sum() + >>> ts.resample("17min").sum() 2000-10-01 23:14:00 0 2000-10-01 23:31:00 9 2000-10-01 23:48:00 21 @@ -9659,7 +9050,7 @@ def resample( 2000-10-02 00:22:00 24 Freq: 17min, dtype: int64 - >>> ts.resample('17min', origin='epoch').sum() + >>> ts.resample("17min", origin="epoch").sum() 2000-10-01 23:18:00 0 2000-10-01 23:35:00 18 2000-10-01 23:52:00 27 @@ -9667,7 +9058,7 @@ def resample( 2000-10-02 00:26:00 24 Freq: 17min, dtype: int64 - >>> ts.resample('17min', origin='2000-01-01').sum() + >>> ts.resample("17min", origin="2000-01-01").sum() 2000-10-01 23:24:00 3 2000-10-01 23:41:00 15 2000-10-01 23:58:00 45 @@ -9677,14 +9068,14 @@ def resample( If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: - >>> ts.resample('17min', origin='start').sum() + >>> ts.resample("17min", origin="start").sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 Freq: 17min, dtype: int64 - >>> ts.resample('17min', offset='23h30min').sum() + >>> ts.resample("17min", offset="23h30min").sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 @@ -9693,7 +9084,7 @@ def resample( If you want to take the largest Timestamp as the end of the bins: - >>> ts.resample('17min', origin='end').sum() + >>> ts.resample("17min", origin="end").sum() 2000-10-01 23:35:00 0 2000-10-01 23:52:00 18 2000-10-02 00:09:00 27 @@ -9704,7 +9095,7 @@ def resample( midnight of the largest Timestamp as the end of the bins and drop the bins not containing data: - >>> ts.resample('17min', origin='end_day').sum() + >>> ts.resample("17min", origin="end_day").sum() 2000-10-01 23:38:00 3 2000-10-01 23:55:00 15 2000-10-02 00:12:00 45 @@ -9713,37 +9104,6 @@ def resample( """ from pandas.core.resample import get_resampler - if axis is not lib.no_default: - axis = self._get_axis_number(axis) - if axis == 1: - warnings.warn( - "DataFrame.resample with axis=1 is deprecated. Do " - "`frame.T.resample(...)` without axis instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.resample is " - "deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - axis = 0 - - if kind is not lib.no_default: - # GH#55895 - warnings.warn( - f"The 'kind' keyword in {type(self).__name__}.resample is " - "deprecated and will be removed in a future version. " - "Explicitly cast the index to the desired type instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - kind = None - if convention is not lib.no_default: warnings.warn( f"The 'convention' keyword in {type(self).__name__}.resample is " @@ -9761,8 +9121,6 @@ def resample( freq=rule, label=label, closed=closed, - axis=axis, - kind=kind, convention=convention, key=on, level=level, @@ -9771,178 +9129,15 @@ def resample( group_keys=group_keys, ) - @final - def first(self, offset) -> Self: - """ - Select initial periods of time series data based on a date offset. - - .. deprecated:: 2.1 - :meth:`.first` is deprecated and will be removed in a future version. - Please create a mask and filter using `.loc` instead. - - For a DataFrame with a sorted DatetimeIndex, this function can - select the first few rows based on a date offset. - - Parameters - ---------- - offset : str, DateOffset or dateutil.relativedelta - The offset length of the data that will be selected. For instance, - '1ME' will display all the rows having their index within the first month. - - Returns - ------- - Series or DataFrame - A subset of the caller. - - Raises - ------ - TypeError - If the index is not a :class:`DatetimeIndex` - - See Also - -------- - last : Select final periods of time series based on a date offset. - at_time : Select values at a particular time of the day. - between_time : Select values between particular times of the day. - - Examples - -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) - >>> ts - A - 2018-04-09 1 - 2018-04-11 2 - 2018-04-13 3 - 2018-04-15 4 - - Get the rows for the first 3 days: - - >>> ts.first('3D') - A - 2018-04-09 1 - 2018-04-11 2 - - Notice the data for 3 first calendar days were returned, not the first - 3 days observed in the dataset, and therefore data for 2018-04-13 was - not returned. - """ - warnings.warn( - "first is deprecated and will be removed in a future version. " - "Please create a mask and filter using `.loc` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - if not isinstance(self.index, DatetimeIndex): - raise TypeError("'first' only supports a DatetimeIndex index") - - if len(self.index) == 0: - return self.copy(deep=False) - - offset = to_offset(offset) - if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]): - # GH#29623 if first value is end of period, remove offset with n = 1 - # before adding the real offset - end_date = end = self.index[0] - offset.base + offset - else: - end_date = end = self.index[0] + offset - - # Tick-like, e.g. 3 weeks - if isinstance(offset, Tick) and end_date in self.index: - end = self.index.searchsorted(end_date, side="left") - return self.iloc[:end] - - return self.loc[:end] - - @final - def last(self, offset) -> Self: - """ - Select final periods of time series data based on a date offset. - - .. deprecated:: 2.1 - :meth:`.last` is deprecated and will be removed in a future version. - Please create a mask and filter using `.loc` instead. - - For a DataFrame with a sorted DatetimeIndex, this function - selects the last few rows based on a date offset. - - Parameters - ---------- - offset : str, DateOffset, dateutil.relativedelta - The offset length of the data that will be selected. For instance, - '3D' will display all the rows having their index within the last 3 days. - - Returns - ------- - Series or DataFrame - A subset of the caller. - - Raises - ------ - TypeError - If the index is not a :class:`DatetimeIndex` - - See Also - -------- - first : Select initial periods of time series based on a date offset. - at_time : Select values at a particular time of the day. - between_time : Select values between particular times of the day. - - Notes - ----- - .. deprecated:: 2.1.0 - Please create a mask and filter using `.loc` instead - - Examples - -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) - >>> ts - A - 2018-04-09 1 - 2018-04-11 2 - 2018-04-13 3 - 2018-04-15 4 - - Get the rows for the last 3 days: - - >>> ts.last('3D') # doctest: +SKIP - A - 2018-04-13 3 - 2018-04-15 4 - - Notice the data for 3 last calendar days were returned, not the last - 3 observed days in the dataset, and therefore data for 2018-04-11 was - not returned. - """ - warnings.warn( - "last is deprecated and will be removed in a future version. " - "Please create a mask and filter using `.loc` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if not isinstance(self.index, DatetimeIndex): - raise TypeError("'last' only supports a DatetimeIndex index") - - if len(self.index) == 0: - return self.copy(deep=False) - - offset = to_offset(offset) - - start_date = self.index[-1] - offset - start = self.index.searchsorted(start_date, side="right") - return self.iloc[start:] - @final def rank( self, axis: Axis = 0, method: Literal["average", "min", "max", "first", "dense"] = "average", - numeric_only: bool_t = False, + numeric_only: bool = False, na_option: Literal["keep", "top", "bottom"] = "keep", - ascending: bool_t = True, - pct: bool_t = False, + ascending: bool = True, + pct: bool = False, ) -> Self: """ Compute numerical data ranks (1 through n) along axis. @@ -9995,9 +9190,12 @@ def rank( Examples -------- - >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog', - ... 'spider', 'snake'], - ... 'Number_legs': [4, 2, 4, 8, np.nan]}) + >>> df = pd.DataFrame( + ... data={ + ... "Animal": ["cat", "penguin", "dog", "spider", "snake"], + ... "Number_legs": [4, 2, 4, 8, np.nan], + ... } + ... ) >>> df Animal Number_legs 0 cat 4.0 @@ -10031,10 +9229,10 @@ def rank( * pct_rank: when setting ``pct = True``, the ranking is expressed as percentile rank. - >>> df['default_rank'] = df['Number_legs'].rank() - >>> df['max_rank'] = df['Number_legs'].rank(method='max') - >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom') - >>> df['pct_rank'] = df['Number_legs'].rank(pct=True) + >>> df["default_rank"] = df["Number_legs"].rank() + >>> df["max_rank"] = df["Number_legs"].rank(method="max") + >>> df["NA_bottom"] = df["Number_legs"].rank(na_option="bottom") + >>> df["pct_rank"] = df["Number_legs"].rank(pct=True) >>> df Animal Number_legs default_rank max_rank NA_bottom pct_rank 0 cat 4.0 2.5 3.0 2.5 0.625 @@ -10094,10 +9292,10 @@ def ranker(data): @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"]) def compare( self, - other, + other: Self, align_axis: Axis = 1, - keep_shape: bool_t = False, - keep_equal: bool_t = False, + keep_shape: bool = False, + keep_equal: bool = False, result_names: Suffixes = ("self", "other"), ): if type(self) is not type(other): @@ -10106,7 +9304,8 @@ def compare( f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'" ) - mask = ~((self == other) | (self.isna() & other.isna())) + # error: Unsupported left operand type for & ("Self") + mask = ~((self == other) | (self.isna() & other.isna())) # type: ignore[operator] mask.fillna(True, inplace=True) if not keep_equal: @@ -10164,7 +9363,9 @@ def compare( # reorder axis to keep things organized indices = ( - np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + np.arange(diff.shape[axis]) + .reshape([2, diff.shape[axis] // 2]) + .T.reshape(-1) ) diff = diff.take(indices, axis=axis) @@ -10181,12 +9382,8 @@ def align( join: AlignJoin = "outer", axis: Axis | None = None, level: Level | None = None, - copy: bool_t | None = None, + copy: bool | lib.NoDefault = lib.no_default, fill_value: Hashable | None = None, - method: FillnaOptions | None | lib.NoDefault = lib.no_default, - limit: int | None | lib.NoDefault = lib.no_default, - fill_axis: Axis | lib.NoDefault = lib.no_default, - broadcast_axis: Axis | None | lib.NoDefault = lib.no_default, ) -> tuple[Self, NDFrameT]: """ Align two objects on their axes with the specified join method. @@ -10196,6 +9393,7 @@ def align( Parameters ---------- other : DataFrame or Series + The object to align with. join : {{'outer', 'inner', 'left', 'right'}}, default 'outer' Type of alignment to be performed. @@ -10210,7 +9408,7 @@ def align( level : int or level name, default None Broadcast across a level, matching Index values on the passed MultiIndex level. - copy : bool, default True + copy : bool, default False Always returns new objects. If copy=False and no reindexing is required then original objects are returned. @@ -10225,43 +9423,22 @@ def align( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 fill_value : scalar, default np.nan Value to use for missing values. Defaults to NaN, but can be any "compatible" value. - method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None - Method to use for filling holes in reindexed Series: - - - pad / ffill: propagate last valid observation forward to next valid. - - backfill / bfill: use NEXT valid observation to fill gap. - - .. deprecated:: 2.1 - - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. Must be greater than 0 if not None. - - .. deprecated:: 2.1 - - fill_axis : {axes_single_arg}, default 0 - Filling axis, method and limit. - - .. deprecated:: 2.1 - - broadcast_axis : {axes_single_arg}, default None - Broadcast values along this axis, if aligning two objects of - different dimensions. - - .. deprecated:: 2.1 Returns ------- tuple of ({klass}, type of other) Aligned objects. + See Also + -------- + Series.align : Align two objects on their axes with specified join method. + DataFrame.align : Align two objects on their axes with specified join method. + Examples -------- >>> df = pd.DataFrame( @@ -10327,93 +9504,7 @@ def align( 3 60.0 70.0 80.0 90.0 NaN 4 600.0 700.0 800.0 900.0 NaN """ - if ( - method is not lib.no_default - or limit is not lib.no_default - or fill_axis is not lib.no_default - ): - # GH#51856 - warnings.warn( - "The 'method', 'limit', and 'fill_axis' keywords in " - f"{type(self).__name__}.align are deprecated and will be removed " - "in a future version. Call fillna directly on the returned objects " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if fill_axis is lib.no_default: - fill_axis = 0 - if method is lib.no_default: - method = None - if limit is lib.no_default: - limit = None - - if method is not None: - method = clean_fill_method(method) - - if broadcast_axis is not lib.no_default: - # GH#51856 - # TODO(3.0): enforcing this deprecation will close GH#13194 - msg = ( - f"The 'broadcast_axis' keyword in {type(self).__name__}.align is " - "deprecated and will be removed in a future version." - ) - if broadcast_axis is not None: - if self.ndim == 1 and other.ndim == 2: - msg += ( - " Use left = DataFrame({col: left for col in right.columns}, " - "index=right.index) before calling `left.align(right)` instead." - ) - elif self.ndim == 2 and other.ndim == 1: - msg += ( - " Use right = DataFrame({col: right for col in left.columns}, " - "index=left.index) before calling `left.align(right)` instead" - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - else: - broadcast_axis = None - - if broadcast_axis == 1 and self.ndim != other.ndim: - if isinstance(self, ABCSeries): - # this means other is a DataFrame, and we need to broadcast - # self - cons = self._constructor_expanddim - df = cons( - {c: self for c in other.columns}, **other._construct_axes_dict() - ) - # error: Incompatible return value type (got "Tuple[DataFrame, - # DataFrame]", expected "Tuple[Self, NDFrameT]") - return df._align_frame( # type: ignore[return-value] - other, # type: ignore[arg-type] - join=join, - axis=axis, - level=level, - copy=copy, - fill_value=fill_value, - method=method, - limit=limit, - fill_axis=fill_axis, - )[:2] - elif isinstance(other, ABCSeries): - # this means self is a DataFrame, and we need to broadcast - # other - cons = other._constructor_expanddim - df = cons( - {c: other for c in self.columns}, **self._construct_axes_dict() - ) - # error: Incompatible return value type (got "Tuple[NDFrameT, - # DataFrame]", expected "Tuple[Self, NDFrameT]") - return self._align_frame( # type: ignore[return-value] - df, - join=join, - axis=axis, - level=level, - copy=copy, - fill_value=fill_value, - method=method, - limit=limit, - fill_axis=fill_axis, - )[:2] + self._check_copy_deprecation(copy) _right: DataFrame | Series if axis is not None: @@ -10424,11 +9515,7 @@ def align( join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, - method=method, - limit=limit, - fill_axis=fill_axis, ) elif isinstance(other, ABCSeries): @@ -10437,11 +9524,7 @@ def align( join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, - method=method, - limit=limit, - fill_axis=fill_axis, ) else: # pragma: no cover raise TypeError(f"unsupported type: {type(other)}") @@ -10471,11 +9554,7 @@ def _align_frame( join: AlignJoin = "outer", axis: Axis | None = None, level=None, - copy: bool_t | None = None, fill_value=None, - method=None, - limit: int | None = None, - fill_axis: Axis = 0, ) -> tuple[Self, DataFrame, Index | None]: # defaults join_index, join_columns = None, None @@ -10504,20 +9583,14 @@ def _align_frame( reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]} left = self._reindex_with_indexers( - reindexers, copy=copy, fill_value=fill_value, allow_dups=True + reindexers, fill_value=fill_value, allow_dups=True ) # other must be always DataFrame right = other._reindex_with_indexers( {0: [join_index, iridx], 1: [join_columns, cridx]}, - copy=copy, fill_value=fill_value, allow_dups=True, ) - - if method is not None: - left = left._pad_or_backfill(method, axis=fill_axis, limit=limit) - right = right._pad_or_backfill(method, axis=fill_axis, limit=limit) - return left, right, join_index @final @@ -10527,15 +9600,9 @@ def _align_series( join: AlignJoin = "outer", axis: Axis | None = None, level=None, - copy: bool_t | None = None, fill_value=None, - method=None, - limit: int | None = None, - fill_axis: Axis = 0, ) -> tuple[Self, Series, Index | None]: is_series = isinstance(self, ABCSeries) - if copy and using_copy_on_write(): - copy = False if (not is_series and axis is None) or axis not in [None, 0, 1]: raise ValueError("Must specify axis=0 or 1") @@ -10554,14 +9621,14 @@ def _align_series( ) if is_series: - left = self._reindex_indexer(join_index, lidx, copy) + left = self._reindex_indexer(join_index, lidx) elif lidx is None or join_index is None: - left = self.copy(deep=copy) + left = self.copy(deep=False) else: - new_mgr = self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy) + new_mgr = self._mgr.reindex_indexer(join_index, lidx, axis=1) left = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) - right = other._reindex_indexer(join_index, ridx, copy) + right = other._reindex_indexer(join_index, ridx) else: # one has > 1 ndim @@ -10577,39 +9644,61 @@ def _align_series( bm_axis = self._get_block_manager_axis(1) fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis) - if copy and fdata is self._mgr: - fdata = fdata.copy() - left = self._constructor_from_mgr(fdata, axes=fdata.axes) - if ridx is None: - right = other.copy(deep=copy) - else: - right = other.reindex(join_index, level=level) + right = other._reindex_indexer(join_index, ridx) + + # fill + fill_na = notna(fill_value) + if fill_na: + left = left.fillna(fill_value) + right = right.fillna(fill_value) + + return left, right, join_index + + @overload + def _where( + self, + cond, + other=..., + *, + inplace: Literal[False] = ..., + axis: Axis | None = ..., + level=..., + ) -> Self: ... - # fill - fill_na = notna(fill_value) or (method is not None) - if fill_na: - fill_value, method = validate_fillna_kwargs(fill_value, method) - if method is not None: - left = left._pad_or_backfill(method, limit=limit, axis=fill_axis) - right = right._pad_or_backfill(method, limit=limit) - else: - left = left.fillna(fill_value, limit=limit, axis=fill_axis) - right = right.fillna(fill_value, limit=limit) + @overload + def _where( + self, + cond, + other=..., + *, + inplace: Literal[True], + axis: Axis | None = ..., + level=..., + ) -> None: ... - return left, right, join_index + @overload + def _where( + self, + cond, + other=..., + *, + inplace: bool, + axis: Axis | None = ..., + level=..., + ) -> Self | None: ... @final def _where( self, cond, other=lib.no_default, - inplace: bool_t = False, + *, + inplace: bool = False, axis: Axis | None = None, level=None, - warn: bool_t = True, - ): + ) -> Self | None: """ Equivalent to public method `where`, except that `other` is not applied as a function even if callable. Used in __setitem__. @@ -10625,11 +9714,11 @@ def _where( # CoW: Make sure reference is not kept alive if cond.ndim == 1 and self.ndim == 2: cond = cond._constructor_expanddim( - {i: cond for i in range(len(self.columns))}, + dict.fromkeys(range(len(self.columns)), cond), copy=False, ) cond.columns = self.columns - cond = cond.align(self, join="right", copy=False)[0] + cond = cond.align(self, join="right")[0] else: if not hasattr(cond, "shape"): cond = np.asanyarray(cond) @@ -10639,14 +9728,8 @@ def _where( # make sure we are boolean fill_value = bool(inplace) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Downcasting object dtype arrays", - category=FutureWarning, - ) - cond = cond.fillna(fill_value) - cond = cond.infer_objects(copy=False) + cond = cond.fillna(fill_value) + cond = cond.infer_objects() msg = "Boolean array expected for the condition, not {dtype}" @@ -10654,11 +9737,11 @@ def _where( if not isinstance(cond, ABCDataFrame): # This is a single-dimensional object. if not is_bool_dtype(cond): - raise ValueError(msg.format(dtype=cond.dtype)) + raise TypeError(msg.format(dtype=cond.dtype)) else: - for _dt in cond.dtypes: - if not is_bool_dtype(_dt): - raise ValueError(msg.format(dtype=_dt)) + for block in cond._mgr.blocks: + if not is_bool_dtype(block.dtype): + raise TypeError(msg.format(dtype=block.dtype)) if cond._mgr.any_extension_types: # GH51574: avoid object ndarray conversion later on cond = cond._constructor( @@ -10670,7 +9753,7 @@ def _where( cond = cond.astype(bool) cond = -cond if inplace else cond - cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False) + cond = cond.reindex(self._info_axis, axis=self._info_axis_number) # try to align with other if isinstance(other, NDFrame): @@ -10683,7 +9766,6 @@ def _where( axis=axis, level=level, fill_value=None, - copy=False, )[1] # if we are NOT aligned, raise as we cannot where index @@ -10739,7 +9821,7 @@ def _where( # we may have different type blocks come out of putmask, so # reconstruct the block manager - new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn) + new_data = self._mgr.putmask(mask=cond, new=other, align=align) result = self._constructor_from_mgr(new_data, axes=new_data.axes) return self._update_inplace(result) @@ -10761,8 +9843,7 @@ def where( inplace: Literal[False] = ..., axis: Axis | None = ..., level: Level = ..., - ) -> Self: - ... + ) -> Self: ... @overload def where( @@ -10773,8 +9854,7 @@ def where( inplace: Literal[True], axis: Axis | None = ..., level: Level = ..., - ) -> None: - ... + ) -> None: ... @overload def where( @@ -10782,11 +9862,10 @@ def where( cond, other=..., *, - inplace: bool_t = ..., + inplace: bool = ..., axis: Axis | None = ..., level: Level = ..., - ) -> Self | None: - ... + ) -> Self | None: ... @final @doc( @@ -10801,7 +9880,7 @@ def where( cond, other=np.nan, *, - inplace: bool_t = False, + inplace: bool = False, axis: Axis | None = None, level: Level | None = None, ) -> Self | None: @@ -10835,25 +9914,30 @@ def where( Returns ------- - Same type as caller or None if ``inplace=True``. + Series or DataFrame or None + When applied to a Series, the function will return a Series, + and when applied to a DataFrame, it will return a DataFrame; + if ``inplace=True``, it will return None. See Also -------- :func:`DataFrame.{name_other}` : Return an object of same shape as - self. + caller. + :func:`Series.{name_other}` : Return an object of same shape as + caller. Notes ----- The {name} method is an application of the if-then idiom. For each - element in the calling DataFrame, if ``cond`` is ``{cond}`` the - element is used; otherwise the corresponding element from the DataFrame + element in the caller, if ``cond`` is ``{cond}`` the + element is used; otherwise the corresponding element from ``other`` is used. If the axis of ``other`` does not align with axis of - ``cond`` {klass}, the misaligned index positions will be filled with - {cond_rev}. + ``cond`` {klass}, the values of ``cond`` on misaligned index positions + will be filled with {cond_rev}. - The signature for :func:`DataFrame.where` differs from - :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to - ``np.where(m, df1, df2)``. + The signature for :func:`Series.where` or + :func:`DataFrame.where` differs from :func:`numpy.where`. + Roughly ``df1.where(m, df2)`` is equivalent to ``np.where(m, df1, df2)``. For further details and examples see the ``{name}`` documentation in :ref:`indexing `. @@ -10911,7 +9995,7 @@ def where( 4 10 dtype: int64 - >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) + >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]) >>> df A B 0 0 1 @@ -10944,32 +10028,16 @@ def where( """ inplace = validate_bool_kwarg(inplace, "inplace") if inplace: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif ( - not PYPY - and not using_copy_on_write() - and self._is_view_after_cow_rules() - ): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) other = common.apply_if_callable(other, self) - return self._where(cond, other, inplace, axis, level) + return self._where(cond, other, inplace=inplace, axis=axis, level=level) @overload def mask( @@ -10980,8 +10048,7 @@ def mask( inplace: Literal[False] = ..., axis: Axis | None = ..., level: Level = ..., - ) -> Self: - ... + ) -> Self: ... @overload def mask( @@ -10992,8 +10059,7 @@ def mask( inplace: Literal[True], axis: Axis | None = ..., level: Level = ..., - ) -> None: - ... + ) -> None: ... @overload def mask( @@ -11001,11 +10067,10 @@ def mask( cond, other=..., *, - inplace: bool_t = ..., + inplace: bool = ..., axis: Axis | None = ..., level: Level = ..., - ) -> Self | None: - ... + ) -> Self | None: ... @final @doc( @@ -11021,35 +10086,19 @@ def mask( cond, other=lib.no_default, *, - inplace: bool_t = False, + inplace: bool = False, axis: Axis | None = None, level: Level | None = None, ) -> Self | None: inplace = validate_bool_kwarg(inplace, "inplace") if inplace: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif ( - not PYPY - and not using_copy_on_write() - and self._is_view_after_cow_rules() - ): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) cond = common.apply_if_callable(cond, self) other = common.apply_if_callable(other, self) @@ -11107,7 +10156,7 @@ def shift( fill_value : object, optional The scalar value to use for newly introduced missing values. the default depends on the dtype of `self`. - For numeric data, ``np.nan`` is used. + For Boolean and numeric NumPy data types, ``np.nan`` is used. For datetime, timedelta, or period data, etc. :attr:`NaT` is used. For extension dtypes, ``self.dtype.na_value`` is used. suffix : str, optional @@ -11127,10 +10176,11 @@ def shift( Examples -------- - >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45], - ... "Col2": [13, 23, 18, 33, 48], - ... "Col3": [17, 27, 22, 37, 52]}}, - ... index=pd.date_range("2020-01-01", "2020-01-05")) + >>> df = pd.DataFrame( + ... [[10, 13, 17], [20, 23, 27], [15, 18, 22], [30, 33, 37], [45, 48, 52]], + ... columns=["Col1", "Col2", "Col3"], + ... index=pd.date_range("2020-01-01", "2020-01-05"), + ... ) >>> df Col1 Col2 Col3 2020-01-01 10 13 17 @@ -11179,7 +10229,7 @@ def shift( 2020-01-07 30 33 37 2020-01-08 45 48 52 - >>> df['Col1'].shift(periods=[0, 1, 2]) + >>> df["Col1"].shift(periods=[0, 1, 2]) Col1_0 Col1_1 Col1_2 2020-01-01 10 NaN NaN 2020-01-02 20 10.0 NaN @@ -11191,17 +10241,12 @@ def shift( if freq is not None and fill_value is not lib.no_default: # GH#53832 - warnings.warn( - "Passing a 'freq' together with a 'fill_value' silently ignores " - "the fill_value and is deprecated. This will raise in a future " - "version.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + "Passing a 'freq' together with a 'fill_value' is not allowed." ) - fill_value = lib.no_default if periods == 0: - return self.copy(deep=None) + return self.copy(deep=False) if is_list_like(periods) and isinstance(self, ABCSeries): return self.to_frame().shift( @@ -11245,11 +10290,11 @@ def _shift_with_freq(self, periods: int, axis: int, freq) -> Self: if freq != orig_freq: assert orig_freq is not None # for mypy raise ValueError( - f"Given freq {freq_to_period_freqstr(freq.n, freq.name)} " + f"Given freq {PeriodDtype(freq)._freqstr} " f"does not match PeriodIndex freq " - f"{freq_to_period_freqstr(orig_freq.n, orig_freq.name)}" + f"{PeriodDtype(orig_freq)._freqstr}" ) - new_ax = index.shift(periods) + new_ax: Index = index.shift(periods) else: new_ax = index.shift(periods, freq) @@ -11262,7 +10307,7 @@ def truncate( before=None, after=None, axis: Axis | None = None, - copy: bool_t | None = None, + copy: bool | lib.NoDefault = lib.no_default, ) -> Self: """ Truncate a Series or DataFrame before and after some index value. @@ -11279,7 +10324,7 @@ def truncate( axis : {0 or 'index', 1 or 'columns'}, optional Axis to truncate. Truncates the index (rows) by default. For `Series` this parameter is unused and defaults to 0. - copy : bool, default is True, + copy : bool, default is False, Return a copy of the truncated section. .. note:: @@ -11294,6 +10339,8 @@ def truncate( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- type of caller @@ -11312,10 +10359,14 @@ def truncate( Examples -------- - >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'], - ... 'B': ['f', 'g', 'h', 'i', 'j'], - ... 'C': ['k', 'l', 'm', 'n', 'o']}, - ... index=[1, 2, 3, 4, 5]) + >>> df = pd.DataFrame( + ... { + ... "A": ["a", "b", "c", "d", "e"], + ... "B": ["f", "g", "h", "i", "j"], + ... "C": ["k", "l", "m", "n", "o"], + ... }, + ... index=[1, 2, 3, 4, 5], + ... ) >>> df A B C 1 a f k @@ -11342,7 +10393,7 @@ def truncate( For Series, only rows can be truncated. - >>> df['A'].truncate(before=2, after=4) + >>> df["A"].truncate(before=2, after=4) 2 b 3 c 4 d @@ -11351,8 +10402,8 @@ def truncate( The index values in ``truncate`` can be datetimes or string dates. - >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s') - >>> df = pd.DataFrame(index=dates, data={'A': 1}) + >>> dates = pd.date_range("2016-01-01", "2016-02-01", freq="s") + >>> df = pd.DataFrame(index=dates, data={"A": 1}) >>> df.tail() A 2016-01-31 23:59:56 1 @@ -11361,8 +10412,9 @@ def truncate( 2016-01-31 23:59:59 1 2016-02-01 00:00:00 1 - >>> df.truncate(before=pd.Timestamp('2016-01-05'), - ... after=pd.Timestamp('2016-01-10')).tail() + >>> df.truncate( + ... before=pd.Timestamp("2016-01-05"), after=pd.Timestamp("2016-01-10") + ... ).tail() A 2016-01-09 23:59:56 1 2016-01-09 23:59:57 1 @@ -11374,7 +10426,7 @@ def truncate( specify `before` and `after` as strings. They will be coerced to Timestamps before truncation. - >>> df.truncate('2016-01-05', '2016-01-10').tail() + >>> df.truncate("2016-01-05", "2016-01-10").tail() A 2016-01-09 23:59:56 1 2016-01-09 23:59:57 1 @@ -11386,7 +10438,7 @@ def truncate( component (midnight). This differs from partial string slicing, which returns any partially matching dates. - >>> df.loc['2016-01-05':'2016-01-10', :].tail() + >>> df.loc["2016-01-05":"2016-01-10", :].tail() A 2016-01-10 23:59:55 1 2016-01-10 23:59:56 1 @@ -11394,6 +10446,8 @@ def truncate( 2016-01-10 23:59:58 1 2016-01-10 23:59:59 1 """ + self._check_copy_deprecation(copy) + if axis is None: axis = 0 axis = self._get_axis_number(axis) @@ -11425,14 +10479,18 @@ def truncate( if isinstance(ax, MultiIndex): setattr(result, self._get_axis_name(axis), ax.truncate(before, after)) - result = result.copy(deep=copy and not using_copy_on_write()) + result = result.copy(deep=False) return result @final @doc(klass=_shared_doc_kwargs["klass"]) def tz_convert( - self, tz, axis: Axis = 0, level=None, copy: bool_t | None = None + self, + tz, + axis: Axis = 0, + level=None, + copy: bool | lib.NoDefault = lib.no_default, ) -> Self: """ Convert tz-aware axis to target time zone. @@ -11447,7 +10505,7 @@ def tz_convert( level : int, str, default None If axis is a MultiIndex, convert a specific level. Otherwise must be None. - copy : bool, default True + copy : bool, default False Also make a copy of the underlying data. .. note:: @@ -11462,6 +10520,8 @@ def tz_convert( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- {klass} @@ -11472,26 +10532,31 @@ def tz_convert( TypeError If the axis is tz-naive. + See Also + -------- + DataFrame.tz_localize: Localize tz-naive index of DataFrame to target time zone. + Series.tz_localize: Localize tz-naive index of Series to target time zone. + Examples -------- Change to another time zone: >>> s = pd.Series( ... [1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']), + ... index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"]), ... ) - >>> s.tz_convert('Asia/Shanghai') + >>> s.tz_convert("Asia/Shanghai") 2018-09-15 07:30:00+08:00 1 dtype: int64 Pass None to convert to UTC and get a tz-naive index: - >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + >>> s = pd.Series([1], index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"])) >>> s.tz_convert(None) 2018-09-14 23:30:00 1 dtype: int64 """ + self._check_copy_deprecation(copy) axis = self._get_axis_number(axis) ax = self._get_axis(axis) @@ -11518,8 +10583,8 @@ def _tz_convert(ax, tz): raise ValueError(f"The level {level} is not valid") ax = _tz_convert(ax, tz) - result = self.copy(deep=copy and not using_copy_on_write()) - result = result.set_axis(ax, axis=axis, copy=False) + result = self.copy(deep=False) + result = result.set_axis(ax, axis=axis) return result.__finalize__(self, method="tz_convert") @final @@ -11529,15 +10594,15 @@ def tz_localize( tz, axis: Axis = 0, level=None, - copy: bool_t | None = None, + copy: bool | lib.NoDefault = lib.no_default, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", ) -> Self: """ - Localize tz-naive index of a Series or DataFrame to target time zone. + Localize time zone naive index of a Series or DataFrame to target time zone. This operation localizes the Index. To localize the values in a - timezone-naive Series, use :meth:`Series.dt.tz_localize`. + time zone naive Series, use :meth:`Series.dt.tz_localize`. Parameters ---------- @@ -11549,7 +10614,7 @@ def tz_localize( level : int, str, default None If axis ia a MultiIndex, localize a specific level. Otherwise must be None. - copy : bool, default True + copy : bool, default False Also make a copy of the underlying data. .. note:: @@ -11563,7 +10628,9 @@ def tz_localize( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + + .. deprecated:: 3.0.0 + ambiguous : 'infer', bool, bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. For example in Central European Time (UTC+01), when going from 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at @@ -11573,11 +10640,11 @@ def tz_localize( - 'infer' will attempt to infer fall dst-transition hours based on order - - bool-ndarray where True signifies a DST time, False designates + - bool (or bool-ndarray) where True signifies a DST time, False designates a non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : str, default 'raise' A nonexistent time does not exist in a particular timezone @@ -11589,35 +10656,40 @@ def tz_localize( closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns ------- {klass} - Same type as the input. + Same type as the input, with time zone naive or aware index, depending on + ``tz``. Raises ------ TypeError If the TimeSeries is tz-aware and tz is not None. + See Also + -------- + Series.dt.tz_localize: Localize the values in a time zone naive Series. + Timestamp.tz_localize: Localize the Timestamp to a timezone. + Examples -------- Localize local times: >>> s = pd.Series( ... [1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']), + ... index=pd.DatetimeIndex(["2018-09-15 01:30:00"]), ... ) - >>> s.tz_localize('CET') + >>> s.tz_localize("CET") 2018-09-15 01:30:00+02:00 1 dtype: int64 Pass None to convert to tz-naive index and preserve local time: - >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + >>> s = pd.Series([1], index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"])) >>> s.tz_localize(None) 2018-09-15 01:30:00 1 dtype: int64 @@ -11625,15 +10697,21 @@ def tz_localize( Be careful with DST changes. When there is sequential data, pandas can infer the DST time: - >>> s = pd.Series(range(7), - ... index=pd.DatetimeIndex(['2018-10-28 01:30:00', - ... '2018-10-28 02:00:00', - ... '2018-10-28 02:30:00', - ... '2018-10-28 02:00:00', - ... '2018-10-28 02:30:00', - ... '2018-10-28 03:00:00', - ... '2018-10-28 03:30:00'])) - >>> s.tz_localize('CET', ambiguous='infer') + >>> s = pd.Series( + ... range(7), + ... index=pd.DatetimeIndex( + ... [ + ... "2018-10-28 01:30:00", + ... "2018-10-28 02:00:00", + ... "2018-10-28 02:30:00", + ... "2018-10-28 02:00:00", + ... "2018-10-28 02:30:00", + ... "2018-10-28 03:00:00", + ... "2018-10-28 03:30:00", + ... ] + ... ), + ... ) + >>> s.tz_localize("CET", ambiguous="infer") 2018-10-28 01:30:00+02:00 0 2018-10-28 02:00:00+02:00 1 2018-10-28 02:30:00+02:00 2 @@ -11646,11 +10724,17 @@ def tz_localize( In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly - >>> s = pd.Series(range(3), - ... index=pd.DatetimeIndex(['2018-10-28 01:20:00', - ... '2018-10-28 02:36:00', - ... '2018-10-28 03:46:00'])) - >>> s.tz_localize('CET', ambiguous=np.array([True, True, False])) + >>> s = pd.Series( + ... range(3), + ... index=pd.DatetimeIndex( + ... [ + ... "2018-10-28 01:20:00", + ... "2018-10-28 02:36:00", + ... "2018-10-28 03:46:00", + ... ] + ... ), + ... ) + >>> s.tz_localize("CET", ambiguous=np.array([True, True, False])) 2018-10-28 01:20:00+02:00 0 2018-10-28 02:36:00+02:00 1 2018-10-28 03:46:00+01:00 2 @@ -11660,22 +10744,24 @@ def tz_localize( dates forward or backward with a timedelta object or `'shift_forward'` or `'shift_backward'`. - >>> s = pd.Series(range(2), - ... index=pd.DatetimeIndex(['2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) - >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward') + >>> dti = pd.DatetimeIndex( + ... ["2015-03-29 02:30:00", "2015-03-29 03:30:00"], dtype="M8[ns]" + ... ) + >>> s = pd.Series(range(2), index=dti) + >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_forward") 2015-03-29 03:00:00+02:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 - >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward') + >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_backward") 2015-03-29 01:59:59.999999999+01:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 - >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) + >>> s.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta("1h")) 2015-03-29 03:30:00+02:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 """ + self._check_copy_deprecation(copy) nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( nonexistent, dt.timedelta @@ -11712,8 +10798,8 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): raise ValueError(f"The level {level} is not valid") ax = _tz_localize(ax, tz, ambiguous, nonexistent) - result = self.copy(deep=copy and not using_copy_on_write()) - result = result.set_axis(ax, axis=axis, copy=False) + result = self.copy(deep=False) + result = result.set_axis(ax, axis=axis) return result.__finalize__(self, method="tz_localize") # ---------------------------------------------------------------------- @@ -11742,9 +10828,8 @@ def describe( ---------- percentiles : list-like of numbers, optional The percentiles to include in the output. All should - fall between 0 and 1. The default is - ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. + fall between 0 and 1. The default, ``None``, will automatically + return the 25th, 50th, and 75th percentiles. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. Here are the options: @@ -11794,17 +10879,17 @@ def describe( upper percentile is ``75``. The ``50`` percentile is the same as the median. - For object data (e.g. strings or timestamps), the result's index + For object data (e.g. strings), the result's index will include ``count``, ``unique``, ``top``, and ``freq``. The ``top`` is the most common value. The ``freq`` is the most common value's - frequency. Timestamps also include the ``first`` and ``last`` items. + frequency. If multiple object values have the highest count, then the ``count`` and ``top`` results will be arbitrarily chosen from among those with the highest count. For mixed data types provided via a ``DataFrame``, the default is to - return only an analysis of numeric columns. If the dataframe consists + return only an analysis of numeric columns. If the DataFrame consists only of object and categorical data without any numeric columns, the default is to return an analysis of both the object and categorical columns. If ``include='all'`` is provided as an option, the result @@ -11832,7 +10917,7 @@ def describe( Describing a categorical ``Series``. - >>> s = pd.Series(['a', 'a', 'b', 'c']) + >>> s = pd.Series(["a", "a", "b", "c"]) >>> s.describe() count 4 unique 3 @@ -11842,11 +10927,13 @@ def describe( Describing a timestamp ``Series``. - >>> s = pd.Series([ - ... np.datetime64("2000-01-01"), - ... np.datetime64("2010-01-01"), - ... np.datetime64("2010-01-01") - ... ]) + >>> s = pd.Series( + ... [ + ... np.datetime64("2000-01-01"), + ... np.datetime64("2010-01-01"), + ... np.datetime64("2010-01-01"), + ... ] + ... ) >>> s.describe() count 3 mean 2006-09-01 08:00:00 @@ -11860,10 +10947,13 @@ def describe( Describing a ``DataFrame``. By default only numeric fields are returned. - >>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']), - ... 'numeric': [1, 2, 3], - ... 'object': ['a', 'b', 'c'] - ... }) + >>> df = pd.DataFrame( + ... { + ... "categorical": pd.Categorical(["d", "e", "f"]), + ... "numeric": [1, 2, 3], + ... "object": ["a", "b", "c"], + ... } + ... ) >>> df.describe() numeric count 3.0 @@ -11877,7 +10967,7 @@ def describe( Describing all columns of a ``DataFrame`` regardless of data type. - >>> df.describe(include='all') # doctest: +SKIP + >>> df.describe(include="all") # doctest: +SKIP categorical numeric object count 3 3.0 3 unique 3 NaN 3 @@ -11929,7 +11019,7 @@ def describe( Including only categorical columns from a ``DataFrame`` description. - >>> df.describe(include=['category']) + >>> df.describe(include=["category"]) categorical count 3 unique 3 @@ -11972,8 +11062,7 @@ def describe( def pct_change( self, periods: int = 1, - fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, - limit: int | None | lib.NoDefault = lib.no_default, + fill_method: None = None, freq=None, **kwargs, ) -> Self: @@ -11995,17 +11084,12 @@ def pct_change( ---------- periods : int, default 1 Periods to shift for forming percent change. - fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' - How to handle NAs **before** computing percent changes. + fill_method : None + Must be None. This argument will be removed in a future version of pandas. .. deprecated:: 2.1 All options of `fill_method` are deprecated except `fill_method=None`. - limit : int, default None - The number of consecutive NAs to fill before stopping. - - .. deprecated:: 2.1 - freq : DateOffset, timedelta, or str, optional Increment to use from time series API (e.g. 'ME' or BDay()). **kwargs @@ -12070,11 +11154,14 @@ def pct_change( Percentage change in French franc, Deutsche Mark, and Italian lira from 1980-01-01 to 1980-03-01. - >>> df = pd.DataFrame({ - ... 'FR': [4.0405, 4.0963, 4.3149], - ... 'GR': [1.7246, 1.7482, 1.8519], - ... 'IT': [804.74, 810.01, 860.13]}, - ... index=['1980-01-01', '1980-02-01', '1980-03-01']) + >>> df = pd.DataFrame( + ... { + ... "FR": [4.0405, 4.0963, 4.3149], + ... "GR": [1.7246, 1.7482, 1.8519], + ... "IT": [804.74, 810.01, 860.13], + ... }, + ... index=["1980-01-01", "1980-02-01", "1980-03-01"], + ... ) >>> df FR GR IT 1980-01-01 4.0405 1.7246 804.74 @@ -12090,67 +11177,37 @@ def pct_change( Percentage of change in GOOG and APPL stock volume. Shows computing the percentage change between columns. - >>> df = pd.DataFrame({ - ... '2016': [1769950, 30586265], - ... '2015': [1500923, 40912316], - ... '2014': [1371819, 41403351]}, - ... index=['GOOG', 'APPL']) + >>> df = pd.DataFrame( + ... { + ... "2016": [1769950, 30586265], + ... "2015": [1500923, 40912316], + ... "2014": [1371819, 41403351], + ... }, + ... index=["GOOG", "APPL"], + ... ) >>> df 2016 2015 2014 GOOG 1769950 1500923 1371819 APPL 30586265 40912316 41403351 - >>> df.pct_change(axis='columns', periods=-1) + >>> df.pct_change(axis="columns", periods=-1) 2016 2015 2014 GOOG 0.179241 0.094112 NaN APPL -0.252395 -0.011860 NaN """ # GH#53491 - if fill_method not in (lib.no_default, None) or limit is not lib.no_default: - warnings.warn( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - f"{type(self).__name__}.pct_change are deprecated and will be removed " - "in a future version. Either fill in any non-leading NA values prior " - "to calling pct_change or specify 'fill_method=None' to not fill NA " - "values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if fill_method is lib.no_default: - if limit is lib.no_default: - cols = self.items() if self.ndim == 2 else [(None, self)] - for _, col in cols: - mask = col.isna().values - mask = mask[np.argmax(~mask) :] - if mask.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will " - "be removed in a future version. Either fill in any " - "non-leading NA values prior to calling pct_change or " - "specify 'fill_method=None' to not fill NA values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - break - fill_method = "pad" - if limit is lib.no_default: - limit = None + if fill_method is not None: + raise ValueError(f"fill_method must be None; got {fill_method=}.") axis = self._get_axis_number(kwargs.pop("axis", "index")) - if fill_method is None: - data = self - else: - data = self._pad_or_backfill(fill_method, axis=axis, limit=limit) - - shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs) + shifted = self.shift(periods=periods, freq=freq, axis=axis, **kwargs) # Unsupported left operand type for / ("Self") - rs = data / shifted - 1 # type: ignore[operator] + rs = self / shifted - 1 # type: ignore[operator] if freq is not None: # Shift method is implemented differently when freq is not None # We want to restore the original index rs = rs.loc[~rs.index.duplicated()] - rs = rs.reindex_like(data) + rs = rs.reindex_like(self) return rs.__finalize__(self, method="pct_change") @final @@ -12159,10 +11216,10 @@ def _logical_func( name: str, func, axis: Axis | None = 0, - bool_only: bool_t = False, - skipna: bool_t = True, + bool_only: bool = False, + skipna: bool = True, **kwargs, - ) -> Series | bool_t: + ) -> Series | bool: nv.validate_logical_func((), kwargs, fname=name) validate_bool_kwarg(skipna, "skipna", none_allowed=False) @@ -12181,9 +11238,9 @@ def _logical_func( if ( self.ndim > 1 and axis == 1 - and len(self._mgr.arrays) > 1 + and len(self._mgr.blocks) > 1 # TODO(EA2D): special-case not needed - and all(x.ndim == 2 for x in self._mgr.arrays) + and all(block.values.ndim == 2 for block in self._mgr.blocks) and not kwargs ): # Fastpath avoiding potentially expensive transpose @@ -12203,22 +11260,24 @@ def _logical_func( def any( self, + *, axis: Axis | None = 0, - bool_only: bool_t = False, - skipna: bool_t = True, + bool_only: bool = False, + skipna: bool = True, **kwargs, - ) -> Series | bool_t: + ) -> Series | bool: return self._logical_func( "any", nanops.nanany, axis, bool_only, skipna, **kwargs ) def all( self, + *, axis: Axis = 0, - bool_only: bool_t = False, - skipna: bool_t = True, + bool_only: bool = False, + skipna: bool = True, **kwargs, - ) -> Series | bool_t: + ) -> Series | bool: return self._logical_func( "all", nanops.nanall, axis, bool_only, skipna, **kwargs ) @@ -12229,7 +11288,7 @@ def _accum_func( name: str, func, axis: Axis | None = None, - skipna: bool_t = True, + skipna: bool = True, *args, **kwargs, ): @@ -12241,7 +11300,12 @@ def _accum_func( if axis == 1: return self.T._accum_func( - name, func, axis=0, skipna=skipna, *args, **kwargs # noqa: B026 + name, + func, + axis=0, + skipna=skipna, + *args, # noqa: B026 + **kwargs, ).T def block_accum_func(blk_values): @@ -12262,20 +11326,20 @@ def block_accum_func(blk_values): self, method=name ) - def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + def cummax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: return self._accum_func( "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs ) - def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + def cummin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: return self._accum_func( "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs ) - def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + def cumsum(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs) - def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + def cumprod(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs) @final @@ -12283,39 +11347,26 @@ def _stat_function_ddof( self, name: str, func, - axis: Axis | None | lib.NoDefault = lib.no_default, - skipna: bool_t = True, + axis: Axis | None = 0, + skipna: bool = True, ddof: int = 1, - numeric_only: bool_t = False, + numeric_only: bool = False, **kwargs, ) -> Series | float: nv.validate_stat_ddof_func((), kwargs, fname=name) validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if axis is None: - if self.ndim > 1: - warnings.warn( - f"The behavior of {type(self).__name__}.{name} with axis=None " - "is deprecated, in a future version this will reduce over both " - "axes and return a scalar. To retain the old behavior, pass " - "axis=0 (or do not pass axis)", - FutureWarning, - stacklevel=find_stack_level(), - ) - axis = 0 - elif axis is lib.no_default: - axis = 0 - return self._reduce( func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof ) def sem( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, + skipna: bool = True, ddof: int = 1, - numeric_only: bool_t = False, + numeric_only: bool = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -12324,10 +11375,11 @@ def sem( def var( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, + skipna: bool = True, ddof: int = 1, - numeric_only: bool_t = False, + numeric_only: bool = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -12336,10 +11388,11 @@ def var( def std( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, + skipna: bool = True, ddof: int = 1, - numeric_only: bool_t = False, + numeric_only: bool = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -12352,8 +11405,8 @@ def _stat_function( name: str, func, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, **kwargs, ): assert name in ["median", "mean", "min", "max", "kurt", "skew"], name @@ -12367,9 +11420,10 @@ def _stat_function( def min( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, **kwargs, ): return self._stat_function( @@ -12383,9 +11437,10 @@ def min( def max( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, **kwargs, ): return self._stat_function( @@ -12399,9 +11454,10 @@ def max( def mean( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -12410,9 +11466,10 @@ def mean( def median( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -12421,9 +11478,10 @@ def median( def skew( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -12432,9 +11490,10 @@ def skew( def kurt( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -12448,9 +11507,9 @@ def _min_count_stat_function( self, name: str, func, - axis: Axis | None | lib.NoDefault = lib.no_default, - skipna: bool_t = True, - numeric_only: bool_t = False, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, min_count: int = 0, **kwargs, ): @@ -12459,20 +11518,6 @@ def _min_count_stat_function( validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if axis is None: - if self.ndim > 1: - warnings.warn( - f"The behavior of {type(self).__name__}.{name} with axis=None " - "is deprecated, in a future version this will reduce over both " - "axes and return a scalar. To retain the old behavior, pass " - "axis=0 (or do not pass axis)", - FutureWarning, - stacklevel=find_stack_level(), - ) - axis = 0 - elif axis is lib.no_default: - axis = 0 - return self._reduce( func, name=name, @@ -12484,9 +11529,10 @@ def _min_count_stat_function( def sum( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, min_count: int = 0, **kwargs, ): @@ -12496,9 +11542,10 @@ def sum( def prod( self, + *, axis: Axis | None = 0, - skipna: bool_t = True, - numeric_only: bool_t = False, + skipna: bool = True, + numeric_only: bool = False, min_count: int = 0, **kwargs, ): @@ -12520,36 +11567,13 @@ def rolling( self, window: int | dt.timedelta | str | BaseOffset | BaseIndexer, min_periods: int | None = None, - center: bool_t = False, + center: bool = False, win_type: str | None = None, on: str | None = None, - axis: Axis | lib.NoDefault = lib.no_default, closed: IntervalClosedType | None = None, step: int | None = None, method: str = "single", ) -> Window | Rolling: - if axis is not lib.no_default: - axis = self._get_axis_number(axis) - name = "rolling" - if axis == 1: - warnings.warn( - f"Support for axis=1 in {type(self).__name__}.{name} is " - "deprecated and will be removed in a future version. " - f"Use obj.T.{name}(...) instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.{name} is " - "deprecated and will be removed in a future version. " - "Call the method without the axis keyword instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - axis = 0 - if win_type is not None: return Window( self, @@ -12558,7 +11582,6 @@ def rolling( center=center, win_type=win_type, on=on, - axis=axis, closed=closed, step=step, method=method, @@ -12571,7 +11594,6 @@ def rolling( center=center, win_type=win_type, on=on, - axis=axis, closed=closed, step=step, method=method, @@ -12582,31 +11604,9 @@ def rolling( def expanding( self, min_periods: int = 1, - axis: Axis | lib.NoDefault = lib.no_default, method: Literal["single", "table"] = "single", ) -> Expanding: - if axis is not lib.no_default: - axis = self._get_axis_number(axis) - name = "expanding" - if axis == 1: - warnings.warn( - f"Support for axis=1 in {type(self).__name__}.{name} is " - "deprecated and will be removed in a future version. " - f"Use obj.T.{name}(...) instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.{name} is " - "deprecated and will be removed in a future version. " - "Call the method without the axis keyword instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - axis = 0 - return Expanding(self, min_periods=min_periods, axis=axis, method=method) + return Expanding(self, min_periods=min_periods, method=method) @final @doc(ExponentialMovingWindow) @@ -12617,34 +11617,11 @@ def ewm( halflife: float | TimedeltaConvertibleTypes | None = None, alpha: float | None = None, min_periods: int | None = 0, - adjust: bool_t = True, - ignore_na: bool_t = False, - axis: Axis | lib.NoDefault = lib.no_default, + adjust: bool = True, + ignore_na: bool = False, times: np.ndarray | DataFrame | Series | None = None, method: Literal["single", "table"] = "single", ) -> ExponentialMovingWindow: - if axis is not lib.no_default: - axis = self._get_axis_number(axis) - name = "ewm" - if axis == 1: - warnings.warn( - f"Support for axis=1 in {type(self).__name__}.{name} is " - "deprecated and will be removed in a future version. " - f"Use obj.T.{name}(...) instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.{name} is " - "deprecated and will be removed in a future version. " - "Call the method without the axis keyword instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - axis = 0 - return ExponentialMovingWindow( self, com=com, @@ -12654,7 +11631,6 @@ def ewm( min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, - axis=axis, times=times, method=method, ) @@ -12667,37 +11643,11 @@ def _inplace_method(self, other, op) -> Self: """ Wrap arithmetic method to operate inplace. """ - warn = True - if not PYPY and warn_copy_on_write(): - if sys.getrefcount(self) <= REF_COUNT + 2: - # we are probably in an inplace setitem context (e.g. df['a'] += 1) - warn = False - result = op(self, other) - if ( - self.ndim == 1 - and result._indexed_same(self) - and result.dtype == self.dtype - and not using_copy_on_write() - and not (warn_copy_on_write() and not warn) - ): - # GH#36498 this inplace op can _actually_ be inplace. - # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager, - # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace" - self._mgr.setitem_inplace( # type: ignore[union-attr] - slice(None), result._values, warn=warn - ) - return self - - # Delete cacher - self._reset_cacher() - # this makes sure that we are aligned like the input - # we are updating inplace so we want to ignore is_copy - self._update_inplace( - result.reindex_like(self, copy=False), verify_is_copy=False - ) + # we are updating inplace + self._update_inplace(result.reindex_like(self)) return self @final @@ -12719,14 +11669,16 @@ def __imul__(self, other) -> Self: def __itruediv__(self, other) -> Self: # error: Unsupported left operand type for / ("Type[NDFrame]") return self._inplace_method( - other, type(self).__truediv__ # type: ignore[operator] + other, + type(self).__truediv__, # type: ignore[operator] ) @final def __ifloordiv__(self, other) -> Self: # error: Unsupported left operand type for // ("Type[NDFrame]") return self._inplace_method( - other, type(self).__floordiv__ # type: ignore[operator] + other, + type(self).__floordiv__, # type: ignore[operator] ) @final @@ -12757,7 +11709,7 @@ def __ixor__(self, other) -> Self: # Misc methods @final - def _find_valid_index(self, *, how: str) -> Hashable | None: + def _find_valid_index(self, *, how: str) -> Hashable: """ Retrieves the index of the first valid value. @@ -12778,13 +11730,25 @@ def _find_valid_index(self, *, how: str) -> Hashable | None: @final @doc(position="first", klass=_shared_doc_kwargs["klass"]) - def first_valid_index(self) -> Hashable | None: + def first_valid_index(self) -> Hashable: """ - Return index for {position} non-NA value or None, if no non-NA value is found. + Return index for {position} non-missing value or None, if no value is found. + + See the :ref:`User Guide ` for more information + on which values are considered missing. Returns ------- type of index + Index of {position} non-missing value. + + See Also + -------- + DataFrame.last_valid_index : Return index for last non-NA value or None, if + no non-NA value is found. + Series.last_valid_index : Return index for last non-NA value or None, if no + non-NA value is found. + DataFrame.isna : Detect missing values. Examples -------- @@ -12814,7 +11778,7 @@ def first_valid_index(self) -> Hashable | None: For DataFrame: - >>> df = pd.DataFrame({{'A': [None, None, 2], 'B': [None, 3, 4]}}) + >>> df = pd.DataFrame({{"A": [None, None, 2], "B": [None, 3, 4]}}) >>> df A B 0 NaN NaN @@ -12825,7 +11789,7 @@ def first_valid_index(self) -> Hashable | None: >>> df.last_valid_index() 2 - >>> df = pd.DataFrame({{'A': [None, None, None], 'B': [None, None, None]}}) + >>> df = pd.DataFrame({{"A": [None, None, None], "B": [None, None, None]}}) >>> df A B 0 None None @@ -12854,7 +11818,7 @@ def first_valid_index(self) -> Hashable | None: @final @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"]) - def last_valid_index(self) -> Hashable | None: + def last_valid_index(self) -> Hashable: return self._find_valid_index(how="last") @@ -12875,7 +11839,7 @@ def last_valid_index(self) -> Hashable | None: skipna : bool, default True Exclude NA/null values when computing the result. numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. + Include only float, int, boolean columns. {min_count}\ **kwargs @@ -12884,6 +11848,8 @@ def last_valid_index(self) -> Hashable | None: Returns ------- {name1} or scalar\ + + Value containing the calculation referenced in the description.\ {see_also}\ {examples} """ @@ -12917,6 +11883,8 @@ def last_valid_index(self) -> Hashable | None: Returns ------- {name1} or scalar\ + + Value containing the calculation referenced in the description.\ {see_also}\ {examples} """ @@ -12943,14 +11911,44 @@ def last_valid_index(self) -> Hashable | None: where N represents the number of elements. numeric_only : bool, default False Include only float, int, boolean columns. Not implemented for Series. +**kwargs : + Additional keywords have no effect but might be accepted + for compatibility with NumPy. Returns ------- -{name1} or {name2} (if level specified) \ +{name1} or {name2} (if level specified) + {return_desc} + +See Also +-------- +{see_also}\ {notes}\ {examples} """ +_sem_see_also = """\ +scipy.stats.sem : Compute standard error of the mean. +{name2}.std : Return sample standard deviation over requested axis. +{name2}.var : Return unbiased variance over requested axis. +{name2}.mean : Return the mean of the values over the requested axis. +{name2}.median : Return the median of the values over the requested axis. +{name2}.mode : Return the mode(s) of the Series.""" + +_sem_return_desc = """\ +Unbiased standard error of the mean over requested axis.""" + +_std_see_also = """\ +numpy.std : Compute the standard deviation along the specified axis. +{name2}.var : Return unbiased variance over requested axis. +{name2}.sem : Return unbiased standard error of the mean over requested axis. +{name2}.mean : Return the mean of the values over the requested axis. +{name2}.median : Return the median of the values over the requested axis. +{name2}.mode : Return the mode(s) of the Series.""" + +_std_return_desc = """\ +Standard deviation over requested axis.""" + _std_notes = """ Notes @@ -13044,9 +12042,9 @@ def last_valid_index(self) -> Hashable | None: Returns ------- -{name1} or {name2} - If level is specified, then, {name2} is returned; otherwise, {name1} - is returned. +{name2} or {name1} + If axis=None, then a scalar boolean is returned. + Otherwise a Series is returned with index matching the index argument. {see_also} {examples}""" @@ -13076,7 +12074,7 @@ def last_valid_index(self) -> Hashable | None: **DataFrames** -Create a dataframe from a dictionary. +Create a DataFrame from a dictionary. >>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]}) >>> df @@ -13111,7 +12109,45 @@ def last_valid_index(self) -> Hashable | None: DataFrame.any : Return True if one (or more) elements are True. """ -_cnum_doc = """ +_cnum_pd_doc = """ +Return cumulative {desc} over a DataFrame or Series axis. + +Returns a DataFrame or Series of the same size containing the cumulative +{desc}. + +Parameters +---------- +axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The index or the name of the axis. 0 is equivalent to None or 'index'. + For `Series` this parameter is unused and defaults to 0. +skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. +numeric_only : bool, default False + Include only float, int, boolean columns. +*args, **kwargs + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + +Returns +------- +{name1} or {name2} + Return cumulative {desc} of {name1} or {name2}. + +See Also +-------- +core.window.expanding.Expanding.{accum_func_name} : Similar functionality + but ignores ``NaN`` values. +{name2}.{accum_func_name} : Return the {desc} over + {name2} axis. +{name2}.cummax : Return cumulative maximum over {name2} axis. +{name2}.cummin : Return cumulative minimum over {name2} axis. +{name2}.cumsum : Return cumulative sum over {name2} axis. +{name2}.cumprod : Return cumulative product over {name2} axis. + +{examples}""" + +_cnum_series_doc = """ Return cumulative {desc} over a DataFrame or Series axis. Returns a DataFrame or Series of the same size containing the cumulative @@ -13494,9 +12530,7 @@ def last_valid_index(self) -> Hashable | None: Series([], dtype: bool) """ -_shared_docs[ - "stat_func_example" -] = """ +_shared_docs["stat_func_example"] = """ Examples -------- @@ -13550,6 +12584,14 @@ def last_valid_index(self) -> Hashable | None: stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0 ) +_skew_see_also = """ + +See Also +-------- +Series.skew : Return unbiased skew over requested axis. +Series.var : Return unbiased variance over requested axis. +Series.std : Return unbiased standard deviation over requested axis.""" + _stat_func_see_also = """ See Also @@ -13661,7 +12703,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "median": base_doc = _num_doc desc = "Return the median of the values over the requested axis." - see_also = "" + see_also = _stat_func_see_also examples = """ Examples @@ -13702,7 +12744,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "mean": base_doc = _num_doc desc = "Return the mean of the values over the requested axis." - see_also = "" + see_also = _stat_func_see_also examples = """ Examples @@ -13758,8 +12800,8 @@ def make_doc(name: str, ndim: int) -> str: "ddof argument." ) examples = _std_examples - see_also = "" - kwargs = {"notes": _std_notes} + see_also = _std_see_also.format(name2=name2) + kwargs = {"notes": "", "return_desc": _std_return_desc} elif name == "sem": base_doc = _num_ddof_doc @@ -13803,13 +12845,13 @@ def make_doc(name: str, ndim: int) -> str: >>> df.sem(numeric_only=True) a 0.5 dtype: float64""" - see_also = "" - kwargs = {"notes": ""} + see_also = _sem_see_also.format(name2=name2) + kwargs = {"notes": "", "return_desc": _sem_return_desc} elif name == "skew": base_doc = _num_doc desc = "Return unbiased skew over requested axis.\n\nNormalized by N-1." - see_also = "" + see_also = _skew_see_also examples = """ Examples @@ -13850,6 +12892,7 @@ def make_doc(name: str, ndim: int) -> str: a 0.0 dtype: float64""" kwargs = {"min_count": ""} + elif name == "kurt": base_doc = _num_doc desc = ( @@ -13904,28 +12947,44 @@ def make_doc(name: str, ndim: int) -> str: kwargs = {"min_count": ""} elif name == "cumsum": - base_doc = _cnum_doc + if ndim == 1: + base_doc = _cnum_series_doc + else: + base_doc = _cnum_pd_doc + desc = "sum" see_also = "" examples = _cumsum_examples kwargs = {"accum_func_name": "sum"} elif name == "cumprod": - base_doc = _cnum_doc + if ndim == 1: + base_doc = _cnum_series_doc + else: + base_doc = _cnum_pd_doc + desc = "product" see_also = "" examples = _cumprod_examples kwargs = {"accum_func_name": "prod"} elif name == "cummin": - base_doc = _cnum_doc + if ndim == 1: + base_doc = _cnum_series_doc + else: + base_doc = _cnum_pd_doc + desc = "minimum" see_also = "" examples = _cummin_examples kwargs = {"accum_func_name": "min"} elif name == "cummax": - base_doc = _cnum_doc + if ndim == 1: + base_doc = _cnum_series_doc + else: + base_doc = _cnum_pd_doc + desc = "maximum" see_also = "" examples = _cummax_examples diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 8248f378e2c1a..ec477626a098f 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -8,8 +8,8 @@ __all__ = [ "DataFrameGroupBy", - "NamedAgg", - "SeriesGroupBy", "GroupBy", "Grouper", + "NamedAgg", + "SeriesGroupBy", ] diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index a443597347283..7699fb3d0f864 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -1,6 +1,7 @@ """ Provide basic components for groupby. """ + from __future__ import annotations import dataclasses @@ -49,6 +50,7 @@ class OutputKey: "sem", "size", "skew", + "kurt", "std", "sum", "var", @@ -70,7 +72,6 @@ class OutputKey: "cumsum", "diff", "ffill", - "fillna", "ngroup", "pct_change", "rank", @@ -91,7 +92,6 @@ class OutputKey: "corr", "cov", "describe", - "dtypes", "expanding", "ewm", "filter", diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 6ab98cf4fe55e..90cd8e3ffa1c7 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -10,9 +10,7 @@ ) -def recode_for_groupby( - c: Categorical, sort: bool, observed: bool -) -> tuple[Categorical, Categorical | None]: +def recode_for_groupby(c: Categorical, sort: bool, observed: bool) -> Categorical: """ Code the categories to ensure we can groupby for categoricals. @@ -42,17 +40,14 @@ def recode_for_groupby( appearance in codes (unless ordered=True, in which case the original order is preserved), followed by any unrepresented categories in the original order. - Categorical or None - If we are observed, return the original categorical, otherwise None """ # we only care about observed values if observed: # In cases with c.ordered, this is equivalent to # return c.remove_unused_categories(), c - unique_codes = unique1d(c.codes) + take_codes = unique1d(c.codes[c.codes != -1]) # type: ignore[no-untyped-call] - take_codes = unique_codes[unique_codes != -1] if sort: take_codes = np.sort(take_codes) @@ -63,25 +58,26 @@ def recode_for_groupby( # return a new categorical that maps our new codes # and categories dtype = CategoricalDtype(categories, ordered=c.ordered) - return Categorical._simple_new(codes, dtype=dtype), c + return Categorical._simple_new(codes, dtype=dtype) # Already sorted according to c.categories; all is fine if sort: - return c, None + return c # sort=False should order groups in as-encountered order (GH-8868) - # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories - all_codes = np.arange(c.categories.nunique()) + # GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories # GH 38140: exclude nan from indexer for categories - unique_notnan_codes = unique1d(c.codes[c.codes != -1]) + unique_notnan_codes = unique1d(c.codes[c.codes != -1]) # type: ignore[no-untyped-call] if sort: unique_notnan_codes = np.sort(unique_notnan_codes) - if len(all_codes) > len(unique_notnan_codes): + if (num_cat := len(c.categories)) > len(unique_notnan_codes): # GH 13179: All categories need to be present, even if missing from the data - missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True) + missing_codes = np.setdiff1d( + np.arange(num_cat), unique_notnan_codes, assume_unique=True + ) take_codes = np.concatenate((unique_notnan_codes, missing_codes)) else: take_codes = unique_notnan_codes - return Categorical(c, c.unique().categories.take(take_codes)), None + return Categorical(c, c.categories.take(take_codes)) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f2e314046fb74..b520ad69aae96 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,15 +5,16 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ + from __future__ import annotations from collections import abc +from collections.abc import Callable from functools import partial from textwrap import dedent from typing import ( TYPE_CHECKING, Any, - Callable, Literal, NamedTuple, TypeVar, @@ -24,16 +25,14 @@ import numpy as np -from pandas._libs import ( - Interval, - lib, -) +from pandas._libs import Interval from pandas._libs.hashtable import duplicated from pandas.errors import SpecificationError from pandas.util._decorators import ( Appender, Substitution, doc, + set_module, ) from pandas.util._exceptions import find_stack_level @@ -62,20 +61,13 @@ maybe_mangle_lambdas, reconstruct_func, validate_func_kwargs, - warn_alias_replacement, ) import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.groupby import ( - base, - ops, -) +from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, - _agg_template_frame, - _agg_template_series, - _apply_docs, _transform_template, ) from pandas.core.indexes.api import ( @@ -93,20 +85,16 @@ if TYPE_CHECKING: from collections.abc import ( Hashable, - Mapping, Sequence, ) from pandas._typing import ( ArrayLike, - Axis, - AxisInt, + BlockManager, CorrelationMethod, - FillnaOptions, IndexLabel, Manager, - Manager2D, - SingleManager, + SingleBlockManager, TakeIndexer, ) @@ -121,6 +109,7 @@ ScalarResult = TypeVar("ScalarResult") +@set_module("pandas") class NamedAgg(NamedTuple): """ Helper for column specific aggregation with control over output column names. @@ -135,6 +124,10 @@ class NamedAgg(NamedTuple): Function to apply to the provided column. If string, the name of a built-in pandas function. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Examples -------- >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) @@ -151,6 +144,7 @@ class NamedAgg(NamedTuple): aggfunc: AggScalar +@set_module("pandas.api.typing") class SeriesGroupBy(GroupBy[Series]): def _wrap_agged_manager(self, mgr: Manager) -> Series: out = self.obj._constructor_from_mgr(mgr, axes=mgr.axes) @@ -159,7 +153,7 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series: def _get_data_to_aggregate( self, *, numeric_only: bool = False, name: str | None = None - ) -> SingleManager: + ) -> SingleBlockManager: ser = self._obj_with_exclusions single = ser._mgr if numeric_only and not is_numeric_dtype(ser.dtype): @@ -221,16 +215,251 @@ def _get_data_to_aggregate( """ ) - @Appender( - _apply_docs["template"].format( - input="series", examples=_apply_docs["series_examples"] - ) - ) def apply(self, func, *args, **kwargs) -> Series: + """ + Apply function ``func`` group-wise and combine the results together. + + The function passed to ``apply`` must take a series as its first + argument and return a DataFrame, Series or scalar. ``apply`` will + then take care of combining the results back together into a single + dataframe or series. ``apply`` is therefore a highly flexible + grouping method. + + While ``apply`` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods + like ``agg`` or ``transform``. Pandas offers a wide range of method that will + be much faster than using ``apply`` for their specific purposes, so try to + use them before reaching for ``apply``. + + Parameters + ---------- + func : callable + A callable that takes a series as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments. + + *args : tuple + Optional positional arguments to pass to ``func``. + + **kwargs : dict + Optional keyword arguments to pass to ``func``. + + Returns + ------- + Series or DataFrame + A pandas object with the result of applying ``func`` to each group. + + See Also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate : Apply aggregate function to the GroupBy object. + transform : Apply function column-by-column to the GroupBy object. + Series.apply : Apply a function to a Series. + DataFrame.apply : Apply a function to each row or column of a DataFrame. + + Notes + ----- + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + >>> s = pd.Series([0, 1, 2], index="a a b".split()) + >>> g1 = s.groupby(s.index, group_keys=False) + >>> g2 = s.groupby(s.index, group_keys=True) + + From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. + Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only + differ in their ``group_keys`` argument. Calling `apply` in various ways, + we can get different grouping results: + + Example 1: The function passed to `apply` takes a Series as + its argument and returns a Series. `apply` combines the result for + each group together into a new Series. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``. + + >>> g1.apply(lambda x: x * 2 if x.name == "a" else x / 2) + a 0.0 + a 2.0 + b 1.0 + dtype: float64 + + In the above, the groups are not part of the index. We can have them included + by using ``g2`` where ``group_keys=True``: + + >>> g2.apply(lambda x: x * 2 if x.name == "a" else x / 2) + a a 0.0 + a 2.0 + b b 1.0 + dtype: float64 + + Example 2: The function passed to `apply` takes a Series as + its argument and returns a scalar. `apply` combines the result for + each group together into a Series, including setting the index as + appropriate: + + >>> g1.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + + The ``group_keys`` argument has no effect here because the result is not + like-indexed (i.e. :ref:`a transform `) when compared + to the input. + + >>> g2.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + """ return super().apply(func, *args, **kwargs) - @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Aggregate using one or more operations. + + The ``aggregate`` method enables flexible and efficient aggregation of grouped + data using a variety of functions, including built-in, user-defined, and + optimized JIT-compiled functions. + + Parameters + ---------- + func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a Series or when passed to Series.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - None, in which case ``**kwargs`` are used with Named Aggregation. Here + the output has one column for each element in ``**kwargs``. The name of + the column is keyword, whereas the value determines the aggregation + used to compute the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + .. deprecated:: 2.1.0 + + Passing a dictionary is deprecated and will raise in a future version + of pandas. Pass a list of aggregations instead. + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + + Returns + ------- + Series + Aggregated Series based on the grouping and the applied aggregation + functions. + + See Also + -------- + SeriesGroupBy.apply : Apply function func group-wise + and combine the results together. + SeriesGroupBy.transform : Transforms the Series on each group + based on the given function. + Series.aggregate : Aggregate using one or more operations. + + Notes + ----- + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).min() + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg("min") + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg(["min", "max"]) + min max + 1 1 2 + 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum="min", + ... maximum="max", + ... ) + minimum maximum + 1 1 2 + 2 3 4 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating + function. + + >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) + 1 1.0 + 2 3.0 + dtype: float64 + """ relabeling = func is None columns = None if relabeling: @@ -264,11 +493,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return ret else: - cyfunc = com.get_cython_func(func) - if cyfunc and not args and not kwargs: - warn_alias_replacement(self, func, cyfunc) - return getattr(self, cyfunc)() - if maybe_use_numba(engine): return self._aggregate_with_numba( func, *args, engine_kwargs=engine_kwargs, **kwargs @@ -286,41 +510,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) index=self._grouper.result_index, dtype=obj.dtype, ) - - if self._grouper.nkeys > 1: - return self._python_agg_general(func, *args, **kwargs) - - try: - return self._python_agg_general(func, *args, **kwargs) - except KeyError: - # KeyError raised in test_groupby.test_basic is bc the func does - # a dictionary lookup on group.name, but group name is not - # pinned in _python_agg_general, only in _aggregate_named - result = self._aggregate_named(func, *args, **kwargs) - - warnings.warn( - "Pinning the groupby key to each group in " - f"{type(self).__name__}.agg is deprecated, and cases that " - "relied on it will raise in a future version. " - "If your operation requires utilizing the groupby keys, " - "iterate over the groupby object instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # result is a dict whose keys are the elements of result_index - result = Series(result, index=self._grouper.result_index) - result = self._wrap_aggregated_output(result) - return result + return self._python_agg_general(func, *args, **kwargs) agg = aggregate def _python_agg_general(self, func, *args, **kwargs): - orig_func = func - func = com.is_builtin_func(func) - if orig_func != func: - alias = com._builtin_table_alias[func] - warn_alias_replacement(self, orig_func, alias) f = lambda x: func(x, *args, **kwargs) obj = self._obj_with_exclusions @@ -330,24 +524,10 @@ def _python_agg_general(self, func, *args, **kwargs): def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: if isinstance(arg, dict): - if self.as_index: - # GH 15931 - raise SpecificationError("nested renamer is not supported") - else: - # GH#50684 - This accidentally worked in 1.x - msg = ( - "Passing a dictionary to SeriesGroupBy.agg is deprecated " - "and will raise in a future version of pandas. Pass a list " - "of aggregations instead." - ) - warnings.warn( - message=msg, - category=FutureWarning, - stacklevel=find_stack_level(), - ) - arg = list(arg.items()) - elif any(isinstance(x, (tuple, list)) for x in arg): - arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] + raise SpecificationError("nested renamer is not supported") + + if any(isinstance(x, (tuple, list)) for x in arg): + arg = ((x, x) if not isinstance(x, (tuple, list)) else x for x in arg) else: # list of functions / function names columns = (com.get_callable_name(f) or f for f in arg) @@ -403,6 +583,8 @@ def _wrap_applied_output( if is_transform: # GH#47787 see test_group_on_empty_multiindex res_index = data.index + elif not self.group_keys: + res_index = None else: res_index = self._grouper.result_index @@ -418,10 +600,9 @@ def _wrap_applied_output( # GH #823 #24880 index = self._grouper.result_index res_df = self.obj._constructor_expanddim(values, index=index) - res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing - res_ser = res_df.stack(future_stack=True) + res_ser = res_df.stack() res_ser.name = self.obj.name return res_ser elif isinstance(values[0], (Series, DataFrame)): @@ -444,29 +625,7 @@ def _wrap_applied_output( if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result) - - def _aggregate_named(self, func, *args, **kwargs): - # Note: this is very similar to _aggregate_series_pure_python, - # but that does not pin group.name - result = {} - initialized = False - - for name, group in self._grouper.get_iterator( - self._obj_with_exclusions, axis=self.axis - ): - # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations - object.__setattr__(group, "name", name) - - output = func(group, *args, **kwargs) - output = ops.extract_result(output) - if not initialized: - # We only do this validation on the first iteration - ops.check_result_array(output, group.dtype) - initialized = True - result[name] = output - - return result + return result __examples_series_doc = dedent( """ @@ -518,16 +677,12 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) - def _cython_transform( - self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs - ): - assert axis == 0 # handled by caller - + def _cython_transform(self, how: str, numeric_only: bool = False, **kwargs): obj = self._obj_with_exclusions try: result = self._grouper._cython_operation( - "transform", obj._values, how, axis, **kwargs + "transform", obj._values, how, 0, **kwargs ) except NotImplementedError as err: # e.g. test_groupby_raises_string @@ -550,7 +705,7 @@ def _transform_general( results = [] for name, group in self._grouper.get_iterator( - self._obj_with_exclusions, axis=self.axis + self._obj_with_exclusions, ): # this setattr is needed for test_transform_lambda_with_datetimetz object.__setattr__(group, "name", name) @@ -562,7 +717,7 @@ def _transform_general( if results: from pandas.core.reshape.concat import concat - concatenated = concat(results) + concatenated = concat(results, ignore_index=True) result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) @@ -581,13 +736,23 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): ---------- func : function Criterion to apply to each group. Should return True or False. - dropna : bool + dropna : bool, optional Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. + *args : tuple + Optional positional arguments to pass to `func`. + **kwargs : dict + Optional keyword arguments to pass to `func`. Returns ------- Series + The filtered subset of the original Series. + + See Also + -------- + Series.filter: Filter elements of ungrouped Series. + DataFrameGroupBy.filter : Filter elements from groups base on criterion. Notes ----- @@ -597,12 +762,15 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): Examples -------- - >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - ... 'foo', 'bar'], - ... 'B' : [1, 2, 3, 4, 5, 6], - ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) - >>> grouped = df.groupby('A') - >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "bar", "foo", "bar", "foo", "bar"], + ... "B": [1, 2, 3, 4, 5, 6], + ... "C": [2.0, 5.0, 8.0, 1.0, 2.0, 9.0], + ... } + ... ) + >>> grouped = df.groupby("A") + >>> df.groupby("A").B.filter(lambda x: x.mean() > 3.0) 1 2 3 4 5 6 @@ -621,9 +789,7 @@ def true_and_notna(x) -> bool: try: indices = [ self._get_index(name) - for name, group in self._grouper.get_iterator( - self._obj_with_exclusions, axis=self.axis - ) + for name, group in self._grouper.get_iterator(self._obj_with_exclusions) if true_and_notna(group) ] except (ValueError, TypeError) as err: @@ -636,16 +802,23 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: """ Return number of unique elements in the group. + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the counts. + Returns ------- Series Number of unique values within each group. - Examples + See Also -------- - For SeriesGroupby: + core.resample.Resampler.nunique : Method nunique for Resampler. - >>> lst = ['a', 'a', 'b', 'b'] + Examples + -------- + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 3], index=lst) >>> ser a 1 @@ -657,23 +830,9 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: a 2 b 1 dtype: int64 - - For Resampler: - - >>> ser = pd.Series([1, 2, 3, 3], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - 2023-02-15 3 - dtype: int64 - >>> ser.resample('MS').nunique() - 2023-01-01 2 - 2023-02-01 1 - Freq: MS, dtype: int64 """ - ids, _, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) @@ -706,7 +865,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result, fill_value=0) + return result @doc(Series.describe) def describe(self, percentiles=None, include=None, exclude=None) -> Series: @@ -722,6 +881,85 @@ def value_counts( bins=None, dropna: bool = True, ) -> Series | DataFrame: + """ + Return a Series or DataFrame containing counts of unique rows. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + bins : int or list of ints, optional + Rather than count values, group them into half-open bins, + a convenience for pd.cut, only works with numeric data. + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + Series or DataFrame + Series if the groupby ``as_index`` is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + DataFrameGroupBy.value_counts: Equivalent method on DataFrameGroupBy. + + Notes + ----- + - If the groupby ``as_index`` is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby ``as_index`` is False then the returned DataFrame will have an + additional column with the value_counts. The column is labelled 'count' or + 'proportion', depending on the ``normalize`` parameter. + + By default, rows that contain any NA values are omitted from + the result. + + By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> s = pd.Series( + ... [1, 1, 2, 3, 2, 3, 3, 1, 1, 3, 3, 3], + ... index=["A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B"], + ... ) + >>> s + A 1 + A 1 + A 2 + A 3 + A 2 + A 3 + B 3 + B 1 + B 1 + B 3 + B 3 + B 3 + dtype: int64 + >>> g1 = s.groupby(s.index) + >>> g1.value_counts(bins=2) + A (0.997, 2.0] 4 + (2.0, 3.0] 2 + B (2.0, 3.0] 4 + (0.997, 2.0] 2 + Name: count, dtype: int64 + >>> g1.value_counts(normalize=True) + A 1 0.333333 + 2 0.333333 + 3 0.333333 + B 3 0.666667 + 1 0.333333 + Name: proportion, dtype: float64 + """ name = "proportion" if normalize else "count" if bins is None: @@ -734,7 +972,7 @@ def value_counts( from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut - ids, _, _ = self._grouper.group_info + ids = self._grouper.ids val = self.obj._values index_names = self._grouper.names + [self.obj.name] @@ -804,9 +1042,18 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - codes = self._grouper.reconstructed_codes + if isinstance(self._grouper.result_index, MultiIndex): + codes = list(self._grouper.result_index.codes) + else: + codes = [ + algorithms.factorize( + self._grouper.result_index, + sort=self._grouper._sort, + use_na_sentinel=self._grouper.dropna, + )[0] + ] codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping._group_index for ping in self._grouper.groupings] + [lev] + levels = self._grouper.levels + [lev] if dropna: mask = codes[-1] != -1 @@ -848,7 +1095,10 @@ def value_counts( # "List[ndarray[Any, Any]]"; expected "List[Union[Union[ExtensionArray, # ndarray[Any, Any]], Index, Series]] _, idx = get_join_indexers( - left, right, sort=False, how="left" # type: ignore[arg-type] + left, # type: ignore[arg-type] + right, + sort=False, + how="left", ) if idx is not None: out = np.where(idx != -1, out[idx], 0) @@ -875,108 +1125,9 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: result = result.reset_index() return result - def fillna( - self, - value: object | ArrayLike | None = None, - method: FillnaOptions | None = None, - axis: Axis | None | lib.NoDefault = lib.no_default, - inplace: bool = False, - limit: int | None = None, - downcast: dict | None | lib.NoDefault = lib.no_default, - ) -> Series | None: - """ - Fill NA/NaN values using the specified method within groups. - - .. deprecated:: 2.2.0 - This method is deprecated and will be removed in a future version. - Use the :meth:`.SeriesGroupBy.ffill` or :meth:`.SeriesGroupBy.bfill` - for forward or backward filling instead. If you want to fill with a - single value, use :meth:`Series.fillna` instead. - - Parameters - ---------- - value : scalar, dict, Series, or DataFrame - Value to use to fill holes (e.g. 0), alternately a - dict/Series/DataFrame of values specifying which value to use for - each index (for a Series) or column (for a DataFrame). Values not - in the dict/Series/DataFrame will not be filled. This value cannot - be a list. Users wanting to use the ``value`` argument and not ``method`` - should prefer :meth:`.Series.fillna` as this - will produce the same result and be more performant. - method : {{'bfill', 'ffill', None}}, default None - Method to use for filling holes. ``'ffill'`` will propagate - the last valid observation forward within a group. - ``'bfill'`` will use next valid observation to fill the gap. - axis : {0 or 'index', 1 or 'columns'} - Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`. - inplace : bool, default False - Broken. Do not set to True. - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill within a group. In other words, - if there is a gap with more than this number of consecutive NaNs, - it will only be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. Must be greater than 0 if not None. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). - - Returns - ------- - Series - Object with missing values filled within groups. - - See Also - -------- - ffill : Forward fill values within a group. - bfill : Backward fill values within a group. - - Examples - -------- - For SeriesGroupBy: - - >>> lst = ['cat', 'cat', 'cat', 'mouse', 'mouse'] - >>> ser = pd.Series([1, None, None, 2, None], index=lst) - >>> ser - cat 1.0 - cat NaN - cat NaN - mouse 2.0 - mouse NaN - dtype: float64 - >>> ser.groupby(level=0).fillna(0, limit=1) - cat 1.0 - cat 0.0 - cat NaN - mouse 2.0 - mouse 0.0 - dtype: float64 - """ - warnings.warn( - f"{type(self).__name__}.fillna is deprecated and " - "will be removed in a future version. Use obj.ffill() or obj.bfill() " - "for forward or backward filling instead. If you want to fill with a " - f"single value, use {type(self.obj).__name__}.fillna instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = self._op_via_apply( - "fillna", - value=value, - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - return result - def take( self, indices: TakeIndexer, - axis: Axis | lib.NoDefault = lib.no_default, **kwargs, ) -> Series: """ @@ -994,14 +1145,6 @@ def take( ---------- indices : array-like An array of ints indicating which positions to take in each group. - axis : {0 or 'index', 1 or 'columns', None}, default 0 - The axis on which to select elements. ``0`` means that we are - selecting rows, ``1`` means that we are selecting columns. - For `SeriesGroupBy` this parameter is unused and defaults to 0. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. **kwargs For compatibility with :meth:`numpy.take`. Has no effect on the @@ -1022,13 +1165,17 @@ def take( Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan), - ... ('rabbit', 'mammal', 15.0)], - ... columns=['name', 'class', 'max_speed'], - ... index=[4, 3, 2, 1, 0]) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ("rabbit", "mammal", 15.0), + ... ], + ... columns=["name", "class", "max_speed"], + ... index=[4, 3, 2, 1, 0], + ... ) >>> df name class max_speed 4 falcon bird 389.0 @@ -1038,7 +1185,7 @@ def take( 0 rabbit mammal 15.0 >>> gb = df["name"].groupby([1, 1, 2, 2, 2]) - Take elements at positions 0 and 1 along the axis 0 in each group (default). + Take elements at rows 0 and 1 in each group. >>> gb.take([0, 1]) 1 4 falcon @@ -1057,12 +1204,11 @@ def take( 1 monkey Name: name, dtype: object """ - result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) + result = self._op_via_apply("take", indices=indices, **kwargs) return result def skew( self, - axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, **kwargs, @@ -1074,14 +1220,6 @@ def skew( Parameters ---------- - axis : {0 or 'index', 1 or 'columns', None}, default 0 - Axis for the function to be applied on. - This parameter is only for compatibility with DataFrame and is unused. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - skipna : bool, default True Exclude NA/null values when computing the result. @@ -1094,6 +1232,7 @@ def skew( Returns ------- Series + Unbiased skew within groups. See Also -------- @@ -1101,10 +1240,19 @@ def skew( Examples -------- - >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.], - ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', - ... 'Parrot', 'Parrot', 'Parrot'], - ... name="Max Speed") + >>> ser = pd.Series( + ... [390.0, 350.0, 357.0, np.nan, 22.0, 20.0, 30.0], + ... index=[ + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... ], + ... name="Max Speed", + ... ) >>> ser Falcon 390.0 Falcon 350.0 @@ -1123,26 +1271,87 @@ def skew( Parrot 1.457863 Name: Max Speed, dtype: float64 """ - if axis is lib.no_default: - axis = 0 - - if axis != 0: - result = self._op_via_apply( - "skew", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - return result + + return self._cython_agg_general( + "skew", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series + Unbiased kurtosis within groups. + + See Also + -------- + Series.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> ser = pd.Series( + ... [390.0, 350.0, 357.0, 333.0, np.nan, 22.0, 20.0, 30.0, 40.0, 41.0], + ... index=[ + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... ], + ... name="Max Speed", + ... ) + >>> ser + Falcon 390.0 + Falcon 350.0 + Falcon 357.0 + Falcon 333.0 + Falcon NaN + Parrot 22.0 + Parrot 20.0 + Parrot 30.0 + Parrot 40.0 + Parrot 41.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt() + Falcon 1.622109 + Parrot -2.878714 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt(skipna=False) + Falcon NaN + Parrot -2.878714 + Name: Max Speed, dtype: float64 + """ def alt(obj): # This should not be reached since the cython path should raise # TypeError and not NotImplementedError. - raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") + raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}") return self._cython_agg_general( - "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + "kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) @property @@ -1173,17 +1382,111 @@ def nsmallest( result = self._python_apply_general(f, data, not_indexed_same=True) return result - @doc(Series.idxmin.__doc__) - def idxmin( - self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True - ) -> Series: - return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) + def idxmin(self, skipna: bool = True) -> Series: + """ + Return the row label of the minimum value. - @doc(Series.idxmax.__doc__) - def idxmax( - self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True - ) -> Series: - return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) + If multiple values equal the minimum, the first row label with that + value is returned. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. + + Returns + ------- + Series + Indexes of minima in each group. + + Raises + ------ + ValueError + If the Series is empty or skipna=False and any value is NA. + + See Also + -------- + numpy.argmin : Return indices of the minimum values + along the given axis. + DataFrame.idxmin : Return index of first occurrence of minimum + over requested axis. + Series.idxmax : Return index *label* of the first occurrence + of maximum of values. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + + >>> ser.groupby(["a", "a", "b", "b"]).idxmin() + a 2023-01-01 + b 2023-02-01 + dtype: datetime64[s] + """ + return self._idxmax_idxmin("idxmin", skipna=skipna) + + def idxmax(self, skipna: bool = True) -> Series: + """ + Return the row label of the maximum value. + + If multiple values equal the maximum, the first row label with that + value is returned. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. + + Returns + ------- + Series + Indexes of maxima in each group. + + Raises + ------ + ValueError + If the Series is empty or skipna=False and any value is NA. + + See Also + -------- + numpy.argmax : Return indices of the maximum values + along the given axis. + DataFrame.idxmax : Return index of first occurrence of maximum + over requested axis. + Series.idxmin : Return index *label* of the first occurrence + of minimum of values. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + + >>> ser.groupby(["a", "a", "b", "b"]).idxmax() + a 2023-01-15 + b 2023-02-15 + dtype: datetime64[s] + """ + return self._idxmax_idxmin("idxmax", skipna=skipna) @doc(Series.corr.__doc__) def corr( @@ -1215,9 +1518,14 @@ def is_monotonic_increasing(self) -> Series: ------- Series + See Also + -------- + SeriesGroupBy.is_monotonic_decreasing : Return whether each group's values + are monotonically decreasing. + Examples -------- - >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"]) >>> s.groupby(level=0).is_monotonic_increasing Falcon False Parrot True @@ -1234,9 +1542,14 @@ def is_monotonic_decreasing(self) -> Series: ------- Series + See Also + -------- + SeriesGroupBy.is_monotonic_increasing : Return whether each group's values + are monotonically increasing. + Examples -------- - >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"]) >>> s.groupby(level=0).is_monotonic_decreasing Falcon True Parrot False @@ -1254,7 +1567,7 @@ def hist( xrot: float | None = None, ylabelsize: int | None = None, yrot: float | None = None, - figsize: tuple[int, int] | None = None, + figsize: tuple[float, float] | None = None, bins: int | Sequence[int] = 10, backend: str | None = None, legend: bool = False, @@ -1300,13 +1613,17 @@ def unique(self) -> Series: Examples -------- - >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1), - ... ('Beagle', 'dog', 15.2), - ... ('Chihuahua', 'dog', 6.9), - ... ('Persian', 'cat', 9.2), - ... ('Chihuahua', 'dog', 7), - ... ('Persian', 'cat', 8.8)], - ... columns=['breed', 'animal', 'height_in']) + >>> df = pd.DataFrame( + ... [ + ... ("Chihuahua", "dog", 6.1), + ... ("Beagle", "dog", 15.2), + ... ("Chihuahua", "dog", 6.9), + ... ("Persian", "cat", 9.2), + ... ("Chihuahua", "dog", 7), + ... ("Persian", "cat", 8.8), + ... ], + ... columns=["breed", "animal", "height_in"], + ... ) >>> df breed animal height_in 0 Chihuahua dog 6.1 @@ -1315,7 +1632,7 @@ def unique(self) -> Series: 3 Persian cat 9.2 4 Chihuahua dog 7.0 5 Persian cat 8.8 - >>> ser = df.groupby('animal')['breed'].unique() + >>> ser = df.groupby("animal")["breed"].unique() >>> ser animal cat [Persian] @@ -1326,6 +1643,7 @@ def unique(self) -> Series: return result +@set_module("pandas.api.typing") class DataFrameGroupBy(GroupBy[DataFrame]): _agg_examples_doc = dedent( """ @@ -1417,8 +1735,181 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ ) - @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Aggregate using one or more operations. + + The ``aggregate`` function allows the application of one or more aggregation + operations on groups of data within a DataFrameGroupBy object. It supports + various aggregation methods, including user-defined functions and predefined + functions such as 'sum', 'mean', etc. + + Parameters + ---------- + func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of index labels -> functions, function names or list of such. + - None, in which case ``**kwargs`` are used with Named Aggregation. Here the + output has one column for each element in ``**kwargs``. The name of the + column is keyword, whereas the value determines the aggregation used to + compute the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + + Returns + ------- + DataFrame + Aggregated DataFrame based on the grouping and the applied aggregation + functions. + + See Also + -------- + DataFrame.groupby.apply : Apply function func group-wise + and combine the results together. + DataFrame.groupby.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate : Aggregate using one or more operations. + + Notes + ----- + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Examples + -------- + >>> data = { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362838, 0.227877, 1.267767, -0.562860], + ... } + >>> df = pd.DataFrame(data) + >>> df + A B C + 0 1 1 0.362838 + 1 1 2 0.227877 + 2 2 3 1.267767 + 3 2 4 -0.562860 + + The aggregation is for each column. + + >>> df.groupby("A").agg("min") + B C + A + 1 1 0.227877 + 2 3 -0.562860 + + Multiple aggregations + + >>> df.groupby("A").agg(["min", "max"]) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.562860 1.267767 + + Select a column for aggregation + + >>> df.groupby("A").B.agg(["min", "max"]) + min max + A + 1 1 2 + 2 3 4 + + User-defined function for aggregation + + >>> df.groupby("A").agg(lambda x: sum(x) + 2) + B C + A + 1 5 2.590715 + 2 9 2.704907 + + Different aggregations per column + + >>> df.groupby("A").agg({"B": ["min", "max"], "C": "sum"}) + B C + min max sum + A + 1 1 2 0.590715 + 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"), + ... ) + b_min c_sum + A + 1 1 0.590715 + 2 3 0.704907 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating + function. + + >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) + B + A + 1 1.0 + 2 3.0 + """ relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) @@ -1469,12 +1960,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # can't return early result = self._aggregate_frame(func, *args, **kwargs) - elif self.axis == 1: - # _aggregate_multiple_funcs does not allow self.axis == 1 - # Note: axis == 1 precludes 'not self.as_index', see __init__ - result = self._aggregate_frame(func) - return result - else: # try to treat as if we are passing a list gba = GroupByApply(self, [func], args=(), kwargs={}) @@ -1507,11 +1992,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) agg = aggregate def _python_agg_general(self, func, *args, **kwargs): - orig_func = func - func = com.is_builtin_func(func) - if orig_func != func: - alias = com._builtin_table_alias[func] - warn_alias_replacement(self, orig_func, alias) f = lambda x: func(x, *args, **kwargs) if self.ngroups == 0: @@ -1520,8 +2000,6 @@ def _python_agg_general(self, func, *args, **kwargs): return self._python_apply_general(f, self._selected_obj, is_agg=True) obj = self._obj_with_exclusions - if self.axis == 1: - obj = obj.T if not len(obj.columns): # e.g. test_margins_no_values_no_cols @@ -1543,15 +2021,13 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: obj = self._obj_with_exclusions result: dict[Hashable, NDFrame | np.ndarray] = {} - for name, grp_df in self._grouper.get_iterator(obj, self.axis): + for name, grp_df in self._grouper.get_iterator(obj): fres = func(grp_df, *args, **kwargs) result[name] = fres result_index = self._grouper.result_index - other_ax = obj.axes[1 - self.axis] - out = self.obj._constructor(result, index=other_ax, columns=result_index) - if self.axis == 0: - out = out.T + out = self.obj._constructor(result, index=obj.columns, columns=result_index) + out = out.T return out @@ -1566,11 +2042,13 @@ def _wrap_applied_output( if is_transform: # GH#47787 see test_group_on_empty_multiindex res_index = data.index + elif not self.group_keys: + res_index = None else: res_index = self._grouper.result_index result = self.obj._constructor(index=res_index, columns=data.columns) - result = result.astype(data.dtypes, copy=False) + result = result.astype(data.dtypes) return result # GH12824 @@ -1578,8 +2056,11 @@ def _wrap_applied_output( first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684 - All values are None, return an empty frame. - return self.obj._constructor() + # GH9684 - All values are None, return an empty frame + # GH57775 - Ensure that columns and dtypes from original frame are kept. + result = self.obj._constructor(columns=data.columns) + result = result.astype(data.dtypes) + return result elif isinstance(first_not_none, DataFrame): return self._concat_objects( values, @@ -1651,18 +2132,13 @@ def _wrap_applied_output_series( # vstack+constructor is faster than concat and handles MI-columns stacked_values = np.vstack([np.asarray(v) for v in values]) - if self.axis == 0: - index = key_index - columns = first_not_none.index.copy() - if columns.name is None: - # GH6124 - propagate name of Series when it's consistent - names = {v.name for v in values} - if len(names) == 1: - columns.name = next(iter(names)) - else: - index = first_not_none.index - columns = key_index - stacked_values = stacked_values.T + index = key_index + columns = first_not_none.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = next(iter(names)) if stacked_values.dtype == object: # We'll have the DataFrame constructor do inference @@ -1672,23 +2148,18 @@ def _wrap_applied_output_series( if not self.as_index: result = self._insert_inaxis_grouper(result) - return self._reindex_output(result) + return result def _cython_transform( self, how: str, numeric_only: bool = False, - axis: AxisInt = 0, **kwargs, ) -> DataFrame: - assert axis == 0 # handled by caller - - # With self.axis == 0, we have multi-block tests + # We have multi-block tests # e.g. test_rank_min_int, test_cython_transform_frame # test_transform_numeric_ret - # With self.axis == 1, _get_data_to_aggregate does a transpose - # so we always have a single block. - mgr: Manager2D = self._get_data_to_aggregate( + mgr: BlockManager = self._get_data_to_aggregate( numeric_only=numeric_only, name=how ) @@ -1697,13 +2168,9 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: "transform", bvalues, how, 1, **kwargs ) - # We could use `mgr.apply` here and not have to set_axis, but - # we would have to do shape gymnastics for ArrayManager compat - res_mgr = mgr.grouped_reduce(arr_func) - res_mgr.set_axis(1, mgr.axes[1]) + res_mgr = mgr.apply(arr_func) res_df = self.obj._constructor_from_mgr(res_mgr, axes=res_mgr.axes) - res_df = self._maybe_transpose_result(res_df) return res_df def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): @@ -1715,7 +2182,7 @@ def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): applied = [] obj = self._obj_with_exclusions - gen = self._grouper.get_iterator(obj, axis=self.axis) + gen = self._grouper.get_iterator(obj) fast_path, slow_path = self._define_paths(func, *args, **kwargs) # Determine whether to use slow or fast path by evaluating on the first group. @@ -1749,10 +2216,11 @@ def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): res = _wrap_transform_general_frame(self.obj, group, res) applied.append(res) - concat_index = obj.columns if self.axis == 0 else obj.index - other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 - concatenated = concat(applied, axis=self.axis, verify_integrity=False) - concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) + concat_index = obj.columns + concatenated = concat( + applied, axis=0, verify_integrity=False, ignore_index=True + ) + concatenated = concatenated.reindex(concat_index, axis=1) return self._set_result_index_ordered(concatenated) __examples_dataframe_doc = dedent( @@ -1820,12 +2288,12 @@ def _define_paths(self, func, *args, **kwargs): if isinstance(func, str): fast_path = lambda group: getattr(group, func)(*args, **kwargs) slow_path = lambda group: group.apply( - lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis + lambda x: getattr(x, func)(*args, **kwargs), axis=0 ) else: fast_path = lambda group: func(group, *args, **kwargs) slow_path = lambda group: group.apply( - lambda x: func(x, *args, **kwargs), axis=self.axis + lambda x: func(x, *args, **kwargs), axis=0 ) return fast_path, slow_path @@ -1865,7 +2333,7 @@ def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFram return path, res - def filter(self, func, dropna: bool = True, *args, **kwargs): + def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: """ Filter elements from groups that don't satisfy a criterion. @@ -1879,10 +2347,20 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): dropna : bool Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. + *args : tuple + Additional positional arguments to pass to `func`. + **kwargs : dict + Additional keyword arguments to pass to `func`. Returns ------- DataFrame + The filtered subset of the original DataFrame. + + See Also + -------- + DataFrame.filter: Filter elements of ungrouped DataFrame. + SeriesGroupBy.filter : Filter elements from groups base on criterion. Notes ----- @@ -1895,12 +2373,15 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): Examples -------- - >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - ... 'foo', 'bar'], - ... 'B' : [1, 2, 3, 4, 5, 6], - ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) - >>> grouped = df.groupby('A') - >>> grouped.filter(lambda x: x['B'].mean() > 3.) + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "bar", "foo", "bar", "foo", "bar"], + ... "B": [1, 2, 3, 4, 5, 6], + ... "C": [2.0, 5.0, 8.0, 1.0, 2.0, 9.0], + ... } + ... ) + >>> grouped = df.groupby("A") + >>> grouped.filter(lambda x: x["B"].mean() > 3.0) A B C 1 bar 2 5.0 3 bar 4 1.0 @@ -1909,7 +2390,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): indices = [] obj = self._selected_obj - gen = self._grouper.get_iterator(obj, axis=self.axis) + gen = self._grouper.get_iterator(obj) for name, group in gen: # 2023-02-27 no tests are broken this pinning, but it is documented in the @@ -1937,9 +2418,6 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): return self._apply_filter(indices, dropna) def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: - if self.axis == 1: - # GH 37725 - raise ValueError("Cannot subset columns when using axis=1") # per GH 23566 if isinstance(key, tuple) and len(key) > 1: # if len == 1, then it becomes a SeriesGroupBy and this is actually @@ -1969,7 +2447,6 @@ def _gotitem(self, key, ndim: int, subset=None): return DataFrameGroupBy( subset, self.keys, - axis=self.axis, level=self.level, grouper=self._grouper, exclusions=self.exclusions, @@ -2001,18 +2478,14 @@ def _gotitem(self, key, ndim: int, subset=None): def _get_data_to_aggregate( self, *, numeric_only: bool = False, name: str | None = None - ) -> Manager2D: + ) -> BlockManager: obj = self._obj_with_exclusions - if self.axis == 1: - mgr = obj.T._mgr - else: - mgr = obj._mgr - + mgr = obj._mgr if numeric_only: mgr = mgr.get_numeric_data() return mgr - def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: + def _wrap_agged_manager(self, mgr: BlockManager) -> DataFrame: return self.obj._constructor_from_mgr(mgr, axes=mgr.axes) def _apply_to_column_groupbys(self, func) -> DataFrame: @@ -2020,7 +2493,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: obj = self._obj_with_exclusions columns = obj.columns - sgbs = [ + sgbs = ( SeriesGroupBy( obj.iloc[:, i], selection=colname, @@ -2029,10 +2502,10 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: observed=self.observed, ) for i, colname in enumerate(obj.columns) - ] + ) results = [func(sgb) for sgb in sgbs] - if not len(results): + if not results: # concat would raise res_df = DataFrame([], columns=columns, index=self._grouper.result_index) else: @@ -2055,13 +2528,21 @@ def nunique(self, dropna: bool = True) -> DataFrame: Returns ------- nunique: DataFrame + Counts of unique elements in each position. + + See Also + -------- + DataFrame.nunique : Count number of distinct elements in specified axis. Examples -------- - >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', - ... 'ham', 'ham'], - ... 'value1': [1, 5, 5, 2, 5, 5], - ... 'value2': list('abbaxy')}) + >>> df = pd.DataFrame( + ... { + ... "id": ["spam", "egg", "egg", "spam", "ham", "ham"], + ... "value1": [1, 5, 5, 2, 5, 5], + ... "value2": list("abbaxy"), + ... } + ... ) >>> df id value1 value2 0 spam 1 a @@ -2071,7 +2552,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: 4 ham 5 x 5 ham 5 y - >>> df.groupby('id').nunique() + >>> df.groupby("id").nunique() value1 value2 id egg 1 1 @@ -2080,48 +2561,27 @@ def nunique(self, dropna: bool = True) -> DataFrame: Check for rows with the same id but conflicting values: - >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) + >>> df.groupby("id").filter(lambda g: (g.nunique() > 1).any()) id value1 value2 0 spam 1 a 3 spam 2 a 4 ham 5 x 5 ham 5 y """ - - if self.axis != 0: - # see test_groupby_crash_on_nunique - return self._python_apply_general( - lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True - ) - return self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna)) def idxmax( self, - axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, ) -> DataFrame: """ - Return index of first occurrence of maximum over requested axis. - - NA/null values are excluded. + Return index of first occurrence of maximum in each group. Parameters ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default None - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - If axis is not provided, grouper's axis is used. - - .. versionchanged:: 2.0.0 - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA values. numeric_only : bool, default False Include only `float`, `int` or `boolean` data. @@ -2129,17 +2589,18 @@ def idxmax( Returns ------- - Series - Indexes of maxima along the specified axis. + DataFrame + Indexes of maxima in each column according to the group. Raises ------ ValueError - * If the row/column is empty + * If a column is empty or skipna=False and any value is NA. See Also -------- Series.idxmax : Return index of the maximum element. + DataFrame.idxmax : Indexes of maxima along the specified axis. Notes ----- @@ -2149,9 +2610,14 @@ def idxmax( -------- Consider a dataset containing food consumption in Argentina. - >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + >>> df = pd.DataFrame( + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) >>> df consumption co2_emissions @@ -2159,51 +2625,29 @@ def idxmax( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the maximum value in each column. - - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object + By default, it returns the index for the maximum value in each column + according to the group. - To return the index for the maximum value in each row, use ``axis="columns"``. - - >>> df.idxmax(axis="columns") - Pork co2_emissions - Wheat Products consumption - Beef co2_emissions - dtype: object + >>> df.groupby("food_type").idxmax() + consumption co2_emissions + food_type + animal Beef Beef + plant Wheat Products Wheat Products """ - return self._idxmax_idxmin( - "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna - ) + return self._idxmax_idxmin("idxmax", numeric_only=numeric_only, skipna=skipna) def idxmin( self, - axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, ) -> DataFrame: """ - Return index of first occurrence of minimum over requested axis. - - NA/null values are excluded. + Return index of first occurrence of minimum in each group. Parameters ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default None - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - If axis is not provided, grouper's axis is used. - - .. versionchanged:: 2.0.0 - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA values. numeric_only : bool, default False Include only `float`, `int` or `boolean` data. @@ -2211,17 +2655,18 @@ def idxmin( Returns ------- - Series - Indexes of minima along the specified axis. + DataFrame + Indexes of minima in each column according to the group. Raises ------ ValueError - * If the row/column is empty + * If a column is empty or skipna=False and any value is NA. See Also -------- Series.idxmin : Return index of the minimum element. + DataFrame.idxmin : Indexes of minima along the specified axis. Notes ----- @@ -2231,9 +2676,14 @@ def idxmin( -------- Consider a dataset containing food consumption in Argentina. - >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + >>> df = pd.DataFrame( + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) >>> df consumption co2_emissions @@ -2241,24 +2691,16 @@ def idxmin( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the minimum value in each column. - - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object + By default, it returns the index for the minimum value in each column + according to the group. - To return the index for the minimum value in each row, use ``axis="columns"``. - - >>> df.idxmin(axis="columns") - Pork consumption - Wheat Products co2_emissions - Beef consumption - dtype: object + >>> df.groupby("food_type").idxmin() + consumption co2_emissions + food_type + animal Pork Pork + plant Wheat Products Wheat Products """ - return self._idxmax_idxmin( - "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna - ) + return self._idxmax_idxmin("idxmin", numeric_only=numeric_only, skipna=skipna) boxplot = boxplot_frame_groupby @@ -2282,7 +2724,13 @@ def value_counts( normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True - Sort by frequencies. + Sort by frequencies when True. When False, non-grouping columns will appear + in the order they occur in within groups. + + .. versionchanged:: 3.0.0 + + In prior versions, ``sort=False`` would sort the non-grouping columns + by label. ascending : bool, default False Sort in ascending order. dropna : bool, default True @@ -2291,7 +2739,7 @@ def value_counts( Returns ------- Series or DataFrame - Series if the groupby as_index is True, otherwise DataFrame. + Series if the groupby ``as_index`` is True, otherwise DataFrame. See Also -------- @@ -2301,9 +2749,9 @@ def value_counts( Notes ----- - - If the groupby as_index is True then the returned Series will have a + - If the groupby ``as_index`` is True then the returned Series will have a MultiIndex with one level per input column. - - If the groupby as_index is False then the returned DataFrame will have an + - If the groupby ``as_index`` is False then the returned DataFrame will have an additional column with the value_counts. The column is labelled 'count' or 'proportion', depending on the ``normalize`` parameter. @@ -2315,11 +2763,13 @@ def value_counts( Examples -------- - >>> df = pd.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], - ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], - ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] - ... }) + >>> df = pd.DataFrame( + ... { + ... "gender": ["male", "male", "female", "male", "female", "male"], + ... "education": ["low", "medium", "high", "low", "high", "low"], + ... "country": ["US", "FR", "US", "FR", "FR", "FR"], + ... } + ... ) >>> df gender education country @@ -2330,201 +2780,54 @@ def value_counts( 4 female high FR 5 male low FR - >>> df.groupby('gender').value_counts() + >>> df.groupby("gender").value_counts() gender education country - female high FR 1 - US 1 + female high US 1 + FR 1 male low FR 2 US 1 medium FR 1 Name: count, dtype: int64 - >>> df.groupby('gender').value_counts(ascending=True) + >>> df.groupby("gender").value_counts(ascending=True) gender education country - female high FR 1 - US 1 + female high US 1 + FR 1 male low US 1 medium FR 1 low FR 2 Name: count, dtype: int64 - >>> df.groupby('gender').value_counts(normalize=True) + >>> df.groupby("gender").value_counts(normalize=True) gender education country - female high FR 0.50 - US 0.50 + female high US 0.50 + FR 0.50 male low FR 0.50 US 0.25 medium FR 0.25 Name: proportion, dtype: float64 - >>> df.groupby('gender', as_index=False).value_counts() + >>> df.groupby("gender", as_index=False).value_counts() gender education country count - 0 female high FR 1 - 1 female high US 1 + 0 female high US 1 + 1 female high FR 1 2 male low FR 2 3 male low US 1 4 male medium FR 1 - >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + >>> df.groupby("gender", as_index=False).value_counts(normalize=True) gender education country proportion - 0 female high FR 0.50 - 1 female high US 0.50 + 0 female high US 0.50 + 1 female high FR 0.50 2 male low FR 0.50 3 male low US 0.25 4 male medium FR 0.25 """ return self._value_counts(subset, normalize, sort, ascending, dropna) - def fillna( - self, - value: Hashable | Mapping | Series | DataFrame | None = None, - method: FillnaOptions | None = None, - axis: Axis | None | lib.NoDefault = lib.no_default, - inplace: bool = False, - limit: int | None = None, - downcast=lib.no_default, - ) -> DataFrame | None: - """ - Fill NA/NaN values using the specified method within groups. - - .. deprecated:: 2.2.0 - This method is deprecated and will be removed in a future version. - Use the :meth:`.DataFrameGroupBy.ffill` or :meth:`.DataFrameGroupBy.bfill` - for forward or backward filling instead. If you want to fill with a - single value, use :meth:`DataFrame.fillna` instead. - - Parameters - ---------- - value : scalar, dict, Series, or DataFrame - Value to use to fill holes (e.g. 0), alternately a - dict/Series/DataFrame of values specifying which value to use for - each index (for a Series) or column (for a DataFrame). Values not - in the dict/Series/DataFrame will not be filled. This value cannot - be a list. Users wanting to use the ``value`` argument and not ``method`` - should prefer :meth:`.DataFrame.fillna` as this - will produce the same result and be more performant. - method : {{'bfill', 'ffill', None}}, default None - Method to use for filling holes. ``'ffill'`` will propagate - the last valid observation forward within a group. - ``'bfill'`` will use next valid observation to fill the gap. - axis : {0 or 'index', 1 or 'columns'} - Axis along which to fill missing values. When the :class:`DataFrameGroupBy` - ``axis`` argument is ``0``, using ``axis=1`` here will produce - the same results as :meth:`.DataFrame.fillna`. When the - :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0`` - or ``axis=1`` here will produce the same results. - inplace : bool, default False - Broken. Do not set to True. - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill within a group. In other words, - if there is a gap with more than this number of consecutive NaNs, - it will only be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. Must be greater than 0 if not None. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). - - Returns - ------- - DataFrame - Object with missing values filled. - - See Also - -------- - ffill : Forward fill values within a group. - bfill : Backward fill values within a group. - - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "key": [0, 0, 1, 1, 1], - ... "A": [np.nan, 2, np.nan, 3, np.nan], - ... "B": [2, 3, np.nan, np.nan, np.nan], - ... "C": [np.nan, np.nan, 2, np.nan, np.nan], - ... } - ... ) - >>> df - key A B C - 0 0 NaN 2.0 NaN - 1 0 2.0 3.0 NaN - 2 1 NaN NaN 2.0 - 3 1 3.0 NaN NaN - 4 1 NaN NaN NaN - - Propagate non-null values forward or backward within each group along columns. - - >>> df.groupby("key").fillna(method="ffill") - A B C - 0 NaN 2.0 NaN - 1 2.0 3.0 NaN - 2 NaN NaN 2.0 - 3 3.0 NaN 2.0 - 4 3.0 NaN 2.0 - - >>> df.groupby("key").fillna(method="bfill") - A B C - 0 2.0 2.0 NaN - 1 2.0 3.0 NaN - 2 3.0 NaN 2.0 - 3 3.0 NaN NaN - 4 NaN NaN NaN - - Propagate non-null values forward or backward within each group along rows. - - >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T - key A B C - 0 0.0 0.0 2.0 2.0 - 1 0.0 2.0 3.0 3.0 - 2 1.0 1.0 NaN 2.0 - 3 1.0 3.0 NaN NaN - 4 1.0 1.0 NaN NaN - - >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T - key A B C - 0 0.0 NaN 2.0 NaN - 1 0.0 2.0 3.0 NaN - 2 1.0 NaN 2.0 2.0 - 3 1.0 3.0 NaN NaN - 4 1.0 NaN NaN NaN - - Only replace the first NaN element within a group along rows. - - >>> df.groupby("key").fillna(method="ffill", limit=1) - A B C - 0 NaN 2.0 NaN - 1 2.0 3.0 NaN - 2 NaN NaN 2.0 - 3 3.0 NaN 2.0 - 4 3.0 NaN NaN - """ - warnings.warn( - f"{type(self).__name__}.fillna is deprecated and " - "will be removed in a future version. Use obj.ffill() or obj.bfill() " - "for forward or backward filling instead. If you want to fill with a " - f"single value, use {type(self.obj).__name__}.fillna instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - - result = self._op_via_apply( - "fillna", - value=value, - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - return result - def take( self, indices: TakeIndexer, - axis: Axis | None | lib.NoDefault = lib.no_default, **kwargs, ) -> DataFrame: """ @@ -2542,13 +2845,6 @@ def take( ---------- indices : array-like An array of ints indicating which positions to take. - axis : {0 or 'index', 1 or 'columns', None}, default 0 - The axis on which to select elements. ``0`` means that we are - selecting rows, ``1`` means that we are selecting columns. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. **kwargs For compatibility with :meth:`numpy.take`. Has no effect on the @@ -2568,13 +2864,17 @@ def take( Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan), - ... ('rabbit', 'mammal', 15.0)], - ... columns=['name', 'class', 'max_speed'], - ... index=[4, 3, 2, 1, 0]) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ("rabbit", "mammal", 15.0), + ... ], + ... columns=["name", "class", "max_speed"], + ... index=[4, 3, 2, 1, 0], + ... ) >>> df name class max_speed 4 falcon bird 389.0 @@ -2584,7 +2884,7 @@ def take( 0 rabbit mammal 15.0 >>> gb = df.groupby([1, 1, 2, 2, 2]) - Take elements at positions 0 and 1 along the axis 0 (default). + Take elements at rows 0 and 1. Note how the indices selected in the result do not correspond to our input indices 0 and 1. That's because we are selecting the 0th @@ -2607,8 +2907,6 @@ def take( 2 1 monkey mammal NaN 2 lion mammal 80.5 - Take elements at indices 1 and 2 along the axis 1 (column selection). - We may take elements using negative integers for positive indices, starting from the end of the object, just like with Python lists. @@ -2619,12 +2917,11 @@ def take( 2 0 rabbit mammal 15.0 1 monkey mammal NaN """ - result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) + result = self._op_via_apply("take", indices=indices, **kwargs) return result def skew( self, - axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, **kwargs, @@ -2636,17 +2933,6 @@ def skew( Parameters ---------- - axis : {0 or 'index', 1 or 'columns', None}, default 0 - Axis for the function to be applied on. - - Specifying ``axis=None`` will apply the aggregation across both axes. - - .. versionadded:: 2.0.0 - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - skipna : bool, default True Exclude NA/null values when computing the result. @@ -2659,6 +2945,7 @@ def skew( Returns ------- DataFrame + Unbiased skew within groups. See Also -------- @@ -2666,14 +2953,15 @@ def skew( Examples -------- - >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi', - ... 'lion', 'monkey', 'rabbit'], - ... ['bird', 'bird', 'bird', 'bird', - ... 'mammal', 'mammal', 'mammal']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class')) - >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan, - ... 80.5, 21.5, 15.0]}, - ... index=index) + >>> arrays = [ + ... ["falcon", "parrot", "cockatoo", "kiwi", "lion", "monkey", "rabbit"], + ... ["bird", "bird", "bird", "bird", "mammal", "mammal", "mammal"], + ... ] + >>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class")) + >>> df = pd.DataFrame( + ... {"max_speed": [389.0, 24.0, 70.0, np.nan, 80.5, 21.5, 15.0]}, + ... index=index, + ... ) >>> df max_speed name class @@ -2696,18 +2984,6 @@ def skew( bird NaN mammal 1.669046 """ - if axis is lib.no_default: - axis = 0 - - if axis != 0: - result = self._op_via_apply( - "skew", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - return result def alt(obj): # This should not be reached since the cython path should raise @@ -2718,6 +2994,111 @@ def alt(obj): "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> DataFrame: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + DataFrame + Unbiased kurtosis within groups. + + See Also + -------- + DataFrame.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> arrays = [ + ... [ + ... "falcon", + ... "parrot", + ... "cockatoo", + ... "kiwi", + ... "eagle", + ... "lion", + ... "monkey", + ... "rabbit", + ... "dog", + ... "wolf", + ... ], + ... [ + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... ], + ... ] + >>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class")) + >>> df = pd.DataFrame( + ... { + ... "max_speed": [ + ... 389.0, + ... 24.0, + ... 70.0, + ... np.nan, + ... 350.0, + ... 80.5, + ... 21.5, + ... 15.0, + ... 40.0, + ... 50.0, + ... ] + ... }, + ... index=index, + ... ) + >>> df + max_speed + name class + falcon bird 389.0 + parrot bird 24.0 + cockatoo bird 70.0 + kiwi bird NaN + eagle bird 350.0 + lion mammal 80.5 + monkey mammal 21.5 + rabbit mammal 15.0 + dog mammal 40.0 + wolf mammal 50.0 + >>> gb = df.groupby(["class"]) + >>> gb.kurt() + max_speed + class + bird -5.493277 + mammal 0.204125 + >>> gb.kurt(skipna=False) + max_speed + class + bird NaN + mammal 0.204125 + """ + + return self._cython_agg_general( + "kurt", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + @property @doc(DataFrame.plot.__doc__) def plot(self) -> GroupByPlot: @@ -2748,7 +3129,6 @@ def cov( ) return result - @doc(DataFrame.hist.__doc__) def hist( self, column: IndexLabel | None = None, @@ -2761,13 +3141,101 @@ def hist( ax=None, sharex: bool = False, sharey: bool = False, - figsize: tuple[int, int] | None = None, + figsize: tuple[float, float] | None = None, layout: tuple[int, int] | None = None, bins: int | Sequence[int] = 10, backend: str | None = None, legend: bool = False, **kwargs, ): + """ + Make a histogram of the DataFrame's columns. + + A `histogram`_ is a representation of the distribution of data. + This function calls :meth:`matplotlib.pyplot.hist`, on each series in + the DataFrame, resulting in one histogram per column. + + .. _histogram: https://en.wikipedia.org/wiki/Histogram + + Parameters + ---------- + column : str or sequence, optional + If passed, will be used to limit data to a subset of columns. + by : object, optional + If passed, then used to form histograms for separate groups. + grid : bool, default True + Whether to show axis grid lines. + xlabelsize : int, default None + If specified changes the x-axis label size. + xrot : float, default None + Rotation of x axis labels. For example, a value of 90 displays the + x labels rotated 90 degrees clockwise. + ylabelsize : int, default None + If specified changes the y-axis label size. + yrot : float, default None + Rotation of y axis labels. For example, a value of 90 displays the + y labels rotated 90 degrees clockwise. + ax : Matplotlib axes object, default None + The axes to plot the histogram on. + sharex : bool, default True if ax is None else False + In case subplots=True, share x axis and set some x axis labels to + invisible; defaults to True if ax is None otherwise False if an ax + is passed in. + Note that passing in both an ax and sharex=True will alter all x axis + labels for all subplots in a figure. + sharey : bool, default False + In case subplots=True, share y axis and set some y axis labels to + invisible. + figsize : tuple, optional + The size in inches of the figure to create. Uses the value in + `matplotlib.rcParams` by default. + layout : tuple, optional + Tuple of (rows, columns) for the layout of the histograms. + bins : int or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. + + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + legend : bool, default False + Whether to show the legend. + + **kwargs + All other plotting keyword arguments to be passed to + :meth:`matplotlib.pyplot.hist`. + + Returns + ------- + matplotlib.Axes or numpy.ndarray + A ``matplotlib.Axes`` object or an array of ``Axes`` objects, depending on + the layout and grouping. + + See Also + -------- + matplotlib.pyplot.hist : Plot a histogram using matplotlib. + + Examples + -------- + This example draws a histogram based on the length and width of + some animals, displayed in three bins + + .. plot:: + :context: close-figs + + >>> data = { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... } + >>> index = ["pig", "rabbit", "duck", "chicken", "horse"] + >>> df = pd.DataFrame(data, index=index) + >>> hist = df.groupby("length").hist(bins=3) + """ result = self._op_via_apply( "hist", column=column, @@ -2789,35 +3257,85 @@ def hist( ) return result - @property - @doc(DataFrame.dtypes.__doc__) - def dtypes(self) -> Series: - # GH#51045 - warnings.warn( - f"{type(self).__name__}.dtypes is deprecated and will be removed in " - "a future version. Check the dtypes on the base object instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # error: Incompatible return value type (got "DataFrame", expected "Series") - return self._python_apply_general( # type: ignore[return-value] - lambda df: df.dtypes, self._selected_obj - ) - - @doc(DataFrame.corrwith.__doc__) def corrwith( self, other: DataFrame | Series, - axis: Axis | lib.NoDefault = lib.no_default, drop: bool = False, method: CorrelationMethod = "pearson", numeric_only: bool = False, ) -> DataFrame: + """ + Compute pairwise correlation. + + .. deprecated:: 3.0.0 + + Pairwise correlation is computed between rows or columns of + DataFrame with rows or columns of Series or DataFrame. DataFrames + are first aligned along both axes before computing the + correlations. + + Parameters + ---------- + other : DataFrame, Series + Object with which to compute correlations. + drop : bool, default False + Drop missing indices from result. + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. + + Returns + ------- + Series + Pairwise correlations. + + See Also + -------- + DataFrame.corr : Compute pairwise correlation of columns. + + Examples + -------- + >>> df1 = pd.DataFrame( + ... { + ... "Day": [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... "Data": [6, 6, 8, 5, 4, 2, 7, 3, 9], + ... } + ... ) + >>> df2 = pd.DataFrame( + ... { + ... "Day": [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... "Data": [5, 3, 8, 3, 1, 1, 2, 3, 6], + ... } + ... ) + + >>> df1.groupby("Day").corrwith(df2) + Data Day + Day + 1 0.917663 NaN + 2 0.755929 NaN + 3 0.576557 NaN + """ + warnings.warn( + "DataFrameGroupBy.corrwith is deprecated", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "corrwith", other=other, - axis=axis, drop=drop, method=method, numeric_only=numeric_only, @@ -2835,7 +3353,7 @@ def _wrap_transform_general_frame( # other dimension; this will preserve dtypes # GH14457 if res.index.is_(obj.index): - res_frame = concat([res] * len(group.columns), axis=1) + res_frame = concat([res] * len(group.columns), axis=1, ignore_index=True) res_frame.columns = group.columns res_frame.index = group.index else: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 089e15afd465b..3daee98371844 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -6,10 +6,13 @@ class providing the base-class of operations. (defined in pandas.core.groupby.generic) expose these user-facing objects to provide specific functionality. """ + from __future__ import annotations from collections.abc import ( + Callable, Hashable, + Iterable, Iterator, Mapping, Sequence, @@ -19,23 +22,20 @@ class providing the base-class of operations. partial, wraps, ) -import inspect from textwrap import dedent from typing import ( TYPE_CHECKING, - Callable, Literal, TypeVar, Union, cast, final, + overload, ) import warnings import numpy as np -from pandas._config.config import option_context - from pandas._libs import ( Timestamp, lib, @@ -46,16 +46,12 @@ class providing the base-class of operations. from pandas._typing import ( AnyArrayLike, ArrayLike, - Axis, - AxisInt, DtypeObj, - FillnaOptions, IndexLabel, + IntervalClosedType, NDFrameT, PositionalIndexer, RandomState, - Scalar, - T, npt, ) from pandas.compat.numpy import function as nv @@ -85,6 +81,7 @@ class providing the base-class of operations. is_numeric_dtype, is_object_dtype, is_scalar, + is_string_dtype, needs_i8_conversion, pandas_dtype, ) @@ -99,11 +96,9 @@ class providing the base-class of operations. sample, ) from pandas.core._numba import executor -from pandas.core.apply import warn_alias_replacement from pandas.core.arrays import ( ArrowExtensionArray, BaseMaskedArray, - Categorical, ExtensionArray, FloatingArray, IntegerArray, @@ -132,10 +127,8 @@ class providing the base-class of operations. GroupByNthSelector, ) from pandas.core.indexes.api import ( - CategoricalIndex, Index, MultiIndex, - RangeIndex, default_index, ) from pandas.core.internals.blocks import ensure_block_shape @@ -144,11 +137,21 @@ class providing the base-class of operations. from pandas.core.util.numba_ import ( get_jit_arguments, maybe_use_numba, + prepare_function_arguments, ) if TYPE_CHECKING: - from typing import Any + from pandas._libs.tslibs import BaseOffset + from pandas._libs.tslibs.timedeltas import Timedelta + from pandas._typing import ( + Any, + Concatenate, + P, + Self, + T, + ) + from pandas.core.indexers.objects import BaseIndexer from pandas.core.resample import Resampler from pandas.core.window import ( ExpandingGroupby, @@ -164,192 +167,7 @@ class providing the base-class of operations. to each row or column of a DataFrame. """ -_apply_docs = { - "template": """ - Apply function ``func`` group-wise and combine the results together. - - The function passed to ``apply`` must take a {input} as its first - argument and return a DataFrame, Series or scalar. ``apply`` will - then take care of combining the results back together into a single - dataframe or series. ``apply`` is therefore a highly flexible - grouping method. - - While ``apply`` is a very flexible method, its downside is that - using it can be quite a bit slower than using more specific methods - like ``agg`` or ``transform``. Pandas offers a wide range of method that will - be much faster than using ``apply`` for their specific purposes, so try to - use them before reaching for ``apply``. - - Parameters - ---------- - func : callable - A callable that takes a {input} as its first argument, and - returns a dataframe, a series or a scalar. In addition the - callable may take positional and keyword arguments. - include_groups : bool, default True - When True, will attempt to apply ``func`` to the groupings in - the case that they are columns of the DataFrame. If this raises a - TypeError, the result will be computed with the groupings excluded. - When False, the groupings will be excluded when applying ``func``. - - .. versionadded:: 2.2.0 - - .. deprecated:: 2.2.0 - - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. - - args, kwargs : tuple and dict - Optional positional and keyword arguments to pass to ``func``. - - Returns - ------- - Series or DataFrame - - See Also - -------- - pipe : Apply function to the full GroupBy object instead of to each - group. - aggregate : Apply aggregate function to the GroupBy object. - transform : Apply function column-by-column to the GroupBy object. - Series.apply : Apply a function to a Series. - DataFrame.apply : Apply a function to each row or column of a DataFrame. - - Notes - ----- - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. - - Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` - for more details. - - Examples - -------- - {examples} - """, - "dataframe_examples": """ - >>> df = pd.DataFrame({'A': 'a a b'.split(), - ... 'B': [1, 2, 3], - ... 'C': [4, 6, 5]}) - >>> g1 = df.groupby('A', group_keys=False) - >>> g2 = df.groupby('A', group_keys=True) - - Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only - differ in their ``group_keys`` argument. Calling `apply` in various ways, - we can get different grouping results: - - Example 1: below the function passed to `apply` takes a DataFrame as - its argument and returns a DataFrame. `apply` combines the result for - each group together into a new DataFrame: - - >>> g1[['B', 'C']].apply(lambda x: x / x.sum()) - B C - 0 0.333333 0.4 - 1 0.666667 0.6 - 2 1.000000 1.0 - - In the above, the groups are not part of the index. We can have them included - by using ``g2`` where ``group_keys=True``: - - >>> g2[['B', 'C']].apply(lambda x: x / x.sum()) - B C - A - a 0 0.333333 0.4 - 1 0.666667 0.6 - b 2 1.000000 1.0 - - Example 2: The function passed to `apply` takes a DataFrame as - its argument and returns a Series. `apply` combines the result for - each group together into a new DataFrame. - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``. - - >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) - B C - A - a 1.0 2.0 - b 0.0 0.0 - - >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) - B C - A - a 1.0 2.0 - b 0.0 0.0 - - The ``group_keys`` argument has no effect here because the result is not - like-indexed (i.e. :ref:`a transform `) when compared - to the input. - - Example 3: The function passed to `apply` takes a DataFrame as - its argument and returns a scalar. `apply` combines the result for - each group together into a Series, including setting the index as - appropriate: - - >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) - A - a 5 - b 2 - dtype: int64""", - "series_examples": """ - >>> s = pd.Series([0, 1, 2], index='a a b'.split()) - >>> g1 = s.groupby(s.index, group_keys=False) - >>> g2 = s.groupby(s.index, group_keys=True) - - From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. - Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only - differ in their ``group_keys`` argument. Calling `apply` in various ways, - we can get different grouping results: - - Example 1: The function passed to `apply` takes a Series as - its argument and returns a Series. `apply` combines the result for - each group together into a new Series. - - .. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``. - - >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2) - a 0.0 - a 2.0 - b 1.0 - dtype: float64 - - In the above, the groups are not part of the index. We can have them included - by using ``g2`` where ``group_keys=True``: - - >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2) - a a 0.0 - a 2.0 - b b 1.0 - dtype: float64 - - Example 2: The function passed to `apply` takes a Series as - its argument and returns a scalar. `apply` combines the result for - each group together into a Series, including setting the index as - appropriate: - - >>> g1.apply(lambda x: x.max() - x.min()) - a 1 - b 0 - dtype: int64 - - The ``group_keys`` argument has no effect here because the result is not - like-indexed (i.e. :ref:`a transform `) when compared - to the input. - - >>> g2.apply(lambda x: x.max() - x.min()) - a 1 - b 0 - dtype: int64""", -} - -_groupby_agg_method_template = """ +_groupby_agg_method_engine_template = """ Compute {fname} of group values. Parameters @@ -365,17 +183,40 @@ class providing the base-class of operations. The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. +engine : str, default None {e} + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None {ek} + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + Returns ------- Series or DataFrame Computed {fname} of values within each group. +See Also +-------- +SeriesGroupBy.min : Return the min of the group values. +DataFrameGroupBy.min : Return the min of the group values. +SeriesGroupBy.max : Return the max of the group values. +DataFrameGroupBy.max : Return the max of the group values. +SeriesGroupBy.sum : Return the sum of the group values. +DataFrameGroupBy.sum : Return the sum of the group values. + Examples -------- {example} """ -_groupby_agg_method_engine_template = """ +_groupby_agg_method_skipna_engine_template = """ Compute {fname} of group values. Parameters @@ -391,6 +232,12 @@ class providing the base-class of operations. The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. +skipna : bool, default {s} + Exclude NA/null values. If the entire group is NA and ``skipna`` is + ``True``, the result will be NA. + + .. versionchanged:: 3.0.0 + engine : str, default None {e} * ``'cython'`` : Runs rolling apply through C-extensions from cython. * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. @@ -410,6 +257,15 @@ class providing the base-class of operations. Series or DataFrame Computed {fname} of values within each group. +See Also +-------- +SeriesGroupBy.min : Return the min of the group values. +DataFrameGroupBy.min : Return the min of the group values. +SeriesGroupBy.max : Return the max of the group values. +DataFrameGroupBy.max : Return the max of the group values. +SeriesGroupBy.sum : Return the sum of the group values. +DataFrameGroupBy.sum : Return the sum of the group values. + Examples -------- {example} @@ -444,14 +300,15 @@ class providing the base-class of operations. a `(callable, data_keyword)` tuple where `data_keyword` is a string indicating the keyword of `callable` that expects the %(klass)s object. -args : iterable, optional +*args : iterable, optional Positional arguments passed into `func`. -kwargs : dict, optional +**kwargs : dict, optional A dictionary of keyword arguments passed into `func`. Returns ------- -the return type of `func`. +%(klass)s + The original object with the function `func` applied. See Also -------- @@ -478,7 +335,7 @@ class providing the base-class of operations. Parameters ---------- -f : function, str +func : function, str Function to apply to each group. See the Notes section below for requirements. Accepted inputs are: @@ -517,13 +374,14 @@ class providing the base-class of operations. Returns ------- %(klass)s + %(klass)s with the same indexes as the original object filled + with transformed values. See Also -------- %(klass)s.groupby.apply : Apply function ``func`` group-wise and combine the results together. -%(klass)s.groupby.aggregate : Aggregate using one or more - operations over the specified axis. +%(klass)s.groupby.aggregate : Aggregate using one or more operations. %(klass)s.transform : Call ``func`` on self producing a %(klass)s with the same axis shape as self. @@ -564,167 +422,6 @@ class providing the base-class of operations. -------- %(example)s""" -_agg_template_series = """ -Aggregate using one or more operations over the specified axis. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - - .. deprecated:: 2.1.0 - - Passing a dictionary is deprecated and will raise in a future version - of pandas. Pass a list of aggregations instead. -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}.groupby.apply : Apply function func group-wise - and combine the results together. -{klass}.groupby.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more - operations over the specified axis. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - -_agg_template_frame = """ -Aggregate using one or more operations over the specified axis. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of axis labels -> functions, function names or list of such. - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}.groupby.apply : Apply function func group-wise - and combine the results together. -{klass}.groupby.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more - operations over the specified axis. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - @final class GroupByPlot(PandasObject): @@ -764,7 +461,6 @@ def f(self): class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): _hidden_attrs = PandasObject._hidden_attrs | { "as_index", - "axis", "dropna", "exclusions", "grouper", @@ -776,7 +472,6 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): "sort", } - axis: AxisInt _grouper: ops.BaseGrouper keys: _KeysArgType | None = None level: IndexLabel | None = None @@ -784,7 +479,7 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): @final def __len__(self) -> int: - return len(self.groups) + return self._grouper.ngroups @final def __repr__(self) -> str: @@ -793,27 +488,30 @@ def __repr__(self) -> str: @final @property - def grouper(self) -> ops.BaseGrouper: - warnings.warn( - f"{type(self).__name__}.grouper is deprecated and will be removed in a " - "future version of pandas.", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._grouper - - @final - @property - def groups(self) -> dict[Hashable, np.ndarray]: + def groups(self) -> dict[Hashable, Index]: """ Dict {group name -> group labels}. + This property provides a dictionary representation of the groupings formed + during a groupby operation, where each key represents a unique group value from + the specified column(s), and each value is a list of index labels + that belong to that group. + + See Also + -------- + core.groupby.DataFrameGroupBy.get_group : Retrieve group from a + ``DataFrameGroupBy`` object with provided name. + core.groupby.SeriesGroupBy.get_group : Retrieve group from a + ``SeriesGroupBy`` object with provided name. + core.resample.Resampler.get_group : Retrieve group from a + ``Resampler`` object with provided name. + Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -832,22 +530,36 @@ def groups(self) -> dict[Hashable, np.ndarray]: 0 1 2 3 1 1 5 6 2 7 8 9 - >>> df.groupby(by=["a"]).groups + >>> df.groupby(by="a").groups {1: [0, 1], 7: [2]} For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').groups - {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} + >>> ser.resample("MS").groups + {Timestamp('2023-01-01 00:00:00'): np.int64(2), + Timestamp('2023-02-01 00:00:00'): np.int64(4)} """ + if isinstance(self.keys, list) and len(self.keys) == 1: + warnings.warn( + "`groups` by one element list returns scalar is deprecated " + "and will be removed. In a future version `groups` by one element " + "list will return tuple. Use ``df.groupby(by='a').groups`` " + "instead of ``df.groupby(by=['a']).groups`` to avoid this warning", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._grouper.groups @final @@ -861,12 +573,28 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ Dict {group name -> group indices}. + The dictionary keys represent the group labels (e.g., timestamps for a + time-based resampling operation), and the values are arrays of integer + positions indicating where the elements of each group are located in the + original data. This property is particularly useful when working with + resampled data, as it provides insight into how the original time-series data + has been grouped. + + See Also + -------- + core.groupby.DataFrameGroupBy.indices : Provides a mapping of group rows to + positions of the elements. + core.groupby.SeriesGroupBy.indices : Provides a mapping of group rows to + positions of the elements. + core.resample.Resampler.indices : Provides a mapping of group rows to + positions of the elements. + Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -879,27 +607,32 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: For DataFrameGroupBy: >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["owl", "toucan", "eagle"] + ... ) >>> df a b c owl 1 2 3 toucan 1 5 6 eagle 7 8 9 >>> df.groupby(by=["a"]).indices - {1: array([0, 1]), 7: array([2])} + {np.int64(1): array([0, 1]), np.int64(7): array([2])} For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').indices + >>> ser.resample("MS").indices defaultdict(, {Timestamp('2023-01-01 00:00:00'): [0, 1], Timestamp('2023-02-01 00:00:00'): [2, 3]}) """ @@ -947,7 +680,7 @@ def get_converter(s): ) raise ValueError(msg) from err - converters = [get_converter(s) for s in index_sample] + converters = (get_converter(s) for s in index_sample) names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) else: @@ -988,6 +721,22 @@ def _selected_obj(self): def _dir_additions(self) -> set[str]: return self.obj._dir_additions() + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + @Substitution( klass="GroupBy", examples=dedent( @@ -1013,14 +762,14 @@ def _dir_additions(self) -> set[str]: @Appender(_pipe_template) def pipe( self, - func: Callable[..., T] | tuple[Callable[..., T], str], - *args, - **kwargs, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, ) -> T: return com.pipe(self, func, *args, **kwargs) @final - def get_group(self, name, obj=None) -> DataFrame | Series: + def get_group(self, name) -> DataFrame | Series: """ Construct DataFrame from group with provided name. @@ -1028,26 +777,29 @@ def get_group(self, name, obj=None) -> DataFrame | Series: ---------- name : object The name of the group to get as a DataFrame. - obj : DataFrame, default None - The DataFrame to take the DataFrame out of. If - it is None, the object groupby was called on will - be used. - - .. deprecated:: 2.1.0 - The obj is deprecated and will be removed in a future version. - Do ``df.iloc[gb.indices.get(name)]`` - instead of ``gb.get_group(name, obj=df)``. Returns ------- - same type as obj + Series or DataFrame + Get the respective Series or DataFrame corresponding to the group provided. + + See Also + -------- + DataFrameGroupBy.groups: Dictionary representation of the groupings formed + during a groupby operation. + DataFrameGroupBy.indices: Provides a mapping of group rows to positions + of the elements. + SeriesGroupBy.groups: Dictionary representation of the groupings formed + during a groupby operation. + SeriesGroupBy.indices: Provides a mapping of group rows to positions + of the elements. Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -1062,8 +814,9 @@ def get_group(self, name, obj=None) -> DataFrame | Series: For DataFrameGroupBy: >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["owl", "toucan", "eagle"] + ... ) >>> df a b c owl 1 2 3 @@ -1076,15 +829,19 @@ def get_group(self, name, obj=None) -> DataFrame | Series: For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').get_group('2023-01-01') + >>> ser.resample("MS").get_group("2023-01-01") 2023-01-01 1 2023-01-15 2 dtype: int64 @@ -1097,51 +854,45 @@ def get_group(self, name, obj=None) -> DataFrame | Series: ): # GH#25971 if isinstance(name, tuple) and len(name) == 1: - # Allow users to pass tuples of length 1 to silence warning name = name[0] - elif not isinstance(name, tuple): - warnings.warn( - "When grouping with a length-1 list-like, " - "you will need to pass a length-1 tuple to get_group in a future " - "version of pandas. Pass `(name,)` instead of `name` to silence " - "this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + else: + raise KeyError(name) inds = self._get_index(name) if not len(inds): raise KeyError(name) - - if obj is None: - indexer = inds if self.axis == 0 else (slice(None), inds) - return self._selected_obj.iloc[indexer] - else: - warnings.warn( - "obj is deprecated and will be removed in a future version. " - "Do ``df.iloc[gb.indices.get(name)]`` " - "instead of ``gb.get_group(name, obj=df)``.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return obj._take_with_is_copy(inds, axis=self.axis) + return self._selected_obj.iloc[inds] @final def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: """ Groupby iterator. + This method provides an iterator over the groups created by the ``resample`` + or ``groupby`` operation on the object. The method yields tuples where + the first element is the label (group key) corresponding to each group or + resampled bin, and the second element is the subset of the data that falls + within that group or bin. + Returns ------- - Generator yielding sequence of (name, subsetted object) - for each group + Iterator + Generator yielding a sequence of (name, subsetted object) + for each group. + + See Also + -------- + Series.groupby : Group data by a specific key or column. + DataFrame.groupby : Group DataFrame using mapper or by columns. + DataFrame.resample : Resample a DataFrame. + Series.resample : Resample a Series. Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -1149,7 +900,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: b 3 dtype: int64 >>> for x, y in ser.groupby(level=0): - ... print(f'{x}\\n{y}\\n') + ... print(f"{x}\\n{y}\\n") a a 1 a 2 @@ -1168,7 +919,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: 1 1 5 6 2 7 8 9 >>> for x, y in df.groupby(by=["a"]): - ... print(f'{x}\\n{y}\\n') + ... print(f"{x}\\n{y}\\n") (1,) a b c 0 1 2 3 @@ -1179,16 +930,20 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> for x, y in ser.resample('MS'): - ... print(f'{x}\\n{y}\\n') + >>> for x, y in ser.resample("MS"): + ... print(f"{x}\\n{y}\\n") 2023-01-01 00:00:00 2023-01-01 1 2023-01-15 2 @@ -1200,19 +955,12 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: """ keys = self.keys level = self.level - result = self._grouper.get_iterator(self._selected_obj, axis=self.axis) - # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" - if is_list_like(level) and len(level) == 1: # type: ignore[arg-type] - # GH 51583 - warnings.warn( - "Creating a Groupby object with a length-1 list-like " - "level parameter will yield indexes as tuples in a future version. " - "To keep indexes as scalars, create Groupby objects with " - "a scalar level parameter instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if isinstance(keys, list) and len(keys) == 1: + result = self._grouper.get_iterator(self._selected_obj) + # mypy: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" + if ( + (is_list_like(level) and len(level) == 1) # type: ignore[arg-type] + or (isinstance(keys, list) and len(keys) == 1) + ): # GH#42795 - when keys is a list, return tuples even when length is 1 result = (((key,), group) for key, group in result) return result @@ -1237,7 +985,6 @@ class GroupBy(BaseGroupBy[NDFrameT]): Parameters ---------- obj : pandas object - axis : int, default 0 level : int, default None Level of MultiIndex groupings : list of Grouping objects @@ -1266,7 +1013,7 @@ class GroupBy(BaseGroupBy[NDFrameT]): :: - grouped = obj.groupby(keys, axis=axis) + grouped = obj.groupby(keys) for key, group in grouped: # do something with the data @@ -1298,7 +1045,6 @@ def __init__( self, obj: NDFrameT, keys: _KeysArgType | None = None, - axis: Axis = 0, level: IndexLabel | None = None, grouper: ops.BaseGrouper | None = None, exclusions: frozenset[Hashable] | None = None, @@ -1306,7 +1052,7 @@ def __init__( as_index: bool = True, sort: bool = True, group_keys: bool = True, - observed: bool | lib.NoDefault = lib.no_default, + observed: bool = False, dropna: bool = True, ) -> None: self._selection = selection @@ -1314,11 +1060,6 @@ def __init__( assert isinstance(obj, NDFrame), type(obj) self.level = level - - if not as_index: - if axis != 0: - raise ValueError("as_index=False only valid for axis=0") - self.as_index = as_index self.keys = keys self.sort = sort @@ -1329,28 +1070,14 @@ def __init__( grouper, exclusions, obj = get_grouper( obj, keys, - axis=axis, level=level, sort=sort, - observed=False if observed is lib.no_default else observed, + observed=observed, dropna=self.dropna, ) - if observed is lib.no_default: - if any(ping._passed_categorical for ping in grouper.groupings): - warnings.warn( - "The default of observed=False is deprecated and will be changed " - "to True in a future version of pandas. Pass observed=False to " - "retain current behavior or observed=True to adopt the future " - "default and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) - observed = False self.observed = observed - self.obj = obj - self.axis = obj._get_axis_number(axis) self._grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() @@ -1364,50 +1091,10 @@ def __getattr__(self, attr: str): f"'{type(self).__name__}' object has no attribute '{attr}'" ) - @final - def _deprecate_axis(self, axis: int, name: str) -> None: - if axis == 1: - warnings.warn( - f"{type(self).__name__}.{name} with axis=1 is deprecated and " - "will be removed in a future version. Operate on the un-grouped " - "DataFrame instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.{name} is deprecated " - "and will be removed in a future version. " - "Call without passing 'axis' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - @final def _op_via_apply(self, name: str, *args, **kwargs): """Compute the result of an operation by using GroupBy's apply.""" f = getattr(type(self._obj_with_exclusions), name) - sig = inspect.signature(f) - - if "axis" in kwargs and kwargs["axis"] is not lib.no_default: - axis = self.obj._get_axis_number(kwargs["axis"]) - self._deprecate_axis(axis, name) - elif "axis" in kwargs: - # exclude skew here because that was already defaulting to lib.no_default - # before this deprecation was instituted - if name == "skew": - pass - elif name == "fillna": - # maintain the behavior from before the deprecation - kwargs["axis"] = None - else: - kwargs["axis"] = 0 - - # a little trickery for aggregation functions that need an axis - # argument - if "axis" in sig.parameters: - if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default: - kwargs["axis"] = self.axis def curried(x): return f(x, *args, **kwargs) @@ -1456,24 +1143,21 @@ def _concat_objects( result = concat( values, - axis=self.axis, + axis=0, keys=group_keys, levels=group_levels, names=group_names, sort=False, ) else: - # GH5610, returns a MI, with the first level being a - # range index - keys = list(range(len(values))) - result = concat(values, axis=self.axis, keys=keys) + result = concat(values, axis=0) elif not not_indexed_same: - result = concat(values, axis=self.axis) + result = concat(values, axis=0) - ax = self._selected_obj._get_axis(self.axis) + ax = self._selected_obj.index if self.dropna: - labels = self._grouper.group_info[0] + labels = self._grouper.ids mask = labels != -1 ax = ax[mask] @@ -1483,16 +1167,16 @@ def _concat_objects( # so we resort to this # GH 14776, 30667 # TODO: can we reuse e.g. _reindex_non_unique? - if ax.has_duplicates and not result.axes[self.axis].equals(ax): + if ax.has_duplicates and not result.axes[0].equals(ax): # e.g. test_category_order_transformer target = algorithms.unique1d(ax._values) indexer, _ = result.index.get_indexer_non_unique(target) - result = result.take(indexer, axis=self.axis) + result = result.take(indexer, axis=0) else: - result = result.reindex(ax, axis=self.axis, copy=False) + result = result.reindex(ax, axis=0) else: - result = concat(values, axis=self.axis) + result = concat(values, axis=0) if self.obj.ndim == 1: name = self.obj.name @@ -1513,68 +1197,65 @@ def _set_result_index_ordered( # set the result index on the passed values object and # return the new object, xref 8046 - obj_axis = self.obj._get_axis(self.axis) + index = self.obj.index if self._grouper.is_monotonic and not self._grouper.has_dropped_na: # shortcut if we have an already ordered grouper - result = result.set_axis(obj_axis, axis=self.axis, copy=False) + result = result.set_axis(index, axis=0) return result # row order is scrambled => sort the rows by position in original index - original_positions = Index(self._grouper.result_ilocs()) - result = result.set_axis(original_positions, axis=self.axis, copy=False) - result = result.sort_index(axis=self.axis) + original_positions = Index(self._grouper.result_ilocs) + result = result.set_axis(original_positions, axis=0) + result = result.sort_index(axis=0) if self._grouper.has_dropped_na: # Add back in any missing rows due to dropna - index here is integral # with values referring to the row of the input so can use RangeIndex - result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) - result = result.set_axis(obj_axis, axis=self.axis, copy=False) + result = result.reindex(default_index(len(index)), axis=0) + result = result.set_axis(index, axis=0) return result @final - def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: + def _insert_inaxis_grouper( + self, result: Series | DataFrame, qs: npt.NDArray[np.float64] | None = None + ) -> DataFrame: if isinstance(result, Series): result = result.to_frame() + n_groupings = len(self._grouper.groupings) + + if qs is not None: + result.insert( + 0, f"level_{n_groupings}", np.tile(qs, len(result) // len(qs)) + ) + # zip in reverse so we can always insert at loc 0 - columns = result.columns - for name, lev, in_axis in zip( - reversed(self._grouper.names), - reversed(self._grouper.get_group_levels()), - reversed([grp.in_axis for grp in self._grouper.groupings]), + for level, (name, lev) in enumerate( + zip( + reversed(self._grouper.names), + self._grouper.get_group_levels(), + ) ): + if name is None: + # Behave the same as .reset_index() when a level is unnamed + name = ( + "index" + if n_groupings == 1 and qs is None + else f"level_{n_groupings - level - 1}" + ) + # GH #28549 # When using .apply(-), name will be in columns already - if name not in columns: - if in_axis: + if name not in result.columns: + # if in_axis: + if qs is None: result.insert(0, name, lev) else: - msg = ( - "A grouping was used that is not in the columns of the " - "DataFrame and so was excluded from the result. This grouping " - "will be included in a future version of pandas. Add the " - "grouping as a column of the DataFrame to silence this warning." - ) - warnings.warn( - message=msg, - category=FutureWarning, - stacklevel=find_stack_level(), - ) + result.insert(0, name, Index(np.repeat(lev, len(qs)))) return result - @final - def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT: - if self.axis == 1: - # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy - result = result.T - if result.index.equals(self.obj.index): - # Retain e.g. DatetimeIndex/TimedeltaIndex freq - # e.g. test_groupby_crash_on_nunique - result.index = self.obj.index.copy() - return result - @final def _wrap_aggregated_output( self, @@ -1598,24 +1279,19 @@ def _wrap_aggregated_output( if not self.as_index: # `not self.as_index` is only relevant for DataFrameGroupBy, # enforced in __init__ - result = self._insert_inaxis_grouper(result) + result = self._insert_inaxis_grouper(result, qs=qs) result = result._consolidate() - index = Index(range(self._grouper.ngroups)) + result.index = default_index(len(result)) else: index = self._grouper.result_index + if qs is not None: + # We get here with len(qs) != 1 and not self.as_index + # in test_pass_args_kwargs + index = _insert_quantile_level(index, qs) + result.index = index - if qs is not None: - # We get here with len(qs) != 1 and not self.as_index - # in test_pass_args_kwargs - index = _insert_quantile_level(index, qs) - - result.index = index - - # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has - # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT" - res = self._maybe_transpose_result(result) # type: ignore[arg-type] - return self._reindex_output(res, qs=qs) + return result def _wrap_applied_output( self, @@ -1631,11 +1307,11 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, _, ngroups = self._grouper.group_info - sorted_index = self._grouper._sort_idx + ngroups = self._grouper.ngroups + sorted_index = self._grouper.result_ilocs sorted_ids = self._grouper._sorted_ids - sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + sorted_data = data.take(sorted_index, axis=0).to_numpy() # GH 46867 index_data = data.index if isinstance(index_data, MultiIndex): @@ -1671,8 +1347,6 @@ def _numba_agg_general( raise NotImplementedError( "as_index=False is not supported. Use .reset_index() instead." ) - if self.axis == 1: - raise NotImplementedError("axis=1 is not supported.") data = self._obj_with_exclusions df = data if data.ndim == 2 else data.to_frame() @@ -1685,7 +1359,7 @@ def _numba_agg_general( ) # Pass group ids to kernel directly if it can handle it # (This is faster since it doesn't require a sort) - ids, _, _ = self._grouper.group_info + ids = self._grouper.ids ngroups = self._grouper.ngroups res_mgr = df._mgr.apply( @@ -1711,12 +1385,16 @@ def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): data and indices into a Numba jitted function. """ data = self._obj_with_exclusions + index_sorting = self._grouper.result_ilocs df = data if data.ndim == 2 else data.to_frame() starts, ends, sorted_index, sorted_data = self._numba_prep(df) numba_.validate_udf(func) + args, kwargs = prepare_function_arguments( + func, args, kwargs, num_required_args=2 + ) numba_transform_func = numba_.generate_numba_transform_func( - func, **get_jit_arguments(engine_kwargs, kwargs) + func, **get_jit_arguments(engine_kwargs) ) result = numba_transform_func( sorted_data, @@ -1728,7 +1406,7 @@ def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): ) # result values needs to be resorted to their original positions since we # evaluated the data sorted by group - result = result.take(np.argsort(sorted_index), axis=0) + result = result.take(np.argsort(index_sorting), axis=0) index = data.index if data.ndim == 1: result_kwargs = {"name": data.name} @@ -1751,8 +1429,11 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): starts, ends, sorted_index, sorted_data = self._numba_prep(df) numba_.validate_udf(func) + args, kwargs = prepare_function_arguments( + func, args, kwargs, num_required_args=2 + ) numba_agg_func = numba_.generate_numba_agg_func( - func, **get_jit_arguments(engine_kwargs, kwargs) + func, **get_jit_arguments(engine_kwargs) ) result = numba_agg_func( sorted_data, @@ -1777,18 +1458,148 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): # ----------------------------------------------------------------- # apply/agg/transform - @Appender( - _apply_docs["template"].format( - input="dataframe", examples=_apply_docs["dataframe_examples"] - ) - ) - def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: - orig_func = func - func = com.is_builtin_func(func) - if orig_func != func: - alias = com._builtin_table_alias[orig_func] - warn_alias_replacement(self, orig_func, alias) + def apply(self, func, *args, include_groups: bool = False, **kwargs) -> NDFrameT: + """ + Apply function ``func`` group-wise and combine the results together. + + The function passed to ``apply`` must take a dataframe as its first + argument and return a DataFrame, Series or scalar. ``apply`` will + then take care of combining the results back together into a single + dataframe or series. ``apply`` is therefore a highly flexible + grouping method. + + While ``apply`` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods + like ``agg`` or ``transform``. Pandas offers a wide range of method that will + be much faster than using ``apply`` for their specific purposes, so try to + use them before reaching for ``apply``. + + Parameters + ---------- + func : callable + A callable that takes a dataframe as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments. + + *args : tuple + Optional positional arguments to pass to ``func``. + + include_groups : bool, default False + When True, will attempt to apply ``func`` to the groupings in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. versionchanged:: 3.0.0 + + The default changed from True to False, and True is no longer allowed. + + **kwargs : dict + Optional keyword arguments to pass to ``func``. + + Returns + ------- + Series or DataFrame + A pandas object with the result of applying ``func`` to each group. + + See Also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate : Apply aggregate function to the GroupBy object. + transform : Apply function column-by-column to the GroupBy object. + Series.apply : Apply a function to a Series. + DataFrame.apply : Apply a function to each row or column of a DataFrame. + + Notes + ----- + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + >>> df = pd.DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) + >>> g1 = df.groupby("A", group_keys=False) + >>> g2 = df.groupby("A", group_keys=True) + + Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only + differ in their ``group_keys`` argument. Calling `apply` in various ways, + we can get different grouping results: + + Example 1: below the function passed to `apply` takes a DataFrame as + its argument and returns a DataFrame. `apply` combines the result for + each group together into a new DataFrame: + + >>> g1[["B", "C"]].apply(lambda x: x / x.sum()) + B C + 0 0.333333 0.4 + 1 0.666667 0.6 + 2 1.000000 1.0 + In the above, the groups are not part of the index. We can have them included + by using ``g2`` where ``group_keys=True``: + + >>> g2[["B", "C"]].apply(lambda x: x / x.sum()) + B C + A + a 0 0.333333 0.4 + 1 0.666667 0.6 + b 2 1.000000 1.0 + + Example 2: The function passed to `apply` takes a DataFrame as + its argument and returns a Series. `apply` combines the result for + each group together into a new DataFrame. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``. + + >>> g1[["B", "C"]].apply(lambda x: x.astype(float).max() - x.min()) + B C + A + a 1.0 2.0 + b 0.0 0.0 + + >>> g2[["B", "C"]].apply(lambda x: x.astype(float).max() - x.min()) + B C + A + a 1.0 2.0 + b 0.0 0.0 + + The ``group_keys`` argument has no effect here because the result is not + like-indexed (i.e. :ref:`a transform `) when compared + to the input. + + Example 3: The function passed to `apply` takes a DataFrame as + its argument and returns a scalar. `apply` combines the result for + each group together into a Series, including setting the index as + appropriate: + + >>> g1.apply(lambda x: x.C.max() - x.B.min()) + A + a 5 + b 2 + dtype: int64 + + Example 4: The function passed to ``apply`` returns ``None`` for one of the + group. This group is filtered from the result: + + >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x) + B C + 0 1 4 + 1 2 6 + """ + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") if isinstance(func, str): if hasattr(self, func): res = getattr(self, func) @@ -1815,37 +1626,7 @@ def f(g): else: f = func - if not include_groups: - return self._python_apply_general(f, self._obj_with_exclusions) - - # ignore SettingWithCopy here in case the user mutates - with option_context("mode.chained_assignment", None): - try: - result = self._python_apply_general(f, self._selected_obj) - if ( - not isinstance(self.obj, Series) - and self._selection is None - and self._selected_obj.shape != self._obj_with_exclusions.shape - ): - warnings.warn( - message=_apply_groupings_depr.format( - type(self).__name__, "apply" - ), - category=FutureWarning, - stacklevel=find_stack_level(), - ) - except TypeError: - # gh-20949 - # try again, with .apply acting as a filtering - # operation, by excluding the grouping column - # This would normally not be triggered - # except if the udf is trying an operation that - # fails on *some* columns, e.g. a numeric operation - # on a string grouper column - - return self._python_apply_general(f, self._obj_with_exclusions) - - return result + return self._python_apply_general(f, self._obj_with_exclusions) @final def _python_apply_general( @@ -1882,7 +1663,7 @@ def _python_apply_general( Series or DataFrame data after applying f """ - values, mutated = self._grouper.apply_groupwise(f, data, self.axis) + values, mutated = self._grouper.apply_groupwise(f, data) if not_indexed_same is None: not_indexed_same = mutated @@ -1945,8 +1726,13 @@ def _agg_py_fallback( # preserve the kind of exception that raised raise type(err)(msg) from err - if ser.dtype == object: + dtype = ser.dtype + if dtype == object: res_values = res_values.astype(object, copy=False) + elif is_string_dtype(dtype): + # mypy doesn't infer dtype is an ExtensionDtype + string_array_cls = dtype.construct_array_type() # type: ignore[union-attr] + res_values = string_array_cls._from_sequence(res_values, dtype=dtype) # If we are DataFrameGroupBy and went through a SeriesGroupByPath # then we need to reshape @@ -2000,23 +1786,14 @@ def array_func(values: ArrayLike) -> ArrayLike: if how in ["idxmin", "idxmax"]: res = self._wrap_idxmax_idxmin(res) out = self._wrap_aggregated_output(res) - if self.axis == 1: - out = out.infer_objects(copy=False) return out - def _cython_transform( - self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs - ): + def _cython_transform(self, how: str, numeric_only: bool = False, **kwargs): raise AbstractMethodError(self) @final def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # optimized transforms - orig_func = func - func = com.get_cython_func(func) or func - if orig_func != func: - warn_alias_replacement(self, orig_func, func) - if not isinstance(func, str): return self._transform_general(func, engine, engine_kwargs, *args, **kwargs) @@ -2032,24 +1809,40 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): else: # i.e. func in base.reduction_kernels + if self.observed: + return self._reduction_kernel_transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) - # GH#30918 Use _transform_fast only when we know func is an aggregation - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - with com.temp_setattr(self, "as_index", True): - # GH#49834 - result needs groups in the index for - # _wrap_transform_fast_result - if func in ["idxmin", "idxmax"]: - func = cast(Literal["idxmin", "idxmax"], func) - result = self._idxmax_idxmin(func, True, *args, **kwargs) - else: - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + with ( + com.temp_setattr(self, "observed", True), + com.temp_setattr(self, "_grouper", self._grouper.observed_grouper), + ): + return self._reduction_kernel_transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) - return self._wrap_transform_fast_result(result) + @final + def _reduction_kernel_transform( + self, func, *args, engine=None, engine_kwargs=None, **kwargs + ): + # GH#30918 Use _transform_fast only when we know func is an aggregation + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) + + return self._wrap_transform_fast_result(result) @final def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: @@ -2059,8 +1852,8 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids, _, _ = self._grouper.group_info - result = result.reindex(self._grouper.result_index, axis=self.axis, copy=False) + ids = self._grouper.ids + result = result.reindex(self._grouper.result_index, axis=0) if self.obj.ndim == 1: # i.e. SeriesGroupBy @@ -2068,15 +1861,12 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: output = obj._constructor(out, index=obj.index, name=obj.name) else: # `.size()` gives Series output on DataFrame input, need axis 0 - axis = 0 if result.ndim == 1 else self.axis # GH#46209 # Don't convert indices: negative indices need to give rise # to null values in the result - new_ax = result.axes[axis].take(ids) - output = result._reindex_with_indexers( - {axis: (new_ax, ids)}, allow_dups=True, copy=False - ) - output = output.set_axis(obj._get_axis(self.axis), axis=axis) + new_ax = result.index.take(ids) + output = result._reindex_with_indexers({0: (new_ax, ids)}, allow_dups=True) + output = output.set_axis(obj.index, axis=0) return output # ----------------------------------------------------------------- @@ -2089,13 +1879,13 @@ def _apply_filter(self, indices, dropna): else: indices = np.sort(np.concatenate(indices)) if dropna: - filtered = self._selected_obj.take(indices, axis=self.axis) + filtered = self._selected_obj.take(indices, axis=0) else: mask = np.empty(len(self._selected_obj.index), dtype=bool) mask.fill(False) mask[indices.astype(int)] = True # mask fails to broadcast when passed to where; broadcast manually. - mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T # type: ignore[assignment] filtered = self._selected_obj.where(mask) # Fill with NaNs. return filtered @@ -2112,7 +1902,8 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, _, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2170,7 +1961,7 @@ def any(self, skipna: bool = True) -> NDFrameT: -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 0], index=lst) >>> ser a 1 @@ -2185,8 +1976,9 @@ def any(self, skipna: bool = True) -> NDFrameT: For DataFrameGroupBy: >>> data = [[1, 0, 3], [1, 0, 6], [7, 1, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["ostrich", "penguin", "parrot"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["ostrich", "penguin", "parrot"] + ... ) >>> df a b c ostrich 1 0 3 @@ -2227,7 +2019,7 @@ def all(self, skipna: bool = True) -> NDFrameT: For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 0], index=lst) >>> ser a 1 @@ -2242,8 +2034,9 @@ def all(self, skipna: bool = True) -> NDFrameT: For DataFrameGroupBy: >>> data = [[1, 0, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["ostrich", "penguin", "parrot"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["ostrich", "penguin", "parrot"] + ... ) >>> df a b c ostrich 1 0 3 @@ -2277,7 +2070,7 @@ def count(self) -> NDFrameT: -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, np.nan], index=lst) >>> ser a 1.0 @@ -2292,8 +2085,9 @@ def count(self) -> NDFrameT: For DataFrameGroupBy: >>> data = [[1, np.nan, 3], [1, np.nan, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["cow", "horse", "bull"] + ... ) >>> df a b c cow 1 NaN 3 @@ -2307,21 +2101,26 @@ def count(self) -> NDFrameT: For Resampler: - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').count() + >>> ser.resample("MS").count() 2023-01-01 2 2023-02-01 2 Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, _, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups mask = ids != -1 is_series = data.ndim == 1 @@ -2352,15 +2151,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(hfunc) new_obj = self._wrap_agged_manager(new_mgr) + result = self._wrap_aggregated_output(new_obj) - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _wrap_aggregated_output() returns. GH 35028 - # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false - with com.temp_setattr(self, "observed", True): - result = self._wrap_aggregated_output(new_obj) - - return self._reindex_output(result, fill_value=0) + return result @final @Substitution(name="groupby") @@ -2368,6 +2161,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def mean( self, numeric_only: bool = False, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2383,6 +2177,11 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. * ``'numba'`` : Runs the operation through JIT compiled code from numba. @@ -2403,17 +2202,19 @@ def mean( Returns ------- pandas.Series or pandas.DataFrame + Mean of values within each group. Same object type as the caller. %(see_also)s Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], - ... 'B': [np.nan, 2, 3, 4, 5], - ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) + >>> df = pd.DataFrame( + ... {"A": [1, 1, 2, 1, 2], "B": [np.nan, 2, 3, 4, 5], "C": [1, 2, 1, 1, 2]}, + ... columns=["A", "B", "C"], + ... ) Groupby one column and return the mean of the remaining columns in each group. - >>> df.groupby('A').mean() + >>> df.groupby("A").mean() B C A 1 3.0 1.333333 @@ -2421,7 +2222,7 @@ def mean( Groupby two columns and return the mean of the remaining column. - >>> df.groupby(['A', 'B']).mean() + >>> df.groupby(["A", "B"]).mean() C A B 1 2.0 2.0 @@ -2432,7 +2233,7 @@ def mean( Groupby one column and return the mean of only particular column in the group. - >>> df.groupby('A')['B'].mean() + >>> df.groupby("A")["B"].mean() A 1 3.0 2 4.0 @@ -2447,17 +2248,21 @@ def mean( executor.float_dtype_mapping, engine_kwargs, min_periods=0, + skipna=skipna, ) else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False) -> NDFrameT: + def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2472,16 +2277,27 @@ def median(self, numeric_only: bool = False) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame Median of values within each group. + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser a 7 @@ -2498,9 +2314,10 @@ def median(self, numeric_only: bool = False) -> NDFrameT: For DataFrameGroupBy: - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> data = {"a": [1, 3, 5, 7, 7, 8, 3], "b": [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame( + ... data, index=["dog", "dog", "dog", "mouse", "mouse", "mouse", "mouse"] + ... ) >>> df a b dog 1 1 @@ -2517,22 +2334,31 @@ def median(self, numeric_only: bool = False) -> NDFrameT: For Resampler: - >>> ser = pd.Series([1, 2, 3, 3, 4, 5], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').median() + >>> ser = pd.Series( + ... [1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").median() 2023-01-01 2.0 2023-02-01 4.0 Freq: MS, dtype: float64 """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2545,6 +2371,7 @@ def std( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -2554,7 +2381,8 @@ def std( Parameters ---------- ddof : int, default 1 - Degrees of freedom. + Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, + where ``N`` represents the number of elements. engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. @@ -2582,6 +2410,11 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2591,7 +2424,7 @@ def std( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser a 7 @@ -2608,9 +2441,10 @@ def std( For DataFrameGroupBy: - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> data = {"a": [1, 3, 5, 7, 7, 8, 3], "b": [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame( + ... data, index=["dog", "dog", "dog", "mouse", "mouse", "mouse", "mouse"] + ... ) >>> df a b dog 1 1 @@ -2635,14 +2469,16 @@ def std( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) ) else: return self._cython_agg_general( "std", - alt=lambda x: Series(x, copy=False).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2654,6 +2490,7 @@ def var( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -2691,6 +2528,11 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2700,7 +2542,7 @@ def var( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser a 7 @@ -2717,9 +2559,10 @@ def var( For DataFrameGroupBy: - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> data = {"a": [1, 3, 5, 7, 7, 8, 3], "b": [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame( + ... data, index=["dog", "dog", "dog", "mouse", "mouse", "mouse", "mouse"] + ... ) >>> df a b dog 1 1 @@ -2743,13 +2586,15 @@ def var( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) else: return self._cython_agg_general( "var", - alt=lambda x: Series(x, copy=False).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2767,10 +2612,6 @@ def _value_counts( SeriesGroupBy additionally supports a bins argument. See the docstring of DataFrameGroupBy.value_counts for a description of arguments. """ - if self.axis == 1: - raise NotImplementedError( - "DataFrameGroupBy.value_counts only handles axis=0" - ) name = "proportion" if normalize else "count" df = self.obj @@ -2781,7 +2622,7 @@ def _value_counts( } if isinstance(obj, Series): _name = obj.name - keys = [] if _name in in_axis_names else [obj] + keys: Iterable[Series] = [] if _name in in_axis_names else [obj] else: unique_cols = set(obj.columns) if subset is not None: @@ -2795,26 +2636,24 @@ def _value_counts( doesnt_exist = subsetted - unique_cols if doesnt_exist: raise ValueError( - f"Keys {doesnt_exist} in subset do not " - f"exist in the DataFrame." + f"Keys {doesnt_exist} in subset do not exist in the DataFrame." ) else: subsetted = unique_cols - keys = [ + keys = ( # Can't use .values because the column label needs to be preserved obj.iloc[:, idx] for idx, _name in enumerate(obj.columns) if _name not in in_axis_names and _name in subsetted - ] + ) groupings = list(self._grouper.groupings) for key in keys: grouper, _, _ = get_grouper( df, key=key, - axis=self.axis, - sort=self.sort, + sort=False, observed=False, dropna=dropna, ) @@ -2823,26 +2662,13 @@ def _value_counts( # Take the size of the overall columns gb = df.groupby( groupings, - sort=self.sort, + sort=False, observed=self.observed, dropna=self.dropna, ) result_series = cast(Series, gb.size()) result_series.name = name - # GH-46357 Include non-observed categories - # of non-grouping columns regardless of `observed` - if any( - isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) - and not grouping._observed - for grouping in groupings - ): - levels_list = [ping._result_index for ping in groupings] - multi_index = MultiIndex.from_product( - levels_list, names=[ping.name for ping in groupings] - ) - result_series = result_series.reindex(multi_index, fill_value=0) - if sort: # Sort by the values result_series = result_series.sort_values( @@ -2853,7 +2679,7 @@ def _value_counts( names = result_series.index.names # GH#55951 - Temporarily replace names in case they are integers result_series.index.names = range(len(names)) - index_level = list(range(len(self._grouper.groupings))) + index_level = range(len(self._grouper.groupings)) result_series = result_series.sort_index( level=index_level, sort_remaining=False ) @@ -2897,7 +2723,9 @@ def _value_counts( return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + def sem( + self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True + ) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -2917,16 +2745,26 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame Standard error of the mean of values within each group. + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([5, 10, 8, 14], index=lst) >>> ser a 5 @@ -2942,8 +2780,11 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: For DataFrameGroupBy: >>> data = [[1, 12, 11], [1, 15, 2], [2, 5, 8], [2, 6, 12]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"], + ... ) >>> df a b c tuna 1 12 11 @@ -2958,14 +2799,20 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: For Resampler: - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').sem() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").sem() 2023-01-01 0.577350 2023-02-01 1.527525 Freq: MS, dtype: float64 @@ -2977,9 +2824,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x, copy=False).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -3000,7 +2848,7 @@ def size(self) -> DataFrame | Series: For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([1, 2, 3], index=lst) >>> ser a 1 @@ -3013,8 +2861,9 @@ def size(self) -> DataFrame | Series: dtype: int64 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["owl", "toucan", "eagle"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["owl", "toucan", "eagle"] + ... ) >>> df a b c owl 1 2 3 @@ -3028,14 +2877,16 @@ def size(self) -> DataFrame | Series: For Resampler: - >>> ser = pd.Series([1, 2, 3], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01'])) + >>> ser = pd.Series( + ... [1, 2, 3], + ... index=pd.DatetimeIndex(["2023-01-01", "2023-01-15", "2023-02-01"]), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 dtype: int64 - >>> ser.resample('MS').size() + >>> ser.resample("MS").size() 2023-01-01 2 2023-02-01 1 Freq: MS, dtype: int64 @@ -3069,10 +2920,6 @@ def size(self) -> DataFrame | Series: dtype_backend=dtype_backend, ) - with com.temp_setattr(self, "as_index", True): - # size already has the desired behavior in GH#49519, but this makes the - # as_index=False path of _reindex_output fail on categorical groupers. - result = self._reindex_output(result, fill_value=0) if not self.as_index: # error: Incompatible types in assignment (expression has # type "DataFrame", variable has type "Series") @@ -3081,10 +2928,11 @@ def size(self) -> DataFrame | Series: @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="sum", no=False, mc=0, + s=True, e=None, ek=None, example=dedent( @@ -3126,6 +2974,7 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3137,6 +2986,7 @@ def sum( executor.default_dtype_mapping, engine_kwargs, min_periods=min_count, + skipna=skipna, ) else: # If we are grouping on categoricals we want unobserved categories to @@ -3148,21 +2998,51 @@ def sum( min_count=min_count, alias="sum", npfunc=np.sum, + skipna=skipna, ) - return self._reindex_output(result, fill_value=0) + return result @final - @doc( - _groupby_agg_method_template, - fname="prod", - no=False, - mc=0, - example=dedent( - """\ + def prod( + self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True + ) -> NDFrameT: + """ + Compute prod of group values. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + + Returns + ------- + Series or DataFrame + Computed prod of values within each group. + + See Also + -------- + Series.prod : Return the product of the values over the requested axis. + DataFrame.prod : Return the product of the values over the requested axis. + + Examples + -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 4], index=lst) >>> ser a 1 @@ -3178,8 +3058,11 @@ def sum( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"], + ... ) >>> df a b c tiger 1 8 2 @@ -3190,22 +3073,25 @@ def sum( b c a 1 16 10 - 2 30 72""" - ), - ) - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + 2 30 72 + """ return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + alias="prod", + npfunc=np.prod, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="min", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3245,6 +3131,7 @@ def min( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3257,23 +3144,26 @@ def min( engine_kwargs, min_periods=min_count, is_max=False, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="min", npfunc=np.min, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="max", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3313,6 +3203,7 @@ def max( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3325,19 +3216,25 @@ def max( engine_kwargs, min_periods=min_count, is_max=True, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="max", npfunc=np.max, ) @final - def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def first( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the first non-null entry of each column. + Compute the first entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3345,26 +3242,36 @@ def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: Include only float, int, boolean columns. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - First non-null of values within each group. + First values within each group. See Also -------- DataFrame.groupby : Apply a function groupby to each row or column of a DataFrame. - pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry + core.groupby.DataFrameGroupBy.last : Compute the last non-null entry of each column. - pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. + core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. Examples -------- - >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3], - ... D=['3/11/2000', '3/12/2000', '3/13/2000'])) - >>> df['D'] = pd.to_datetime(df['D']) + >>> df = pd.DataFrame( + ... dict( + ... A=[1, 1, 3], + ... B=[None, 5, 6], + ... C=[1, 2, 3], + ... D=["3/11/2000", "3/12/2000", "3/13/2000"], + ... ) + ... ) + >>> df["D"] = pd.to_datetime(df["D"]) >>> df.groupby("A").first() B C D A @@ -3382,7 +3289,7 @@ def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: 3 6.0 3 """ - def first_compat(obj: NDFrameT, axis: AxisInt = 0): + def first_compat(obj: NDFrameT): def first(x: Series): """Helper function for first item that isn't NA.""" arr = x.array[notna(x.array)] @@ -3391,7 +3298,7 @@ def first(x: Series): return arr[0] if isinstance(obj, DataFrame): - return obj.apply(first, axis=axis) + return obj.apply(first) elif isinstance(obj, Series): return first(obj) else: # pragma: no cover @@ -3402,12 +3309,17 @@ def first(x: Series): min_count=min_count, alias="first", npfunc=first_compat, + skipna=skipna, ) @final - def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def last( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the last non-null entry of each column. + Compute the last entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3416,20 +3328,24 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: everything, then use only numeric data. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - Last non-null of values within each group. + Last of values within each group. See Also -------- DataFrame.groupby : Apply a function groupby to each row or column of a DataFrame. - pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry + core.groupby.DataFrameGroupBy.first : Compute the first non-null entry of each column. - pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. + core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. Examples -------- @@ -3441,7 +3357,7 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: 3 6.0 3 """ - def last_compat(obj: NDFrameT, axis: AxisInt = 0): + def last_compat(obj: NDFrameT): def last(x: Series): """Helper function for last item that isn't NA.""" arr = x.array[notna(x.array)] @@ -3450,7 +3366,7 @@ def last(x: Series): return arr[-1] if isinstance(obj, DataFrame): - return obj.apply(last, axis=axis) + return obj.apply(last) elif isinstance(obj, Series): return last(obj) else: # pragma: no cover @@ -3461,6 +3377,7 @@ def last(x: Series): min_count=min_count, alias="last", npfunc=last_compat, + skipna=skipna, ) @final @@ -3475,12 +3392,27 @@ def ohlc(self) -> DataFrame: DataFrame Open, high, low and close values within each group. + See Also + -------- + DataFrame.agg : Aggregate using one or more operations over the specified axis. + DataFrame.resample : Resample time-series data. + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Examples -------- For SeriesGroupBy: - >>> lst = ['SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC',] + >>> lst = [ + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... "SPX", + ... "CAC", + ... ] >>> ser = pd.Series([3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst) >>> ser SPX 3.4 @@ -3499,10 +3431,13 @@ def ohlc(self) -> DataFrame: For DataFrameGroupBy: - >>> data = {2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2 , 1], - ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0]} - >>> df = pd.DataFrame(data, index=['SPX', 'CAC', 'SPX', 'CAC', - ... 'SPX', 'CAC', 'SPX', 'CAC']) + >>> data = { + ... 2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2, 1], + ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0], + ... } + >>> df = pd.DataFrame( + ... data, index=["SPX", "CAC", "SPX", "CAC", "SPX", "CAC", "SPX", "CAC"] + ... ) >>> df 2022 2023 SPX 1.2 3.4 @@ -3521,14 +3456,20 @@ def ohlc(self) -> DataFrame: For Resampler: - >>> ser = pd.Series([1, 3, 2, 4, 3, 5], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').ohlc() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").ohlc() open high low close 2023-01-01 1 3 1 2 2023-02-01 4 5 3 5 @@ -3548,7 +3489,7 @@ def ohlc(self) -> DataFrame: result = self.obj._constructor_expanddim( res_values, index=self._grouper.result_index, columns=agg_names ) - return self._reindex_output(result) + return result result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) return result @@ -3580,8 +3521,6 @@ def describe( obj, not_indexed_same=True, ) - if self.axis == 1: - return result.T # GH#49256 - properly handle the grouping column(s) result = result.unstack() @@ -3592,7 +3531,9 @@ def describe( return result @final - def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: + def resample( + self, rule, *args, include_groups: bool = False, **kwargs + ) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3617,10 +3558,9 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default was changed to False, and True is no longer allowed. **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and @@ -3628,11 +3568,8 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Returns ------- - pandas.api.typing.DatetimeIndexResamplerGroupby, - pandas.api.typing.PeriodIndexResamplerGroupby, or - pandas.api.typing.TimedeltaIndexResamplerGroupby - Return a new groupby object, with type depending on the data - being resampled. + DatetimeIndexResampler, PeriodIndexResampler or TimdeltaResampler + Resampler object for the type of the index. See Also -------- @@ -3643,10 +3580,8 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Examples -------- - >>> idx = pd.date_range('1/1/2000', periods=4, freq='min') - >>> df = pd.DataFrame(data=4 * [range(2)], - ... index=idx, - ... columns=['a', 'b']) + >>> idx = pd.date_range("1/1/2000", periods=4, freq="min") + >>> df = pd.DataFrame(data=4 * [range(2)], index=idx, columns=["a", "b"]) >>> df.iloc[2, 0] = 5 >>> df a b @@ -3658,7 +3593,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby('a').resample('3min', include_groups=False).sum() + >>> df.groupby("a").resample("3min").sum() b a 0 2000-01-01 00:00:00 2 @@ -3667,7 +3602,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Upsample the series into 30 second bins. - >>> df.groupby('a').resample('30s', include_groups=False).sum() + >>> df.groupby("a").resample("30s").sum() b a 0 2000-01-01 00:00:00 1 @@ -3681,7 +3616,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Resample by month. Values are assigned to the month of the period. - >>> df.groupby('a').resample('ME', include_groups=False).sum() + >>> df.groupby("a").resample("ME").sum() b a 0 2000-01-31 3 @@ -3690,11 +3625,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> ( - ... df.groupby('a') - ... .resample('3min', closed='right', include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right").sum()) b a 0 1999-12-31 23:57:00 1 @@ -3705,11 +3636,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp the bin interval, but label each bin using the right edge instead of the left. - >>> ( - ... df.groupby('a') - ... .resample('3min', closed='right', label='right', include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right", label="right").sum()) b a 0 2000-01-01 00:00:00 1 @@ -3718,30 +3645,38 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp """ from pandas.core.resample import get_resampler_for_grouping - # mypy flags that include_groups could be specified via `*args` or `**kwargs` - # GH#54961 would resolve. - return get_resampler_for_grouping( # type: ignore[misc] - self, rule, *args, include_groups=include_groups, **kwargs - ) + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") + + return get_resampler_for_grouping(self, rule, *args, **kwargs) @final - def rolling(self, *args, **kwargs) -> RollingGroupby: + def rolling( + self, + window: int | datetime.timedelta | str | BaseOffset | BaseIndexer, + min_periods: int | None = None, + center: bool = False, + win_type: str | None = None, + on: str | None = None, + closed: IntervalClosedType | None = None, + method: str = "single", + ) -> RollingGroupby: """ Return a rolling grouper, providing rolling functionality per group. Parameters ---------- window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. + Interval of the moving window. - If an integer, the fixed number of observations used for - each window. + If an integer, the delta between the start and end of each window. + The number of points in the window depends on the ``closed`` argument. If a timedelta, str, or offset, the time period of each window. Each window will be a variable sized based on the observations included in the time-period. This is only valid for datetimelike indexes. - To learn more about the offsets & frequency strings, please see `this link - `__. + To learn more about the offsets & frequency strings, please see + :ref:`this link`. If a BaseIndexer subclass, the window boundaries based on the defined ``get_window_bounds`` method. Additional rolling @@ -3780,22 +3715,23 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: Provided integer column is ignored and excluded from result since an integer index is not used to calculate the rolling window. - axis : int or str, default 0 - If ``0`` or ``'index'``, roll across the rows. - - If ``1`` or ``'columns'``, roll across the columns. + closed : str, default None + Determines the inclusivity of points in the window - For `Series` this parameter is unused and defaults to 0. + If ``'right'``, uses the window (first, last] meaning the last point + is included in the calculations. - closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + If ``'left'``, uses the window [first, last) meaning the first point + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'both'``, uses the window [first, last] meaning all points in + the window are included in the calculations. - If ``'both'``, no points in the window are excluded from calculations. + If ``'neither'``, uses the window (first, last) meaning the first + and last points in the window are excluded from calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). @@ -3820,9 +3756,13 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 2, 2], - ... 'B': [1, 2, 3, 4], - ... 'C': [0.362, 0.227, 1.267, -0.562]}) + >>> df = pd.DataFrame( + ... { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362, 0.227, 1.267, -0.562], + ... } + ... ) >>> df A B C 0 1 1 0.362 @@ -3830,7 +3770,7 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: 2 2 3 1.267 3 2 4 -0.562 - >>> df.groupby('A').rolling(2).sum() + >>> df.groupby("A").rolling(2).sum() B C A 1 0 NaN NaN @@ -3838,7 +3778,7 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: 2 2 NaN NaN 3 7.0 0.705 - >>> df.groupby('A').rolling(2, min_periods=1).sum() + >>> df.groupby("A").rolling(2, min_periods=1).sum() B C A 1 0 1.0 0.362 @@ -3846,7 +3786,7 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: 2 2 3.0 1.267 3 7.0 0.705 - >>> df.groupby('A').rolling(2, on='B').sum() + >>> df.groupby("A").rolling(2, on="B").sum() B C A 1 0 1 NaN @@ -3858,51 +3798,191 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: return RollingGroupby( self._selected_obj, - *args, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + closed=closed, + method=method, _grouper=self._grouper, _as_index=self.as_index, - **kwargs, ) @final - @Substitution(name="groupby") - @Appender(_common_see_also) - def expanding(self, *args, **kwargs) -> ExpandingGroupby: + def expanding( + self, + min_periods: int = 1, + method: str = "single", + ) -> ExpandingGroupby: """ - Return an expanding grouper, providing expanding - functionality per group. + Return an expanding grouper, providing expanding functionality per group. + + Parameters + ---------- + min_periods : int, default 1 + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + method : str {'single', 'table'}, default 'single' + Execute the expanding operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. Returns ------- pandas.api.typing.ExpandingGroupby + An object that supports expanding transformations over each group. + + See Also + -------- + Series.expanding : Expanding transformations for Series. + DataFrame.expanding : Expanding transformations for DataFrames. + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "Class": ["A", "A", "A", "B", "B", "B"], + ... "Value": [10, 20, 30, 40, 50, 60], + ... } + ... ) + >>> df + Class Value + 0 A 10 + 1 A 20 + 2 A 30 + 3 B 40 + 4 B 50 + 5 B 60 + + >>> df.groupby("Class").expanding().mean() + Value + Class + A 0 10.0 + 1 15.0 + 2 20.0 + B 3 40.0 + 4 45.0 + 5 50.0 """ from pandas.core.window import ExpandingGroupby return ExpandingGroupby( self._selected_obj, - *args, + min_periods=min_periods, + method=method, _grouper=self._grouper, - **kwargs, ) @final - @Substitution(name="groupby") - @Appender(_common_see_also) - def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: + def ewm( + self, + com: float | None = None, + span: float | None = None, + halflife: float | str | Timedelta | None = None, + alpha: float | None = None, + min_periods: int | None = 0, + adjust: bool = True, + ignore_na: bool = False, + times: np.ndarray | Series | None = None, + method: str = "single", + ) -> ExponentialMovingWindowGroupby: """ Return an ewm grouper, providing ewm functionality per group. + Parameters + ---------- + com : float, optional + Specify decay in terms of center of mass. + Alternative to ``span``, ``halflife``, and ``alpha``. + + span : float, optional + Specify decay in terms of span. + + halflife : float, str, or Timedelta, optional + Specify decay in terms of half-life. + + alpha : float, optional + Specify smoothing factor directly. + + min_periods : int, default 0 + Minimum number of observations in the window required to have a value; + otherwise, result is ``np.nan``. + + adjust : bool, default True + Divide by decaying adjustment factor to account for imbalance in + relative weights. + + ignore_na : bool, default False + Ignore missing values when calculating weights. + + times : str or array-like of datetime64, optional + Times corresponding to the observations. + + method : {'single', 'table'}, default 'single' + Execute the operation per group independently (``'single'``) or over the + entire object before regrouping (``'table'``). Only applicable to + ``mean()``, and only when using ``engine='numba'``. + Returns ------- pandas.api.typing.ExponentialMovingWindowGroupby + An object that supports exponentially weighted moving transformations over + each group. + + See Also + -------- + Series.ewm : EWM transformations for Series. + DataFrame.ewm : EWM transformations for DataFrames. + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "Class": ["A", "A", "A", "B", "B", "B"], + ... "Value": [10, 20, 30, 40, 50, 60], + ... } + ... ) + >>> df + Class Value + 0 A 10 + 1 A 20 + 2 A 30 + 3 B 40 + 4 B 50 + 5 B 60 + + >>> df.groupby("Class").ewm(com=0.5).mean() + Value + Class + A 0 10.000000 + 1 17.500000 + 2 26.153846 + B 3 40.000000 + 4 47.500000 + 5 56.153846 """ from pandas.core.window import ExponentialMovingWindowGroupby return ExponentialMovingWindowGroupby( self._selected_obj, - *args, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + times=times, + method=method, _grouper=self._grouper, - **kwargs, ) @final @@ -3933,24 +4013,22 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, _, _ = self._grouper.group_info - sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) - if direction == "bfill": - sorted_labels = sorted_labels[::-1] + ids = self._grouper.ids + ngroups = self._grouper.ngroups col_func = partial( libgroupby.group_fillna_indexer, labels=ids, - sorted_labels=sorted_labels, limit=limit, - dropna=self.dropna, + compute_ffill=(direction == "ffill"), + ngroups=ngroups, ) def blk_func(values: ArrayLike) -> ArrayLike: mask = isna(values) if values.ndim == 1: indexer = np.empty(values.shape, dtype=np.intp) - col_func(out=indexer, mask=mask) + col_func(out=indexer, mask=mask) # type: ignore[arg-type] return algorithms.take_nd(values, indexer) else: @@ -3980,12 +4058,6 @@ def blk_func(values: ArrayLike) -> ArrayLike: res_mgr = mgr.apply(blk_func) new_obj = self._wrap_agged_manager(res_mgr) - - if self.axis == 1: - # Only relevant for DataFrameGroupBy - new_obj = new_obj.T - new_obj.columns = self.obj.columns - new_obj.index = self.obj.index return new_obj @@ -4070,7 +4142,7 @@ def ffill(self, limit: int | None = None): 3 1.0 3.0 NaN NaN 4 1.0 1.0 NaN NaN - Only replace the first NaN element within a group along rows. + Only replace the first NaN element within a group along columns. >>> df.groupby("key").ffill(limit=1) A B C @@ -4110,7 +4182,7 @@ def bfill(self, limit: int | None = None): With Series: - >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'] + >>> index = ["Falcon", "Falcon", "Parrot", "Parrot", "Parrot"] >>> s = pd.Series([None, 1, None, None, 3], index=index) >>> s Falcon NaN @@ -4136,8 +4208,10 @@ def bfill(self, limit: int | None = None): With DataFrame: - >>> df = pd.DataFrame({'A': [1, None, None, None, 4], - ... 'B': [None, None, 5, None, 7]}, index=index) + >>> df = pd.DataFrame( + ... {"A": [1, None, None, None, 4], "B": [None, None, 5, None, 7]}, + ... index=index, + ... ) >>> df A B Falcon 1.0 NaN @@ -4177,19 +4251,6 @@ def nth(self) -> GroupByNthSelector: 'all' or 'any'; this is equivalent to calling dropna(how=dropna) before the groupby. - Parameters - ---------- - n : int, slice or list of ints and slices - A single nth value for the row or a list of nth values or slices. - - .. versionchanged:: 1.4.0 - Added slice and lists containing slices. - Added index notation. - - dropna : {'any', 'all', None}, default None - Apply the specified dropna operation before counting which row is - the nth row. Only supported if n is an int. - Returns ------- Series or DataFrame @@ -4198,9 +4259,10 @@ def nth(self) -> GroupByNthSelector: Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], - ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) - >>> g = df.groupby('A') + >>> df = pd.DataFrame( + ... {"A": [1, 1, 2, 1, 2], "B": [np.nan, 2, 3, 4, 5]}, columns=["A", "B"] + ... ) + >>> g = df.groupby("A") >>> g.nth(0) A B 0 1 NaN @@ -4241,7 +4303,7 @@ def nth(self) -> GroupByNthSelector: Specifying `dropna` allows ignoring ``NaN`` values - >>> g.nth(0, dropna='any') + >>> g.nth(0, dropna="any") A B 1 1 2.0 2 2 3.0 @@ -4249,7 +4311,7 @@ def nth(self) -> GroupByNthSelector: When the specified ``n`` is larger than any of the groups, an empty DataFrame is returned - >>> g.nth(3, dropna='any') + >>> g.nth(3, dropna="any") Empty DataFrame Columns: [A, B] Index: [] @@ -4264,7 +4326,7 @@ def _nth( if not dropna: mask = self._make_mask_from_positional_indexer(n) - ids, _, _ = self._grouper.group_info + ids = self._grouper.ids # Drop NA values in grouping mask = mask & (ids != -1) @@ -4287,7 +4349,7 @@ def _nth( # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf n = cast(int, n) - dropped = self._selected_obj.dropna(how=dropna, axis=self.axis) + dropped = self._selected_obj.dropna(how=dropna, axis=0) # get a new grouper for our dropped obj grouper: np.ndarray | Index | ops.BaseGrouper @@ -4308,17 +4370,16 @@ def _nth( values = np.where(nulls, NA, grouper) # type: ignore[call-overload] grouper = Index(values, dtype="Int64") - if self.axis == 1: - grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort) - else: - grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) + grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) return grb.nth(n) @final def quantile( self, q: float | AnyArrayLike = 0.5, - interpolation: str = "linear", + interpolation: Literal[ + "linear", "lower", "higher", "nearest", "midpoint" + ] = "linear", numeric_only: bool = False, ): """ @@ -4352,11 +4413,11 @@ def quantile( Examples -------- - >>> df = pd.DataFrame([ - ... ['a', 1], ['a', 2], ['a', 3], - ... ['b', 1], ['b', 3], ['b', 5] - ... ], columns=['key', 'val']) - >>> df.groupby('key').quantile() + >>> df = pd.DataFrame( + ... [["a", 1], ["a", 2], ["a", 3], ["b", 1], ["b", 3], ["b", 5]], + ... columns=["key", "val"], + ... ) + >>> df.groupby("key").quantile() val key a 2.0 @@ -4364,19 +4425,15 @@ def quantile( """ mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") obj = self._wrap_agged_manager(mgr) - if self.axis == 1: - splitter = self._grouper._get_splitter(obj.T, axis=self.axis) - sdata = splitter._sorted_data.T - else: - splitter = self._grouper._get_splitter(obj, axis=self.axis) - sdata = splitter._sorted_data + splitter = self._grouper._get_splitter(obj) + sdata = splitter._sorted_data starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: - if is_object_dtype(vals.dtype): + if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype): raise TypeError( - "'quantile' cannot be performed against 'object' dtypes!" + f"dtype '{vals.dtype}' does not support operation 'quantile'" ) inference: DtypeObj | None = None @@ -4392,16 +4449,8 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): out = vals.to_numpy(dtype=float, na_value=np.nan) elif is_bool_dtype(vals.dtype): - # GH#51424 deprecate to match Series/DataFrame behavior - warnings.warn( - f"Allowing bool dtype in {type(self).__name__}.quantile is " - "deprecated and will raise in a future version, matching " - "the Series/DataFrame behavior. Cast to uint8 dtype before " - "calling quantile instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - out = np.asarray(vals) + # GH#51424 remove to match Series/DataFrame behavior + raise TypeError("Cannot use quantile with bool dtype") elif needs_i8_conversion(vals.dtype): inference = vals.dtype # In this case we need to delay the casting until after the @@ -4469,13 +4518,18 @@ def post_processor( return vals - qs = np.array(q, dtype=np.float64) - pass_qs: np.ndarray | None = qs if is_scalar(q): qs = np.array([q], dtype=np.float64) - pass_qs = None + pass_qs: None | np.ndarray = None + else: + qs = np.asarray(q, dtype=np.float64) + pass_qs = qs - ids, _, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups + if self.dropna: + # splitter drops NA groups, we need to do the same + ids = ids[ids >= 0] nqs = len(qs) func = partial( @@ -4514,7 +4568,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: func( out[0], values=vals, - mask=mask, + mask=mask, # type: ignore[arg-type] result_mask=result_mask, is_datetimelike=is_datetimelike, ) @@ -4529,11 +4583,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: ) if vals.ndim == 1: - out = out.ravel("K") + out = out.ravel("K") # type: ignore[assignment] if result_mask is not None: - result_mask = result_mask.ravel("K") + result_mask = result_mask.ravel("K") # type: ignore[assignment] else: - out = out.reshape(ncols, ngroups * nqs) + out = out.reshape(ncols, ngroups * nqs) # type: ignore[assignment] return post_processor(out, inference, result_mask, orig_vals) @@ -4607,8 +4661,8 @@ def ngroup(self, ascending: bool = True): dtype: int64 """ obj = self._obj_with_exclusions - index = obj._get_axis(self.axis) - comp_ids = self._grouper.group_info[0] + index = obj.index + comp_ids = self._grouper.ids dtype: type if self._grouper.has_dropped_na: @@ -4654,8 +4708,7 @@ def cumcount(self, ascending: bool = True): Examples -------- - >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], - ... columns=['A']) + >>> df = pd.DataFrame([["a"], ["a"], ["a"], ["b"], ["b"], ["a"]], columns=["A"]) >>> df A 0 a @@ -4664,7 +4717,7 @@ def cumcount(self, ascending: bool = True): 3 b 4 b 5 a - >>> df.groupby('A').cumcount() + >>> df.groupby("A").cumcount() 0 0 1 1 2 2 @@ -4672,7 +4725,7 @@ def cumcount(self, ascending: bool = True): 4 1 5 3 dtype: int64 - >>> df.groupby('A').cumcount(ascending=False) + >>> df.groupby("A").cumcount(ascending=False) 0 3 1 2 2 1 @@ -4681,7 +4734,7 @@ def cumcount(self, ascending: bool = True): 5 0 dtype: int64 """ - index = self._obj_with_exclusions._get_axis(self.axis) + index = self._obj_with_exclusions.index cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) @@ -4694,7 +4747,6 @@ def rank( ascending: bool = True, na_option: str = "keep", pct: bool = False, - axis: AxisInt | lib.NoDefault = lib.no_default, ) -> NDFrameT: """ Provide the rank of values within each group. @@ -4715,16 +4767,11 @@ def rank( * bottom: smallest rank if descending. pct : bool, default False Compute percentage rank of data within each group. - axis : int, default 0 - The axis of the object over which to compute the rank. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. Returns ------- - DataFrame with ranking of values within each group + DataFrame + The ranking of values within each group. %(see_also)s Examples -------- @@ -4746,8 +4793,8 @@ def rank( 7 b 4 8 b 1 9 b 5 - >>> for method in ['average', 'min', 'max', 'dense', 'first']: - ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) + >>> for method in ["average", "min", "max", "dense", "first"]: + ... df[f"{method}_rank"] = df.groupby("group")["value"].rank(method) >>> df group value average_rank min_rank max_rank dense_rank first_rank 0 a 2 1.5 1.0 2.0 1.0 1.0 @@ -4765,52 +4812,46 @@ def rank( msg = "na_option must be one of 'keep', 'top', or 'bottom'" raise ValueError(msg) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "rank") - else: - axis = 0 - kwargs = { "ties_method": method, "ascending": ascending, "na_option": na_option, "pct": pct, } - if axis != 0: - # DataFrame uses different keyword name - kwargs["method"] = kwargs.pop("ties_method") - f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs) - result = self._python_apply_general( - f, self._selected_obj, is_transform=True - ) - return result return self._cython_transform( "rank", numeric_only=False, - axis=axis, **kwargs, ) @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def cumprod( - self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs - ) -> NDFrameT: + def cumprod(self, numeric_only: bool = False, *args, **kwargs) -> NDFrameT: """ Cumulative product for each group. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + *args : tuple + Positional arguments to be passed to `func`. + **kwargs : dict + Additional/specific keyword arguments to be passed to the function, + such as `numeric_only` and `skipna`. + Returns ------- Series or DataFrame + Cumulative product for each group. Same object type as the caller. %(see_also)s Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([6, 2, 0], index=lst) >>> ser a 6 @@ -4826,8 +4867,9 @@ def cumprod( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["cow", "horse", "bull"] + ... ) >>> df a b c cow 1 8 2 @@ -4841,37 +4883,36 @@ def cumprod( horse 16 10 bull 6 9 """ - nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "cumprod") - else: - axis = 0 - - if axis != 0: - f = lambda x: x.cumprod(axis=axis, **kwargs) - return self._python_apply_general(f, self._selected_obj, is_transform=True) - - return self._cython_transform("cumprod", **kwargs) + nv.validate_groupby_func("cumprod", args, kwargs, ["skipna"]) + return self._cython_transform("cumprod", numeric_only, **kwargs) @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def cumsum( - self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs - ) -> NDFrameT: + def cumsum(self, numeric_only: bool = False, *args, **kwargs) -> NDFrameT: """ Cumulative sum for each group. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + *args : tuple + Positional arguments to be passed to `func`. + **kwargs : dict + Additional/specific keyword arguments to be passed to the function, + such as `numeric_only` and `skipna`. + Returns ------- Series or DataFrame + Cumulative sum for each group. Same object type as the caller. %(see_also)s Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'b'] + >>> lst = ["a", "a", "b"] >>> ser = pd.Series([6, 2, 0], index=lst) >>> ser a 6 @@ -4887,8 +4928,9 @@ def cumsum( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["fox", "gorilla", "lion"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["fox", "gorilla", "lion"] + ... ) >>> df a b c fox 1 8 2 @@ -4902,40 +4944,38 @@ def cumsum( gorilla 10 7 lion 6 9 """ - nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "cumsum") - else: - axis = 0 - - if axis != 0: - f = lambda x: x.cumsum(axis=axis, **kwargs) - return self._python_apply_general(f, self._selected_obj, is_transform=True) - - return self._cython_transform("cumsum", **kwargs) + nv.validate_groupby_func("cumsum", args, kwargs, ["skipna"]) + return self._cython_transform("cumsum", numeric_only, **kwargs) @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def cummin( self, - axis: AxisInt | lib.NoDefault = lib.no_default, numeric_only: bool = False, **kwargs, ) -> NDFrameT: """ Cumulative min for each group. + Parameters + ---------- + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + **kwargs : dict, optional + Additional keyword arguments to be passed to the function, such as `skipna`, + to control whether NA/null values are ignored. + Returns ------- Series or DataFrame + Cumulative min for each group. Same object type as the caller. %(see_also)s Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([1, 6, 2, 3, 0, 4], index=lst) >>> ser a 1 @@ -4957,8 +4997,9 @@ def cummin( For DataFrameGroupBy: >>> data = [[1, 0, 2], [1, 1, 5], [6, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["snake", "rabbit", "turtle"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["snake", "rabbit", "turtle"] + ... ) >>> df a b c snake 1 0 2 @@ -4973,19 +5014,6 @@ def cummin( turtle 6 9 """ skipna = kwargs.get("skipna", True) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "cummin") - else: - axis = 0 - - if axis != 0: - f = lambda x: np.minimum.accumulate(x, axis) - obj = self._selected_obj - if numeric_only: - obj = obj._get_numeric_data() - return self._python_apply_general(f, obj, is_transform=True) - return self._cython_transform( "cummin", numeric_only=numeric_only, skipna=skipna ) @@ -4995,22 +5023,30 @@ def cummin( @Substitution(see_also=_common_see_also) def cummax( self, - axis: AxisInt | lib.NoDefault = lib.no_default, numeric_only: bool = False, **kwargs, ) -> NDFrameT: """ Cumulative max for each group. + Parameters + ---------- + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + **kwargs : dict, optional + Additional keyword arguments to be passed to the function, such as `skipna`, + to control whether NA/null values are ignored. + Returns ------- Series or DataFrame + Cumulative max for each group. Same object type as the caller. %(see_also)s Examples -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([1, 6, 2, 3, 1, 4], index=lst) >>> ser a 1 @@ -5032,8 +5068,9 @@ def cummax( For DataFrameGroupBy: >>> data = [[1, 8, 2], [1, 1, 0], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["cow", "horse", "bull"]) + >>> df = pd.DataFrame( + ... data, columns=["a", "b", "c"], index=["cow", "horse", "bull"] + ... ) >>> df a b c cow 1 8 2 @@ -5048,19 +5085,6 @@ def cummax( bull 6 9 """ skipna = kwargs.get("skipna", True) - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "cummax") - else: - axis = 0 - - if axis != 0: - f = lambda x: np.maximum.accumulate(x, axis) - obj = self._selected_obj - if numeric_only: - obj = obj._get_numeric_data() - return self._python_apply_general(f, obj, is_transform=True) - return self._cython_transform( "cummax", numeric_only=numeric_only, skipna=skipna ) @@ -5071,7 +5095,6 @@ def shift( self, periods: int | Sequence[int] = 1, freq=None, - axis: Axis | lib.NoDefault = lib.no_default, fill_value=lib.no_default, suffix: str | None = None, ): @@ -5087,13 +5110,6 @@ def shift( each period. freq : str, optional Frequency string. - axis : axis to shift, default 0 - Shift direction. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - fill_value : optional The scalar value to use for newly introduced missing values. @@ -5118,7 +5134,7 @@ def shift( For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 4], index=lst) >>> ser a 1 @@ -5136,8 +5152,11 @@ def shift( For DataFrameGroupBy: >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"], + ... ) >>> df a b c tuna 1 2 3 @@ -5151,17 +5170,7 @@ def shift( catfish NaN NaN goldfish 5.0 8.0 """ - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "shift") - else: - axis = 0 - if is_list_like(periods): - if axis == 1: - raise ValueError( - "If `periods` contains multiple shifts, `axis` cannot be 1." - ) periods = cast(Sequence, periods) if len(periods) == 0: raise ValueError("If `periods` is an iterable, it cannot be empty.") @@ -5185,9 +5194,12 @@ def shift( f"Periods must be integer, but {period} is {type(period)}." ) period = cast(int, period) - if freq is not None or axis != 0: + if freq is not None: f = lambda x: x.shift( - period, freq, axis, fill_value # pylint: disable=cell-var-from-loop + period, + freq, + 0, # axis + fill_value, ) shifted = self._python_apply_general( f, self._selected_obj, is_transform=True @@ -5195,7 +5207,8 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, _, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) @@ -5203,7 +5216,7 @@ def shift( obj = self._obj_with_exclusions shifted = obj._reindex_with_indexers( - {self.axis: (obj.axes[self.axis], res_indexer)}, + {0: (obj.index, res_indexer)}, fill_value=fill_value, allow_dups=True, ) @@ -5226,7 +5239,8 @@ def shift( @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def diff( - self, periods: int = 1, axis: AxisInt | lib.NoDefault = lib.no_default + self, + periods: int = 1, ) -> NDFrameT: """ First discrete difference of element. @@ -5238,12 +5252,6 @@ def diff( ---------- periods : int, default 1 Periods to shift for calculating difference, accepts negative values. - axis : axis to shift, default 0 - Take difference over rows (0) or columns (1). - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. Returns ------- @@ -5254,7 +5262,7 @@ def diff( -------- For SeriesGroupBy: - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) >>> ser a 7 @@ -5275,9 +5283,10 @@ def diff( For DataFrameGroupBy: - >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} - >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', - ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> data = {"a": [1, 3, 5, 7, 7, 8, 3], "b": [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame( + ... data, index=["dog", "dog", "dog", "mouse", "mouse", "mouse", "mouse"] + ... ) >>> df a b dog 1 1 @@ -5297,15 +5306,6 @@ def diff( mouse 1.0 -2.0 mouse -5.0 -1.0 """ - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "diff") - else: - axis = 0 - - if axis != 0: - return self.apply(lambda x: x.diff(periods=periods, axis=axis)) - obj = self._obj_with_exclusions shifted = self.shift(periods=periods) @@ -5317,8 +5317,8 @@ def diff( shifted = shifted.astype("float32") else: to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] - if len(to_coerce): - shifted = shifted.astype({c: "float32" for c in to_coerce}) + if to_coerce: + shifted = shifted.astype(dict.fromkeys(to_coerce, "float32")) return obj - shifted @@ -5328,14 +5328,30 @@ def diff( def pct_change( self, periods: int = 1, - fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, - limit: int | None | lib.NoDefault = lib.no_default, + fill_method: None = None, freq=None, - axis: Axis | lib.NoDefault = lib.no_default, ): """ Calculate pct_change of each value to previous entry in group. + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating percentage change. Comparing with + a period of 1 means adjacent elements are compared, whereas a period + of 2 compares every other element. + + fill_method : None + Must be None. This argument will be removed in a future version of pandas. + + .. deprecated:: 2.1 + All options of `fill_method` are deprecated except `fill_method=None`. + + freq : str, pandas offset object, or None, default None + The frequency increment for time series data (e.g., 'M' for month-end). + If None, the frequency is inferred from the index. Relevant for time + series data only. + Returns ------- Series or DataFrame @@ -5346,7 +5362,7 @@ def pct_change( For SeriesGroupBy: - >>> lst = ['a', 'a', 'b', 'b'] + >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 4], index=lst) >>> ser a 1 @@ -5364,8 +5380,11 @@ def pct_change( For DataFrameGroupBy: >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] - >>> df = pd.DataFrame(data, columns=["a", "b", "c"], - ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df = pd.DataFrame( + ... data, + ... columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"], + ... ) >>> df a b c tuna 1 2 3 @@ -5380,62 +5399,26 @@ def pct_change( goldfish 0.2 0.125 """ # GH#53491 - if fill_method not in (lib.no_default, None) or limit is not lib.no_default: - warnings.warn( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - f"{type(self).__name__}.pct_change are deprecated and will be removed " - "in a future version. Either fill in any non-leading NA values prior " - "to calling pct_change or specify 'fill_method=None' to not fill NA " - "values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if fill_method is lib.no_default: - if limit is lib.no_default and any( - grp.isna().values.any() for _, grp in self - ): - warnings.warn( - "The default fill_method='ffill' in " - f"{type(self).__name__}.pct_change is deprecated and will " - "be removed in a future version. Either fill in any " - "non-leading NA values prior to calling pct_change or " - "specify 'fill_method=None' to not fill NA values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - fill_method = "ffill" - if limit is lib.no_default: - limit = None - - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "pct_change") - else: - axis = 0 + if fill_method is not None: + raise ValueError(f"fill_method must be None; got {fill_method=}.") # TODO(GH#23918): Remove this conditional for SeriesGroupBy when # GH#23918 is fixed - if freq is not None or axis != 0: + if freq is not None: f = lambda x: x.pct_change( periods=periods, - fill_method=fill_method, - limit=limit, freq=freq, - axis=axis, + axis=0, ) return self._python_apply_general(f, self._selected_obj, is_transform=True) if fill_method is None: # GH30463 - fill_method = "ffill" - limit = 0 - filled = getattr(self, fill_method)(limit=limit) - if self.axis == 0: - fill_grp = filled.groupby(self._grouper.codes, group_keys=self.group_keys) + op = "ffill" else: - fill_grp = filled.T.groupby(self._grouper.codes, group_keys=self.group_keys) + op = fill_method + filled = getattr(self, op)(limit=0) + fill_grp = filled.groupby(self._grouper.codes, group_keys=self.group_keys) shifted = fill_grp.shift(periods=periods, freq=freq) - if self.axis == 1: - shifted = shifted.T return (filled / shifted) - 1 @final @@ -5463,13 +5446,12 @@ def head(self, n: int = 5) -> NDFrameT: Examples -------- - >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], - ... columns=['A', 'B']) - >>> df.groupby('A').head(1) + >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + >>> df.groupby("A").head(1) A B 0 1 2 2 5 6 - >>> df.groupby('A').head(-1) + >>> df.groupby("A").head(-1) A B 0 1 2 """ @@ -5501,13 +5483,14 @@ def tail(self, n: int = 5) -> NDFrameT: Examples -------- - >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], - ... columns=['A', 'B']) - >>> df.groupby('A').tail(1) + >>> df = pd.DataFrame( + ... [["a", 1], ["a", 2], ["b", 1], ["b", 2]], columns=["A", "B"] + ... ) + >>> df.groupby("A").tail(1) A B 1 a 2 3 b 2 - >>> df.groupby('A').tail(-1) + >>> df.groupby("A").tail(-1) A B 1 a 2 3 b 2 @@ -5522,7 +5505,7 @@ def tail(self, n: int = 5) -> NDFrameT: @final def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: """ - Return _selected_obj with mask applied to the correct axis. + Return _selected_obj with mask applied. Parameters ---------- @@ -5534,111 +5517,9 @@ def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: Series or DataFrame Filtered _selected_obj. """ - ids = self._grouper.group_info[0] + ids = self._grouper.ids mask = mask & (ids != -1) - - if self.axis == 0: - return self._selected_obj[mask] - else: - return self._selected_obj.iloc[:, mask] - - @final - def _reindex_output( - self, - output: OutputFrameOrSeries, - fill_value: Scalar = np.nan, - qs: npt.NDArray[np.float64] | None = None, - ) -> OutputFrameOrSeries: - """ - If we have categorical groupers, then we might want to make sure that - we have a fully re-indexed output to the levels. This means expanding - the output space to accommodate all values in the cartesian product of - our groups, regardless of whether they were observed in the data or - not. This will expand the output space if there are missing groups. - - The method returns early without modifying the input if the number of - groupings is less than 2, self.observed == True or none of the groupers - are categorical. - - Parameters - ---------- - output : Series or DataFrame - Object resulting from grouping and applying an operation. - fill_value : scalar, default np.nan - Value to use for unobserved categories if self.observed is False. - qs : np.ndarray[float64] or None, default None - quantile values, only relevant for quantile. - - Returns - ------- - Series or DataFrame - Object (potentially) re-indexed to include all possible groups. - """ - groupings = self._grouper.groupings - if len(groupings) == 1: - return output - - # if we only care about the observed values - # we are done - elif self.observed: - return output - - # reindexing only applies to a Categorical grouper - elif not any( - isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) - for ping in groupings - ): - return output - - levels_list = [ping._group_index for ping in groupings] - names = self._grouper.names - if qs is not None: - # error: Argument 1 to "append" of "list" has incompatible type - # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" - levels_list.append(qs) # type: ignore[arg-type] - names = names + [None] - index = MultiIndex.from_product(levels_list, names=names) - if self.sort: - index = index.sort_values() - - if self.as_index: - # Always holds for SeriesGroupBy unless GH#36507 is implemented - d = { - self.obj._get_axis_name(self.axis): index, - "copy": False, - "fill_value": fill_value, - } - return output.reindex(**d) # type: ignore[arg-type] - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `output`. An idea is to do: - # output = output.set_index(self._grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `output`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = [ - (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis - ] - if len(in_axis_grps) > 0: - g_nums, g_names = zip(*in_axis_grps) - output = output.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - output = output.set_index(self._grouper.result_index).reindex( - index, copy=False, fill_value=fill_value - ) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - if len(in_axis_grps) > 0: - output = output.reset_index(level=g_nums) - - return output.reset_index(drop=True) + return self._selected_obj[mask] @final def sample( @@ -5674,6 +5555,7 @@ def sample( random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. + Default ``None`` results in sampling with the current state of np.random. .. versionchanged:: 1.4.0 @@ -5688,6 +5570,7 @@ def sample( See Also -------- DataFrame.sample: Generate random samples from a DataFrame object. + Series.sample: Generate random samples from a Series object. numpy.random.choice: Generate a random sample from a given 1-D numpy array. @@ -5739,13 +5622,11 @@ def sample( return self._selected_obj size = sample.process_sampling_size(n, frac, replace) if weights is not None: - weights_arr = sample.preprocess_weights( - self._selected_obj, weights, axis=self.axis - ) + weights_arr = sample.preprocess_weights(self._selected_obj, weights, axis=0) random_state = com.random_state(random_state) - group_iterator = self._grouper.get_iterator(self._selected_obj, self.axis) + group_iterator = self._grouper.get_iterator(self._selected_obj) sampled_indices = [] for labels, obj in group_iterator: @@ -5767,13 +5648,12 @@ def sample( sampled_indices.append(grp_indices[grp_sample]) sampled_indices = np.concatenate(sampled_indices) - return self._selected_obj.take(sampled_indices, axis=self.axis) + return self._selected_obj.take(sampled_indices, axis=0) def _idxmax_idxmin( self, how: Literal["idxmax", "idxmin"], ignore_unobserved: bool = False, - axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, ) -> NDFrameT: @@ -5783,14 +5663,10 @@ def _idxmax_idxmin( ---------- how : {'idxmin', 'idxmax'} Whether to compute idxmin or idxmax. - axis : {{0 or 'index', 1 or 'columns'}}, default None - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - If axis is not provided, grouper's axis is used. numeric_only : bool, default False Include only float, int, boolean columns. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. ignore_unobserved : bool, default False When True and an unobserved group is encountered, do not raise. This used for transform where unobserved groups do not play an impact on the result. @@ -5800,25 +5676,13 @@ def _idxmax_idxmin( Series or DataFrame idxmax or idxmin for the groupby operation. """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, how) - else: - axis = self.axis - if not self.observed and any( ping._passed_categorical for ping in self._grouper.groupings ): - expected_len = np.prod( - [len(ping._group_index) for ping in self._grouper.groupings] - ) - if len(self._grouper.groupings) == 1: - result_len = len(self._grouper.groupings[0].grouping_vector.unique()) - else: - # result_index only contains observed groups in this case - result_len = len(self._grouper.result_index) + expected_len = len(self._grouper.result_index) + # TODO: Better way to find # of observed groups? + group_sizes = self._grouper.size() + result_len = group_sizes[group_sizes > 0].shape[0] assert result_len <= expected_len has_unobserved = result_len < expected_len @@ -5837,36 +5701,11 @@ def _idxmax_idxmin( f"Can't get {how} of an empty group due to unobserved categories. " "Specify observed=True in groupby instead." ) - elif not skipna: - if self._obj_with_exclusions.isna().any(axis=None): - warnings.warn( - f"The behavior of {type(self).__name__}.{how} with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if axis == 1: - try: - - def func(df): - method = getattr(df, how) - return method(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = how - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True - ) - except ValueError as err: - name = "argmax" if how == "idxmax" else "argmin" - if f"attempt to get {name} of an empty sequence" in str(err): - raise ValueError( - f"Can't get {how} of an empty group due to unobserved " - "categories. Specify observed=True in groupby instead." - ) from None - raise - return result + elif not skipna and self._obj_with_exclusions.isna().any(axis=None): + raise ValueError( + f"{type(self).__name__}.{how} with skipna=False encountered an NA " + f"value." + ) result = self._agg_general( numeric_only=numeric_only, @@ -5877,7 +5716,7 @@ def func(df): return result def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: - index = self.obj._get_axis(self.axis) + index = self.obj.index if res.size == 0: result = res.astype(index.dtype) else: @@ -5908,7 +5747,6 @@ def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: def get_groupby( obj: NDFrame, by: _KeysArgType | None = None, - axis: AxisInt = 0, grouper: ops.BaseGrouper | None = None, group_keys: bool = True, ) -> GroupBy: @@ -5927,7 +5765,6 @@ def get_groupby( return klass( obj=obj, keys=by, - axis=axis, grouper=grouper, group_keys=group_keys, ) @@ -5965,13 +5802,3 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi - - -# GH#7155 -_apply_groupings_depr = ( - "{}.{} operated on the grouping columns. This behavior is deprecated, " - "and in a future version of pandas the grouping columns will be excluded " - "from the operation. Either pass `include_groups=False` to exclude the " - "groupings or explicitly select the grouping columns after groupby to silence " - "this warning." -) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e2224caad9e84..f8e92b7e2650a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -2,28 +2,26 @@ Provide user facing operators for doing the split part of the split-apply-combine paradigm. """ + from __future__ import annotations from typing import ( TYPE_CHECKING, final, ) -import warnings import numpy as np -from pandas._config import ( - using_copy_on_write, - warn_copy_on_write, +from pandas._libs import ( + algos as libalgos, ) - -from pandas._libs import lib from pandas._libs.tslibs import OutOfBoundsDatetime from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, is_list_like, is_scalar, ) @@ -39,13 +37,16 @@ from pandas.core.groupby import ops from pandas.core.groupby.categorical import recode_for_groupby from pandas.core.indexes.api import ( - CategoricalIndex, Index, MultiIndex, + default_index, ) from pandas.core.series import Series -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import ( + PrettyDict, + pprint_thing, +) if TYPE_CHECKING: from collections.abc import ( @@ -55,7 +56,6 @@ from pandas._typing import ( ArrayLike, - Axis, NDFrameT, npt, ) @@ -68,14 +68,21 @@ class Grouper: A Grouper allows the user to specify a groupby instruction for an object. This specification will select a column via the key parameter, or if the - level and/or axis parameters are given, a level of the index of the target + level parameter is given, a level of the index of the target object. - If `axis` and/or `level` are passed as keywords to both `Grouper` and + If ``level`` is passed as a keyword to both `Grouper` and `groupby`, the values passed to `Grouper` take precedence. Parameters ---------- + *args + Currently unused, reserved for future use. + **kwargs + Dictionary of the keyword arguments to pass to Grouper. + + Attributes + ---------- key : str, defaults to None Groupby key, which selects the grouping column of the target. level : name/number, defaults to None @@ -83,10 +90,7 @@ class Grouper: freq : str / frequency object, defaults to None This will groupby the specified frequency if the target selection (via key or level) is a datetime-like object. For full specification - of available frequencies, please see `here - `_. - axis : str, int, defaults to 0 - Number/name of the axis. + of available frequencies, please see :ref:`here`. sort : bool, default to False Whether to sort the resulting labels. closed : {'left' or 'right'} @@ -125,6 +129,11 @@ class Grouper: A TimeGrouper is returned if ``freq`` is not ``None``. Otherwise, a Grouper is returned. + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + Examples -------- ``df.groupby(pd.Grouper(key="Animal"))`` is equivalent to ``df.groupby('Animal')`` @@ -151,15 +160,15 @@ class Grouper: Specify a resample operation on the column 'Publish date' >>> df = pd.DataFrame( - ... { - ... "Publish date": [ + ... { + ... "Publish date": [ ... pd.Timestamp("2000-01-02"), ... pd.Timestamp("2000-01-02"), ... pd.Timestamp("2000-01-09"), - ... pd.Timestamp("2000-01-16") + ... pd.Timestamp("2000-01-16"), ... ], ... "ID": [0, 1, 2, 3], - ... "Price": [10, 20, 30, 40] + ... "Price": [10, 20, 30, 40], ... } ... ) >>> df @@ -177,8 +186,8 @@ class Grouper: If you want to adjust the start of the bins based on a fixed timestamp: - >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' - >>> rng = pd.date_range(start, end, freq='7min') + >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + >>> rng = pd.date_range(start, end, freq="7min") >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) >>> ts 2000-10-01 23:30:00 0 @@ -192,7 +201,7 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 7min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min')).sum() + >>> ts.groupby(pd.Grouper(freq="17min")).sum() 2000-10-01 23:14:00 0 2000-10-01 23:31:00 9 2000-10-01 23:48:00 21 @@ -200,7 +209,7 @@ class Grouper: 2000-10-02 00:22:00 24 Freq: 17min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() + >>> ts.groupby(pd.Grouper(freq="17min", origin="epoch")).sum() 2000-10-01 23:18:00 0 2000-10-01 23:35:00 18 2000-10-01 23:52:00 27 @@ -208,7 +217,7 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 17min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + >>> ts.groupby(pd.Grouper(freq="17min", origin="2000-01-01")).sum() 2000-10-01 23:24:00 3 2000-10-01 23:41:00 15 2000-10-01 23:58:00 45 @@ -218,14 +227,14 @@ class Grouper: If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: - >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() + >>> ts.groupby(pd.Grouper(freq="17min", origin="start")).sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 Freq: 17min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() + >>> ts.groupby(pd.Grouper(freq="17min", offset="23h30min")).sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 @@ -235,7 +244,7 @@ class Grouper: To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: - >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() + >>> ts.groupby(pd.Grouper(freq="17min", offset="2min")).sum() 2000-10-01 23:16:00 0 2000-10-01 23:33:00 9 2000-10-01 23:50:00 36 @@ -246,10 +255,9 @@ class Grouper: sort: bool dropna: bool - _gpr_index: Index | None _grouper: Index | None - _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna") + _attributes: tuple[str, ...] = ("key", "level", "freq", "sort", "dropna") def __new__(cls, *args, **kwargs): if kwargs.get("freq") is not None: @@ -263,36 +271,16 @@ def __init__( key=None, level=None, freq=None, - axis: Axis | lib.NoDefault = lib.no_default, sort: bool = False, dropna: bool = True, ) -> None: - if type(self) is Grouper: - # i.e. not TimeGrouper - if axis is not lib.no_default: - warnings.warn( - "Grouper axis keyword is deprecated and will be removed in a " - "future version. To group on axis=1, use obj.T.groupby(...) " - "instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - axis = 0 - if axis is lib.no_default: - axis = 0 - self.key = key self.level = level self.freq = freq - self.axis = axis self.sort = sort self.dropna = dropna - self._grouper_deprecated = None self._indexer_deprecated: npt.NDArray[np.intp] | None = None - self._obj_deprecated = None - self._gpr_index = None self.binner = None self._grouper = None self._indexer: npt.NDArray[np.intp] | None = None @@ -315,16 +303,11 @@ def _get_grouper( grouper, _, obj = get_grouper( obj, [self.key], - axis=self.axis, level=self.level, sort=self.sort, validate=validate, dropna=self.dropna, ) - # Without setting this, subsequent lookups to .groups raise - # error: Incompatible types in assignment (expression has type "BaseGrouper", - # variable has type "None") - self._grouper_deprecated = grouper # type: ignore[assignment] return grouper, obj @@ -381,7 +364,7 @@ def _set_grouper( ax = Index(obj[key], name=key) else: - ax = obj._get_axis(self.axis) + ax = obj.index if self.level is not None: level = self.level @@ -404,79 +387,14 @@ def _set_grouper( kind="mergesort", na_position="first" ) ax = ax.take(indexer) - obj = obj.take(indexer, axis=self.axis) + obj = obj.take(indexer, axis=0) - # error: Incompatible types in assignment (expression has type - # "NDFrameT", variable has type "None") - self._obj_deprecated = obj # type: ignore[assignment] - self._gpr_index = ax return obj, ax, indexer - @final - @property - def ax(self) -> Index: - warnings.warn( - f"{type(self).__name__}.ax is deprecated and will be removed in a " - "future version. Use Resampler.ax instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - index = self._gpr_index - if index is None: - raise ValueError("_set_grouper must be called before ax is accessed") - return index - - @final - @property - def indexer(self): - warnings.warn( - f"{type(self).__name__}.indexer is deprecated and will be removed " - "in a future version. Use Resampler.indexer instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._indexer_deprecated - - @final - @property - def obj(self): - # TODO(3.0): enforcing these deprecations on Grouper should close - # GH#25564, GH#41930 - warnings.warn( - f"{type(self).__name__}.obj is deprecated and will be removed " - "in a future version. Use GroupBy.indexer instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._obj_deprecated - - @final - @property - def grouper(self): - warnings.warn( - f"{type(self).__name__}.grouper is deprecated and will be removed " - "in a future version. Use GroupBy.grouper instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._grouper_deprecated - - @final - @property - def groups(self): - warnings.warn( - f"{type(self).__name__}.groups is deprecated and will be removed " - "in a future version. Use GroupBy.groups instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - # error: "None" has no attribute "groups" - return self._grouper_deprecated.groups # type: ignore[attr-defined] - @final def __repr__(self) -> str: attrs_list = ( - f"{attr_name}={repr(getattr(self, attr_name))}" + f"{attr_name}={getattr(self, attr_name)!r}" for attr_name in self._attributes if getattr(self, attr_name) is not None ) @@ -520,7 +438,6 @@ class Grouping: """ _codes: npt.NDArray[np.signedinteger] | None = None - _all_grouper: Categorical | None _orig_cats: Index | None _index: Index @@ -539,7 +456,6 @@ def __init__( self.level = level self._orig_grouper = grouper grouping_vector = _convert_grouper(index, grouper) - self._all_grouper = None self._orig_cats = None self._index = index self._sort = sort @@ -608,8 +524,7 @@ def __init__( ): grper = pprint_thing(grouping_vector) errmsg = ( - "Grouper result violates len(labels) == " - f"len(data)\nresult: {grper}" + f"Grouper result violates len(labels) == len(data)\nresult: {grper}" ) raise AssertionError(errmsg) @@ -623,9 +538,7 @@ def __init__( elif isinstance(getattr(grouping_vector, "dtype", None), CategoricalDtype): # a passed Categorical self._orig_cats = grouping_vector.categories - grouping_vector, self._all_grouper = recode_for_groupby( - grouping_vector, sort, observed - ) + grouping_vector = recode_for_groupby(grouping_vector, sort, observed) self.grouping_vector = grouping_vector @@ -675,7 +588,7 @@ def _ilevel(self) -> int | None: @property def ngroups(self) -> int: - return len(self._group_index) + return len(self.uniques) @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @@ -690,89 +603,9 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: def codes(self) -> npt.NDArray[np.signedinteger]: return self._codes_and_uniques[0] - @cache_readonly - def _group_arraylike(self) -> ArrayLike: - """ - Analogous to result_index, but holding an ArrayLike to ensure - we can retain ExtensionDtypes. - """ - if self._all_grouper is not None: - # retain dtype for categories, including unobserved ones - return self._result_index._values - - elif self._passed_categorical: - return self._group_index._values - - return self._codes_and_uniques[1] - @property - def group_arraylike(self) -> ArrayLike: - """ - Analogous to result_index, but holding an ArrayLike to ensure - we can retain ExtensionDtypes. - """ - warnings.warn( - "group_arraylike is deprecated and will be removed in a future " - "version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._group_arraylike - - @cache_readonly - def _result_index(self) -> Index: - # result_index retains dtype for categories, including unobserved ones, - # which group_index does not - if self._all_grouper is not None: - group_idx = self._group_index - assert isinstance(group_idx, CategoricalIndex) - cats = self._orig_cats - # set_categories is dynamically added - return group_idx.set_categories(cats) # type: ignore[attr-defined] - return self._group_index - - @property - def result_index(self) -> Index: - warnings.warn( - "result_index is deprecated and will be removed in a future " - "version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._result_index - - @cache_readonly - def _group_index(self) -> Index: - codes, uniques = self._codes_and_uniques - if not self._dropna and self._passed_categorical: - assert isinstance(uniques, Categorical) - if self._sort and (codes == len(uniques)).any(): - # Add NA value on the end when sorting - uniques = Categorical.from_codes( - np.append(uniques.codes, [-1]), uniques.categories, validate=False - ) - elif len(codes) > 0: - # Need to determine proper placement of NA value when not sorting - cat = self.grouping_vector - na_idx = (cat.codes < 0).argmax() - if cat.codes[na_idx] < 0: - # count number of unique codes that comes before the nan value - na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) - new_codes = np.insert(uniques.codes, na_unique_idx, -1) - uniques = Categorical.from_codes( - new_codes, uniques.categories, validate=False - ) - return Index._with_infer(uniques, name=self.name) - - @property - def group_index(self) -> Index: - warnings.warn( - "group_index is deprecated and will be removed in a future " - "version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._group_index + def uniques(self) -> ArrayLike: + return self._codes_and_uniques[1] @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: @@ -792,29 +625,31 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: else: ucodes = np.arange(len(categories)) - uniques = Categorical.from_codes( - codes=ucodes, categories=categories, ordered=cat.ordered, validate=False - ) - - codes = cat.codes + has_dropped_na = False if not self._dropna: - na_mask = codes < 0 + na_mask = cat.isna() if np.any(na_mask): + has_dropped_na = True if self._sort: - # Replace NA codes with `largest code + 1` + # NA goes at the end, gets `largest non-NA code + 1` na_code = len(categories) - codes = np.where(na_mask, na_code, codes) else: - # Insert NA code into the codes based on first appearance - # A negative code must exist, no need to check codes[na_idx] < 0 + # Insert NA in result based on first appearance, need + # the number of unique codes prior na_idx = na_mask.argmax() - # count number of unique codes that comes before the nan value - na_code = algorithms.nunique_ints(codes[:na_idx]) - codes = np.where(codes >= na_code, codes + 1, codes) - codes = np.where(na_mask, na_code, codes) + na_code = algorithms.nunique_ints(cat.codes[:na_idx]) + ucodes = np.insert(ucodes, na_code, -1) + + uniques = Categorical.from_codes( + codes=ucodes, categories=categories, ordered=cat.ordered, validate=False + ) + codes = cat.codes - if not self._observed: - uniques = uniques.reorder_categories(self._orig_cats) + if has_dropped_na: + if not self._sort: + # NA code is based on first appearance, increment higher codes + codes = np.where(codes >= na_code, codes + 1, codes) + codes = np.where(na_mask, na_code, codes) return codes, uniques @@ -838,15 +673,44 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: return codes, uniques @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: - cats = Categorical.from_codes(self.codes, self._group_index, validate=False) - return self._index.groupby(cats) + def groups(self) -> dict[Hashable, Index]: + codes, uniques = self._codes_and_uniques + uniques = Index._with_infer(uniques, name=self.name) + + r, counts = libalgos.groupsort_indexer(ensure_platform_int(codes), len(uniques)) + counts = ensure_int64(counts).cumsum() + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + # map to the label + result = {k: self._index.take(v) for k, v in zip(uniques, _result)} + + return PrettyDict(result) + + @property + def observed_grouping(self) -> Grouping: + if self._observed: + return self + + return self._observed_grouping + + @cache_readonly + def _observed_grouping(self) -> Grouping: + grouping = Grouping( + self._index, + self._orig_grouper, + obj=self.obj, + level=self.level, + sort=self._sort, + observed=True, + in_axis=self.in_axis, + dropna=self._dropna, + uniques=self._uniques, + ) + return grouping def get_grouper( obj: NDFrameT, key=None, - axis: Axis = 0, level=None, sort: bool = True, observed: bool = False, @@ -862,8 +726,8 @@ def get_grouper( Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers - Groupers enable local references to axis,level,sort, while - the passed in axis, level, and sort are 'global'. + Groupers enable local references to level,sort, while + the passed in level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into @@ -875,10 +739,10 @@ def get_grouper( If validate, then check for key/level overlaps. """ - group_axis = obj._get_axis(axis) + group_axis = obj.index # validate that the passed single level is compatible with the passed - # axis of the object + # index of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are @@ -911,11 +775,8 @@ def get_grouper( raise ValueError("multiple levels only valid with MultiIndex") if isinstance(level, str): - if obj._get_axis(axis).name != level: - raise ValueError( - f"level name {level} is not the name " - f"of the {obj._get_axis_name(axis)}" - ) + if obj.index.name != level: + raise ValueError(f"level name {level} is not the name of the index") elif level > 0 or level < -1: raise ValueError("level > 0 or level < -1 only valid with MultiIndex") @@ -999,28 +860,15 @@ def is_in_axis(key) -> bool: def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False - if using_copy_on_write() or warn_copy_on_write(): - # For the CoW case, we check the references to determine if the - # series is part of the object - try: - obj_gpr_column = obj[gpr.name] - except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): - return False - if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): - return gpr._mgr.references_same_values( # type: ignore[union-attr] - obj_gpr_column._mgr, 0 # type: ignore[arg-type] - ) - return False + # We check the references to determine if the + # series is part of the object try: - return gpr is obj[gpr.name] + obj_gpr_column = obj[gpr.name] except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): - # IndexError reached in e.g. test_skip_group_keys when we pass - # lambda here - # InvalidIndexError raised on key-types inappropriate for index, - # e.g. DatetimeIndex.get_loc(tuple()) - # OutOfBoundsDatetime raised when obj is a Series with DatetimeIndex - # and gpr.name is month str return False + if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): + return gpr._mgr.references_same_values(obj_gpr_column._mgr, 0) + return False for gpr, level in zip(keys, levels): if is_in_obj(gpr): # df.groupby(df['name']) @@ -1030,14 +878,14 @@ def is_in_obj(gpr) -> bool: elif is_in_axis(gpr): # df.groupby('name') if obj.ndim != 1 and gpr in obj: if validate: - obj._check_label_or_level_ambiguity(gpr, axis=axis) + obj._check_label_or_level_ambiguity(gpr, axis=0) in_axis, name, gpr = True, gpr, obj[gpr] if gpr.ndim != 1: # non-unique columns; raise here to get the name in the # exception message raise ValueError(f"Grouper for '{name}' not 1-dimensional") exclusions.add(name) - elif obj._is_level_reference(gpr, axis=axis): + elif obj._is_level_reference(gpr, axis=0): in_axis, level, gpr = False, gpr, None else: raise KeyError(gpr) @@ -1070,7 +918,7 @@ def is_in_obj(gpr) -> bool: if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") if len(groupings) == 0: - groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) + groupings.append(Grouping(default_index(0), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index a3c5ab8edc94e..c658f625d5ea9 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -99,8 +99,9 @@ def _positional_selector(self) -> GroupByPositionalSelector: Examples -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) + >>> df = pd.DataFrame( + ... [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] + ... ) >>> df.groupby("A")._positional_selector[1:2] A B 1 a 2 @@ -113,7 +114,6 @@ def _positional_selector(self) -> GroupByPositionalSelector: 4 b 5 """ if TYPE_CHECKING: - # pylint: disable-next=used-before-assignment groupby_self = cast(groupby.GroupBy, self) else: groupby_self = self diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index 3b7a58e87603e..73b681c64c3a3 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -1,4 +1,5 @@ """Common utilities for Numba operations with groupby ops""" + from __future__ import annotations import functools @@ -6,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -19,6 +19,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import Scalar diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5e83eaee02afc..75f3495041917 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -5,13 +5,13 @@ operations, primarily in cython. These classes (BaseGrouper and BinGrouper) are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. """ + from __future__ import annotations import collections import functools from typing import ( TYPE_CHECKING, - Callable, Generic, final, ) @@ -49,6 +49,7 @@ maybe_fill, ) +from pandas.core.arrays import Categorical from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -61,7 +62,6 @@ from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, - get_flattened_list, get_group_index, get_group_index_sorter, get_indexer_dict, @@ -69,9 +69,10 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, + Generator, Hashable, Iterator, - Sequence, ) from pandas.core.generic import NDFrame @@ -143,6 +144,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "std": functools.partial(libgroupby.group_var, name="std"), "sem": functools.partial(libgroupby.group_var, name="sem"), "skew": "group_skew", + "kurt": "group_kurt", "first": "group_nth", "last": "group_last", "ohlc": "group_ohlc", @@ -192,7 +194,7 @@ def _get_cython_function( elif how in ["std", "sem", "idxmin", "idxmax"]: # We have a partial object that does not have __signatures__ return f - elif how == "skew": + elif how in ["skew", "kurt"]: # _get_cython_vals will convert to float64 pass elif "object" not in f.__signatures__: @@ -223,7 +225,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median", "std", "sem", "skew"]: + if how in ["median", "std", "sem", "skew", "kurt"]: # median only has a float64 implementation # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -370,6 +372,10 @@ def _call_cython_op( is_datetimelike = dtype.kind in "mM" + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if is_datetimelike: values = values.view("int64") is_numeric = True @@ -379,12 +385,10 @@ def _call_cython_op( values = values.astype(np.float32) if self.how in ["any", "all"]: - if mask is None: - mask = isna(values) if dtype == object: if kwargs["skipna"]: # GH#37501: don't raise on pd.NA when skipna=True - if mask.any(): + if mask is not None and mask.any(): # mask on original values computed separately values = values.copy() values[mask] = True @@ -414,6 +418,7 @@ def _call_cython_op( "last", "first", "sum", + "median", ]: func( out=result, @@ -424,8 +429,9 @@ def _call_cython_op( mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, + **kwargs, ) - elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: + elif self.how in ["sem", "std", "var", "ohlc", "prod"]: if self.how in ["std", "sem"]: kwargs["is_datetimelike"] = is_datetimelike func( @@ -448,7 +454,7 @@ def _call_cython_op( **kwargs, ) result = result.astype(bool, copy=False) - elif self.how in ["skew"]: + elif self.how in ["skew", "kurt"]: func( out=result, counts=counts, @@ -577,14 +583,14 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: Sequence[grouper.Grouping], + groupings: list[grouper.Grouping], sort: bool = True, dropna: bool = True, ) -> None: assert isinstance(axis, Index), axis self.axis = axis - self._groupings: list[grouper.Grouping] = list(groupings) + self._groupings = groupings self._sort = sort self.dropna = dropna @@ -592,10 +598,6 @@ def __init__( def groupings(self) -> list[grouper.Grouping]: return self._groupings - @property - def shape(self) -> Shape: - return tuple(ping.ngroups for ping in self.groupings) - def __iter__(self) -> Iterator[Hashable]: return iter(self.indices) @@ -603,9 +605,7 @@ def __iter__(self) -> Iterator[Hashable]: def nkeys(self) -> int: return len(self.groupings) - def get_iterator( - self, data: NDFrameT, axis: AxisInt = 0 - ) -> Iterator[tuple[Hashable, NDFrameT]]: + def get_iterator(self, data: NDFrameT) -> Iterator[tuple[Hashable, NDFrameT]]: """ Groupby iterator @@ -614,38 +614,31 @@ def get_iterator( Generator yielding sequence of (name, subsetted object) for each group """ - splitter = self._get_splitter(data, axis=axis) - keys = self.group_keys_seq + splitter = self._get_splitter(data) + # TODO: Would be more efficient to skip unobserved for transforms + keys = self.result_index yield from zip(keys, splitter) @final - def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: + def _get_splitter(self, data: NDFrame) -> DataSplitter: """ Returns ------- Generator yielding subsetted objects """ - ids, _, ngroups = self.group_info - return _get_splitter( + if isinstance(data, Series): + klass: type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame + klass = FrameSplitter + + return klass( data, - ids, - ngroups, + self.ngroups, sorted_ids=self._sorted_ids, - sort_idx=self._sort_idx, - axis=axis, + sort_idx=self.result_ilocs, ) - @final - @cache_readonly - def group_keys_seq(self): - if len(self.groupings) == 1: - return self.levels[0] - else: - ids, _, ngroups = self.group_info - - # provide "flattened" iterator for multi-group setting - return get_flattened_list(ids, ngroups, self.levels, self.codes) - @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" @@ -653,10 +646,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: # This shows unused categories in indices GH#38642 return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] - keys = [ping._group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + return get_indexer_dict(codes_list, self.levels) @final + @cache_readonly def result_ilocs(self) -> npt.NDArray[np.intp]: """ Get the original integer locations of result_index in the input. @@ -664,18 +657,15 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: # Original indices are where group_index would go via sorting. # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. - group_index = get_group_index( - self.codes, self.shape, sort=self._sort, xnull=True - ) - group_index, _ = compress_group_index(group_index, sort=self._sort) + ids = self.ids if self.has_dropped_na: - mask = np.where(group_index >= 0) + mask = np.where(ids >= 0) # Count how many gaps are caused by previous null values for each position - null_gaps = np.cumsum(group_index == -1)[mask] - group_index = group_index[mask] + null_gaps = np.cumsum(ids == -1)[mask] + ids = ids[mask] - result = get_group_index_sorter(group_index, self.ngroups) + result = get_group_index_sorter(ids, self.ngroups) if self.has_dropped_na: # Shift by the number of prior null gaps @@ -683,14 +673,17 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: return result - @final @property def codes(self) -> list[npt.NDArray[np.signedinteger]]: return [ping.codes for ping in self.groupings] @property def levels(self) -> list[Index]: - return [ping._group_index for ping in self.groupings] + if len(self.groupings) > 1: + # mypy doesn't know result_index must be a MultiIndex + return list(self.result_index.levels) # type: ignore[attr-defined] + else: + return [self.result_index] @property def names(self) -> list[Hashable]: @@ -701,7 +694,8 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, _, ngroups = self.group_info + ids = self.ids + ngroups = self.ngroups out: np.ndarray | list if ngroups: out = np.bincount(ids[ids != -1], minlength=ngroups) @@ -710,26 +704,25 @@ def size(self) -> Series: return Series(out, index=self.result_index, dtype="int64", copy=False) @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: + def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" if len(self.groupings) == 1: return self.groupings[0].groups - else: - to_groupby = [] - for ping in self.groupings: - gv = ping.grouping_vector - if not isinstance(gv, BaseGrouper): - to_groupby.append(gv) - else: - to_groupby.append(gv.groupings[0].grouping_vector) - index = MultiIndex.from_arrays(to_groupby) - return self.axis.groupby(index) + result_index, ids = self.result_index_and_ids + values = result_index._values + categories = Categorical(ids, categories=range(len(result_index))) + result = { + # mypy is not aware that group has to be an integer + values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload] + for group, axis_ilocs in categories._reverse_indexer().items() + } + return result @final @cache_readonly def is_monotonic(self) -> bool: # return if my group orderings are monotonic - return Index(self.group_info[0]).is_monotonic_increasing + return Index(self.ids).is_monotonic_increasing @final @cache_readonly @@ -737,35 +730,12 @@ def has_dropped_na(self) -> bool: """ Whether grouper has null value(s) that are dropped. """ - return bool((self.group_info[0] < 0).any()) - - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - comp_ids, obs_group_ids = self._get_compressed_codes() - - ngroups = len(obs_group_ids) - comp_ids = ensure_platform_int(comp_ids) - - return comp_ids, obs_group_ids, ngroups + return bool((self.ids < 0).any()) @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis - ids, _, _ = self.group_info - return ids - - @final - def _get_compressed_codes( - self, - ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: - # The first returned ndarray may have any signed integer dtype - if len(self.groupings) > 1: - group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) - return compress_group_index(group_index, sort=self._sort) - # FIXME: compress_group_index's second return value is int64, not intp - - ping = self.groupings[0] - return ping.codes, np.arange(len(ping._group_index), dtype=np.intp) + return self.ids @final @cache_readonly @@ -773,37 +743,171 @@ def ngroups(self) -> int: return len(self.result_index) @property - def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: - codes = self.codes - ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + def result_index(self) -> Index: + return self.result_index_and_ids[0] + + @property + def ids(self) -> npt.NDArray[np.intp]: + return self.result_index_and_ids[1] @cache_readonly - def result_index(self) -> Index: + def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: + levels = [Index._with_infer(ping.uniques) for ping in self.groupings] + obs = [ + ping._observed or not ping._passed_categorical for ping in self.groupings + ] + sorts = [ping._sort for ping in self.groupings] + # When passed a categorical grouping, keep all categories + for k, (ping, level) in enumerate(zip(self.groupings, levels)): + if ping._passed_categorical: + levels[k] = level.set_categories(ping._orig_cats) + if len(self.groupings) == 1: - return self.groupings[0]._result_index.rename(self.names[0]) + result_index = levels[0] + result_index.name = self.names[0] + ids = ensure_platform_int(self.codes[0]) + elif all(obs): + result_index, ids = self._ob_index_and_ids( + levels, self.codes, self.names, sorts + ) + elif not any(obs): + result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names) + else: + # Combine unobserved and observed parts + names = self.names + codes = [ping.codes for ping in self.groupings] + ob_indices = [idx for idx, ob in enumerate(obs) if ob] + unob_indices = [idx for idx, ob in enumerate(obs) if not ob] + ob_index, ob_ids = self._ob_index_and_ids( + levels=[levels[idx] for idx in ob_indices], + codes=[codes[idx] for idx in ob_indices], + names=[names[idx] for idx in ob_indices], + sorts=[sorts[idx] for idx in ob_indices], + ) + unob_index, unob_ids = self._unob_index_and_ids( + levels=[levels[idx] for idx in unob_indices], + codes=[codes[idx] for idx in unob_indices], + names=[names[idx] for idx in unob_indices], + ) - codes = self.reconstructed_codes - levels = [ping._result_index for ping in self.groupings] - return MultiIndex( - levels=levels, codes=codes, verify_integrity=False, names=self.names + result_index_codes = np.concatenate( + [ + np.tile(unob_index.codes, len(ob_index)), + np.repeat(ob_index.codes, len(unob_index), axis=1), + ], + axis=0, + ) + _, index = np.unique(unob_indices + ob_indices, return_index=True) + result_index = MultiIndex( + levels=list(unob_index.levels) + list(ob_index.levels), + codes=result_index_codes, + names=list(unob_index.names) + list(ob_index.names), + ).reorder_levels(index) + ids = len(unob_index) * ob_ids + unob_ids + + if any(sorts): + # Sort result_index and recode ids using the new order + n_levels = len(sorts) + drop_levels = [ + n_levels - idx + for idx, sort in enumerate(reversed(sorts), 1) + if not sort + ] + if len(drop_levels) > 0: + sorter = result_index._drop_level_numbers(drop_levels).argsort() + else: + sorter = result_index.argsort() + result_index = result_index.take(sorter) + _, index = np.unique(sorter, return_index=True) + ids = ensure_platform_int(ids) + ids = index.take(ids) + else: + # Recode ids and reorder result_index with observed groups up front, + # unobserved at the end + ids, uniques = compress_group_index(ids, sort=False) + ids = ensure_platform_int(ids) + taker = np.concatenate( + [uniques, np.delete(np.arange(len(result_index)), uniques)] + ) + result_index = result_index.take(taker) + + return result_index, ids + + @property + def observed_grouper(self) -> BaseGrouper: + if all(ping._observed for ping in self.groupings): + return self + + return self._observed_grouper + + @cache_readonly + def _observed_grouper(self) -> BaseGrouper: + groupings = [ping.observed_grouping for ping in self.groupings] + grouper = BaseGrouper(self.axis, groupings, sort=self._sort, dropna=self.dropna) + return grouper + + def _ob_index_and_ids( + self, + levels: list[Index], + codes: list[npt.NDArray[np.intp]], + names: list[Hashable], + sorts: list[bool], + ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + consistent_sorting = all(sorts[0] == sort for sort in sorts[1:]) + sort_in_compress = sorts[0] if consistent_sorting else False + shape = tuple(len(level) for level in levels) + group_index = get_group_index(codes, shape, sort=True, xnull=True) + ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress) + ob_ids = ensure_platform_int(ob_ids) + ob_index_codes = decons_obs_group_ids( + ob_ids, obs_group_ids, shape, codes, xnull=True ) + ob_index = MultiIndex( + levels=levels, + codes=ob_index_codes, + names=names, + verify_integrity=False, + ) + if not consistent_sorting and len(ob_index) > 0: + # Sort by the levels where the corresponding sort argument is True + n_levels = len(sorts) + drop_levels = [ + n_levels - idx + for idx, sort in enumerate(reversed(sorts), 1) + if not sort + ] + if len(drop_levels) > 0: + sorter = ob_index._drop_level_numbers(drop_levels).argsort() + else: + sorter = ob_index.argsort() + ob_index = ob_index.take(sorter) + _, index = np.unique(sorter, return_index=True) + ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids)) + ob_ids = ensure_platform_int(ob_ids) + return ob_index, ob_ids + + def _unob_index_and_ids( + self, + levels: list[Index], + codes: list[npt.NDArray[np.intp]], + names: list[Hashable], + ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + shape = tuple(len(level) for level in levels) + unob_ids = get_group_index(codes, shape, sort=True, xnull=True) + unob_index = MultiIndex.from_product(levels, names=names) + unob_ids = ensure_platform_int(unob_ids) + return unob_index, unob_ids @final - def get_group_levels(self) -> list[ArrayLike]: + def get_group_levels(self) -> Generator[Index]: # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper + result_index = self.result_index if len(self.groupings) == 1: - return [self.groupings[0]._group_arraylike] - - name_list = [] - for ping, codes in zip(self.groupings, self.reconstructed_codes): - codes = ensure_platform_int(codes) - levels = ping._group_arraylike.take(codes) - - name_list.append(levels) - - return name_list + yield result_index + else: + for level in range(result_index.nlevels - 1, -1, -1): + yield result_index.get_level_values(level) # ------------------------------------------------------------ # Aggregation functions @@ -825,14 +929,12 @@ def _cython_operation( cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) - ids, _, _ = self.group_info - ngroups = self.ngroups return cy_op.cython_operation( values=values, axis=axis, min_count=min_count, - comp_ids=ids, - ngroups=ngroups, + comp_ids=self.ids, + ngroups=self.ngroups, **kwargs, ) @@ -873,12 +975,10 @@ def agg_series( def _aggregate_series_pure_python( self, obj: Series, func: Callable ) -> npt.NDArray[np.object_]: - _, _, ngroups = self.group_info - - result = np.empty(ngroups, dtype="O") + result = np.empty(self.ngroups, dtype="O") initialized = False - splitter = self._get_splitter(obj, axis=0) + splitter = self._get_splitter(obj) for i, group in enumerate(splitter): res = func(group) @@ -895,11 +995,11 @@ def _aggregate_series_pure_python( @final def apply_groupwise( - self, f: Callable, data: DataFrame | Series, axis: AxisInt = 0 + self, f: Callable, data: DataFrame | Series ) -> tuple[list, bool]: mutated = False - splitter = self._get_splitter(data, axis=axis) - group_keys = self.group_keys_seq + splitter = self._get_splitter(data) + group_keys = self.result_index result_values = [] # This calls DataSplitter.__iter__ @@ -916,12 +1016,13 @@ def apply_groupwise( # group might be modified group_axes = group.axes res = f(group) - if not mutated and not _is_indexed_like(res, group_axes, axis): + if not mutated and not _is_indexed_like(res, group_axes): mutated = True result_values.append(res) # getattr pattern for __name__ is needed for functools.partial objects if len(group_keys) == 0 and getattr(f, "__name__", None) in [ "skew", + "kurt", "sum", "prod", ]: @@ -935,18 +1036,14 @@ def apply_groupwise( # ------------------------------------------------------------ # Methods for sorting subsets of our GroupBy's object - @final - @cache_readonly - def _sort_idx(self) -> npt.NDArray[np.intp]: - # Counting sort indexer - ids, _, ngroups = self.group_info - return get_group_index_sorter(ids, ngroups) - @final @cache_readonly def _sorted_ids(self) -> npt.NDArray[np.intp]: - ids, _, _ = self.group_info - return ids.take(self._sort_idx) + result = self.ids.take(self.result_ilocs) + if getattr(self, "dropna", True): + # BinGrouper has no dropna + result = result[result >= 0] + return result class BinGrouper(BaseGrouper): @@ -1017,13 +1114,13 @@ def nkeys(self) -> int: @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis - ids, _, _ = self.group_info + ids = self.ids if self.indexer is not None: sorter = np.lexsort((ids, self.indexer)) ids = ids[sorter] return ids - def get_iterator(self, data: NDFrame, axis: AxisInt = 0): + def get_iterator(self, data: NDFrame): """ Groupby iterator @@ -1032,27 +1129,22 @@ def get_iterator(self, data: NDFrame, axis: AxisInt = 0): Generator yielding sequence of (name, subsetted object) for each group """ - if axis == 0: - slicer = lambda start, edge: data.iloc[start:edge] - else: - slicer = lambda start, edge: data.iloc[:, start:edge] + slicer = lambda start, edge: data.iloc[start:edge] - length = len(data.axes[axis]) - - start = 0 + start: np.int64 | int = 0 for edge, label in zip(self.bins, self.binlabels): if label is not NaT: yield label, slicer(start, edge) start = edge - if start < length: + if start < len(data): yield self.binlabels[-1], slicer(start, None) @cache_readonly def indices(self): indices = collections.defaultdict(list) - i = 0 + i: np.int64 | int = 0 for label, bin in zip(self.binlabels, self.bins): if i < bin: if label is not NaT: @@ -1061,34 +1153,26 @@ def indices(self): return indices @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - ngroups = self.ngroups - obs_group_ids = np.arange(ngroups, dtype=np.intp) + def codes(self) -> list[npt.NDArray[np.intp]]: + return [self.ids] + + @cache_readonly + def result_index_and_ids(self): + result_index = self.binlabels + if len(self.binlabels) != 0 and isna(self.binlabels[0]): + result_index = result_index[1:] + + ngroups = len(result_index) rep = np.diff(np.r_[0, self.bins]) rep = ensure_platform_int(rep) if ngroups == len(self.bins): - comp_ids = np.repeat(np.arange(ngroups), rep) + ids = np.repeat(np.arange(ngroups), rep) else: - comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) + ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) + ids = ensure_platform_int(ids) - return ( - ensure_platform_int(comp_ids), - obs_group_ids, - ngroups, - ) - - @cache_readonly - def reconstructed_codes(self) -> list[np.ndarray]: - # get unique result indices, and prepend 0 as groupby starts from the first - return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] - - @cache_readonly - def result_index(self) -> Index: - if len(self.binlabels) != 0 and isna(self.binlabels[0]): - return self.binlabels[1:] - - return self.binlabels + return result_index, ids @property def levels(self) -> list[Index]: @@ -1101,21 +1185,25 @@ def names(self) -> list[Hashable]: @property def groupings(self) -> list[grouper.Grouping]: lev = self.binlabels - codes = self.group_info[0] + codes = self.ids labels = lev.take(codes) ping = grouper.Grouping( labels, labels, in_axis=False, level=None, uniques=lev._values ) return [ping] + @property + def observed_grouper(self) -> BinGrouper: + return self + -def _is_indexed_like(obj, axes, axis: AxisInt) -> bool: +def _is_indexed_like(obj, axes) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False - return obj.axes[axis].equals(axes[axis]) + return obj.index.equals(axes[0]) elif isinstance(obj, DataFrame): - return obj.axes[axis].equals(axes[axis]) + return obj.index.equals(axes[0]) return False @@ -1128,39 +1216,31 @@ class DataSplitter(Generic[NDFrameT]): def __init__( self, data: NDFrameT, - labels: npt.NDArray[np.intp], ngroups: int, *, sort_idx: npt.NDArray[np.intp], sorted_ids: npt.NDArray[np.intp], - axis: AxisInt = 0, ) -> None: self.data = data - self.labels = ensure_platform_int(labels) # _should_ already be np.intp self.ngroups = ngroups self._slabels = sorted_ids self._sort_idx = sort_idx - self.axis = axis - assert isinstance(axis, int), axis - def __iter__(self) -> Iterator: - sdata = self._sorted_data - if self.ngroups == 0: # we are inside a generator, rather than raise StopIteration # we merely return signal the end return starts, ends = lib.generate_slices(self._slabels, self.ngroups) - + sdata = self._sorted_data for start, end in zip(starts, ends): yield self._chop(sdata, slice(start, end)) @cache_readonly def _sorted_data(self) -> NDFrameT: - return self.data.take(self._sort_idx, axis=self.axis) + return self.data.take(self._sort_idx, axis=0) def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) @@ -1178,30 +1258,7 @@ def _chop(self, sdata: Series, slice_obj: slice) -> Series: class FrameSplitter(DataSplitter): def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # Fastpath equivalent to: - # if self.axis == 0: - # return sdata.iloc[slice_obj] - # else: - # return sdata.iloc[:, slice_obj] - mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) + # return sdata.iloc[slice_obj] + mgr = sdata._mgr.get_slice(slice_obj, axis=1) df = sdata._constructor_from_mgr(mgr, axes=mgr.axes) return df.__finalize__(sdata, method="groupby") - - -def _get_splitter( - data: NDFrame, - labels: npt.NDArray[np.intp], - ngroups: int, - *, - sort_idx: npt.NDArray[np.intp], - sorted_ids: npt.NDArray[np.intp], - axis: AxisInt = 0, -) -> DataSplitter: - if isinstance(data, Series): - klass: type[DataSplitter] = SeriesSplitter - else: - # i.e. DataFrame - klass = FrameSplitter - - return klass( - data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids, axis=axis - ) diff --git a/pandas/core/indexers/__init__.py b/pandas/core/indexers/__init__.py index ba8a4f1d0ee7a..036b32b3feac2 100644 --- a/pandas/core/indexers/__init__.py +++ b/pandas/core/indexers/__init__.py @@ -15,17 +15,17 @@ ) __all__ = [ - "is_valid_positional_slice", + "check_array_indexer", + "check_key_length", + "check_setitem_lengths", + "disallow_ndim_indexing", + "is_empty_indexer", "is_list_like_indexer", "is_scalar_indexer", - "is_empty_indexer", - "check_setitem_lengths", - "validate_indices", - "maybe_convert_indices", + "is_valid_positional_slice", "length_of_indexer", - "disallow_ndim_indexing", + "maybe_convert_indices", "unpack_1tuple", - "check_key_length", - "check_array_indexer", "unpack_tuple_and_ellipses", + "validate_indices", ] diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index f2db4886a5590..6fc638e85bc5e 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -1,4 +1,5 @@ """Indexer objects for computing start/end window bounds for rolling operations""" + from __future__ import annotations from datetime import timedelta @@ -47,16 +48,32 @@ class BaseIndexer: """ Base class for window bounds calculations. + Parameters + ---------- + index_array : np.ndarray, default None + Array-like structure representing the indices for the data points. + If None, the default indices are assumed. This can be useful for + handling non-uniform indices in data, such as in time series + with irregular timestamps. + window_size : int, default 0 + Size of the moving window. This is the number of observations used + for calculating the statistic. The default is to consider all + observations within the window. + **kwargs + Additional keyword arguments passed to the subclass's methods. + + See Also + -------- + DataFrame.rolling : Provides rolling window calculations on dataframe. + Series.rolling : Provides rolling window calculations on series. + Examples -------- >>> from pandas.api.indexers import BaseIndexer >>> class CustomIndexer(BaseIndexer): ... def get_window_bounds(self, num_values, min_periods, center, closed, step): - ... start = np.empty(num_values, dtype=np.int64) - ... end = np.empty(num_values, dtype=np.int64) - ... for i in range(num_values): - ... start[i] = i - ... end[i] = i + self.window_size + ... start = np.arange(num_values, dtype=np.int64) + ... end = np.arange(num_values, dtype=np.int64) + self.window_size ... return start, end >>> df = pd.DataFrame({"values": range(5)}) >>> indexer = CustomIndexer(window_size=2) @@ -114,8 +131,8 @@ def get_window_bounds( if closed in ["left", "neither"]: end -= 1 - end = np.clip(end, 0, num_values) - start = np.clip(start, 0, num_values) + end = np.clip(end, 0, num_values) # type: ignore[assignment] + start = np.clip(start, 0, num_values) # type: ignore[assignment] return start, end @@ -150,6 +167,31 @@ class VariableOffsetWindowIndexer(BaseIndexer): """ Calculate window boundaries based on a non-fixed offset such as a BusinessDay. + Parameters + ---------- + index_array : np.ndarray, default 0 + Array-like structure specifying the indices for data points. + This parameter is currently not used. + + window_size : int, optional, default 0 + Specifies the number of data points in each window. + This parameter is currently not used. + + index : DatetimeIndex, optional + ``DatetimeIndex`` of the labels of each observation. + + offset : BaseOffset, optional + ``DateOffset`` representing the size of the window. + + **kwargs + Additional keyword arguments passed to the parent class ``BaseIndexer``. + + See Also + -------- + api.indexers.BaseIndexer : Base class for all indexers. + DataFrame.rolling : Rolling window calculations on DataFrames. + offsets : Module providing various time offset classes. + Examples -------- >>> from pandas.api.indexers import VariableOffsetWindowIndexer @@ -298,9 +340,29 @@ class FixedForwardWindowIndexer(BaseIndexer): """ Creates window boundaries for fixed-length windows that include the current row. + Parameters + ---------- + index_array : np.ndarray, default None + Array-like structure representing the indices for the data points. + If None, the default indices are assumed. This can be useful for + handling non-uniform indices in data, such as in time series + with irregular timestamps. + window_size : int, default 0 + Size of the moving window. This is the number of observations used + for calculating the statistic. The default is to consider all + observations within the window. + **kwargs + Additional keyword arguments passed to the subclass's methods. + + See Also + -------- + DataFrame.rolling : Provides rolling window calculations. + api.indexers.VariableWindowIndexer : Calculate window bounds based on + variable-sized windows. + Examples -------- - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) >>> df B 0 0.0 @@ -340,7 +402,7 @@ def get_window_bounds( start = np.arange(0, num_values, step, dtype="int64") end = start + self.window_size if self.window_size: - end = np.clip(end, 0, num_values) + end = np.clip(end, 0, num_values) # type: ignore[assignment] return start, end @@ -399,7 +461,7 @@ def get_window_bounds( start_arrays = [] end_arrays = [] window_indices_start = 0 - for key, indices in self.groupby_indices.items(): + for indices in self.groupby_indices.values(): index_array: np.ndarray | None if self.index_array is not None: @@ -416,9 +478,9 @@ def get_window_bounds( ) start = start.astype(np.int64) end = end.astype(np.int64) - assert len(start) == len( - end - ), "these should be equal in length from get_window_bounds" + assert len(start) == len(end), ( + "these should be equal in length from get_window_bounds" + ) # Cannot use groupby_indices as they might not be monotonic with the object # we're rolling over window_indices = np.arange( @@ -426,7 +488,7 @@ def get_window_bounds( ) window_indices_start += len(indices) # Extend as we'll be slicing window like [start, end) - window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( + window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( # type: ignore[assignment] np.int64, copy=False ) start_arrays.append(window_indices.take(ensure_platform_int(start))) diff --git a/pandas/core/indexers/utils.py b/pandas/core/indexers/utils.py index 55bb58f3108c3..b089be3469d87 100644 --- a/pandas/core/indexers/utils.py +++ b/pandas/core/indexers/utils.py @@ -1,6 +1,7 @@ """ Low-dependency indexing utilities. """ + from __future__ import annotations from typing import ( @@ -202,7 +203,7 @@ def validate_indices(indices: np.ndarray, n: int) -> None: Examples -------- - >>> validate_indices(np.array([1, 2]), 3) # OK + >>> validate_indices(np.array([1, 2]), 3) # OK >>> validate_indices(np.array([1, -2]), 3) Traceback (most recent call last): @@ -214,7 +215,7 @@ def validate_indices(indices: np.ndarray, n: int) -> None: ... IndexError: indices are out-of-bounds - >>> validate_indices(np.array([-1, -1]), 0) # OK + >>> validate_indices(np.array([-1, -1]), 0) # OK >>> validate_indices(np.array([0, 1]), 0) Traceback (most recent call last): @@ -502,7 +503,7 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: For non-integer/boolean dtypes, an appropriate error is raised: - >>> indexer = np.array([0., 2.], dtype="float64") + >>> indexer = np.array([0.0, 2.0], dtype="float64") >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 929c7f4a63f8f..c404323a1168c 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -1,10 +1,12 @@ """ datetimelike delegation """ + from __future__ import annotations from typing import ( TYPE_CHECKING, + NoReturn, cast, ) import warnings @@ -106,18 +108,9 @@ def _delegate_property_get(self, name: str): else: index = self._parent.index # return the result as a Series - result = Series(result, index=index, name=self.name).__finalize__(self._parent) - - # setting this object will show a SettingWithCopyWarning/Error - result._is_copy = ( - "modifications to a property of a datetimelike " - "object are not supported and are discarded. " - "Change values on the original." - ) - - return result + return Series(result, index=index, name=self.name).__finalize__(self._parent) - def _delegate_property_set(self, name: str, value, *args, **kwargs): + def _delegate_property_set(self, name: str, value, *args, **kwargs) -> NoReturn: raise ValueError( "modifications to a property of a datetimelike object are not supported. " "Change values on the original." @@ -134,20 +127,25 @@ def _delegate_method(self, name: str, *args, **kwargs): if not is_list_like(result): return result - result = Series(result, index=self._parent.index, name=self.name).__finalize__( + return Series(result, index=self._parent.index, name=self.name).__finalize__( self._parent ) - # setting this object will show a SettingWithCopyWarning/Error - result._is_copy = ( - "modifications to a method of a datetimelike " - "object are not supported and are discarded. " - "Change values on the original." - ) - - return result - +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_ops, + typ="property", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) @delegate_names( delegate=ArrowExtensionArray, accessors=DatetimeArray._datetimelike_ops, @@ -213,16 +211,20 @@ def _delegate_method(self, name: str, *args, **kwargs): return result - def to_pydatetime(self): - # GH#20306 + def to_pytimedelta(self): + # GH 57463 warnings.warn( - f"The behavior of {type(self).__name__}.to_pydatetime is deprecated, " + f"The behavior of {type(self).__name__}.to_pytimedelta is deprecated, " "in a future version this will return a Series containing python " - "datetime objects instead of an ndarray. To retain the old behavior, " - "call `np.array` on the result", + "datetime.timedelta objects instead of an ndarray. To retain the " + "old behavior, call `np.array` on the result", FutureWarning, stacklevel=find_stack_level(), ) + return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta() + + def to_pydatetime(self) -> Series: + # GH#20306 return cast(ArrowExtensionArray, self._parent.array)._dt_to_pydatetime() def isocalendar(self) -> DataFrame: @@ -241,6 +243,26 @@ def isocalendar(self) -> DataFrame: ) return iso_calendar_df + @property + def components(self) -> DataFrame: + from pandas import DataFrame + + components_df = DataFrame( + { + col: getattr(self._parent.array, f"_dt_{col}") + for col in [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] + } + ) + return components_df + @delegate_names( delegate=DatetimeArray, @@ -298,15 +320,9 @@ class DatetimeProperties(Properties): Raises TypeError if the Series does not contain datetimelike values. """ - def to_pydatetime(self) -> np.ndarray: + def to_pydatetime(self) -> Series: """ - Return the data as an array of :class:`datetime.datetime` objects. - - .. deprecated:: 2.1.0 - - The current behavior of dt.to_pydatetime is deprecated. - In a future version this will return a Series containing python - datetime objects instead of a ndarray. + Return the data as a Series of :class:`datetime.datetime` objects. Timezone information is retained if present. @@ -326,41 +342,59 @@ def to_pydatetime(self) -> np.ndarray: Examples -------- - >>> s = pd.Series(pd.date_range('20180310', periods=2)) + >>> s = pd.Series(pd.date_range("20180310", periods=2)) >>> s 0 2018-03-10 1 2018-03-11 dtype: datetime64[ns] >>> s.dt.to_pydatetime() - array([datetime.datetime(2018, 3, 10, 0, 0), - datetime.datetime(2018, 3, 11, 0, 0)], dtype=object) + 0 2018-03-10 00:00:00 + 1 2018-03-11 00:00:00 + dtype: object pandas' nanosecond precision is truncated to microseconds. - >>> s = pd.Series(pd.date_range('20180310', periods=2, freq='ns')) + >>> s = pd.Series(pd.date_range("20180310", periods=2, freq="ns")) >>> s 0 2018-03-10 00:00:00.000000000 1 2018-03-10 00:00:00.000000001 dtype: datetime64[ns] >>> s.dt.to_pydatetime() - array([datetime.datetime(2018, 3, 10, 0, 0), - datetime.datetime(2018, 3, 10, 0, 0)], dtype=object) + 0 2018-03-10 00:00:00 + 1 2018-03-10 00:00:00 + dtype: object """ # GH#20306 - warnings.warn( - f"The behavior of {type(self).__name__}.to_pydatetime is deprecated, " - "in a future version this will return a Series containing python " - "datetime objects instead of an ndarray. To retain the old behavior, " - "call `np.array` on the result", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._get_values().to_pydatetime() + from pandas import Series + + return Series(self._get_values().to_pydatetime(), dtype=object) @property def freq(self): + """ + Tries to return a string representing a frequency generated by infer_freq. + + Returns None if it can't autodetect the frequency. + + See Also + -------- + Series.dt.to_period : Cast to PeriodArray/PeriodIndex at a particular + frequency. + + Examples + -------- + >>> ser = pd.Series(["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04"]) + >>> ser = pd.to_datetime(ser) + >>> ser.dt.freq + 'D' + + >>> ser = pd.Series(["2022-01-01", "2024-01-01", "2026-01-01", "2028-01-01"]) + >>> ser = pd.to_datetime(ser) + >>> ser.dt.freq + '2YS-JAN' + """ return self._get_values().inferred_freq def isocalendar(self) -> DataFrame: @@ -447,7 +481,7 @@ def to_pytimedelta(self) -> np.ndarray: Examples -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="D")) >>> s 0 0 days 1 1 days @@ -461,20 +495,39 @@ def to_pytimedelta(self) -> np.ndarray: datetime.timedelta(days=2), datetime.timedelta(days=3), datetime.timedelta(days=4)], dtype=object) """ + # GH 57463 + warnings.warn( + f"The behavior of {type(self).__name__}.to_pytimedelta is deprecated, " + "in a future version this will return a Series containing python " + "datetime.timedelta objects instead of an ndarray. To retain the " + "old behavior, call `np.array` on the result", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._get_values().to_pytimedelta() @property - def components(self): + def components(self) -> DataFrame: """ Return a Dataframe of the components of the Timedeltas. + Each row of the DataFrame corresponds to a Timedelta in the original + Series and contains the individual components (days, hours, minutes, + seconds, milliseconds, microseconds, nanoseconds) of the Timedelta. + Returns ------- DataFrame + See Also + -------- + TimedeltaIndex.components : Return a DataFrame of the individual resolution + components of the Timedeltas. + Series.dt.total_seconds : Return the total number of seconds in the duration. + Examples -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="s")) >>> s 0 0 days 00:00:00 1 0 days 00:00:01 @@ -571,6 +624,44 @@ class PeriodProperties(Properties): class CombinedDatetimelikeProperties( DatetimeProperties, TimedeltaProperties, PeriodProperties ): + """ + Accessor object for Series values' datetime-like, timedelta and period properties. + + See Also + -------- + DatetimeIndex : Index of datetime64 data. + + Examples + -------- + >>> dates = pd.Series( + ... ["2024-01-01", "2024-01-15", "2024-02-5"], dtype="datetime64[ns]" + ... ) + >>> dates.dt.day + 0 1 + 1 15 + 2 5 + dtype: int32 + >>> dates.dt.month + 0 1 + 1 1 + 2 2 + dtype: int32 + + >>> dates = pd.Series( + ... ["2024-01-01", "2024-01-15", "2024-02-5"], dtype="datetime64[ns, UTC]" + ... ) + >>> dates.dt.day + 0 1 + 1 15 + 2 5 + dtype: int32 + >>> dates.dt.month + 0 1 + 1 1 + 2 2 + dtype: int32 + """ + def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor] # CombinedDatetimelikeProperties isn't really instantiated. Instead # we need to choose which parent (datetime or timedelta) is @@ -592,7 +683,7 @@ def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor index=orig.index, ) - if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M": + if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm": return ArrowTemporalProperties(data, orig) if lib.is_np_dtype(data.dtype, "M"): return DatetimeProperties(data, orig) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 560285bd57a22..058e584336905 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,6 +1,5 @@ from __future__ import annotations -import textwrap from typing import ( TYPE_CHECKING, cast, @@ -23,6 +22,7 @@ ensure_index, ensure_index_from_sequences, get_unanimous_names, + maybe_sequence_to_range, ) from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import DatetimeIndex @@ -34,38 +34,29 @@ if TYPE_CHECKING: from pandas._typing import Axis -_sort_msg = textwrap.dedent( - """\ -Sorting because non-concatenation axis is not aligned. A future version -of pandas will change to not sort by default. - -To accept the future behavior, pass 'sort=False'. - -To retain the current behavior and silence the warning, pass 'sort=True'. -""" -) __all__ = [ - "Index", - "MultiIndex", "CategoricalIndex", + "DatetimeIndex", + "Index", "IntervalIndex", - "RangeIndex", "InvalidIndexError", - "TimedeltaIndex", + "MultiIndex", + "NaT", "PeriodIndex", - "DatetimeIndex", + "RangeIndex", + "TimedeltaIndex", "_new_Index", - "NaT", + "all_indexes_same", + "default_index", "ensure_index", "ensure_index_from_sequences", "get_objs_combined_axis", - "union_indexes", "get_unanimous_names", - "all_indexes_same", - "default_index", + "maybe_sequence_to_range", "safe_sort_index", + "union_indexes", ] @@ -74,7 +65,6 @@ def get_objs_combined_axis( intersect: bool = False, axis: Axis = 0, sort: bool = True, - copy: bool = False, ) -> Index: """ Extract combined index: return intersection or union (depending on the @@ -92,15 +82,13 @@ def get_objs_combined_axis( The axis to extract indexes from. sort : bool, default True Whether the result index should come out sorted or not. - copy : bool, default False - If True, return a copy of the combined index. Returns ------- Index """ obs_idxes = [obj._get_axis(axis) for obj in objs] - return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy) + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) def _get_distinct_objs(objs: list[Index]) -> list[Index]: @@ -121,7 +109,6 @@ def _get_combined_index( indexes: list[Index], intersect: bool = False, sort: bool = False, - copy: bool = False, ) -> Index: """ Return the union or intersection of indexes. @@ -135,8 +122,6 @@ def _get_combined_index( calculate the union. sort : bool, default False Whether the result index should come out sorted or not. - copy : bool, default False - If True, return a copy of the combined index. Returns ------- @@ -145,7 +130,7 @@ def _get_combined_index( # TODO: handle index names! indexes = _get_distinct_objs(indexes) if len(indexes) == 0: - index = Index([]) + index: Index = default_index(0) elif len(indexes) == 1: index = indexes[0] elif intersect: @@ -158,10 +143,6 @@ def _get_combined_index( if sort: index = safe_sort_index(index) - # GH 29879 - if copy: - index = index.copy() - return index @@ -228,76 +209,28 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: indexes, kind = _sanitize_and_check(indexes) - def _unique_indices(inds, dtype) -> Index: - """ - Concatenate indices and remove duplicates. - - Parameters - ---------- - inds : list of Index or list objects - dtype : dtype to set for the resulting Index - - Returns - ------- - Index - """ - if all(isinstance(ind, Index) for ind in inds): - inds = [ind.astype(dtype, copy=False) for ind in inds] - result = inds[0].unique() - other = inds[1].append(inds[2:]) - diff = other[result.get_indexer_for(other) == -1] - if len(diff): - result = result.append(diff.unique()) - if sort: - result = result.sort_values() - return result - - def conv(i): - if isinstance(i, Index): - i = i.tolist() - return i - - return Index( - lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort), - dtype=dtype, - ) - - def _find_common_index_dtype(inds): - """ - Finds a common type for the indexes to pass through to resulting index. - - Parameters - ---------- - inds: list of Index or list objects - - Returns - ------- - The common type or None if no indexes were given - """ - dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)] - if dtypes: - dtype = find_common_type(dtypes) - else: - dtype = None - - return dtype - if kind == "special": result = indexes[0] - dtis = [x for x in indexes if isinstance(x, DatetimeIndex)] - dti_tzs = [x for x in dtis if x.tz is not None] - if len(dti_tzs) not in [0, len(dtis)]: + num_dtis = 0 + num_dti_tzs = 0 + for idx in indexes: + if isinstance(idx, DatetimeIndex): + num_dtis += 1 + if idx.tz is not None: + num_dti_tzs += 1 + if num_dti_tzs not in [0, num_dtis]: # TODO: this behavior is not tested (so may not be desired), # but is kept in order to keep behavior the same when # deprecating union_many # test_frame_from_dict_with_mixed_indexes raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - if len(dtis) == len(indexes): + if num_dtis == len(indexes): + sort = True result = indexes[0] - elif len(dtis) > 1: + elif num_dtis > 1: # If we have mixed timezones, our casting behavior may depend on # the order of indexes, which we don't want. sort = False @@ -312,18 +245,36 @@ def _find_common_index_dtype(inds): return result elif kind == "array": - dtype = _find_common_index_dtype(indexes) - index = indexes[0] - if not all(index.equals(other) for other in indexes[1:]): - index = _unique_indices(indexes, dtype) + if not all_indexes_same(indexes): + dtype = find_common_type([idx.dtype for idx in indexes]) + inds = [ind.astype(dtype, copy=False) for ind in indexes] + index = inds[0].unique() + other = inds[1].append(inds[2:]) + diff = other[index.get_indexer_for(other) == -1] + if len(diff): + index = index.append(diff.unique()) + if sort: + index = index.sort_values() + else: + index = indexes[0] name = get_unanimous_names(*indexes)[0] if name != index.name: index = index.rename(name) return index - else: # kind='list' - dtype = _find_common_index_dtype(indexes) - return _unique_indices(indexes, dtype) + elif kind == "list": + dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)] + if dtypes: + dtype = find_common_type(dtypes) + else: + dtype = None + all_lists = (idx.tolist() if isinstance(idx, Index) else idx for idx in indexes) + return Index( + lib.fast_unique_multiple_list_gen(all_lists, sort=bool(sort)), + dtype=dtype, + ) + else: + raise ValueError(f"{kind=} must be 'special', 'array' or 'list'.") def _sanitize_and_check(indexes): @@ -347,14 +298,14 @@ def _sanitize_and_check(indexes): sanitized_indexes : list of Index or list objects type : {'list', 'array', 'special'} """ - kinds = list({type(index) for index in indexes}) + kinds = {type(index) for index in indexes} if list in kinds: if len(kinds) > 1: indexes = [ Index(list(x)) if not isinstance(x, Index) else x for x in indexes ] - kinds.remove(list) + kinds -= {list} else: return indexes, "list" diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 88a08dd55f739..9df0787a4560d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -8,7 +8,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, NoReturn, @@ -22,8 +21,7 @@ from pandas._config import ( get_option, - using_copy_on_write, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -50,17 +48,21 @@ ArrayLike, Axes, Axis, + AxisInt, DropKeep, + Dtype, DtypeObj, F, IgnoreRaise, IndexLabel, + IndexT, JoinHow, Level, NaPosition, ReindexMethod, Self, Shape, + SliceType, npt, ) from pandas.compat.numpy import function as nv @@ -71,8 +73,8 @@ from pandas.util._decorators import ( Appender, cache_readonly, - deprecate_nonkeyword_arguments, doc, + set_module, ) from pandas.util._exceptions import ( find_stack_level, @@ -146,7 +148,7 @@ nanops, ops, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor import pandas.core.algorithms as algos from pandas.core.array_algos.putmask import ( setitem_datetimelike_compat, @@ -181,7 +183,6 @@ from pandas.core.indexes.frozen import FrozenList from pandas.core.missing import clean_reindex_fill_method from pandas.core.ops import get_op_result_name -from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( ensure_key_mapped, get_group_index_sorter, @@ -198,6 +199,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Sequence, @@ -317,6 +319,7 @@ def _new_Index(cls, d): return cls.__new__(cls, **d) +@set_module("pandas") class Index(IndexOpsMixin, PandasObject): """ Immutable sequence used for indexing and alignment. @@ -331,6 +334,8 @@ class Index(IndexOpsMixin, PandasObject): Parameters ---------- data : array-like (1-dimensional) + An array-like structure containing the data for the index. This could be a + Python list, a NumPy array, or a pandas Series. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Index. If not specified, this will be inferred from `data`. @@ -362,7 +367,7 @@ class Index(IndexOpsMixin, PandasObject): >>> pd.Index([1, 2, 3]) Index([1, 2, 3], dtype='int64') - >>> pd.Index(list('abc')) + >>> pd.Index(list("abc")) Index(['a', 'b', 'c'], dtype='object') >>> pd.Index([1, 2, 3], dtype="uint8") @@ -465,7 +470,7 @@ def _engine_type( _accessors = {"str"} - str = CachedAccessor("str", StringMethods) + str = Accessor("str", StringMethods) _references = None @@ -493,8 +498,6 @@ def __new__( if not copy and isinstance(data, (ABCSeries, Index)): refs = data._references - is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) - # range if isinstance(data, (range, RangeIndex)): result = RangeIndex(start=data, copy=copy, name=name) @@ -506,12 +509,13 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - pass + if isinstance(data, (set, frozenset)): + data = list(data) elif is_ea_or_datetimelike_dtype(data_dtype): pass - elif isinstance(data, (np.ndarray, Index, ABCSeries)): + elif isinstance(data, (np.ndarray, ABCMultiIndex)): if isinstance(data, ABCMultiIndex): data = data._values @@ -521,7 +525,9 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - + elif isinstance(data, (ABCSeries, Index)): + # GH 56244: Avoid potential inference on object types + pass elif is_scalar(data): raise cls._raise_scalar_data_error(data) elif hasattr(data, "__array__"): @@ -574,19 +580,7 @@ def __new__( klass = cls._dtype_to_subclass(arr.dtype) arr = klass._ensure_array(arr, arr.dtype, copy=False) - result = klass._simple_new(arr, name, refs=refs) - if dtype is None and is_pandas_object and data_dtype == np.object_: - if result.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Index " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - return result # type: ignore[return-value] + return klass._simple_new(arr, name, refs=refs) @classmethod def _ensure_array(cls, data, dtype, copy: bool): @@ -725,7 +719,7 @@ def _format_duplicate_message(self) -> DataFrame: Examples -------- - >>> idx = pd.Index(['a', 'b', 'a']) + >>> idx = pd.Index(["a", "b", "a"]) >>> idx._format_duplicate_message() positions label @@ -812,7 +806,7 @@ def is_(self, other) -> bool: Examples -------- - >>> idx1 = pd.Index(['1', '2', '3']) + >>> idx1 = pd.Index(["1", "2", "3"]) >>> idx1.is_(idx1.view()) True @@ -837,7 +831,8 @@ def _reset_identity(self) -> None: @final def _cleanup(self) -> None: - self._engine.clear_mapping() + if "_engine" in self._cache: + self._engine.clear_mapping() @cache_readonly def _engine( @@ -883,6 +878,8 @@ def _engine( # error: Item "ExtensionArray" of "Union[ExtensionArray, # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] + elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -912,11 +909,15 @@ def __len__(self) -> int: """ return len(self._data) - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ - return np.asarray(self._data, dtype=dtype) + if copy is None: + # Note, that the if branch exists for NumPy 1.x support + return np.asarray(self._data, dtype=dtype) + + return np.array(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): @@ -949,6 +950,9 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): elif method == "reduce": result = lib.item_from_zerodim(result) return result + elif is_scalar(result): + # e.g. matmul + return result if result.dtype == np.float16: result = result.astype(np.float32) @@ -956,7 +960,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): return self.__array_wrap__(result) @final - def __array_wrap__(self, result, context=None): + def __array_wrap__(self, result, context=None, return_scalar=False): """ Gets called after a ufunc and other functions e.g. np.split. """ @@ -976,6 +980,10 @@ def dtype(self) -> DtypeObj: """ Return the dtype object of the underlying data. + See Also + -------- + Index.inferred_type: Return a string of the type inferred from the values. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -991,9 +999,16 @@ def ravel(self, order: str_t = "C") -> Self: """ Return a view on self. + Parameters + ---------- + order : {'K', 'A', 'C', 'F'}, default 'C' + Specify the memory layout of the view. This parameter is not + implemented currently. + Returns ------- Index + A view on self. See Also -------- @@ -1001,16 +1016,69 @@ def ravel(self, order: str_t = "C") -> Self: Examples -------- - >>> s = pd.Series([1, 2, 3], index=['a', 'b', 'c']) + >>> s = pd.Series([1, 2, 3], index=["a", "b", "c"]) >>> s.index.ravel() Index(['a', 'b', 'c'], dtype='object') """ return self[:] def view(self, cls=None): + """ + Return a view of the Index with the specified dtype or a new Index instance. + + This method returns a view of the calling Index object if no arguments are + provided. If a dtype is specified through the `cls` argument, it attempts + to return a view of the Index with the specified dtype. Note that viewing + the Index as a different dtype reinterprets the underlying data, which can + lead to unexpected results for non-numeric or incompatible dtype conversions. + + Parameters + ---------- + cls : data-type or ndarray sub-class, optional + Data-type descriptor of the returned view, e.g., float32 or int16. + Omitting it results in the view having the same data-type as `self`. + This argument can also be specified as an ndarray sub-class, + e.g., np.int64 or np.float32 which then specifies the type of + the returned object. + + Returns + ------- + Index or ndarray + A view of the Index. If `cls` is None, the returned object is an Index + view with the same dtype as the calling object. If a numeric `cls` is + specified an ndarray view with the new dtype is returned. + + Raises + ------ + ValueError + If attempting to change to a dtype in a way that is not compatible with + the original dtype's memory layout, for example, viewing an 'int64' Index + as 'str'. + + See Also + -------- + Index.copy : Returns a copy of the Index. + numpy.ndarray.view : Returns a new view of array with the same data. + + Examples + -------- + >>> idx = pd.Index([-1, 0, 1]) + >>> idx.view() + Index([-1, 0, 1], dtype='int64') + + >>> idx.view(np.uint64) + array([18446744073709551615, 0, 1], + dtype=uint64) + + Viewing as 'int32' or 'float32' reinterprets the memory, which may lead to + unexpected behavior: + + >>> idx.view("float32") + array([ nan, nan, 0.e+00, 0.e+00, 1.e-45, 0.e+00], dtype=float32) + """ # we need to see if we are subclassing an # index type here - if cls is not None and not hasattr(cls, "_typ"): + if cls is not None: dtype = cls if isinstance(cls, str): dtype = pandas_dtype(cls) @@ -1027,22 +1095,12 @@ def view(self, cls=None): result = self._data.view(cls) else: - if cls is not None: - warnings.warn( - # GH#55709 - f"Passing a type in {type(self).__name__}.view is deprecated " - "and will raise in a future version. " - "Call view without any argument to retain the old behavior.", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = self._view() if isinstance(result, Index): result._id = self._id return result - def astype(self, dtype, copy: bool = True): + def astype(self, dtype: Dtype, copy: bool = True): """ Create an Index with values cast to dtypes. @@ -1066,12 +1124,18 @@ def astype(self, dtype, copy: bool = True): Index Index with values cast to specified dtype. + See Also + -------- + Index.dtype: Return the dtype object of the underlying data. + Index.dtypes: Return the dtype object of the underlying data. + Index.convert_dtypes: Convert columns to the best possible dtypes. + Examples -------- >>> idx = pd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='int64') - >>> idx.astype('float') + >>> idx.astype("float") Index([1.0, 2.0, 3.0], dtype='float64') """ if dtype is not None: @@ -1107,9 +1171,7 @@ def astype(self, dtype, copy: bool = True): result._references.add_index_reference(result) return result - _index_shared_docs[ - "take" - ] = """ + _index_shared_docs["take"] = """ Return a new %(klass)s of the values selected by the indices. For internal compatibility with numpy arrays. @@ -1121,9 +1183,21 @@ def astype(self, dtype, copy: bool = True): axis : int, optional The axis over which to select values, always 0. allow_fill : bool, default True + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + fill_value : scalar, default None If allow_fill=True and fill_value is not None, indices specified by -1 are regarded as NA. If Index doesn't hold NA, raise ValueError. + **kwargs + Required for compatibility with numpy. Returns ------- @@ -1159,6 +1233,9 @@ def take( indices = ensure_platform_int(indices) allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) + if indices.ndim == 1 and lib.is_range_indexer(indices, len(self)): + return self.copy() + # Note: we discard fill_value and use self._na_value, only relevant # in the case where allow_fill is True and fill_value is not None values = self._values @@ -1196,9 +1273,7 @@ def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool: allow_fill = False return allow_fill - _index_shared_docs[ - "repeat" - ] = """ + _index_shared_docs["repeat"] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -1262,12 +1337,19 @@ def copy( name : Label, optional Set name for new object. deep : bool, default False + If True attempts to make a deep copy of the Index. + Else makes a shallow copy. Returns ------- Index Index refer to new object which is a copy of this object. + See Also + -------- + Index.delete: Make new Index with passed location(-s) deleted. + Index.drop: Make new Index with passed list of labels deleted. + Notes ----- In most cases, there should be no functional difference from using @@ -1275,7 +1357,7 @@ def copy( Examples -------- - >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx = pd.Index(["a", "b", "c"]) >>> new_idx = idx.copy() >>> idx is new_idx False @@ -1372,16 +1454,19 @@ def _format_attrs(self) -> list[tuple[str_t, str_t | int | bool | None]]: return attrs @final - def _get_level_names(self) -> Hashable | Sequence[Hashable]: + def _get_level_names(self) -> range | Sequence[Hashable]: """ Return a name or list of names with None replaced by the level number. """ if self._is_multi: - return [ - level if name is None else name for level, name in enumerate(self.names) - ] + return maybe_sequence_to_range( + [ + level if name is None else name + for level, name in enumerate(self.names) + ] + ) else: - return 0 if self.name is None else self.name + return range(1) if self.name is None else [self.name] @final def _mpl_repr(self) -> np.ndarray: @@ -1390,36 +1475,6 @@ def _mpl_repr(self) -> np.ndarray: return cast(np.ndarray, self.values) return self.astype(object, copy=False)._values - def format( - self, - name: bool = False, - formatter: Callable | None = None, - na_rep: str_t = "NaN", - ) -> list[str_t]: - """ - Render a string representation of the Index. - """ - warnings.warn( - # GH#55413 - f"{type(self).__name__}.format is deprecated and will be removed " - "in a future version. Convert using index.astype(str) or " - "index.map(formatter) instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - header = [] - if name: - header.append( - pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) - if self.name is not None - else "" - ) - - if formatter is not None: - return header + list(self.map(formatter)) - - return self._format_with_header(header=header, na_rep=na_rep) - _default_na_rep = "NaN" @final @@ -1567,7 +1622,7 @@ def to_series(self, index=None, name: Hashable | None = None) -> Series: Examples -------- - >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx = pd.Index(["Ant", "Bear", "Cow"], name="animal") By default, the original index and original name is reused. @@ -1588,7 +1643,7 @@ def to_series(self, index=None, name: Hashable | None = None) -> Series: To override the name of the resulting column, specify ``name``: - >>> idx.to_series(name='zoo') + >>> idx.to_series(name="zoo") animal Ant Ant Bear Bear @@ -1631,7 +1686,7 @@ def to_frame( Examples -------- - >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx = pd.Index(["Ant", "Bear", "Cow"], name="animal") >>> idx.to_frame() animal animal @@ -1649,7 +1704,7 @@ def to_frame( To override the name of the resulting column, specify `name`: - >>> idx.to_frame(index=False, name='zoo') + >>> idx.to_frame(index=False, name="zoo") zoo 0 Ant 1 Bear @@ -1658,8 +1713,11 @@ def to_frame( from pandas import DataFrame if name is lib.no_default: - name = self._get_level_names() - result = DataFrame({name: self}, copy=not using_copy_on_write()) + result_name = self._get_level_names() + else: + result_name = Index([name]) # type: ignore[assignment] + result = DataFrame(self, copy=False) + result.columns = result_name if index: result.index = self @@ -1673,9 +1731,20 @@ def name(self) -> Hashable: """ Return Index or MultiIndex name. + Returns + ------- + label (hashable object) + The name of the Index. + + See Also + -------- + Index.set_names: Able to set new names partially and by level. + Index.rename: Able to set new names partially and by level. + Series.name: Corresponding Series property. + Examples -------- - >>> idx = pd.Index([1, 2, 3], name='x') + >>> idx = pd.Index([1, 2, 3], name="x") >>> idx Index([1, 2, 3], dtype='int64', name='x') >>> idx.name @@ -1763,6 +1832,38 @@ def _get_default_index_names( return names def _get_names(self) -> FrozenList: + """ + Get names on index. + + This method returns a FrozenList containing the names of the object. + It's primarily intended for internal use. + + Returns + ------- + FrozenList + A FrozenList containing the object's names, contains None if the object + does not have a name. + + See Also + -------- + Index.name : Index name as a string, or None for MultiIndex. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3], name="x") + >>> idx.names + FrozenList(['x']) + + >>> idx = pd.Index([1, 2, 3], name=("x", "y")) + >>> idx.names + FrozenList([('x', 'y')]) + + If the index does not have a name set: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx.names + FrozenList([None]) + """ return FrozenList((self.name,)) def _set_names(self, values, *, level=None) -> None: @@ -1795,16 +1896,13 @@ def _set_names(self, values, *, level=None) -> None: names = property(fset=_set_names, fget=_get_names) @overload - def set_names(self, names, *, level=..., inplace: Literal[False] = ...) -> Self: - ... + def set_names(self, names, *, level=..., inplace: Literal[False] = ...) -> Self: ... @overload - def set_names(self, names, *, level=..., inplace: Literal[True]) -> None: - ... + def set_names(self, names, *, level=..., inplace: Literal[True]) -> None: ... @overload - def set_names(self, names, *, level=..., inplace: bool = ...) -> Self | None: - ... + def set_names(self, names, *, level=..., inplace: bool = ...) -> Self | None: ... def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: """ @@ -1844,19 +1942,18 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: >>> idx = pd.Index([1, 2, 3, 4]) >>> idx Index([1, 2, 3, 4], dtype='int64') - >>> idx.set_names('quarter') + >>> idx.set_names("quarter") Index([1, 2, 3, 4], dtype='int64', name='quarter') - >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]]) + >>> idx = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) >>> idx MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], ) - >>> idx = idx.set_names(['kind', 'year']) - >>> idx.set_names('species', level=0) + >>> idx = idx.set_names(["kind", "year"]) + >>> idx.set_names("species", level=0) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), @@ -1865,7 +1962,7 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: When renaming levels with a dict, levels can not be passed. - >>> idx.set_names({'kind': 'snake'}) + >>> idx.set_names({"kind": "snake"}) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), @@ -1912,17 +2009,12 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: return None @overload - def rename(self, name, *, inplace: Literal[False] = ...) -> Self: - ... + def rename(self, name, *, inplace: Literal[False] = ...) -> Self: ... @overload - def rename(self, name, *, inplace: Literal[True]) -> None: - ... + def rename(self, name, *, inplace: Literal[True]) -> None: ... - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "name"], name="rename" - ) - def rename(self, name, inplace: bool = False) -> Self | None: + def rename(self, name, *, inplace: bool = False) -> Self | None: """ Alter Index or MultiIndex name. @@ -1948,26 +2040,26 @@ def rename(self, name, inplace: bool = False) -> Self | None: Examples -------- - >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') - >>> idx.rename('grade') + >>> idx = pd.Index(["A", "C", "A", "B"], name="score") + >>> idx.rename("grade") Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') - >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]], - ... names=['kind', 'year']) + >>> idx = pd.MultiIndex.from_product( + ... [["python", "cobra"], [2018, 2019]], names=["kind", "year"] + ... ) >>> idx MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], names=['kind', 'year']) - >>> idx.rename(['species', 'year']) + >>> idx.rename(["species", "year"]) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], names=['species', 'year']) - >>> idx.rename('species') + >>> idx.rename("species") Traceback (most recent call last): TypeError: Must pass list-like as `names`. """ @@ -2023,7 +2115,7 @@ def sortlevel( ascending: bool | list[bool] = True, sort_remaining=None, na_position: NaPosition = "first", - ): + ) -> tuple[Self, np.ndarray]: """ For internal compatibility with the Index API. @@ -2090,7 +2182,7 @@ def _get_level_values(self, level) -> Index: Examples -------- - >>> idx = pd.Index(list('abc')) + >>> idx = pd.Index(list("abc")) >>> idx Index(['a', 'b', 'c'], dtype='object') @@ -2121,11 +2213,18 @@ def droplevel(self, level: IndexLabel = 0): Returns ------- Index or MultiIndex + Returns an Index or MultiIndex object, depending on the resulting index + after removing the requested level(s). + + See Also + -------- + Index.dropna : Return Index without NA/NaN values. Examples -------- >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + ... [[1, 2], [3, 4], [5, 6]], names=["x", "y", "z"] + ... ) >>> mi MultiIndex([(1, 3, 5), (2, 4, 6)], @@ -2141,18 +2240,18 @@ def droplevel(self, level: IndexLabel = 0): (2, 4)], names=['x', 'y']) - >>> mi.droplevel('z') + >>> mi.droplevel("z") MultiIndex([(1, 3), (2, 4)], names=['x', 'y']) - >>> mi.droplevel(['x', 'y']) + >>> mi.droplevel(["x", "y"]) Index([5, 6], dtype='int64', name='z') """ if not isinstance(level, (tuple, list)): level = [level] - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + levnums = sorted((self._get_level_number(lev) for lev in level), reverse=True) return self._drop_level_numbers(levnums) @@ -2333,13 +2432,13 @@ def is_unique(self) -> bool: >>> idx.is_unique True - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", "Watermelon"]).astype( + ... "category" + ... ) >>> idx.is_unique False - >>> idx = pd.Index(["Orange", "Apple", - ... "Watermelon"]).astype("category") + >>> idx = pd.Index(["Orange", "Apple", "Watermelon"]).astype("category") >>> idx.is_unique True """ @@ -2370,382 +2469,27 @@ def has_duplicates(self) -> bool: >>> idx.has_duplicates False - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") - >>> idx.has_duplicates - True - - >>> idx = pd.Index(["Orange", "Apple", - ... "Watermelon"]).astype("category") - >>> idx.has_duplicates - False - """ - return not self.is_unique - - @final - def is_boolean(self) -> bool: - """ - Check if the Index only consists of booleans. - - .. deprecated:: 2.0.0 - Use `pandas.api.types.is_bool_dtype` instead. - - Returns - ------- - bool - Whether or not the Index only consists of booleans. - - See Also - -------- - is_integer : Check if the Index only consists of integers (deprecated). - is_floating : Check if the Index is a floating type (deprecated). - is_numeric : Check if the Index only consists of numeric data (deprecated). - is_object : Check if the Index is of the object dtype (deprecated). - is_categorical : Check if the Index holds categorical data. - is_interval : Check if the Index holds Interval objects (deprecated). - - Examples - -------- - >>> idx = pd.Index([True, False, True]) - >>> idx.is_boolean() # doctest: +SKIP - True - - >>> idx = pd.Index(["True", "False", "True"]) - >>> idx.is_boolean() # doctest: +SKIP - False - - >>> idx = pd.Index([True, False, "True"]) - >>> idx.is_boolean() # doctest: +SKIP - False - """ - warnings.warn( - f"{type(self).__name__}.is_boolean is deprecated. " - "Use pandas.api.types.is_bool_type instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.inferred_type in ["boolean"] - - @final - def is_integer(self) -> bool: - """ - Check if the Index only consists of integers. - - .. deprecated:: 2.0.0 - Use `pandas.api.types.is_integer_dtype` instead. - - Returns - ------- - bool - Whether or not the Index only consists of integers. - - See Also - -------- - is_boolean : Check if the Index only consists of booleans (deprecated). - is_floating : Check if the Index is a floating type (deprecated). - is_numeric : Check if the Index only consists of numeric data (deprecated). - is_object : Check if the Index is of the object dtype. (deprecated). - is_categorical : Check if the Index holds categorical data (deprecated). - is_interval : Check if the Index holds Interval objects (deprecated). - - Examples - -------- - >>> idx = pd.Index([1, 2, 3, 4]) - >>> idx.is_integer() # doctest: +SKIP - True - - >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) - >>> idx.is_integer() # doctest: +SKIP - False - - >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) - >>> idx.is_integer() # doctest: +SKIP - False - """ - warnings.warn( - f"{type(self).__name__}.is_integer is deprecated. " - "Use pandas.api.types.is_integer_dtype instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.inferred_type in ["integer"] - - @final - def is_floating(self) -> bool: - """ - Check if the Index is a floating type. - - .. deprecated:: 2.0.0 - Use `pandas.api.types.is_float_dtype` instead - - The Index may consist of only floats, NaNs, or a mix of floats, - integers, or NaNs. - - Returns - ------- - bool - Whether or not the Index only consists of only consists of floats, NaNs, or - a mix of floats, integers, or NaNs. - - See Also - -------- - is_boolean : Check if the Index only consists of booleans (deprecated). - is_integer : Check if the Index only consists of integers (deprecated). - is_numeric : Check if the Index only consists of numeric data (deprecated). - is_object : Check if the Index is of the object dtype. (deprecated). - is_categorical : Check if the Index holds categorical data (deprecated). - is_interval : Check if the Index holds Interval objects (deprecated). - - Examples - -------- - >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) - >>> idx.is_floating() # doctest: +SKIP - True - - >>> idx = pd.Index([1.0, 2.0, np.nan, 4.0]) - >>> idx.is_floating() # doctest: +SKIP - True - - >>> idx = pd.Index([1, 2, 3, 4, np.nan]) - >>> idx.is_floating() # doctest: +SKIP - True - - >>> idx = pd.Index([1, 2, 3, 4]) - >>> idx.is_floating() # doctest: +SKIP - False - """ - warnings.warn( - f"{type(self).__name__}.is_floating is deprecated. " - "Use pandas.api.types.is_float_dtype instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] - - @final - def is_numeric(self) -> bool: - """ - Check if the Index only consists of numeric data. - - .. deprecated:: 2.0.0 - Use `pandas.api.types.is_numeric_dtype` instead. - - Returns - ------- - bool - Whether or not the Index only consists of numeric data. - - See Also - -------- - is_boolean : Check if the Index only consists of booleans (deprecated). - is_integer : Check if the Index only consists of integers (deprecated). - is_floating : Check if the Index is a floating type (deprecated). - is_object : Check if the Index is of the object dtype. (deprecated). - is_categorical : Check if the Index holds categorical data (deprecated). - is_interval : Check if the Index holds Interval objects (deprecated). - - Examples - -------- - >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) - >>> idx.is_numeric() # doctest: +SKIP - True - - >>> idx = pd.Index([1, 2, 3, 4.0]) - >>> idx.is_numeric() # doctest: +SKIP - True - - >>> idx = pd.Index([1, 2, 3, 4]) - >>> idx.is_numeric() # doctest: +SKIP - True - - >>> idx = pd.Index([1, 2, 3, 4.0, np.nan]) - >>> idx.is_numeric() # doctest: +SKIP - True - - >>> idx = pd.Index([1, 2, 3, 4.0, np.nan, "Apple"]) - >>> idx.is_numeric() # doctest: +SKIP - False - """ - warnings.warn( - f"{type(self).__name__}.is_numeric is deprecated. " - "Use pandas.api.types.is_any_real_numeric_dtype instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.inferred_type in ["integer", "floating"] - - @final - def is_object(self) -> bool: - """ - Check if the Index is of the object dtype. - - .. deprecated:: 2.0.0 - Use `pandas.api.types.is_object_dtype` instead. - - Returns - ------- - bool - Whether or not the Index is of the object dtype. - - See Also - -------- - is_boolean : Check if the Index only consists of booleans (deprecated). - is_integer : Check if the Index only consists of integers (deprecated). - is_floating : Check if the Index is a floating type (deprecated). - is_numeric : Check if the Index only consists of numeric data (deprecated). - is_categorical : Check if the Index holds categorical data (deprecated). - is_interval : Check if the Index holds Interval objects (deprecated). - - Examples - -------- - >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) - >>> idx.is_object() # doctest: +SKIP - True - - >>> idx = pd.Index(["Apple", "Mango", 2.0]) - >>> idx.is_object() # doctest: +SKIP - True - - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") - >>> idx.is_object() # doctest: +SKIP - False - - >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) - >>> idx.is_object() # doctest: +SKIP - False - """ - warnings.warn( - f"{type(self).__name__}.is_object is deprecated." - "Use pandas.api.types.is_object_dtype instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return is_object_dtype(self.dtype) - - @final - def is_categorical(self) -> bool: - """ - Check if the Index holds categorical data. - - .. deprecated:: 2.0.0 - Use `isinstance(index.dtype, pd.CategoricalDtype)` instead. - - Returns - ------- - bool - True if the Index is categorical. - - See Also - -------- - CategoricalIndex : Index for categorical data. - is_boolean : Check if the Index only consists of booleans (deprecated). - is_integer : Check if the Index only consists of integers (deprecated). - is_floating : Check if the Index is a floating type (deprecated). - is_numeric : Check if the Index only consists of numeric data (deprecated). - is_object : Check if the Index is of the object dtype. (deprecated). - is_interval : Check if the Index holds Interval objects (deprecated). - - Examples - -------- - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") - >>> idx.is_categorical() # doctest: +SKIP - True - - >>> idx = pd.Index([1, 3, 5, 7]) - >>> idx.is_categorical() # doctest: +SKIP - False - - >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"]) - >>> s - 0 Peter - 1 Victor - 2 Elisabeth - 3 Mar - dtype: object - >>> s.index.is_categorical() # doctest: +SKIP - False - """ - warnings.warn( - f"{type(self).__name__}.is_categorical is deprecated." - "Use pandas.api.types.is_categorical_dtype instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return self.inferred_type in ["categorical"] - - @final - def is_interval(self) -> bool: - """ - Check if the Index holds Interval objects. - - .. deprecated:: 2.0.0 - Use `isinstance(index.dtype, pd.IntervalDtype)` instead. - - Returns - ------- - bool - Whether or not the Index holds Interval objects. - - See Also - -------- - IntervalIndex : Index for Interval objects. - is_boolean : Check if the Index only consists of booleans (deprecated). - is_integer : Check if the Index only consists of integers (deprecated). - is_floating : Check if the Index is a floating type (deprecated). - is_numeric : Check if the Index only consists of numeric data (deprecated). - is_object : Check if the Index is of the object dtype. (deprecated). - is_categorical : Check if the Index holds categorical data (deprecated). - - Examples - -------- - >>> idx = pd.Index([pd.Interval(left=0, right=5), - ... pd.Interval(left=5, right=10)]) - >>> idx.is_interval() # doctest: +SKIP - True - - >>> idx = pd.Index([1, 3, 5, 7]) - >>> idx.is_interval() # doctest: +SKIP - False - """ - warnings.warn( - f"{type(self).__name__}.is_interval is deprecated." - "Use pandas.api.types.is_interval_dtype instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.inferred_type in ["interval"] - - @final - def _holds_integer(self) -> bool: - """ - Whether the type is an integer type. - """ - return self.inferred_type in ["integer", "mixed-integer"] - - @final - def holds_integer(self) -> bool: - """ - Whether the type is an integer type. + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", "Watermelon"]).astype( + ... "category" + ... ) + >>> idx.has_duplicates + True - .. deprecated:: 2.0.0 - Use `pandas.api.types.infer_dtype` instead + >>> idx = pd.Index(["Orange", "Apple", "Watermelon"]).astype("category") + >>> idx.has_duplicates + False """ - warnings.warn( - f"{type(self).__name__}.holds_integer is deprecated. " - "Use pandas.api.types.infer_dtype instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._holds_integer() + return not self.is_unique @cache_readonly def inferred_type(self) -> str_t: """ Return a string of the type inferred from the values. + See Also + -------- + Index.dtype : Return the dtype object of the underlying data. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -2825,9 +2569,15 @@ def hasnans(self) -> bool: ------- bool + See Also + -------- + Index.isna : Detect missing values. + Index.dropna : Return Index without NA/NaN values. + Index.fillna : Fill NA/NaN values with the specified value. + Examples -------- - >>> s = pd.Series([1, 2, 3], index=['a', 'b', None]) + >>> s = pd.Series([1, 2, 3], index=["a", "b", None]) >>> s a 1 b 2 @@ -2878,7 +2628,7 @@ def isna(self) -> npt.NDArray[np.bool_]: Empty strings are not considered NA values. None is considered an NA value. - >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx = pd.Index(["black", "", "red", None]) >>> idx Index(['black', '', 'red', None], dtype='object') >>> idx.isna() @@ -2886,11 +2636,12 @@ def isna(self) -> npt.NDArray[np.bool_]: For datetimes, `NaT` (Not a Time) is considered as an NA value. - >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) + >>> idx = pd.DatetimeIndex( + ... [pd.Timestamp("1940-04-25"), pd.Timestamp(""), None, pd.NaT] + ... ) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.isna() array([False, True, True, True]) """ @@ -2934,7 +2685,7 @@ def notna(self) -> npt.NDArray[np.bool_]: Empty strings are not considered NA values. None is considered a NA value. - >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx = pd.Index(["black", "", "red", None]) >>> idx Index(['black', '', 'red', None], dtype='object') >>> idx.notna() @@ -2944,7 +2695,7 @@ def notna(self) -> npt.NDArray[np.bool_]: notnull = notna - def fillna(self, value=None, downcast=lib.no_default): + def fillna(self, value): """ Fill NA/NaN values with the specified value. @@ -2953,16 +2704,11 @@ def fillna(self, value=None, downcast=lib.no_default): value : scalar Scalar value to use to fill holes (e.g. 0). This value cannot be a list-likes. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). - - .. deprecated:: 2.1.0 Returns ------- Index + NA/NaN values replaced with `value`. See Also -------- @@ -2977,28 +2723,13 @@ def fillna(self, value=None, downcast=lib.no_default): """ if not is_scalar(value): raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") - if downcast is not lib.no_default: - warnings.warn( - f"The 'downcast' keyword in {type(self).__name__}.fillna is " - "deprecated and will be removed in a future version. " - "It was previously silently ignored.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - downcast = None if self.hasnans: result = self.putmask(self._isnan, value) - if downcast is None: - # no need to care metadata other than name - # because it can't have freq if it has NaTs - # _with_infer needed for test_fillna_categorical - return Index._with_infer(result, name=self.name) - raise NotImplementedError( - f"{type(self).__name__}.fillna does not support 'downcast' " - "argument values other than 'None'." - ) + # no need to care metadata other than name + # because it can't have freq if it has NaTs + # _with_infer needed for test_fillna_categorical + return Index._with_infer(result, name=self.name) return self._view() def dropna(self, how: AnyAll = "any") -> Self: @@ -3014,6 +2745,12 @@ def dropna(self, how: AnyAll = "any") -> Self: Returns ------- Index + Returns an Index object after removing NA/NaN values. + + See Also + -------- + Index.fillna : Fill NA/NaN values with the specified value. + Index.isna : Detect missing values. Examples -------- @@ -3047,6 +2784,7 @@ def unique(self, level: Hashable | None = None) -> Self: Returns ------- Index + Unique values in the index. See Also -------- @@ -3082,6 +2820,7 @@ def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: Returns ------- Index + A new Index object with the duplicate values removed. See Also -------- @@ -3094,20 +2833,20 @@ def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: -------- Generate an pandas.Index with duplicate values. - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + >>> idx = pd.Index(["llama", "cow", "llama", "beetle", "llama", "hippo"]) The `keep` parameter controls which duplicate values are removed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. - >>> idx.drop_duplicates(keep='first') - Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + >>> idx.drop_duplicates(keep="first") + Index(['llama', 'cow', 'beetle', 'hippo'], dtype='object') The value 'last' keeps the last occurrence for each set of duplicated entries. - >>> idx.drop_duplicates(keep='last') - Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + >>> idx.drop_duplicates(keep="last") + Index(['cow', 'beetle', 'llama', 'hippo'], dtype='object') The value ``False`` discards all sets of duplicated entries. @@ -3141,6 +2880,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: Returns ------- np.ndarray[bool] + A numpy array of boolean values indicating duplicate index values. See Also -------- @@ -3153,19 +2893,19 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: By default, for each set of duplicated values, the first occurrence is set to False and all others to True: - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx = pd.Index(["llama", "cow", "llama", "beetle", "llama"]) >>> idx.duplicated() array([False, False, True, False, True]) which is equivalent to - >>> idx.duplicated(keep='first') + >>> idx.duplicated(keep="first") array([False, False, True, False, True]) By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True: - >>> idx.duplicated(keep='last') + >>> idx.duplicated(keep="last") array([ True, False, True, False, False]) By setting keep on ``False``, all duplicates are True: @@ -3186,14 +2926,12 @@ def __iadd__(self, other): return self + other @final - def __nonzero__(self) -> NoReturn: + def __bool__(self) -> NoReturn: raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) - __bool__ = __nonzero__ - # -------------------------------------------------------------------- # Set Operation Methods @@ -3209,7 +2947,7 @@ def _get_reconciled_name_object(self, other): return self @final - def _validate_sort_keyword(self, sort): + def _validate_sort_keyword(self, sort) -> None: if sort not in [None, False, True]: raise ValueError( "The 'sort' keyword only takes the values of " @@ -3221,7 +2959,7 @@ def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index """ With mismatched timezones, cast both to UTC. """ - # Caller is responsibelf or checking + # Caller is responsible for checking # `self.dtype != other.dtype` if ( isinstance(self, ABCDatetimeIndex) @@ -3236,7 +2974,7 @@ def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index return self, other @final - def union(self, other, sort=None): + def union(self, other, sort: bool | None = None): """ Form the union of two Index objects. @@ -3246,6 +2984,8 @@ def union(self, other, sort=None): Parameters ---------- other : Index or array-like + Index or an array-like object containing elements to form the union + with the original Index. sort : bool or None, default None Whether to sort the resulting Index. @@ -3262,6 +3002,14 @@ def union(self, other, sort=None): Returns ------- Index + Returns a new Index object with all unique elements from both the original + Index and the `other` Index. + + See Also + -------- + Index.unique : Return unique values in the index. + Index.intersection : Form the intersection of two Index objects. + Index.difference : Return a new Index with elements of index not in `other`. Examples -------- @@ -3274,7 +3022,7 @@ def union(self, other, sort=None): Union mismatched dtypes - >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) + >>> idx1 = pd.Index(["a", "b", "c", "d"]) >>> idx2 = pd.Index([1, 2, 3, 4]) >>> idx1.union(idx2) Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') @@ -3382,9 +3130,7 @@ def _union(self, other: Index, sort: bool | None): if ( sort in (None, True) - and self.is_monotonic_increasing - and other.is_monotonic_increasing - and not (self.has_duplicates and other.has_duplicates) + and (self.is_unique or other.is_unique) and self._can_use_libjoin and other._can_use_libjoin ): @@ -3399,7 +3145,7 @@ def _union(self, other: Index, sort: bool | None): # worth making this faster? a very unusual case value_set = set(lvals) - value_list.extend([x for x in rvals if x not in value_set]) + value_list.extend(x for x in rvals if x not in value_set) # If objects are unorderable, we must have object dtype. return np.array(value_list, dtype=object) @@ -3458,6 +3204,8 @@ def intersection(self, other, sort: bool = False): Parameters ---------- other : Index or array-like + An Index or an array-like object containing elements to form the + intersection with the original Index. sort : True, False or None, default False Whether to sort the resulting index. @@ -3469,6 +3217,14 @@ def intersection(self, other, sort: bool = False): Returns ------- Index + Returns a new Index object with elements common to both the original Index + and the `other` Index. + + See Also + -------- + Index.union : Form the union of two Index objects. + Index.difference : Return a new Index with elements of index not in other. + Index.isin : Return a boolean array where the index values are in values. Examples -------- @@ -3536,12 +3292,7 @@ def _intersection(self, other: Index, sort: bool = False): """ intersection specialized to the case with matching dtypes. """ - if ( - self.is_monotonic_increasing - and other.is_monotonic_increasing - and self._can_use_libjoin - and other._can_use_libjoin - ): + if self._can_use_libjoin and other._can_use_libjoin: try: res_indexer, indexer, _ = self._inner_indexer(other) except TypeError: @@ -3600,7 +3351,7 @@ def _intersection_via_get_indexer( return result @final - def difference(self, other, sort=None): + def difference(self, other, sort: bool | None = None): """ Return a new Index with elements of index not in `other`. @@ -3609,6 +3360,8 @@ def difference(self, other, sort=None): Parameters ---------- other : Index or array-like + Index object or an array-like object containing elements to be compared + with the elements of the original Index. sort : bool or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from @@ -3622,6 +3375,14 @@ def difference(self, other, sort=None): Returns ------- Index + Returns a new Index object containing elements that are in the original + Index but not in the `other` Index. + + See Also + -------- + Index.symmetric_difference : Compute the symmetric difference of two Index + objects. + Index.intersection : Form the intersection of two Index objects. Examples -------- @@ -3663,9 +3424,12 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex + this = self + if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: + this = this.dropna() other = other.unique() - the_diff = self[other.get_indexer_for(self) == -1] - the_diff = the_diff if self.is_unique else the_diff.unique() + the_diff = this[other.get_indexer_for(this) == -1] + the_diff = the_diff if this.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) return the_diff @@ -3673,14 +3437,22 @@ def _wrap_difference_result(self, other, result): # We will override for MultiIndex to handle empty results return self._wrap_setop_result(other, result) - def symmetric_difference(self, other, result_name=None, sort=None): + def symmetric_difference( + self, + other, + result_name: abc.Hashable | None = None, + sort: bool | None = None, + ): """ Compute the symmetric difference of two Index objects. Parameters ---------- other : Index or array-like + Index or an array-like object with elements to compute the symmetric + difference with the original Index. result_name : str + A string representing the name of the resulting Index, if desired. sort : bool or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from @@ -3694,6 +3466,14 @@ def symmetric_difference(self, other, result_name=None, sort=None): Returns ------- Index + Returns a new Index object containing elements that appear in either the + original Index or the `other` Index, but not both. + + See Also + -------- + Index.difference : Return a new Index with elements of index not in other. + Index.union : Form the union of two Index objects. + Index.intersection : Form the intersection of two Index objects. Notes ----- @@ -3778,23 +3558,35 @@ def get_loc(self, key): Parameters ---------- key : label + The key to check its location if it is present in the index. Returns ------- int if unique index, slice if monotonic index, else mask + Integer location, slice or boolean mask. + + See Also + -------- + Index.get_slice_bound : Calculate slice bound that corresponds to + given label. + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Index.get_indexer_for : Returns an indexer even when non-unique. Examples -------- - >>> unique_index = pd.Index(list('abc')) - >>> unique_index.get_loc('b') + >>> unique_index = pd.Index(list("abc")) + >>> unique_index.get_loc("b") 1 - >>> monotonic_index = pd.Index(list('abbc')) - >>> monotonic_index.get_loc('b') + >>> monotonic_index = pd.Index(list("abbc")) + >>> monotonic_index.get_loc("b") slice(1, 3, None) - >>> non_monotonic_index = pd.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') + >>> non_monotonic_index = pd.Index(list("abcb")) + >>> non_monotonic_index.get_loc("b") array([False, True, False, True]) """ casted_key = self._maybe_cast_indexer(key) @@ -3805,7 +3597,7 @@ def get_loc(self, key): isinstance(casted_key, abc.Iterable) and any(isinstance(x, slice) for x in casted_key) ): - raise InvalidIndexError(key) + raise InvalidIndexError(key) from err raise KeyError(key) from err except TypeError: # If we have a listlike key, _check_indexing_error will raise @@ -3831,6 +3623,7 @@ def get_indexer( Parameters ---------- target : Index + An iterable containing the values to be used for computing indexer. method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -3858,6 +3651,12 @@ def get_indexer( positions matches the corresponding target values. Missing values in the target are marked by -1. + See Also + -------- + Index.get_indexer_for : Returns an indexer even when non-unique. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Notes ----- Returns -1 for unmatched values, for further explanation see the @@ -3865,8 +3664,8 @@ def get_indexer( Examples -------- - >>> index = pd.Index(['c', 'a', 'b']) - >>> index.get_indexer(['a', 'b', 'x']) + >>> index = pd.Index(["c", "a", "b"]) + >>> index.get_indexer(["a", "b", "x"]) array([ 1, 2, -1]) Notice that the return value is an array of locations in ``index`` @@ -4050,7 +3849,7 @@ def _convert_tolerance(self, tolerance, target: np.ndarray | Index) -> np.ndarra raise ValueError( f"tolerance argument for {type(self).__name__} with dtype {self.dtype} " - f"must be numeric if it is a scalar: {repr(tolerance)}" + f"must be numeric if it is a scalar: {tolerance!r}" ) return tolerance @@ -4097,7 +3896,7 @@ def _get_fill_indexer_searchsorted( """ if limit is not None: raise ValueError( - f"limit argument for {repr(method)} method only well-defined " + f"limit argument for {method!r} method only well-defined " "if index and target are monotonic" ) @@ -4216,25 +4015,6 @@ def _convert_slice_indexer(self, key: slice, kind: Literal["loc", "getitem"]): # TODO(GH#50617): once Series.__[gs]etitem__ is removed we should be able # to simplify this. - if lib.is_np_dtype(self.dtype, "f"): - # We always treat __getitem__ slicing as label-based - # translate to locations - if kind == "getitem" and is_index_slice and not start == stop and step != 0: - # exclude step=0 from the warning because it will raise anyway - # start/stop both None e.g. [:] or [::-1] won't change. - # exclude start==stop since it will be empty either way, or - # will be [:] or [::-1] which won't change - warnings.warn( - # GH#49612 - "The behavior of obj[i:j] with a float-dtype index is " - "deprecated. In a future version, this will be treated as " - "positional instead of label-based. For label-based slicing, " - "use obj.loc[i:j] instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.slice_indexer(start, stop, step) - if kind == "getitem": # called from the getitem slicers, validate that we are in fact integers if is_index_slice: @@ -4331,6 +4111,7 @@ def reindex( Parameters ---------- target : an iterable + An iterable containing the values to be used for creating the new index. method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -4376,10 +4157,10 @@ def reindex( Examples -------- - >>> idx = pd.Index(['car', 'bike', 'train', 'tractor']) + >>> idx = pd.Index(["car", "bike", "train", "tractor"]) >>> idx Index(['car', 'bike', 'train', 'tractor'], dtype='object') - >>> idx.reindex(['car', 'bike']) + >>> idx.reindex(["car", "bike"]) (Index(['car', 'bike'], dtype='object'), array([0, 1])) """ # GH6552: preserve names when reindexing to non-named target @@ -4387,7 +4168,8 @@ def reindex( preserve_names = not hasattr(target, "name") # GH7774: preserve dtype/tz if target is empty and not an Index. - target = ensure_has_len(target) # target may be an iterator + if is_iterator(target): + target = list(target) if not isinstance(target, Index) and len(target) == 0: if level is not None and self._is_multi: @@ -4434,7 +4216,7 @@ def _wrap_reindex_result(self, target, indexer, preserve_names: bool): target = self._maybe_preserve_names(target, preserve_names) return target - def _maybe_preserve_names(self, target: Index, preserve_names: bool): + def _maybe_preserve_names(self, target: IndexT, preserve_names: bool) -> IndexT: if preserve_names and target.nlevels == 1 and target.name != self.name: target = target.copy(deep=False) target.name = self.name @@ -4524,8 +4306,7 @@ def join( level: Level = ..., return_indexers: Literal[True], sort: bool = ..., - ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - ... + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: ... @overload def join( @@ -4536,8 +4317,7 @@ def join( level: Level = ..., return_indexers: Literal[False] = ..., sort: bool = ..., - ) -> Index: - ... + ) -> Index: ... @overload def join( @@ -4548,8 +4328,9 @@ def join( level: Level = ..., return_indexers: bool = ..., sort: bool = ..., - ) -> Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - ... + ) -> ( + Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None] + ): ... @final @_maybe_return_indexers @@ -4568,9 +4349,12 @@ def join( Parameters ---------- other : Index + The other index on which join is performed. how : {'left', 'right', 'inner', 'outer'} level : int or level name, default None + It is either the integer position or the name of the level. return_indexers : bool, default False + Whether to return the indexers or not for both the index objects. sort : bool, default False Sort the join keys lexicographically in the result Index. If False, the order of the join keys depends on the join type (how keyword). @@ -4578,13 +4362,24 @@ def join( Returns ------- join_index, (left_indexer, right_indexer) + The new index. + + See Also + -------- + DataFrame.join : Join columns with `other` DataFrame either on index + or on a key. + DataFrame.merge : Merge DataFrame or named Series objects with a + database-style join. Examples -------- >>> idx1 = pd.Index([1, 2, 3]) >>> idx2 = pd.Index([4, 5, 6]) - >>> idx1.join(idx2, how='outer') + >>> idx1.join(idx2, how="outer") Index([1, 2, 3, 4, 5, 6], dtype='int64') + >>> idx1.join(other=idx2, how="outer", return_indexers=True) + (Index([1, 2, 3, 4, 5, 6], dtype='int64'), + array([ 0, 1, 2, -1, -1, -1]), array([-1, -1, -1, 0, 1, 2])) """ other = ensure_index(other) sort = sort or how == "outer" @@ -4615,38 +4410,12 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) - lidx: np.ndarray | None - ridx: np.ndarray | None - - if len(other) == 0: - if how in ("left", "outer"): - if sort and not self.is_monotonic_increasing: - lidx = self.argsort() - join_index = self.take(lidx) - else: - lidx = None - join_index = self._view() - ridx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("right", "inner", "cross"): - join_index = other._view() - lidx = np.array([], dtype=np.intp) - return join_index, lidx, None - - if len(self) == 0: - if how in ("right", "outer"): - if sort and not other.is_monotonic_increasing: - ridx = other.argsort() - join_index = other.take(ridx) - else: - ridx = None - join_index = other._view() - lidx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("left", "inner", "cross"): - join_index = self._view() - ridx = np.array([], dtype=np.intp) - return join_index, None, ridx + if len(self) == 0 or len(other) == 0: + try: + return self._join_empty(other, how, sort) + except TypeError: + # object dtype; non-comparable objects + pass if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) @@ -4681,6 +4450,32 @@ def join( return self._join_via_get_indexer(other, how, sort) + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + assert len(self) == 0 or len(other) == 0 + _validate_join_method(how) + + lidx: np.ndarray | None + ridx: np.ndarray | None + + if len(other): + how = cast(JoinHow, {"left": "right", "right": "left"}.get(how, how)) + join_index, ridx, lidx = other._join_empty(self, how, sort) + elif how in ["left", "outer"]: + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + else: + join_index = other._view() + lidx = np.array([], dtype=np.intp) + ridx = None + return join_index, lidx, ridx + @final def _join_via_get_indexer( self, other: Index, how: JoinHow, sort: bool @@ -4689,11 +4484,23 @@ def _join_via_get_indexer( # uniqueness/monotonicity # Note: at this point we have checked matching dtypes + lindexer: npt.NDArray[np.intp] | None + rindexer: npt.NDArray[np.intp] | None if how == "left": - join_index = self.sort_values() if sort else self + if sort: + join_index, lindexer = self.sort_values(return_indexer=True) + rindexer = other.get_indexer_for(join_index) + return join_index, lindexer, rindexer + else: + join_index = self elif how == "right": - join_index = other.sort_values() if sort else other + if sort: + join_index, rindexer = other.sort_values(return_indexer=True) + lindexer = self.get_indexer_for(join_index) + return join_index, lindexer, rindexer + else: + join_index = other elif how == "inner": join_index = self.intersection(other, sort=sort) elif how == "outer": @@ -4706,6 +4513,10 @@ def _join_via_get_indexer( except TypeError: pass + names = other.names if how == "right" else self.names + if join_index.names != names: + join_index = join_index.set_names(names) + if join_index is self: lindexer = None else: @@ -4722,8 +4533,8 @@ def _join_multi(self, other: Index, how: JoinHow): from pandas.core.reshape.merge import restore_dropped_levels_multijoin # figure out join names - self_names_list = list(com.not_none(*self.names)) - other_names_list = list(com.not_none(*other.names)) + self_names_list = list(self.names) + other_names_list = list(other.names) self_names_order = self_names_list.index other_names_order = other_names_list.index self_names = set(self_names_list) @@ -4817,11 +4628,18 @@ def _join_non_unique( left_idx, right_idx = get_join_indexers_non_unique( self._values, other._values, how=how, sort=sort ) - mask = left_idx == -1 - join_idx = self.take(left_idx) - right = other.take(right_idx) - join_index = join_idx.putmask(mask, right) + if how == "right": + join_index = other.take(right_idx) + else: + join_index = self.take(left_idx) + + if how == "outer": + mask = left_idx == -1 + if mask.any(): + right = other.take(right_idx) + join_index = join_index.putmask(mask, right) + if isinstance(join_index, ABCMultiIndex) and how == "outer": # test_join_index_levels join_index = join_index._sort_levels_monotonic() @@ -4976,11 +4794,13 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: ) return join_index, left_indexer, right_indexer - @final def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - # We only get here with matching dtypes and both monotonic increasing + # We only get here with (caller is responsible for ensuring): + # 1) matching dtypes + # 2) both monotonic increasing + # 3) other.is_unique or self.is_unique assert other.dtype == self.dtype assert self._can_use_libjoin and other._can_use_libjoin @@ -4994,62 +4814,71 @@ def _join_monotonic( ridx: npt.NDArray[np.intp] | None lidx: npt.NDArray[np.intp] | None - if self.is_unique and other.is_unique: - # We can perform much better than the general case - if how == "left": + if how == "left": + if other.is_unique: + # We can perform much better than the general case join_index = self lidx = None ridx = self._left_indexer_unique(other) - elif how == "right": + else: + join_array, lidx, ridx = self._left_indexer(other) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) + elif how == "right": + if self.is_unique: + # We can perform much better than the general case join_index = other lidx = other._left_indexer_unique(self) ridx = None - elif how == "inner": - join_array, lidx, ridx = self._inner_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx) - elif how == "outer": - join_array, lidx, ridx = self._outer_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx) - else: - if how == "left": - join_array, lidx, ridx = self._left_indexer(other) - elif how == "right": + else: join_array, ridx, lidx = other._left_indexer(self) - elif how == "inner": - join_array, lidx, ridx = self._inner_indexer(other) - elif how == "outer": - join_array, lidx, ridx = self._outer_indexer(other) - - assert lidx is not None - assert ridx is not None - - join_index = self._wrap_joined_index(join_array, other, lidx, ridx) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) + elif how == "inner": + join_array, lidx, ridx = self._inner_indexer(other) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) + elif how == "outer": + join_array, lidx, ridx = self._outer_indexer(other) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) lidx = None if lidx is None else ensure_platform_int(lidx) ridx = None if ridx is None else ensure_platform_int(ridx) return join_index, lidx, ridx - def _wrap_joined_index( + def _wrap_join_result( self, joined: ArrayLike, other: Self, - lidx: npt.NDArray[np.intp], - ridx: npt.NDArray[np.intp], - ) -> Self: + lidx: npt.NDArray[np.intp] | None, + ridx: npt.NDArray[np.intp] | None, + how: JoinHow, + ) -> tuple[Self, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: assert other.dtype == self.dtype - if isinstance(self, ABCMultiIndex): - name = self.names if self.names == other.names else None - # error: Incompatible return value type (got "MultiIndex", - # expected "Self") - mask = lidx == -1 - join_idx = self.take(lidx) - right = cast("MultiIndex", other.take(ridx)) - join_index = join_idx.putmask(mask, right)._sort_levels_monotonic() - return join_index.set_names(name) # type: ignore[return-value] + if lidx is not None and lib.is_range_indexer(lidx, len(self)): + lidx = None + if ridx is not None and lib.is_range_indexer(ridx, len(other)): + ridx = None + + # return self or other if possible to maintain cached attributes + if lidx is None: + join_index = self + elif ridx is None: + join_index = other else: - name = get_op_result_name(self, other) - return self._constructor._with_infer(joined, name=name, dtype=self.dtype) + join_index = self._constructor._with_infer(joined, dtype=self.dtype) + + names = other.names if how == "right" else self.names + if join_index.names != names: + join_index = join_index.set_names(names) + + return join_index, lidx, ridx @final @cache_readonly @@ -5062,13 +4891,20 @@ def _can_use_libjoin(self) -> bool: making a copy. If we cannot, this negates the performance benefit of using libjoin. """ + if not self.is_monotonic_increasing: + # The libjoin functions all assume monotonicity. + return False + if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic # values only, meaning no NA return ( isinstance(self.dtype, np.dtype) or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) - or self.dtype == "string[python]" + or ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ) ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin @@ -5092,6 +4928,10 @@ def values(self) -> ArrayLike: :meth:`Index.to_numpy`, depending on whether you need a reference to the underlying data or a NumPy array. + .. versionchanged:: 3.0.0 + + The returned array is read-only. + Returns ------- array: numpy.ndarray or ExtensionArray @@ -5119,13 +4959,11 @@ def values(self) -> ArrayLike: [(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]] Length: 5, dtype: interval[int64, right] """ - if using_copy_on_write(): - data = self._data - if isinstance(data, np.ndarray): - data = data.view() - data.flags.writeable = False - return data - return self._data + data = self._data + if isinstance(data, np.ndarray): + data = data.view() + data.flags.writeable = False + return data @cache_readonly @doc(IndexOpsMixin.array) @@ -5232,8 +5070,9 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: def memory_usage(self, deep: bool = False) -> int: result = self._memory_usage(deep=deep) - # include our engine hashtable - result += self._engine.sizeof(deep=deep) + # include our engine hashtable, only if it's already cached + if "_engine" in self._cache: + result += self._engine.sizeof(deep=deep) return result @final @@ -5263,10 +5102,10 @@ def where(self, cond, other=None) -> Index: Examples -------- - >>> idx = pd.Index(['car', 'bike', 'train', 'tractor']) + >>> idx = pd.Index(["car", "bike", "train", "tractor"]) >>> idx Index(['car', 'bike', 'train', 'tractor'], dtype='object') - >>> idx.where(idx.isin(['car', 'train']), 'other') + >>> idx.where(idx.isin(["car", "train"]), "other") Index(['car', 'other', 'train', 'other'], dtype='object') """ if isinstance(self, ABCMultiIndex): @@ -5310,11 +5149,14 @@ def _validate_fill_value(self, value): raise TypeError return value + @cache_readonly def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ - return is_object_dtype(self.dtype) + return is_object_dtype(self.dtype) or ( + is_string_dtype(self.dtype) and self.dtype.storage == "python" # type: ignore[union-attr] + ) def __contains__(self, key: Any) -> bool: """ @@ -5401,12 +5243,9 @@ def __getitem__(self, key): if not isinstance(self.dtype, ExtensionDtype): if len(key) == 0 and len(key) != len(self): - warnings.warn( - "Using a boolean indexer with length 0 on an Index with " - "length greater than 0 is deprecated and will raise in a " - "future version.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + "The length of the boolean indexer cannot be 0 " + "when the Index has length greater than 0." ) result = getitem(key) @@ -5455,10 +5294,18 @@ def append(self, other: Index | Sequence[Index]) -> Index: Parameters ---------- other : Index or list/tuple of indices + Single Index or a collection of indices, which can be either a list or a + tuple. Returns ------- Index + Returns a new Index object resulting from appending the provided other + indices to the original Index. + + See Also + -------- + Index.insert : Make new Index inserting new item at location. Examples -------- @@ -5498,13 +5345,23 @@ def putmask(self, mask, value) -> Index: """ Return a new Index of the values set with the mask. + Parameters + ---------- + mask : np.ndarray[bool] + Array of booleans denoting where values in the original + data are not ``NA``. + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + Returns ------- Index + A new Index of the values set with the mask. See Also -------- - numpy.ndarray.putmask : Changes elements of an array + numpy.putmask : Changes elements of an array based on conditional and input values. Examples @@ -5565,6 +5422,12 @@ def equals(self, other: Any) -> bool: True if "other" is an Index and it has the same elements and order as the calling index; False otherwise. + See Also + -------- + Index.identical: Checks that object attributes and types are also equal. + Index.has_duplicates: Check if the Index has duplicate values. + Index.is_unique: Return if the index has unique values. + Examples -------- >>> idx1 = pd.Index([1, 2, 3]) @@ -5595,10 +5458,10 @@ def equals(self, other: Any) -> bool: The dtype is *not* compared - >>> int64_idx = pd.Index([1, 2, 3], dtype='int64') + >>> int64_idx = pd.Index([1, 2, 3], dtype="int64") >>> int64_idx Index([1, 2, 3], dtype='int64') - >>> uint64_idx = pd.Index([1, 2, 3], dtype='uint64') + >>> uint64_idx = pd.Index([1, 2, 3], dtype="uint64") >>> uint64_idx Index([1, 2, 3], dtype='uint64') >>> int64_idx.equals(uint64_idx) @@ -5616,9 +5479,10 @@ def equals(self, other: Any) -> bool: if ( isinstance(self.dtype, StringDtype) - and self.dtype.storage == "pyarrow_numpy" + and self.dtype.na_value is np.nan and other.dtype != self.dtype ): + # TODO(infer_string) can we avoid this special case? # special case for object behavior return other.equals(self.astype(object)) @@ -5649,21 +5513,32 @@ def identical(self, other) -> bool: """ Similar to equals, but checks that object attributes and types are also equal. + Parameters + ---------- + other : Index + The Index object you want to compare with the current Index object. + Returns ------- bool If two Index objects have equal elements and same type True, otherwise False. + See Also + -------- + Index.equals: Determine if two Index object are equal. + Index.has_duplicates: Check if the Index has duplicate values. + Index.is_unique: Return if the index has unique values. + Examples -------- - >>> idx1 = pd.Index(['1', '2', '3']) - >>> idx2 = pd.Index(['1', '2', '3']) + >>> idx1 = pd.Index(["1", "2", "3"]) + >>> idx2 = pd.Index(["1", "2", "3"]) >>> idx2.identical(idx1) True - >>> idx1 = pd.Index(['1', '2', '3'], name="A") - >>> idx2 = pd.Index(['1', '2', '3'], name="B") + >>> idx1 = pd.Index(["1", "2", "3"], name="A") + >>> idx2 = pd.Index(["1", "2", "3"], name="B") >>> idx2.identical(idx1) False """ @@ -5711,39 +5586,38 @@ def asof(self, label): -------- `Index.asof` returns the latest index label up to the passed label. - >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) - >>> idx.asof('2014-01-01') + >>> idx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"]) + >>> idx.asof("2014-01-01") '2013-12-31' If the label is in the index, the method returns the passed label. - >>> idx.asof('2014-01-02') + >>> idx.asof("2014-01-02") '2014-01-02' If all of the labels in the index are later than the passed label, NaN is returned. - >>> idx.asof('1999-01-02') + >>> idx.asof("1999-01-02") nan If the index is not sorted, an error is raised. - >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', - ... '2014-01-03']) - >>> idx_not_sorted.asof('2013-12-31') + >>> idx_not_sorted = pd.Index(["2013-12-31", "2015-01-02", "2014-01-03"]) + >>> idx_not_sorted.asof("2013-12-31") Traceback (most recent call last): ValueError: index must be monotonic increasing or decreasing """ self._searchsorted_monotonic(label) # validate sortedness try: loc = self.get_loc(label) - except (KeyError, TypeError): + except (KeyError, TypeError) as err: # KeyError -> No exact match, try for padded # TypeError -> passed e.g. non-hashable, fall through to get # the tested exception message indexer = self.get_indexer([label], method="pad") if indexer.ndim > 1 or indexer.size > 1: - raise TypeError("asof requires scalar valued input") + raise TypeError("asof requires scalar valued input") from err loc = indexer.item() if loc == -1: return self._na_value @@ -5790,9 +5664,10 @@ def asof_locs( Examples -------- - >>> idx = pd.date_range('2023-06-01', periods=3, freq='D') - >>> where = pd.DatetimeIndex(['2023-05-30 00:12:00', '2023-06-01 00:00:00', - ... '2023-06-02 23:59:59']) + >>> idx = pd.date_range("2023-06-01", periods=3, freq="D") + >>> where = pd.DatetimeIndex( + ... ["2023-05-30 00:12:00", "2023-06-01 00:00:00", "2023-06-02 23:59:59"] + ... ) >>> mask = np.ones(3, dtype=bool) >>> idx.asof_locs(where, mask) array([-1, 0, 1]) @@ -5807,7 +5682,8 @@ def asof_locs( # types "Union[ExtensionArray, ndarray[Any, Any]]", "str" # TODO: will be fixed when ExtensionArray.searchsorted() is fixed locs = self._values[mask].searchsorted( - where._values, side="right" # type: ignore[call-overload] + where._values, + side="right", # type: ignore[call-overload] ) locs = np.where(locs > 0, locs - 1, 0) @@ -5826,8 +5702,7 @@ def sort_values( ascending: bool = ..., na_position: NaPosition = ..., key: Callable | None = ..., - ) -> Self: - ... + ) -> Self: ... @overload def sort_values( @@ -5837,8 +5712,7 @@ def sort_values( ascending: bool = ..., na_position: NaPosition = ..., key: Callable | None = ..., - ) -> tuple[Self, np.ndarray]: - ... + ) -> tuple[Self, np.ndarray]: ... @overload def sort_values( @@ -5848,14 +5722,11 @@ def sort_values( ascending: bool = ..., na_position: NaPosition = ..., key: Callable | None = ..., - ) -> Self | tuple[Self, np.ndarray]: - ... + ) -> Self | tuple[Self, np.ndarray]: ... - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self"], name="sort_values" - ) def sort_values( self, + *, return_indexer: bool = False, ascending: bool = True, na_position: NaPosition = "last", @@ -5913,17 +5784,14 @@ def sort_values( (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ if key is None and ( - self.is_monotonic_increasing or self.is_monotonic_decreasing + (ascending and self.is_monotonic_increasing) + or (not ascending and self.is_monotonic_decreasing) ): - reverse = ascending != self.is_monotonic_increasing - sorted_index = self[::-1] if reverse else self.copy() if return_indexer: indexer = np.arange(len(self), dtype=np.intp) - if reverse: - indexer = indexer[::-1] - return sorted_index, indexer + return self.copy(), indexer else: - return sorted_index + return self.copy() # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex @@ -5944,14 +5812,7 @@ def sort_values( else: return sorted_index - @final - def sort(self, *args, **kwargs): - """ - Use sort_values instead. - """ - raise TypeError("cannot sort an Index object in-place, use sort_values instead") - - def shift(self, periods: int = 1, freq=None): + def shift(self, periods: int = 1, freq=None) -> Self: """ Shift index by desired number of time frequency increments. @@ -5986,7 +5847,7 @@ def shift(self, periods: int = 1, freq=None): -------- Put the first 5 month starts of 2011 into an index. - >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts = pd.date_range("1/1/2011", periods=5, freq="MS") >>> month_starts DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', '2011-05-01'], @@ -5994,7 +5855,7 @@ def shift(self, periods: int = 1, freq=None): Shift the index by 10 days. - >>> month_starts.shift(10, freq='D') + >>> month_starts.shift(10, freq="D") DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', '2011-05-11'], dtype='datetime64[ns]', freq=None) @@ -6036,7 +5897,7 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: Examples -------- - >>> idx = pd.Index(['b', 'a', 'd', 'c']) + >>> idx = pd.Index(["b", "a", "d", "c"]) >>> idx Index(['b', 'a', 'd', 'c'], dtype='object') @@ -6051,7 +5912,7 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: # by RangeIndex, MultIIndex return self._data.argsort(*args, **kwargs) - def _check_indexing_error(self, key): + def _check_indexing_error(self, key) -> None: if not is_scalar(key): # if key is not a scalar, directly raise an error (the code below # would convert to numpy arrays and raise later any way) - GH29926 @@ -6069,9 +5930,7 @@ def _should_fallback_to_positional(self) -> bool: "complex", } - _index_shared_docs[ - "get_indexer_non_unique" - ] = """ + _index_shared_docs["get_indexer_non_unique"] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -6080,6 +5939,7 @@ def _should_fallback_to_positional(self) -> bool: Parameters ---------- target : %(target_klass)s + An iterable containing the values to be used for computing indexer. Returns ------- @@ -6091,6 +5951,12 @@ def _should_fallback_to_positional(self) -> bool: An indexer into the target of the values not found. These correspond to the -1 in the indexer array. + See Also + -------- + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_indexer_for : Returns an indexer even when non-unique. + Examples -------- >>> index = pd.Index(['c', 'b', 'a', 'b', 'b']) @@ -6121,7 +5987,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): @@ -6166,14 +6031,26 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]: This dispatches to get_indexer or get_indexer_non_unique as appropriate. + Parameters + ---------- + target : Index + An iterable containing the values to be used for computing indexer. + Returns ------- np.ndarray[np.intp] List of indices. + See Also + -------- + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Examples -------- - >>> idx = pd.Index([np.nan, 'var1', np.nan]) + >>> idx = pd.Index([np.nan, "var1", np.nan]) >>> idx.get_indexer_for([np.nan]) array([0, 2]) """ @@ -6253,20 +6130,17 @@ def _raise_if_missing(self, key, indexer, axis_name: str_t) -> None: @overload def _get_indexer_non_comparable( self, target: Index, method, unique: Literal[True] = ... - ) -> npt.NDArray[np.intp]: - ... + ) -> npt.NDArray[np.intp]: ... @overload def _get_indexer_non_comparable( self, target: Index, method, unique: Literal[False] - ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - ... + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... @overload def _get_indexer_non_comparable( self, target: Index, method, unique: bool = True - ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - ... + ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... @final def _get_indexer_non_comparable( @@ -6374,6 +6248,24 @@ def _find_common_type_compat(self, target) -> DtypeObj: """ target_dtype, _ = infer_dtype_from(target) + if using_string_dtype(): + # special case: if left or right is a zero-length RangeIndex or + # Index[object], those can be created by the default empty constructors + # -> for that case ignore this dtype and always return the other + # (https://github.com/pandas-dev/pandas/pull/60797) + from pandas.core.indexes.range import RangeIndex + + if len(self) == 0 and ( + isinstance(self, RangeIndex) or self.dtype == np.object_ + ): + return target_dtype + if ( + isinstance(target, Index) + and len(target) == 0 + and (isinstance(target, RangeIndex) or target_dtype == np.object_) + ): + return self.dtype + # special case: if one dtype is uint64 and the other a signed int, return object # See https://github.com/pandas-dev/pandas/issues/26778 for discussion # Now it's: @@ -6409,7 +6301,11 @@ def _should_compare(self, other: Index) -> bool: return False dtype = _unpack_nested_dtype(other) - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + return ( + self._is_comparable_dtype(dtype) + or is_object_dtype(dtype) + or is_string_dtype(dtype) + ) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ @@ -6424,7 +6320,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return True @final - def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: + def groupby(self, values) -> PrettyDict[Hashable, Index]: """ Group the index labels by a given array of values. @@ -6469,19 +6365,23 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): If the function returns a tuple with more than one element a MultiIndex will be returned. + See Also + -------- + Index.where : Replace values where the condition is False. + Examples -------- >>> idx = pd.Index([1, 2, 3]) - >>> idx.map({1: 'a', 2: 'b', 3: 'c'}) + >>> idx.map({1: "a", 2: "b", 3: "c"}) Index(['a', 'b', 'c'], dtype='object') Using `map` with a function: >>> idx = pd.Index([1, 2, 3]) - >>> idx.map('I am a {}'.format) + >>> idx.map("I am a {}".format) Index(['I am a 1', 'I am a 2', 'I am a 3'], dtype='object') - >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx = pd.Index(["a", "b", "c"]) >>> idx.map(lambda x: x.upper()) Index(['A', 'B', 'C'], dtype='object') """ @@ -6536,7 +6436,7 @@ def _transform_index(self, func, *, level=None) -> Index: items = [func(x) for x in self] return Index(items, name=self.name, tupleize_cols=False) - def isin(self, values, level=None) -> npt.NDArray[np.bool_]: + def isin(self, values, level: str_t | int | None = None) -> npt.NDArray[np.bool_]: """ Return a boolean array where the index values are in `values`. @@ -6576,7 +6476,7 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: Examples -------- - >>> idx = pd.Index([1,2,3]) + >>> idx = pd.Index([1, 2, 3]) >>> idx Index([1, 2, 3], dtype='int64') @@ -6585,9 +6485,9 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: >>> idx.isin([1, 4]) array([ True, False, False]) - >>> midx = pd.MultiIndex.from_arrays([[1,2,3], - ... ['red', 'blue', 'green']], - ... names=('number', 'color')) + >>> midx = pd.MultiIndex.from_arrays( + ... [[1, 2, 3], ["red", "blue", "green"]], names=["number", "color"] + ... ) >>> midx MultiIndex([(1, 'red'), (2, 'blue'), @@ -6597,12 +6497,12 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: Check whether the strings in the 'color' level of the MultiIndex are in a list of colors. - >>> midx.isin(['red', 'orange', 'yellow'], level='color') + >>> midx.isin(["red", "orange", "yellow"], level="color") array([ True, False, False]) To check across the levels of a MultiIndex, pass a list of tuples: - >>> midx.isin([(1, 'red'), (3, 'red')]) + >>> midx.isin([(1, "red"), (3, "red")]) array([ True, False, False]) """ if level is not None: @@ -6632,30 +6532,37 @@ def slice_indexer( end : label, default None If None, defaults to the end. step : int, default None + If None, defaults to 1. Returns ------- slice + A slice object. Raises ------ KeyError : If key does not exist, or key is not unique and index is not ordered. + See Also + -------- + Index.slice_locs : Computes slice locations for input labels. + Index.get_slice_bound : Retrieves slice bound that corresponds to given label. + Notes ----- - This function assumes that the data is sorted, so use at your own peril + This function assumes that the data is sorted, so use at your own peril. Examples -------- This is a method on all index types. For example you can do: - >>> idx = pd.Index(list('abcd')) - >>> idx.slice_indexer(start='b', end='c') + >>> idx = pd.Index(list("abcd")) + >>> idx.slice_indexer(start="b", end="c") slice(1, 3, None) - >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')]) - >>> idx.slice_indexer(start='b', end=('c', 'g')) + >>> idx = pd.MultiIndex.from_arrays([list("abcd"), list("efgh")]) + >>> idx.slice_indexer(start="b", end=("c", "g")) slice(1, 3, None) """ start_slice, end_slice = self.slice_locs(start, end, step=step) @@ -6679,7 +6586,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer( @@ -6751,7 +6667,10 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: Parameters ---------- label : object + The label for which to calculate the slice bound. side : {'left', 'right'} + if 'left' return leftmost position of given label. + if 'right' return one-past-the-rightmost position of given label. Returns ------- @@ -6766,16 +6685,16 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: Examples -------- >>> idx = pd.RangeIndex(5) - >>> idx.get_slice_bound(3, 'left') + >>> idx.get_slice_bound(3, "left") 3 - >>> idx.get_slice_bound(3, 'right') + >>> idx.get_slice_bound(3, "right") 4 If ``label`` is non-unique in the index, an error will be raised. - >>> idx_duplicate = pd.Index(['a', 'b', 'a', 'c', 'd']) - >>> idx_duplicate.get_slice_bound('a', 'left') + >>> idx_duplicate = pd.Index(["a", "b", "a", "c", "d"]) + >>> idx_duplicate.get_slice_bound("a", "left") Traceback (most recent call last): KeyError: Cannot get left slice bound for non-unique label: 'a' """ @@ -6800,7 +6719,7 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: return self._searchsorted_monotonic(label, side) except ValueError: # raise the original KeyError - raise err + raise err from None if isinstance(slc, np.ndarray): # get_loc may return a boolean array, which @@ -6810,7 +6729,7 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: if isinstance(slc, np.ndarray): raise KeyError( f"Cannot get {side} slice bound for non-unique " - f"label: {repr(original_label)}" + f"label: {original_label!r}" ) if isinstance(slc, slice): @@ -6824,7 +6743,12 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: else: return slc - def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: + def slice_locs( + self, + start: SliceType = None, + end: SliceType = None, + step: int | None = None, + ) -> tuple[int, int]: """ Compute slice locations for input labels. @@ -6840,6 +6764,8 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: Returns ------- tuple[int, int] + Returns a tuple of two integers representing the slice locations for the + input labels within the index. See Also -------- @@ -6851,9 +6777,13 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: Examples -------- - >>> idx = pd.Index(list('abcd')) - >>> idx.slice_locs(start='b', end='c') + >>> idx = pd.Index(list("abcd")) + >>> idx.slice_locs(start="b", end="c") (1, 3) + + >>> idx = pd.Index(list("bcde")) + >>> idx.slice_locs(start="a", end="c") + (0, 2) """ inc = step is None or step >= 0 @@ -6912,7 +6842,9 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: return start_slice, end_slice - def delete(self, loc) -> Self: + def delete( + self, loc: int | np.integer | list[int] | npt.NDArray[np.integer] + ) -> Self: """ Make new Index with passed location(-s) deleted. @@ -6933,11 +6865,11 @@ def delete(self, loc) -> Self: Examples -------- - >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx = pd.Index(["a", "b", "c"]) >>> idx.delete(1) Index(['a', 'c'], dtype='object') - >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx = pd.Index(["a", "b", "c"]) >>> idx.delete([0, 2]) Index(['b'], dtype='object') """ @@ -6961,16 +6893,24 @@ def insert(self, loc: int, item) -> Index: Parameters ---------- loc : int + The integer location where the new item will be inserted. item : object + The new item to be inserted into the Index. Returns ------- Index + Returns a new Index object resulting from inserting the specified item at + the specified location within the original Index. + + See Also + -------- + Index.append : Append a collection of Indexes together. Examples -------- - >>> idx = pd.Index(['a', 'b', 'c']) - >>> idx.insert(1, 'x') + >>> idx = pd.Index(["a", "b", "c"]) + >>> idx.insert(1, "x") Index(['a', 'x', 'b', 'c'], dtype='object') """ item = lib.item_from_zerodim(item) @@ -6979,6 +6919,14 @@ def insert(self, loc: int, item) -> Index: arr = self._values + if using_string_dtype() and len(self) == 0 and self.dtype == np.object_: + # special case: if we are an empty object-dtype Index, also + # take into account the inserted item for the resulting dtype + # (https://github.com/pandas-dev/pandas/pull/60797) + dtype = self._find_common_type_compat(item) + if dtype != self.dtype: + return self.astype(dtype).insert(loc, item) + try: if isinstance(arr, ExtensionArray): res_values = arr.insert(loc, item) @@ -6990,6 +6938,9 @@ def insert(self, loc: int, item) -> Index: # We cannot keep the same dtype, so cast to the (often object) # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) + if dtype == self.dtype: + # EA's might run into recursion errors if loc is invalid + raise return self.astype(dtype).insert(loc, item) if arr.dtype != object or not isinstance( @@ -7008,23 +6959,8 @@ def insert(self, loc: int, item) -> Index: loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - out = Index._with_infer(new_values, name=self.name) - if ( - using_pyarrow_string_dtype() - and is_string_dtype(out.dtype) - and new_values.dtype == object - ): - out = out.astype(new_values.dtype) - if self.dtype == object and out.dtype != object: - # GH#51363 - warnings.warn( - "The behavior of Index.insert with object-dtype is deprecated, " - "in a future version this will return an object-dtype Index " - "instead of inferring a non-object dtype. To retain the old " - "behavior, do `idx.insert(loc, item).infer_objects(copy=False)`", - FutureWarning, - stacklevel=find_stack_level(), - ) + # GH#51363 stopped doing dtype inference here + out = Index(new_values, dtype=new_values.dtype, name=self.name) return out def drop( @@ -7038,6 +6974,8 @@ def drop( Parameters ---------- labels : array-like or scalar + Array-like object or a scalar value, representing the labels to be removed + from the Index. errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and existing labels are dropped. @@ -7051,10 +6989,15 @@ def drop( KeyError If not all of the labels are found in the selected axis + See Also + -------- + Index.dropna : Return Index without NA/NaN values. + Index.drop_duplicates : Return Index with duplicate values removed. + Examples -------- - >>> idx = pd.Index(['a', 'b', 'c']) - >>> idx.drop(['a']) + >>> idx = pd.Index(["a", "b", "c"]) + >>> idx.drop(["a"]) Index(['b', 'c'], dtype='object') """ if not isinstance(labels, Index): @@ -7131,7 +7074,6 @@ def diff(self, periods: int = 1) -> Index: """ return Index(self.to_series().diff(periods)) - @final def round(self, decimals: int = 0) -> Self: """ Round each value in the Index to the given number of decimals. @@ -7212,10 +7154,10 @@ def _logical_method(self, other, op): rvalues = extract_array(other, extract_numpy=True, extract_range=True) res_values = ops.logical_op(lvalues, rvalues, op) - return self._construct_result(res_values, name=res_name) + return self._construct_result(res_values, name=res_name, other=other) @final - def _construct_result(self, result, name): + def _construct_result(self, result, name, other): if isinstance(result, tuple): return ( Index(result[0], name=name, dtype=result[0].dtype), @@ -7355,54 +7297,39 @@ def _maybe_disable_logical_methods(self, opname: str_t) -> None: """ raise if this Index subclass does not support any or all. """ - if ( - isinstance(self, ABCMultiIndex) - # TODO(3.0): PeriodArray and DatetimeArray any/all will raise, - # so checking needs_i8_conversion will be unnecessary - or (needs_i8_conversion(self.dtype) and self.dtype.kind != "m") - ): - # This call will raise - make_invalid_op(opname)(self) + if isinstance(self, ABCMultiIndex): + raise TypeError(f"cannot perform {opname} with {type(self).__name__}") @Appender(IndexOpsMixin.argmin.__doc__) - def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def argmin( + self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs + ) -> int: nv.validate_argmin(args, kwargs) nv.validate_minmax_axis(axis) if not self._is_multi and self.hasnans: - # Take advantage of cache - mask = self._isnan - if not skipna or mask.all(): - warnings.warn( - f"The behavior of {type(self).__name__}.argmax/argmin " - "with skipna=False and NAs, or with all-NAs is deprecated. " - "In a future version this will raise ValueError.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return -1 + if not skipna: + raise ValueError("Encountered an NA value with skipna=False") + elif self._isnan.all(): + raise ValueError("Encountered all NA values") + return super().argmin(skipna=skipna) @Appender(IndexOpsMixin.argmax.__doc__) - def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def argmax( + self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs + ) -> int: nv.validate_argmax(args, kwargs) nv.validate_minmax_axis(axis) if not self._is_multi and self.hasnans: - # Take advantage of cache - mask = self._isnan - if not skipna or mask.all(): - warnings.warn( - f"The behavior of {type(self).__name__}.argmax/argmin " - "with skipna=False and NAs, or with all-NAs is deprecated. " - "In a future version this will raise ValueError.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return -1 + if not skipna: + raise ValueError("Encountered an NA value with skipna=False") + elif self._isnan.all(): + raise ValueError("Encountered all NA values") return super().argmax(skipna=skipna) - def min(self, axis=None, skipna: bool = True, *args, **kwargs): + def min(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs): """ Return the minimum value of the Index. @@ -7432,13 +7359,13 @@ def min(self, axis=None, skipna: bool = True, *args, **kwargs): >>> idx.min() 1 - >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx = pd.Index(["c", "b", "a"]) >>> idx.min() 'a' For a MultiIndex, the minimum is determined lexicographically. - >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx = pd.MultiIndex.from_product([("a", "b"), (2, 1)]) >>> idx.min() ('a', 1) """ @@ -7465,7 +7392,7 @@ def min(self, axis=None, skipna: bool = True, *args, **kwargs): return nanops.nanmin(self._values, skipna=skipna) - def max(self, axis=None, skipna: bool = True, *args, **kwargs): + def max(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs): """ Return the maximum value of the Index. @@ -7495,13 +7422,13 @@ def max(self, axis=None, skipna: bool = True, *args, **kwargs): >>> idx.max() 3 - >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx = pd.Index(["c", "b", "a"]) >>> idx.max() 'c' For a MultiIndex, the maximum is determined lexicographically. - >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx = pd.MultiIndex.from_product([("a", "b"), (2, 1)]) >>> idx.max() ('b', 2) """ @@ -7537,6 +7464,13 @@ def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. + See Also + -------- + Index.size: Return the number of elements in the underlying data. + Index.ndim: Number of dimensions of the underlying data, by definition 1. + Index.dtype: Return the dtype object of the underlying data. + Index.values: Return an array representing the data in the Index. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -7549,6 +7483,44 @@ def shape(self) -> Shape: return (len(self),) +def maybe_sequence_to_range(sequence) -> Any | range: + """ + Convert a 1D, non-pandas sequence to a range if possible. + + Returns the input if not possible. + + Parameters + ---------- + sequence : 1D sequence + names : sequence of str + + Returns + ------- + Any : input or range + """ + if isinstance(sequence, (range, ExtensionArray)): + return sequence + elif len(sequence) == 1 or lib.infer_dtype(sequence, skipna=False) != "integer": + return sequence + elif isinstance(sequence, (ABCSeries, Index)) and not ( + isinstance(sequence.dtype, np.dtype) and sequence.dtype.kind == "i" + ): + return sequence + if len(sequence) == 0: + return range(0) + try: + np_sequence = np.asarray(sequence, dtype=np.int64) + except OverflowError: + return sequence + diff = np_sequence[1] - np_sequence[0] + if diff == 0: + return sequence + elif len(sequence) == 2 or lib.is_sequence_range(np_sequence, diff): + return range(np_sequence[0], np_sequence[-1] + diff, diff) + else: + return sequence + + def ensure_index_from_sequences(sequences, names=None) -> Index: """ Construct an index from sequences of data. @@ -7567,8 +7539,8 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: Examples -------- - >>> ensure_index_from_sequences([[1, 2, 3]], names=["name"]) - Index([1, 2, 3], dtype='int64', name='name') + >>> ensure_index_from_sequences([[1, 2, 4]], names=["name"]) + Index([1, 2, 4], dtype='int64', name='name') >>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"]) MultiIndex([('a', 'a'), @@ -7579,13 +7551,17 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: -------- ensure_index """ + from pandas.core.indexes.api import default_index from pandas.core.indexes.multi import MultiIndex - if len(sequences) == 1: + if len(sequences) == 0: + return default_index(0) + elif len(sequences) == 1: if names is not None: names = names[0] - return Index(sequences[0], name=names) + return Index(maybe_sequence_to_range(sequences[0]), name=names) else: + # TODO: Apply maybe_sequence_to_range to sequences? return MultiIndex.from_arrays(sequences, names=names) @@ -7609,13 +7585,13 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: Examples -------- - >>> ensure_index(['a', 'b']) + >>> ensure_index(["a", "b"]) Index(['a', 'b'], dtype='object') - >>> ensure_index([('a', 'a'), ('b', 'c')]) + >>> ensure_index([("a", "a"), ("b", "c")]) Index([('a', 'a'), ('b', 'c')], dtype='object') - >>> ensure_index([['a', 'a'], ['b', 'c']]) + >>> ensure_index([["a", "a"], ["b", "c"]]) MultiIndex([('a', 'b'), ('a', 'c')], ) @@ -7633,7 +7609,7 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: index_like = list(index_like) if isinstance(index_like, list): - if type(index_like) is not list: # noqa: E721 + if type(index_like) is not list: # must check for exactly list here because of strict type # check in clean_index_list index_like = list(index_like) @@ -7648,21 +7624,9 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: return Index(index_like, copy=copy) -def ensure_has_len(seq): - """ - If seq is an iterator, put its values into a list. - """ - try: - len(seq) - except TypeError: - return list(seq) - else: - return seq - - def trim_front(strings: list[str]) -> list[str]: """ - Trims zeros and decimal points. + Trims leading spaces evenly among all strings. Examples -------- @@ -7674,8 +7638,9 @@ def trim_front(strings: list[str]) -> list[str]: """ if not strings: return strings - while all(strings) and all(x[0] == " " for x in strings): - strings = [x[1:] for x in strings] + smallest_leading_space = min(len(x) - len(x.lstrip()) for x in strings) + if smallest_leading_space > 0: + strings = [x[smallest_leading_space:] for x in strings] return strings @@ -7713,8 +7678,8 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: list A list representing the unanimous 'names' found. """ - name_tups = [tuple(i.names) for i in indexes] - name_sets = [{*ns} for ns in zip_longest(*name_tups)] + name_tups = (tuple(i.names) for i in indexes) + name_sets = ({*ns} for ns in zip_longest(*name_tups)) names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) return names diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b307be004ad6e..d20a84449fb85 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -13,6 +13,7 @@ from pandas.util._decorators import ( cache_readonly, doc, + set_module, ) from pandas.core.dtypes.common import is_scalar @@ -76,6 +77,7 @@ Categorical, wrap=True, ) +@set_module("pandas") class CategoricalIndex(NDArrayBackedExtensionIndex): """ Index based on an underlying :class:`Categorical`. @@ -276,22 +278,34 @@ def equals(self, other: object) -> bool: """ Determine if two CategoricalIndex objects contain the same elements. + The order and orderedness of elements matters. The categories matter, + but the order of the categories matters only when ``ordered=True``. + + Parameters + ---------- + other : object + The CategoricalIndex object to compare with. + Returns ------- bool ``True`` if two :class:`pandas.CategoricalIndex` objects have equal elements, ``False`` otherwise. + See Also + -------- + Categorical.equals : Returns True if categorical arrays are equal. + Examples -------- - >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) - >>> ci2 = pd.CategoricalIndex(pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])) + >>> ci = pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) + >>> ci2 = pd.CategoricalIndex(pd.Categorical(["a", "b", "c", "a", "b", "c"])) >>> ci.equals(ci2) True The order of elements matters. - >>> ci3 = pd.CategoricalIndex(['c', 'b', 'a', 'a', 'b', 'c']) + >>> ci3 = pd.CategoricalIndex(["c", "b", "a", "a", "b", "c"]) >>> ci.equals(ci3) False @@ -304,16 +318,17 @@ def equals(self, other: object) -> bool: The categories matter, but the order of the categories matters only when ``ordered=True``. - >>> ci5 = ci.set_categories(['a', 'b', 'c', 'd']) + >>> ci5 = ci.set_categories(["a", "b", "c", "d"]) >>> ci.equals(ci5) False - >>> ci6 = ci.set_categories(['b', 'c', 'a']) + >>> ci6 = ci.set_categories(["b", "c", "a"]) >>> ci.equals(ci6) True - >>> ci_ordered = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], - ... ordered=True) - >>> ci2_ordered = ci_ordered.set_categories(['b', 'c', 'a']) + >>> ci_ordered = pd.CategoricalIndex( + ... ["a", "b", "c", "a", "b", "c"], ordered=True + ... ) + >>> ci2_ordered = ci_ordered.set_categories(["b", "c", "a"]) >>> ci_ordered.equals(ci2_ordered) False """ @@ -364,8 +379,13 @@ def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. if is_valid_na_for_dtype(key, self.categories.dtype): return self.hasnans - - return contains(self, key, container=self._engine) + if self.categories._typ == "rangeindex": + container: Index | libindex.IndexEngine | libindex.ExtensionEngine = ( + self.categories + ) + else: + container = self._engine + return contains(self, key, container=container) def reindex( self, target, method=None, level=None, limit: int | None = None, tolerance=None @@ -445,6 +465,9 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): ---------- mapper : function, dict, or Series Mapping correspondence. + na_action : {None, 'ignore'}, default 'ignore' + If 'ignore', propagate NaN values, without passing them to + the mapping correspondence. Returns ------- @@ -462,37 +485,37 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): Examples -------- - >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) + >>> idx = pd.CategoricalIndex(["a", "b", "c"]) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') >>> idx.map(lambda x: x.upper()) CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=False, dtype='category') - >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'}) + >>> idx.map({"a": "first", "b": "second", "c": "third"}) CategoricalIndex(['first', 'second', 'third'], categories=['first', 'second', 'third'], ordered=False, dtype='category') If the mapping is one-to-one the ordering of the categories is preserved: - >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True) + >>> idx = pd.CategoricalIndex(["a", "b", "c"], ordered=True) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=True, dtype='category') - >>> idx.map({'a': 3, 'b': 2, 'c': 1}) + >>> idx.map({"a": 3, "b": 2, "c": 1}) CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True, dtype='category') If the mapping is not one-to-one an :class:`~pandas.Index` is returned: - >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'}) + >>> idx.map({"a": "first", "b": "second", "c": "first"}) Index(['first', 'second', 'first'], dtype='object') If a `dict` is used, all unmapped categories are mapped to `NaN` and the result is an :class:`~pandas.Index`: - >>> idx.map({'a': 'first', 'b': 'second'}) + >>> idx.map({"a": "first", "b": "second"}) Index(['first', 'second', nan], dtype='object') """ mapped = self._values.map(mapper, na_action=na_action) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index cad8737a987d4..8b316de30662c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -1,6 +1,7 @@ """ Base and utility classes for tseries type pandas objects. """ + from __future__ import annotations from abc import ( @@ -10,16 +11,12 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, final, ) -import warnings import numpy as np -from pandas._config import using_copy_on_write - from pandas._libs import ( NaT, Timedelta, @@ -32,7 +29,6 @@ parsing, to_offset, ) -from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas.compat.numpy import function as nv from pandas.errors import ( InvalidIndexError, @@ -43,14 +39,16 @@ cache_readonly, doc, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_integer, is_list_like, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + PeriodDtype, +) from pandas.core.arrays import ( DatetimeArray, @@ -75,6 +73,7 @@ from pandas._typing import ( Axis, + JoinHow, Self, npt, ) @@ -98,6 +97,32 @@ def mean(self, *, skipna: bool = True, axis: int | None = 0): @property def freq(self) -> BaseOffset | None: + """ + Return the frequency object if it is set, otherwise None. + + To learn more about the frequency strings, please see + :ref:`this link`. + + See Also + -------- + DatetimeIndex.freq : Return the frequency object if it is set, otherwise None. + PeriodIndex.freq : Return the frequency object if it is set, otherwise None. + + Examples + -------- + >>> datetimeindex = pd.date_range( + ... "2022-02-22 02:22:22", periods=10, tz="America/Chicago", freq="h" + ... ) + >>> datetimeindex + DatetimeIndex(['2022-02-22 02:22:22-06:00', '2022-02-22 03:22:22-06:00', + '2022-02-22 04:22:22-06:00', '2022-02-22 05:22:22-06:00', + '2022-02-22 06:22:22-06:00', '2022-02-22 07:22:22-06:00', + '2022-02-22 08:22:22-06:00', '2022-02-22 09:22:22-06:00', + '2022-02-22 10:22:22-06:00', '2022-02-22 11:22:22-06:00'], + dtype='datetime64[ns, America/Chicago]', freq='h') + >>> datetimeindex.freq + + """ return self._data.freq @freq.setter @@ -117,15 +142,14 @@ def freqstr(self) -> str: if self._data.freqstr is not None and isinstance( self._data, (PeriodArray, PeriodIndex) ): - freq = freq_to_period_freqstr(self._data.freq.n, self._data.freq.name) + freq = PeriodDtype(self._data.freq)._freqstr return freq else: return self._data.freqstr # type: ignore[return-value] @cache_readonly @abstractmethod - def _resolution_obj(self) -> Resolution: - ... + def _resolution_obj(self) -> Resolution: ... @cache_readonly @doc(DatetimeLikeArrayMixin.resolution) @@ -191,39 +215,6 @@ def _convert_tolerance(self, tolerance, target): # Rendering Methods _default_na_rep = "NaT" - def format( - self, - name: bool = False, - formatter: Callable | None = None, - na_rep: str = "NaT", - date_format: str | None = None, - ) -> list[str]: - """ - Render a string representation of the Index. - """ - warnings.warn( - # GH#55413 - f"{type(self).__name__}.format is deprecated and will be removed " - "in a future version. Convert using index.astype(str) or " - "index.map(formatter) instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - header = [] - if name: - header.append( - ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) - if self.name is not None - else "" - ) - - if formatter is not None: - return header + list(self.map(formatter)) - - return self._format_with_header( - header=header, na_rep=na_rep, date_format=date_format - ) - def _format_with_header( self, *, header: list[str], na_rep: str, date_format: str | None = None ) -> list[str]: @@ -272,7 +263,7 @@ def _can_partial_date_slice(self, reso: Resolution) -> bool: def _parsed_string_to_bounds(self, reso: Resolution, parsed): raise NotImplementedError - def _parse_with_reso(self, label: str): + def _parse_with_reso(self, label: str) -> tuple[datetime, Resolution]: # overridden by TimedeltaIndex try: if self.freq is None or hasattr(self.freq, "rule_code"): @@ -294,7 +285,7 @@ def _parse_with_reso(self, label: str): reso = Resolution.from_attrname(reso_str) return parsed, reso - def _get_string_slice(self, key: str): + def _get_string_slice(self, key: str) -> slice | npt.NDArray[np.intp]: # overridden by TimedeltaIndex parsed, reso = self._parse_with_reso(key) try: @@ -450,6 +441,10 @@ def as_unit(self, unit: str) -> Self: """ Convert to a dtype with the given unit resolution. + This method is for converting the dtype of a ``DatetimeIndex`` or + ``TimedeltaIndex`` to a new dtype with the given unit + resolution/precision. + Parameters ---------- unit : {'s', 'ms', 'us', 'ns'} @@ -457,25 +452,33 @@ def as_unit(self, unit: str) -> Self: Returns ------- same type as self + Converted to the specified unit. + + See Also + -------- + Timestamp.as_unit : Convert to the given unit. + Timedelta.as_unit : Convert to the given unit. + DatetimeIndex.as_unit : Convert to the given unit. + TimedeltaIndex.as_unit : Convert to the given unit. Examples -------- For :class:`pandas.DatetimeIndex`: - >>> idx = pd.DatetimeIndex(['2020-01-02 01:02:03.004005006']) + >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"]) >>> idx DatetimeIndex(['2020-01-02 01:02:03.004005006'], dtype='datetime64[ns]', freq=None) - >>> idx.as_unit('s') + >>> idx.as_unit("s") DatetimeIndex(['2020-01-02 01:02:03'], dtype='datetime64[s]', freq=None) For :class:`pandas.TimedeltaIndex`: - >>> tdelta_idx = pd.to_timedelta(['1 day 3 min 2 us 42 ns']) + >>> tdelta_idx = pd.to_timedelta(["1 day 3 min 2 us 42 ns"]) >>> tdelta_idx TimedeltaIndex(['1 days 00:03:00.000002042'], dtype='timedelta64[ns]', freq=None) - >>> tdelta_idx.as_unit('s') + >>> tdelta_idx.as_unit("s") TimedeltaIndex(['1 days 00:03:00'], dtype='timedelta64[s]', freq=None) """ arr = self._data.as_unit(unit) @@ -489,9 +492,8 @@ def _with_freq(self, freq): def values(self) -> np.ndarray: # NB: For Datetime64TZ this is lossy data = self._data._ndarray - if using_copy_on_write(): - data = data.view() - data.flags.writeable = False + data = data.view() + data.flags.writeable = False return data @doc(DatetimeIndexOpsMixin.shift) @@ -533,7 +535,7 @@ def _as_range_index(self) -> RangeIndex: # Convert our i8 representations to RangeIndex # Caller is responsible for checking isinstance(self.freq, Tick) freq = cast(Tick, self.freq) - tick = Timedelta(freq).as_unit("ns")._value + tick = Timedelta(freq).as_unit(self.unit)._value rng = range(self[0]._value, self[-1]._value + tick, tick) return RangeIndex(rng) @@ -546,7 +548,9 @@ def _wrap_range_setop(self, other, res_i8) -> Self: # RangeIndex defaults to step=1, which we don't want. new_freq = self.freq elif isinstance(res_i8, RangeIndex): - new_freq = to_offset(Timedelta(res_i8.step)) + new_freq = to_offset( + Timedelta(res_i8.step, unit=self.unit).as_unit(self.unit) + ) # TODO(GH#41493): we cannot just do # type(self._data)(res_i8.values, dtype=self.dtype, freq=new_freq) @@ -734,13 +738,20 @@ def _get_join_freq(self, other): freq = self.freq return freq - def _wrap_joined_index( - self, joined, other, lidx: npt.NDArray[np.intp], ridx: npt.NDArray[np.intp] - ): + def _wrap_join_result( + self, + joined, + other, + lidx: npt.NDArray[np.intp] | None, + ridx: npt.NDArray[np.intp] | None, + how: JoinHow, + ) -> tuple[Self, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: assert other.dtype == self.dtype, (other.dtype, self.dtype) - result = super()._wrap_joined_index(joined, other, lidx, ridx) - result._data._freq = self._get_join_freq(other) - return result + join_index, lidx, ridx = super()._wrap_join_result( + joined, other, lidx, ridx, how + ) + join_index._data._freq = self._get_join_freq(other) + return join_index, lidx, ridx def _get_engine_target(self) -> np.ndarray: # engine methods and libjoin methods need dt64/td64 values cast to i8 diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c978abd8c2427..9adbaadbdcdc8 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -6,7 +6,6 @@ import warnings import numpy as np -import pytz from pandas._libs import ( NaT, @@ -27,8 +26,8 @@ from pandas.util._decorators import ( cache_readonly, doc, + set_module, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_scalar from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -128,6 +127,7 @@ def _new_DatetimeIndex(cls, d): + DatetimeArray._bool_ops, DatetimeArray, ) +@set_module("pandas") class DatetimeIndex(DatetimeTimedeltaMixin): """ Immutable ndarray-like of datetime64 data. @@ -148,19 +148,8 @@ class DatetimeIndex(DatetimeTimedeltaMixin): One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation. - tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + tz : zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or str Set the Timezone of the data. - normalize : bool, default False - Normalize start/end dates to midnight before generating date range. - - .. deprecated:: 2.1.0 - - closed : {'left', 'right'}, optional - Set whether to include `start` and `end` that are on the - boundary. The default includes boundary points on either end. - - .. deprecated:: 2.1.0 - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. For example in Central European Time (UTC+01), when going from 03:00 @@ -174,7 +163,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. + - 'raise' will raise a ValueError if there are ambiguous times. dayfirst : bool, default False If True, parse dates in `data` with the day first order. yearfirst : bool, default False @@ -246,15 +235,15 @@ class DatetimeIndex(DatetimeTimedeltaMixin): Notes ----- - To learn more about the frequency strings, please see `this link - `__. + To learn more about the frequency strings, please see + :ref:`this link`. Examples -------- >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> idx DatetimeIndex(['2020-01-01 10:00:00+00:00', '2020-02-01 11:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) """ _typ = "datetimeindex" @@ -276,7 +265,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: @doc(DatetimeArray.strftime) def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) - return Index(arr, name=self.name, dtype=object) + return Index(arr, name=self.name, dtype=arr.dtype) @doc(DatetimeArray.tz_convert) def tz_convert(self, tz) -> Self: @@ -322,8 +311,6 @@ def __new__( data=None, freq: Frequency | lib.NoDefault = lib.no_default, tz=lib.no_default, - normalize: bool | lib.NoDefault = lib.no_default, - closed=lib.no_default, ambiguous: TimeAmbiguous = "raise", dayfirst: bool = False, yearfirst: bool = False, @@ -331,23 +318,6 @@ def __new__( copy: bool = False, name: Hashable | None = None, ) -> Self: - if closed is not lib.no_default: - # GH#52628 - warnings.warn( - f"The 'closed' keyword in {cls.__name__} construction is " - "deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if normalize is not lib.no_default: - # GH#52628 - warnings.warn( - f"The 'normalize' keyword in {cls.__name__} construction is " - "deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if is_scalar(data): cls._raise_scalar_data_error(data) @@ -482,18 +452,35 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: """ Snap time stamps to nearest occurring frequency. + Parameters + ---------- + freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'S' + Frequency strings can have multiples, e.g. '5h'. See + :ref:`here ` for a list of + frequency aliases. + Returns ------- DatetimeIndex + Time stamps to nearest occurring `freq`. + + See Also + -------- + DatetimeIndex.round : Perform round operation on the data to the + specified `freq`. + DatetimeIndex.floor : Perform floor operation on the data to the + specified `freq`. Examples -------- - >>> idx = pd.DatetimeIndex(['2023-01-01', '2023-01-02', - ... '2023-02-01', '2023-02-02']) + >>> idx = pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"], + ... dtype="M8[ns]", + ... ) >>> idx DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], dtype='datetime64[ns]', freq=None) - >>> idx.snap('MS') + >>> idx.snap("MS") DatetimeIndex(['2023-01-01', '2023-01-01', '2023-02-01', '2023-02-01'], dtype='datetime64[ns]', freq=None) """ @@ -518,7 +505,9 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: # -------------------------------------------------------------------- # Indexing Methods - def _parsed_string_to_bounds(self, reso: Resolution, parsed: dt.datetime): + def _parsed_string_to_bounds( + self, reso: Resolution, parsed: dt.datetime + ) -> tuple[Timestamp, Timestamp]: """ Calculate datetime bounds for parsed time string and its resolution. @@ -536,6 +525,8 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: dt.datetime): freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) per = Period(parsed, freq=freq) start, end = per.start_time, per.end_time + start = start.as_unit(self.unit) + end = end.as_unit(self.unit) # GH 24076 # If an incoming date string contained a UTC offset, need to localize @@ -555,7 +546,7 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: dt.datetime): # which localizes parsed. return start, end - def _parse_with_reso(self, label: str): + def _parse_with_reso(self, label: str) -> tuple[Timestamp, Resolution]: parsed, reso = super()._parse_with_reso(label) parsed = Timestamp(parsed) @@ -601,7 +592,7 @@ def get_loc(self, key): elif isinstance(key, str): try: parsed, reso = self._parse_with_reso(key) - except (ValueError, pytz.NonExistentTimeError) as err: + except ValueError as err: raise KeyError(key) from err self._disallow_mismatched_indexing(parsed) @@ -722,10 +713,13 @@ def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: Time passed in either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). + asof : bool, default False + This parameter is currently not supported. Returns ------- np.ndarray[np.intp] + Index locations of values at given `time` of day. See Also -------- @@ -735,8 +729,9 @@ def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: Examples -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00", "2/1/2020 11:00", - ... "3/1/2020 10:00"]) + >>> idx = pd.DatetimeIndex( + ... ["1/1/2020 10:00", "2/1/2020 11:00", "3/1/2020 10:00"] + ... ) >>> idx.indexer_at_time("10:00") array([0, 2]) """ @@ -770,11 +765,14 @@ def indexer_between_time( appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). include_start : bool, default True + Include boundaries; whether to set start bound as closed or open. include_end : bool, default True + Include boundaries; whether to set end bound as closed or open. Returns ------- np.ndarray[np.intp] + Index locations of values between particular times of day. See Also -------- @@ -818,6 +816,7 @@ def indexer_between_time( return mask.nonzero()[0] +@set_module("pandas") def date_range( start=None, end=None, @@ -835,13 +834,15 @@ def date_range( Return a fixed frequency DatetimeIndex. Returns the range of equally spaced time points (where the difference between any - two adjacent points is specified by the given frequency) such that they all - satisfy `start <[=] x <[=] end`, where the first one and the last one are, resp., - the first and last time points in that range that fall on the boundary of ``freq`` - (if given as a frequency string) or that are valid for ``freq`` (if given as a - :class:`pandas.tseries.offsets.DateOffset`). (If exactly one of ``start``, - ``end``, or ``freq`` is *not* specified, this missing parameter can be computed - given ``periods``, the number of timesteps in the range. See the note below.) + two adjacent points is specified by the given frequency) such that they fall in the + range `[start, end]` , where the first one and the last one are, resp., the first + and last time points in that range that fall on the boundary of ``freq`` (if given + as a frequency string) or that are valid for ``freq`` (if given as a + :class:`pandas.tseries.offsets.DateOffset`). If ``freq`` is positive, the points + satisfy `start <[=] x <[=] end`, and if ``freq`` is negative, the points satisfy + `end <[=] x <[=] start`. (If exactly one of ``start``, ``end``, or ``freq`` is *not* + specified, this missing parameter can be computed given ``periods``, the number of + timesteps in the range. See the note below.) Parameters ---------- @@ -877,6 +878,7 @@ def date_range( Returns ------- DatetimeIndex + A DatetimeIndex object of the generated dates. See Also -------- @@ -892,8 +894,8 @@ def date_range( ``DatetimeIndex`` will have ``periods`` linearly spaced elements between ``start`` and ``end`` (closed on both sides). - To learn more about the frequency strings, please see `this link - `__. + To learn more about the frequency strings, please see + :ref:`this link`. Examples -------- @@ -904,7 +906,7 @@ def date_range( Specify `start` and `end`, with the default daily frequency. - >>> pd.date_range(start='1/1/2018', end='1/08/2018') + >>> pd.date_range(start="1/1/2018", end="1/08/2018") DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], dtype='datetime64[ns]', freq='D') @@ -923,14 +925,14 @@ def date_range( Specify `start` and `periods`, the number of periods (days). - >>> pd.date_range(start='1/1/2018', periods=8) + >>> pd.date_range(start="1/1/2018", periods=8) DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], dtype='datetime64[ns]', freq='D') Specify `end` and `periods`, the number of periods (days). - >>> pd.date_range(end='1/1/2018', periods=8) + >>> pd.date_range(end="1/1/2018", periods=8) DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], dtype='datetime64[ns]', freq='D') @@ -938,7 +940,7 @@ def date_range( Specify `start`, `end`, and `periods`; the frequency is generated automatically (linearly spaced). - >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3) + >>> pd.date_range(start="2018-04-24", end="2018-04-27", periods=3) DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', '2018-04-27 00:00:00'], dtype='datetime64[ns]', freq=None) @@ -947,28 +949,28 @@ def date_range( Changed the `freq` (frequency) to ``'ME'`` (month end frequency). - >>> pd.date_range(start='1/1/2018', periods=5, freq='ME') + >>> pd.date_range(start="1/1/2018", periods=5, freq="ME") DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31'], dtype='datetime64[ns]', freq='ME') Multiples are allowed - >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME') + >>> pd.date_range(start="1/1/2018", periods=5, freq="3ME") DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], dtype='datetime64[ns]', freq='3ME') `freq` can also be specified as an Offset object. - >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) + >>> pd.date_range(start="1/1/2018", periods=5, freq=pd.offsets.MonthEnd(3)) DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], dtype='datetime64[ns]', freq='3ME') Specify `tz` to set the timezone. - >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo') + >>> pd.date_range(start="1/1/2018", periods=5, tz="Asia/Tokyo") DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00', '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00', '2018-01-05 00:00:00+09:00'], @@ -977,20 +979,20 @@ def date_range( `inclusive` controls whether to include `start` and `end` that are on the boundary. The default, "both", includes boundary points on either end. - >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both") + >>> pd.date_range(start="2017-01-01", end="2017-01-04", inclusive="both") DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') Use ``inclusive='left'`` to exclude `end` if it falls on the boundary. - >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left') + >>> pd.date_range(start="2017-01-01", end="2017-01-04", inclusive="left") DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq='D') Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and similarly ``inclusive='neither'`` will exclude both `start` and `end`. - >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right') + >>> pd.date_range(start="2017-01-01", end="2017-01-04", inclusive="right") DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') @@ -1019,6 +1021,7 @@ def date_range( return DatetimeIndex._simple_new(dtarr, name=name) +@set_module("pandas") def bdate_range( start=None, end=None, @@ -1071,6 +1074,13 @@ def bdate_range( Returns ------- DatetimeIndex + Fixed frequency DatetimeIndex. + + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex. Notes ----- @@ -1079,14 +1089,14 @@ def bdate_range( for ``bdate_range``. Use ``date_range`` if specifying ``freq`` is not desired. - To learn more about the frequency strings, please see `this link - `__. + To learn more about the frequency strings, please see + :ref:`this link`. Examples -------- Note how the two weekend days are skipped in the result. - >>> pd.bdate_range(start='1/1/2018', end='1/08/2018') + >>> pd.bdate_range(start="1/1/2018", end="1/08/2018") DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-08'], dtype='datetime64[ns]', freq='B') diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 61949531f37df..2eeacfb769be4 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -1,11 +1,12 @@ """ Shared methods for Index subclasses backed by ExtensionArray. """ + from __future__ import annotations +from inspect import signature from typing import ( TYPE_CHECKING, - Callable, TypeVar, ) @@ -16,6 +17,8 @@ from pandas.core.indexes.base import Index if TYPE_CHECKING: + from collections.abc import Callable + import numpy as np from pandas._typing import ( @@ -71,7 +74,7 @@ def fget(self): return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result def fset(self, value) -> None: @@ -98,12 +101,13 @@ def method(self, *args, **kwargs): # type: ignore[misc] return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result # error: "property" has no attribute "__name__" method.__name__ = name # type: ignore[attr-defined] method.__doc__ = attr.__doc__ + method.__signature__ = signature(attr) # type: ignore[attr-defined] return method diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 9d528d34e3684..254bd71ade209 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -6,6 +6,7 @@ - .names (FrozenList) """ + from __future__ import annotations from typing import ( @@ -109,10 +110,12 @@ def _disabled(self, *args, **kwargs) -> NoReturn: raise TypeError(f"'{type(self).__name__}' does not support mutable operations.") def __str__(self) -> str: - return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) + return pprint_thing( + self, quote_strings=True, escape_chars=("\t", "\r", "\n", "'") + ) def __repr__(self) -> str: - return f"{type(self).__name__}({str(self)})" + return f"{type(self).__name__}({self!s})" __setitem__ = __setslice__ = _disabled # type: ignore[assignment] __delitem__ = __delslice__ = _disabled diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 4fcdb87974511..8c40b630e8cfd 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,4 +1,5 @@ -""" define the IntervalIndex """ +"""define the IntervalIndex""" + from __future__ import annotations from operator import ( @@ -31,6 +32,7 @@ from pandas.util._decorators import ( Appender, cache_readonly, + set_module, ) from pandas.util._exceptions import rewrite_exception @@ -50,6 +52,7 @@ is_number, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -128,7 +131,7 @@ def _get_next_label(label): elif is_float_dtype(dtype): return np.nextafter(label, np.inf) else: - raise TypeError(f"cannot determine next label for type {repr(type(label))}") + raise TypeError(f"cannot determine next label for type {type(label)!r}") def _get_prev_label(label): @@ -145,7 +148,7 @@ def _get_prev_label(label): elif is_float_dtype(dtype): return np.nextafter(label, -np.inf) else: - raise TypeError(f"cannot determine next label for type {repr(type(label))}") + raise TypeError(f"cannot determine next label for type {type(label)!r}") def _new_IntervalIndex(cls, d): @@ -200,6 +203,7 @@ def _new_IntervalIndex(cls, d): IntervalArray, ) @inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) +@set_module("pandas") class IntervalIndex(ExtensionIndex): _typ = "intervalindex" @@ -479,7 +483,7 @@ def is_overlapping(self) -> bool: Intervals that share closed endpoints overlap: - >>> index = pd.interval_range(0, 3, closed='both') + >>> index = pd.interval_range(0, 3, closed="both") >>> index IntervalIndex([[0, 1], [1, 2], [2, 3]], dtype='interval[int64, both]') @@ -488,7 +492,7 @@ def is_overlapping(self) -> bool: Intervals that only have an open endpoint in common do not overlap: - >>> index = pd.interval_range(0, 3, closed='left') + >>> index = pd.interval_range(0, 3, closed="left") >>> index IntervalIndex([[0, 1), [1, 2), [2, 3)], dtype='interval[int64, left]') @@ -554,10 +558,7 @@ def _maybe_convert_i8(self, key): left = self._maybe_convert_i8(key.left) right = self._maybe_convert_i8(key.right) constructor = Interval if scalar else IntervalIndex.from_arrays - # error: "object" not callable - return constructor( - left, right, closed=self.closed - ) # type: ignore[operator] + return constructor(left, right, closed=self.closed) if scalar: # Timestamp/Timedelta @@ -622,13 +623,27 @@ def get_loc(self, key) -> int | slice | np.ndarray: """ Get integer location, slice or boolean mask for requested label. + The `get_loc` method is used to retrieve the integer index, a slice for + slicing objects, or a boolean mask indicating the presence of the label + in the `IntervalIndex`. + Parameters ---------- key : label + The value or range to find in the IntervalIndex. Returns ------- int if unique index, slice if monotonic index, else mask + The position or positions found. This could be a single + number, a range, or an array of true/false values + indicating the position(s) of the label. + + See Also + -------- + IntervalIndex.get_indexer_non_unique : Compute indexer and + mask for new index given the current index. + Index.get_loc : Similar method in the base Index class. Examples -------- @@ -699,7 +714,7 @@ def _get_indexer( # left/right get_indexer, compare elementwise, equality -> match indexer = self._get_indexer_unique_sides(target) - elif not is_object_dtype(target.dtype): + elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)): # homogeneous scalar index: use IntervalTree # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) @@ -829,24 +844,155 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: @cache_readonly def left(self) -> Index: + """ + Return left bounds of the intervals in the IntervalIndex. + + The left bounds of each interval in the IntervalIndex are + returned as an Index. The datatype of the left bounds is the + same as the datatype of the endpoints of the intervals. + + Returns + ------- + Index + An Index containing the left bounds of the intervals. + + See Also + -------- + IntervalIndex.right : Return the right bounds of the intervals + in the IntervalIndex. + IntervalIndex.mid : Return the mid-point of the intervals in + the IntervalIndex. + IntervalIndex.length : Return the length of the intervals in + the IntervalIndex. + + Examples + -------- + >>> iv_idx = pd.IntervalIndex.from_arrays([1, 2, 3], [4, 5, 6], closed="right") + >>> iv_idx.left + Index([1, 2, 3], dtype='int64') + + >>> iv_idx = pd.IntervalIndex.from_tuples( + ... [(1, 4), (2, 5), (3, 6)], closed="left" + ... ) + >>> iv_idx.left + Index([1, 2, 3], dtype='int64') + """ return Index(self._data.left, copy=False) @cache_readonly def right(self) -> Index: + """ + Return right bounds of the intervals in the IntervalIndex. + + The right bounds of each interval in the IntervalIndex are + returned as an Index. The datatype of the right bounds is the + same as the datatype of the endpoints of the intervals. + + Returns + ------- + Index + An Index containing the right bounds of the intervals. + + See Also + -------- + IntervalIndex.left : Return the left bounds of the intervals + in the IntervalIndex. + IntervalIndex.mid : Return the mid-point of the intervals in + the IntervalIndex. + IntervalIndex.length : Return the length of the intervals in + the IntervalIndex. + + Examples + -------- + >>> iv_idx = pd.IntervalIndex.from_arrays([1, 2, 3], [4, 5, 6], closed="right") + >>> iv_idx.right + Index([4, 5, 6], dtype='int64') + + >>> iv_idx = pd.IntervalIndex.from_tuples( + ... [(1, 4), (2, 5), (3, 6)], closed="left" + ... ) + >>> iv_idx.right + Index([4, 5, 6], dtype='int64') + """ return Index(self._data.right, copy=False) @cache_readonly def mid(self) -> Index: + """ + Return the midpoint of each interval in the IntervalIndex as an Index. + + Each midpoint is calculated as the average of the left and right bounds + of each interval. The midpoints are returned as a pandas Index object. + + Returns + ------- + pandas.Index + An Index containing the midpoints of each interval. + + See Also + -------- + IntervalIndex.left : Return the left bounds of the intervals + in the IntervalIndex. + IntervalIndex.right : Return the right bounds of the intervals + in the IntervalIndex. + IntervalIndex.length : Return the length of the intervals in + the IntervalIndex. + + Notes + ----- + The midpoint is the average of the interval bounds, potentially resulting + in a floating-point number even if bounds are integers. The returned Index + will have a dtype that accurately holds the midpoints. This computation is + the same regardless of whether intervals are open or closed. + + Examples + -------- + >>> iv_idx = pd.IntervalIndex.from_arrays([1, 2, 3], [4, 5, 6]) + >>> iv_idx.mid + Index([2.5, 3.5, 4.5], dtype='float64') + + >>> iv_idx = pd.IntervalIndex.from_tuples([(1, 4), (2, 5), (3, 6)]) + >>> iv_idx.mid + Index([2.5, 3.5, 4.5], dtype='float64') + """ return Index(self._data.mid, copy=False) @property def length(self) -> Index: + """ + Calculate the length of each interval in the IntervalIndex. + + This method returns a new Index containing the lengths of each interval + in the IntervalIndex. The length of an interval is defined as the difference + between its end and its start. + + Returns + ------- + Index + An Index containing the lengths of each interval. + + See Also + -------- + Interval.length : Return the length of the Interval. + + Examples + -------- + >>> intervals = pd.IntervalIndex.from_arrays( + ... [1, 2, 3], [4, 5, 6], closed="right" + ... ) + >>> intervals.length + Index([3, 3, 3], dtype='int64') + + >>> intervals = pd.IntervalIndex.from_tuples([(1, 5), (6, 10), (11, 15)]) + >>> intervals.length + Index([4, 4, 4], dtype='int64') + """ return Index(self._data.length, copy=False) # -------------------------------------------------------------------- # Set Operations - def _intersection(self, other, sort): + def _intersection(self, other, sort: bool = False): """ intersection specialized to the case with matching dtypes. """ @@ -861,7 +1007,7 @@ def _intersection(self, other, sort): # duplicates taken = self._intersection_non_unique(other) - if sort is None: + if sort: taken = taken.sort_values() return taken @@ -994,6 +1140,7 @@ def interval_range( Returns ------- IntervalIndex + Object with a fixed frequency. See Also -------- @@ -1006,8 +1153,8 @@ def interval_range( ``IntervalIndex`` will have ``periods`` linearly spaced elements between ``start`` and ``end``, inclusively. - To learn more about datetime-like frequency strings, please see `this link - `__. + To learn more about datetime-like frequency strings, please see + :ref:`this link`. Examples -------- @@ -1019,8 +1166,9 @@ def interval_range( Additionally, datetime-like input is also supported. - >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), - ... end=pd.Timestamp('2017-01-04')) + >>> pd.interval_range( + ... start=pd.Timestamp("2017-01-01"), end=pd.Timestamp("2017-01-04") + ... ) IntervalIndex([(2017-01-01 00:00:00, 2017-01-02 00:00:00], (2017-01-02 00:00:00, 2017-01-03 00:00:00], (2017-01-03 00:00:00, 2017-01-04 00:00:00]], @@ -1037,8 +1185,7 @@ def interval_range( Similarly, for datetime-like ``start`` and ``end``, the frequency must be convertible to a DateOffset. - >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), - ... periods=3, freq='MS') + >>> pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=3, freq="MS") IntervalIndex([(2017-01-01 00:00:00, 2017-02-01 00:00:00], (2017-02-01 00:00:00, 2017-03-01 00:00:00], (2017-03-01 00:00:00, 2017-04-01 00:00:00]], @@ -1054,7 +1201,7 @@ def interval_range( The ``closed`` parameter specifies which endpoints of the individual intervals within the ``IntervalIndex`` are closed. - >>> pd.interval_range(end=5, periods=4, closed='both') + >>> pd.interval_range(end=5, periods=4, closed="both") IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], dtype='interval[int64, both]') """ @@ -1103,9 +1250,23 @@ def interval_range( breaks: np.ndarray | TimedeltaIndex | DatetimeIndex if is_number(endpoint): + dtype: np.dtype = np.dtype("int64") if com.all_not_none(start, end, freq): + if ( + isinstance(start, (float, np.float16)) + or isinstance(end, (float, np.float16)) + or isinstance(freq, (float, np.float16)) + ): + dtype = np.dtype("float64") + elif ( + isinstance(start, (np.integer, np.floating)) + and isinstance(end, (np.integer, np.floating)) + and start.dtype == end.dtype + ): + dtype = start.dtype # 0.1 ensures we capture end breaks = np.arange(start, end + (freq * 0.1), freq) + breaks = maybe_downcast_numeric(breaks, dtype) else: # compute the period/start/end if unspecified (at most one) if periods is None: @@ -1118,14 +1279,7 @@ def interval_range( breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com.not_none(start, end, freq)): # np.linspace always produces float output - - # error: Argument 1 to "maybe_downcast_numeric" has incompatible type - # "Union[ndarray[Any, Any], TimedeltaIndex, DatetimeIndex]"; - # expected "ndarray[Any, Any]" [ - breaks = maybe_downcast_numeric( - breaks, # type: ignore[arg-type] - np.dtype("int64"), - ) + breaks = maybe_downcast_numeric(breaks, dtype) else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): @@ -1133,4 +1287,9 @@ def interval_range( else: breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) - return IntervalIndex.from_breaks(breaks, name=name, closed=closed) + return IntervalIndex.from_breaks( + breaks, + name=name, + closed=closed, + dtype=IntervalDtype(subtype=breaks.dtype, closed=closed), + ) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2a4e027e2b806..29b34f560ab2e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Collection, Generator, Hashable, @@ -8,11 +9,11 @@ Sequence, ) from functools import wraps +from itertools import zip_longest from sys import getsizeof from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, ) @@ -37,6 +38,7 @@ F, IgnoreRaise, IndexLabel, + IndexT, Scalar, Self, Shape, @@ -52,6 +54,7 @@ Appender, cache_readonly, doc, + set_module, ) from pandas.util._exceptions import find_stack_level @@ -65,6 +68,7 @@ is_list_like, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -107,10 +111,7 @@ lexsort_indexer, ) -from pandas.io.formats.printing import ( - get_adjustment, - pprint_thing, -) +from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: from pandas import ( @@ -125,84 +126,56 @@ ) -class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): - """ - This class manages a MultiIndex by mapping label combinations to positive - integers. +class MultiIndexUInt64Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. + + The number of possible label combinations must not overflow the 64 bits integers. """ _base = libindex.UInt64Engine + _codes_dtype = "uint64" - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one uint64 (each), in a strictly - monotonic way (i.e. respecting the lexicographic order of integer - combinations): see BaseMultiIndexCodesEngine documentation. - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +class MultiIndexUInt32Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt32Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. - Returns - ------- - scalar or 1-dimensional array, of dtype uint64 - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits: - codes <<= self.offsets + The number of possible label combinations must not overflow the 32 bits integers. + """ - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer: - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) + _base = libindex.UInt32Engine + _codes_dtype = "uint32" - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) +class MultiIndexUInt16Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt16Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. -class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): - """ - This class manages those (extreme) cases in which the number of possible - label combinations overflows the 64 bits integers, and uses an ObjectEngine - containing Python integers. + The number of possible label combinations must not overflow the 16 bits integers. """ - _base = libindex.ObjectEngine + _base = libindex.UInt16Engine + _codes_dtype = "uint16" - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one Python integer (each), in a - strictly monotonic way (i.e. respecting the lexicographic order of - integer combinations): see BaseMultiIndexCodesEngine documentation. - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +class MultiIndexUInt8Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt8Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. - Returns - ------- - int, or 1-dimensional array of dtype object - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits. Since this can overflow uint64, first make sure we are - # working with Python integers: - codes = codes.astype("object") << self.offsets + The number of possible label combinations must not overflow the 8 bits integers. + """ + + _base = libindex.UInt8Engine + _codes_dtype = "uint8" + + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): + """Manages a MultiIndex by mapping label combinations to positive integers. - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer (per row): - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) + _base = libindex.ObjectEngine + _codes_dtype = "object" def names_compat(meth: F) -> F: @@ -224,6 +197,7 @@ def new_meth(self_or_cls, *args, **kwargs): return cast(F, new_meth) +@set_module("pandas") class MultiIndex(Index): """ A multi-level, or hierarchical, index object for pandas objects. @@ -241,6 +215,8 @@ class MultiIndex(Index): Names for each of the index levels. (name is accepted for compat). copy : bool, default False Copy the meta-data. + name : Label + Kept for compatibility with 1-dimensional Index. Should not be used. verify_integrity : bool, default True Check that the levels/codes are consistent and valid. @@ -296,8 +272,8 @@ class MultiIndex(Index): methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + >>> arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + >>> pd.MultiIndex.from_arrays(arrays, names=("number", "color")) MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), @@ -328,7 +304,6 @@ def __new__( codes=None, sortorder=None, names=None, - dtype=None, copy: bool = False, name=None, verify_integrity: bool = True, @@ -369,15 +344,15 @@ def __new__( return result - def _validate_codes(self, level: list, code: list): + def _validate_codes(self, level: Index, code: np.ndarray) -> np.ndarray: """ Reassign code values as -1 if their corresponding levels are NaN. Parameters ---------- - code : list + code : Index Code to reassign. - level : list + level : np.ndarray Level to check for missing values (NaN, NaT, None). Returns @@ -387,10 +362,7 @@ def _validate_codes(self, level: list, code: list): """ null_mask = isna(level) if np.any(null_mask): - # error: Incompatible types in assignment - # (expression has type "ndarray[Any, dtype[Any]]", - # variable has type "List[Any]") - code = np.where(null_mask[code], -1, code) # type: ignore[assignment] + code = np.where(null_mask[code], -1, code) return code def _verify_integrity( @@ -398,7 +370,7 @@ def _verify_integrity( codes: list | None = None, levels: list | None = None, levels_to_verify: list[int] | range | None = None, - ): + ) -> FrozenList: """ Parameters ---------- @@ -505,8 +477,8 @@ def from_arrays( Examples -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + >>> arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + >>> pd.MultiIndex.from_arrays(arrays, names=("number", "color")) MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), @@ -576,9 +548,8 @@ def from_tuples( Examples -------- - >>> tuples = [(1, 'red'), (1, 'blue'), - ... (2, 'red'), (2, 'blue')] - >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + >>> tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] + >>> pd.MultiIndex.from_tuples(tuples, names=("number", "color")) MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), @@ -618,7 +589,7 @@ def from_tuples( elif isinstance(tuples, list): arrays = list(lib.to_object_array_tuples(tuples).T) else: - arrs = zip(*tuples) + arrs = zip_longest(*tuples, fillvalue=np.nan) arrays = cast(list[Sequence[Hashable]], arrs) return cls.from_arrays(arrays, sortorder=sortorder, names=names) @@ -658,9 +629,8 @@ def from_product( Examples -------- >>> numbers = [0, 1, 2] - >>> colors = ['green', 'purple'] - >>> pd.MultiIndex.from_product([numbers, colors], - ... names=['number', 'color']) + >>> colors = ["green", "purple"] + >>> pd.MultiIndex.from_product([numbers, colors], names=["number", "color"]) MultiIndex([(0, 'green'), (0, 'purple'), (1, 'green'), @@ -669,7 +639,6 @@ def from_product( (2, 'purple')], names=['number', 'color']) """ - from pandas.core.reshape.util import cartesian_product if not is_list_like(iterables): raise TypeError("Input must be a list / sequence of iterables.") @@ -720,9 +689,10 @@ def from_frame( Examples -------- - >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], - ... ['NJ', 'Temp'], ['NJ', 'Precip']], - ... columns=['a', 'b']) + >>> df = pd.DataFrame( + ... [["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]], + ... columns=["a", "b"], + ... ) >>> df a b 0 HI Temp @@ -739,7 +709,7 @@ def from_frame( Using explicit names, instead of the column names - >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) + >>> pd.MultiIndex.from_frame(df, names=["state", "observation"]) MultiIndex([('HI', 'Temp'), ('HI', 'Precip'), ('NJ', 'Temp'), @@ -774,9 +744,9 @@ def _values(self) -> np.ndarray: ): vals = vals.astype(object) - vals = np.array(vals, copy=False) - vals = algos.take_nd(vals, codes, fill_value=index._na_value) - values.append(vals) + array_vals = np.asarray(vals) + array_vals = algos.take_nd(array_vals, codes, fill_value=index._na_value) + values.append(array_vals) arr = lib.fast_zip(values) return arr @@ -805,10 +775,16 @@ def dtypes(self) -> Series: """ Return the dtypes as a Series for the underlying MultiIndex. + See Also + -------- + Index.dtype : Return the dtype object of the underlying data. + Series.dtypes : Return the data type of the underlying Series. + Examples -------- - >>> idx = pd.MultiIndex.from_product([(0, 1, 2), ('green', 'purple')], - ... names=['number', 'color']) + >>> idx = pd.MultiIndex.from_product( + ... [(0, 1, 2), ("green", "purple")], names=["number", "color"] + ... ) >>> idx MultiIndex([(0, 'green'), (0, 'purple'), @@ -824,7 +800,7 @@ def dtypes(self) -> Series: """ from pandas import Series - names = com.fill_missing_names([level.name for level in self.levels]) + names = com.fill_missing_names(self.names) return Series([level.dtype for level in self.levels], index=Index(names)) def __len__(self) -> int: @@ -859,12 +835,19 @@ def levels(self) -> FrozenList: it filters out all rows of the level C, MultiIndex.levels will still return A, B, C. + See Also + -------- + MultiIndex.codes : The codes of the levels in the MultiIndex. + MultiIndex.get_level_values : Return vector of label values for requested + level. + Examples -------- - >>> index = pd.MultiIndex.from_product([['mammal'], - ... ('goat', 'human', 'cat', 'dog')], - ... names=['Category', 'Animals']) - >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) + >>> index = pd.MultiIndex.from_product( + ... [["mammal"], ("goat", "human", "cat", "dog")], + ... names=["Category", "Animals"], + ... ) + >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=["Legs"]) >>> leg_num Legs Category Animals @@ -924,7 +907,7 @@ def _set_levels( new_levels = FrozenList( ensure_index(lev, copy=copy)._view() for lev in levels ) - level_numbers = list(range(len(new_levels))) + level_numbers: range | list[int] = range(len(new_levels)) else: level_numbers = [self._get_level_number(lev) for lev in level] new_levels_list = list(self._levels) @@ -951,6 +934,19 @@ def set_levels( """ Set new levels on MultiIndex. Defaults to returning new index. + The `set_levels` method provides a flexible way to change the levels of a + `MultiIndex`. This is particularly useful when you need to update the + index structure of your DataFrame without altering the data. The method + returns a new `MultiIndex` unless the operation is performed in-place, + ensuring that the original index remains unchanged unless explicitly + modified. + + The method checks the integrity of the new levels against the existing + codes by default, but this can be disabled if you are confident that + your levels are consistent with the underlying data. This can be useful + when you want to perform optimizations or make specific adjustments to + the index levels that do not strictly adhere to the original structure. + Parameters ---------- levels : sequence or list of sequence @@ -963,6 +959,14 @@ def set_levels( Returns ------- MultiIndex + A new `MultiIndex` with the updated levels. + + See Also + -------- + MultiIndex.set_codes : Set new codes on the existing `MultiIndex`. + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + Index.set_names : Set Index or MultiIndex name. Examples -------- @@ -973,9 +977,9 @@ def set_levels( ... (2, "one"), ... (2, "two"), ... (3, "one"), - ... (3, "two") + ... (3, "two"), ... ], - ... names=["foo", "bar"] + ... names=["foo", "bar"], ... ) >>> idx MultiIndex([(1, 'one'), @@ -986,7 +990,7 @@ def set_levels( (3, 'two')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) + >>> idx.set_levels([["a", "b", "c"], [1, 2]]) MultiIndex([('a', 1), ('a', 2), ('b', 1), @@ -994,7 +998,7 @@ def set_levels( ('c', 1), ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b', 'c'], level=0) + >>> idx.set_levels(["a", "b", "c"], level=0) MultiIndex([('a', 'one'), ('a', 'two'), ('b', 'one'), @@ -1002,7 +1006,7 @@ def set_levels( ('c', 'one'), ('c', 'two')], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b'], level='bar') + >>> idx.set_levels(["a", "b"], level="bar") MultiIndex([(1, 'a'), (1, 'b'), (2, 'a'), @@ -1016,7 +1020,7 @@ def set_levels( be stored in the MultiIndex levels, though the values will be truncated in the MultiIndex output. - >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) + >>> idx.set_levels([["a", "b", "c"], [1, 2, 3, 4]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), ('b', 1), @@ -1024,7 +1028,7 @@ def set_levels( ('c', 1), ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + >>> idx.set_levels([["a", "b", "c"], [1, 2, 3, 4]], level=[0, 1]).levels FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ @@ -1048,9 +1052,16 @@ def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. + See Also + -------- + MultiIndex.levels : Get the levels of the MultiIndex. + MultiIndex.codes : Get the codes of the MultiIndex. + MultiIndex.from_arrays : Convert arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) >>> mi MultiIndex([('a', 'b', 'c')], ) @@ -1062,11 +1073,23 @@ def nlevels(self) -> int: @property def levshape(self) -> Shape: """ - A tuple with the length of each level. + A tuple representing the length of each level in the MultiIndex. + + In a `MultiIndex`, each level can contain multiple unique values. The + `levshape` property provides a quick way to assess the size of each + level by returning a tuple where each entry represents the number of + unique values in that specific level. This is particularly useful in + scenarios where you need to understand the structure and distribution + of your index levels, such as when working with multidimensional data. + + See Also + -------- + MultiIndex.shape : Return a tuple of the shape of the MultiIndex. + MultiIndex.levels : Returns the levels of the MultiIndex. Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) >>> mi MultiIndex([('a', 'b', 'c')], ) @@ -1080,6 +1103,29 @@ def levshape(self) -> Shape: @property def codes(self) -> FrozenList: + """ + Codes of the MultiIndex. + + Codes are the position of the index value in the list of level values + for each level. + + Returns + ------- + tuple of numpy.ndarray + The codes of the MultiIndex. Each array in the tuple corresponds + to a level in the MultiIndex. + + See Also + -------- + MultiIndex.set_codes : Set new codes on MultiIndex. + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + >>> mi = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) + >>> mi.codes + FrozenList([[0, 0, 1, 1], [1, 0, 1, 0]]) + """ return self._codes def _set_codes( @@ -1143,6 +1189,12 @@ def set_codes( new index (of same type and class...etc) or None The same type as the caller or None if ``inplace=True``. + See Also + -------- + MultiIndex.set_levels : Set new levels on MultiIndex. + MultiIndex.codes : Get the codes of the levels in the MultiIndex. + MultiIndex.levels : Get the levels of the MultiIndex. + Examples -------- >>> idx = pd.MultiIndex.from_tuples( @@ -1167,7 +1219,7 @@ def set_codes( (2, 'one'), (1, 'two')], names=['foo', 'bar']) - >>> idx.set_codes([0, 0, 1, 1], level='bar') + >>> idx.set_codes([0, 0, 1, 1], level="bar") MultiIndex([(1, 'one'), (1, 'one'), (2, 'two'), @@ -1210,13 +1262,25 @@ def _engine(self): # equivalent to sorting lexicographically the codes themselves. Notice # that each level needs to be shifted by the number of bits needed to # represent the _previous_ ones: - offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") + offsets = np.concatenate([lev_bits[1:], [0]]) + # Downcast the type if possible, to prevent upcasting when shifting codes: + offsets = offsets.astype(np.min_scalar_type(int(offsets[0]))) # Check the total number of bits needed for our representation: if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: return MultiIndexPyIntEngine(self.levels, self.codes, offsets) - return MultiIndexUIntEngine(self.levels, self.codes, offsets) + if lev_bits[0] > 32: + # The levels would overflow a 32 bit uint - use uint64 + return MultiIndexUInt64Engine(self.levels, self.codes, offsets) + if lev_bits[0] > 16: + # The levels would overflow a 16 bit uint - use uint8 + return MultiIndexUInt32Engine(self.levels, self.codes, offsets) + if lev_bits[0] > 8: + # The levels would overflow a 8 bit uint - use uint16 + return MultiIndexUInt16Engine(self.levels, self.codes, offsets) + # The levels fit in an 8 bit uint - use uint8 + return MultiIndexUInt8Engine(self.levels, self.codes, offsets) # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return # type "Type[MultiIndex]" in supertype "Index" @@ -1239,7 +1303,7 @@ def _view(self) -> MultiIndex: verify_integrity=False, ) result._cache = self._cache.copy() - result._cache.pop("levels", None) # GH32669 + result._reset_cache("levels") # GH32669 return result # -------------------------------------------------------------------- @@ -1252,20 +1316,37 @@ def copy( # type: ignore[override] name=None, ) -> Self: """ - Make a copy of this object. + Make a copy of this object. Names, dtype, levels and codes can be passed and \ + will be set on new copy. - Names, dtype, levels and codes can be passed and will be set on new copy. + The `copy` method provides a mechanism to create a duplicate of an + existing MultiIndex object. This is particularly useful in scenarios where + modifications are required on an index, but the original MultiIndex should + remain unchanged. By specifying the `deep` parameter, users can control + whether the copy should be a deep or shallow copy, providing flexibility + depending on the size and complexity of the MultiIndex. Parameters ---------- names : sequence, optional + Names to set on the new MultiIndex object. deep : bool, default False + If False, the new object will be a shallow copy. If True, a deep copy + will be attempted. Deep copying can be potentially expensive for large + MultiIndex objects. name : Label Kept for compatibility with 1-dimensional Index. Should not be used. Returns ------- MultiIndex + A new MultiIndex object with the specified modifications. + + See Also + -------- + MultiIndex.from_arrays : Convert arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Convert DataFrame to MultiIndex. Notes ----- @@ -1275,7 +1356,7 @@ def copy( # type: ignore[override] Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) >>> mi MultiIndex([('a', 'b', 'c')], ) @@ -1304,13 +1385,22 @@ def copy( # type: ignore[override] verify_integrity=False, ) new_index._cache = self._cache.copy() - new_index._cache.pop("levels", None) # GH32669 + new_index._reset_cache("levels") # GH32669 if keep_id: new_index._id = self._id return new_index - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" + if copy is False: + # self.values is always a newly construct array, so raise. + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if copy is True: + # explicit np.array call to ensure a copy is made and unique objects + # are returned, because self.values is cached + return np.array(self.values, dtype=dtype) return self.values def view(self, cls=None) -> Self: @@ -1332,13 +1422,16 @@ def __contains__(self, key: Any) -> bool: def dtype(self) -> np.dtype: return np.dtype("O") + @cache_readonly def _is_memory_usage_qualified(self) -> bool: """return a boolean if we need a qualified .info display""" - def f(level) -> bool: - return "mixed" in level or "string" in level or "unicode" in level + def f(dtype) -> bool: + return is_object_dtype(dtype) or ( + is_string_dtype(dtype) and dtype.storage == "python" + ) - return any(f(level) for level in self._inferred_type_levels) + return any(f(level.dtype) for level in self.levels) # Cannot determine type of "memory_usage" @doc(Index.memory_usage) # type: ignore[has-type] @@ -1371,8 +1464,9 @@ def _nbytes(self, deep: bool = False) -> int: names_nbytes = sum(getsizeof(i, objsize) for i in self.names) result = level_nbytes + label_nbytes + names_nbytes - # include our engine hashtable - result += self._engine.sizeof(deep=deep) + # include our engine hashtable, only if it's already cached + if "_engine" in self._cache: + result += self._engine.sizeof(deep=deep) return result # -------------------------------------------------------------------- @@ -1382,7 +1476,7 @@ def _formatter_func(self, tup): """ Formats each item in tup according to its level's formatter function. """ - formatter_funcs = [level._formatter_func for level in self.levels] + formatter_funcs = (level._formatter_func for level in self.levels) return tuple(func(val) for func, val in zip(formatter_funcs, tup)) def _get_values_for_csv( @@ -1421,87 +1515,6 @@ def _get_values_for_csv( ) return mi._values - def format( - self, - name: bool | None = None, - formatter: Callable | None = None, - na_rep: str | None = None, - names: bool = False, - space: int = 2, - sparsify=None, - adjoin: bool = True, - ) -> list: - warnings.warn( - # GH#55413 - f"{type(self).__name__}.format is deprecated and will be removed " - "in a future version. Convert using index.astype(str) or " - "index.map(formatter) instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if name is not None: - names = name - - if len(self) == 0: - return [] - - stringified_levels = [] - for lev, level_codes in zip(self.levels, self.codes): - na = na_rep if na_rep is not None else _get_na_rep(lev.dtype) - - if len(lev) > 0: - formatted = lev.take(level_codes).format(formatter=formatter) - - # we have some NA - mask = level_codes == -1 - if mask.any(): - formatted = np.array(formatted, dtype=object) - formatted[mask] = na - formatted = formatted.tolist() - - else: - # weird all NA case - formatted = [ - pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) - for x in algos.take_nd(lev._values, level_codes) - ] - stringified_levels.append(formatted) - - result_levels = [] - for lev, lev_name in zip(stringified_levels, self.names): - level = [] - - if names: - level.append( - pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) - if lev_name is not None - else "" - ) - - level.extend(np.array(lev, dtype=object)) - result_levels.append(level) - - if sparsify is None: - sparsify = get_option("display.multi_sparse") - - if sparsify: - sentinel: Literal[""] | bool | lib.NoDefault = "" - # GH3547 use value of sparsify as sentinel if it's "Falsey" - assert isinstance(sparsify, bool) or sparsify is lib.no_default - if sparsify in [False, lib.no_default]: - sentinel = sparsify - # little bit of a kludge job for #1217 - result_levels = sparsify_labels( - result_levels, start=int(names), sentinel=sentinel - ) - - if adjoin: - adj = get_adjustment() - return adj.adjoin(space, *result_levels).split("\n") - else: - return result_levels - def _format_multi( self, *, @@ -1571,7 +1584,7 @@ def _format_multi( def _get_names(self) -> FrozenList: return FrozenList(self._names) - def _set_names(self, names, *, level=None, validate: bool = True): + def _set_names(self, names, *, level=None) -> None: """ Set new names on index. Each name has to be a hashable type. @@ -1582,8 +1595,6 @@ def _set_names(self, names, *, level=None, validate: bool = True): level : int, level name, or sequence of int/level names (default None) If the index is a MultiIndex (hierarchical), level(s) to set (None for all levels). Otherwise level must be None - validate : bool, default True - validate that the names match level lengths Raises ------ @@ -1602,18 +1613,17 @@ def _set_names(self, names, *, level=None, validate: bool = True): raise ValueError("Names should be list-like for a MultiIndex") names = list(names) - if validate: - if level is not None and len(names) != len(level): - raise ValueError("Length of names must match length of level.") - if level is None and len(names) != self.nlevels: - raise ValueError( - "Length of names must match number of levels in MultiIndex." - ) + if level is not None and len(names) != len(level): + raise ValueError("Length of names must match length of level.") + if level is None and len(names) != self.nlevels: + raise ValueError( + "Length of names must match number of levels in MultiIndex." + ) if level is None: level = range(self.nlevels) else: - level = [self._get_level_number(lev) for lev in level] + level = (self._get_level_number(lev) for lev in level) # set the name for lev, name in zip(level, names): @@ -1626,8 +1636,9 @@ def _set_names(self, names, *, level=None, validate: bool = True): ) self._names[lev] = name - # If .levels has been accessed, the names in our cache will be stale. - self._reset_cache() + # If .levels has been accessed, the .name of each level in our cache + # will be stale. + self._reset_cache("levels") names = property( fset=_set_names, @@ -1635,10 +1646,22 @@ def _set_names(self, names, *, level=None, validate: bool = True): doc=""" Names of levels in MultiIndex. + This attribute provides access to the names of the levels in a `MultiIndex`. + The names are stored as a `FrozenList`, which is an immutable list-like + container. Each name corresponds to a level in the `MultiIndex`, and can be + used to identify or manipulate the levels individually. + + See Also + -------- + MultiIndex.set_names : Set Index or MultiIndex name. + MultiIndex.rename : Rename specific levels in a MultiIndex. + Index.names : Get names on index. + Examples -------- >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z'] + ... ) >>> mi MultiIndex([(1, 3, 5), (2, 4, 6)], @@ -1720,11 +1743,6 @@ def is_monotonic_decreasing(self) -> bool: # monotonic decreasing if and only if reverse is monotonic increasing return self[::-1].is_monotonic_increasing - @cache_readonly - def _inferred_type_levels(self) -> list[str]: - """return a list of the inferred types, one for each level""" - return [i.inferred_type for i in self.levels] - @doc(Index.duplicated) def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: shape = tuple(len(lev) for lev in self.levels) @@ -1736,11 +1754,11 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: # (previously declared in base class "IndexOpsMixin") _duplicated = duplicated # type: ignore[misc] - def fillna(self, value=None, downcast=None): + def fillna(self, value): """ fillna is not implemented for MultiIndex """ - raise NotImplementedError("isna is not defined for MultiIndex") + raise NotImplementedError("fillna is not defined for MultiIndex") @doc(Index.dropna) def dropna(self, how: AnyAll = "any") -> MultiIndex: @@ -1786,6 +1804,16 @@ def get_level_values(self, level) -> Index: # type: ignore[override] Return vector of label values for requested level. Length of returned vector is equal to the length of the index. + The `get_level_values` method is a crucial utility for extracting + specific level values from a `MultiIndex`. This function is particularly + useful when working with multi-level data, allowing you to isolate + and manipulate individual levels without having to deal with the + complexity of the entire `MultiIndex` structure. It seamlessly handles + both integer and string-based level access, providing flexibility in + how you can interact with the data. Additionally, this method ensures + that the returned `Index` maintains the integrity of the original data, + even when missing values are present, by appropriately casting the + result to a suitable data type. Parameters ---------- @@ -1799,6 +1827,13 @@ def get_level_values(self, level) -> Index: # type: ignore[override] Values is a level of this MultiIndex converted to a single :class:`Index` (or subclass thereof). + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Index : Immutable sequence used for indexing and alignment. + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + Notes ----- If the level contains missing values, the result may be casted to @@ -1809,14 +1844,14 @@ def get_level_values(self, level) -> Index: # type: ignore[override] -------- Create a MultiIndex: - >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def'))) - >>> mi.names = ['level_1', 'level_2'] + >>> mi = pd.MultiIndex.from_arrays((list("abc"), list("def"))) + >>> mi.names = ["level_1", "level_2"] Get level values by supplying level as either integer or name: >>> mi.get_level_values(0) Index(['a', 'b', 'c'], dtype='object', name='level_1') - >>> mi.get_level_values('level_2') + >>> mi.get_level_values("level_2") Index(['d', 'e', 'f'], dtype='object', name='level_2') If a level contains missing values, the return type of the level @@ -1869,6 +1904,7 @@ def to_frame( Returns ------- DataFrame + DataFrame representation of the MultiIndex, with levels as columns. See Also -------- @@ -1877,7 +1913,7 @@ def to_frame( Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) + >>> mi = pd.MultiIndex.from_arrays([["a", "b"], ["c", "d"]]) >>> mi MultiIndex([('a', 'c'), ('b', 'd')], @@ -1895,7 +1931,7 @@ def to_frame( 0 a c 1 b d - >>> df = mi.to_frame(name=['x', 'y']) + >>> df = mi.to_frame(name=["x", "y"]) >>> df x y a c a c @@ -1954,8 +1990,8 @@ def to_flat_index(self) -> Index: # type: ignore[override] Examples -------- >>> index = pd.MultiIndex.from_product( - ... [['foo', 'bar'], ['baz', 'qux']], - ... names=['a', 'b']) + ... [["foo", "bar"], ["baz", "qux"]], names=["a", "b"] + ... ) >>> index.to_flat_index() Index([('foo', 'baz'), ('foo', 'qux'), ('bar', 'baz'), ('bar', 'qux')], @@ -1976,25 +2012,29 @@ def _is_lexsorted(self) -> bool: In the below examples, the first level of the MultiIndex is sorted because a>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], - ... ['d', 'e', 'f']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [["a", "b", "c"], ["d", "e", "f"]] + ... )._is_lexsorted() True - >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], - ... ['d', 'f', 'e']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [["a", "b", "c"], ["d", "f", "e"]] + ... )._is_lexsorted() True In case there is a tie, the lexicographical sorting looks at the next level of the MultiIndex. - >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'b', 'c']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays([[0, 1, 1], ["a", "b", "c"]])._is_lexsorted() True - >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'c', 'b']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays([[0, 1, 1], ["a", "c", "b"]])._is_lexsorted() False - >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], - ... ['aa', 'bb', 'aa', 'bb']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [["a", "a", "b", "b"], ["aa", "bb", "aa", "bb"]] + ... )._is_lexsorted() True - >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], - ... ['bb', 'aa', 'aa', 'bb']])._is_lexsorted() + >>> pd.MultiIndex.from_arrays( + ... [["a", "a", "b", "b"], ["bb", "aa", "aa", "bb"]] + ... )._is_lexsorted() False """ return self._lexsort_depth == self.nlevels @@ -2031,8 +2071,9 @@ def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIn Examples -------- - >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi = pd.MultiIndex( + ... levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ... ) >>> mi MultiIndex([('a', 'bb'), ('a', 'aa'), @@ -2089,13 +2130,26 @@ def remove_unused_levels(self) -> MultiIndex: appearance, meaning the same .values and ordering. It will also be .equals() to the original. + The `remove_unused_levels` method is useful in cases where you have a + MultiIndex with hierarchical levels, but some of these levels are no + longer needed due to filtering or subsetting operations. By removing + the unused levels, the resulting MultiIndex becomes more compact and + efficient, which can improve performance in subsequent operations. + Returns ------- MultiIndex + A new MultiIndex with unused levels removed. + + See Also + -------- + MultiIndex.droplevel : Remove specified levels from a MultiIndex. + MultiIndex.reorder_levels : Rearrange levels of a MultiIndex. + MultiIndex.set_levels : Set new levels on a MultiIndex. Examples -------- - >>> mi = pd.MultiIndex.from_product([range(2), list('ab')]) + >>> mi = pd.MultiIndex.from_product([range(2), list("ab")]) >>> mi MultiIndex([(0, 'a'), (0, 'b'), @@ -2247,6 +2301,9 @@ def take( # only fill if we are passing a non-None fill_value allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) + if indices.ndim == 1 and lib.is_range_indexer(indices, len(self)): + return self.copy() + na_value = -1 taken = [lab.take(indices) for lab in self.codes] @@ -2268,18 +2325,31 @@ def append(self, other): """ Append a collection of Index options together. + The `append` method is used to combine multiple `Index` objects into a single + `Index`. This is particularly useful when dealing with multi-level indexing + (MultiIndex) where you might need to concatenate different levels of indices. + The method handles the alignment of the levels and codes of the indices being + appended to ensure consistency in the resulting `MultiIndex`. + Parameters ---------- other : Index or list/tuple of indices + Index or list/tuple of Index objects to be appended. Returns ------- Index The combined index. + See Also + -------- + MultiIndex: A multi-level, or hierarchical, index object for pandas objects. + Index.append : Append a collection of Index options together. + concat : Concatenate pandas objects along a particular axis. + Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b']]) + >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"]]) >>> mi MultiIndex([('a', 'b')], ) @@ -2361,21 +2431,38 @@ def drop( # type: ignore[override] """ Make a new :class:`pandas.MultiIndex` with the passed list of codes deleted. + This method allows for the removal of specified labels from a MultiIndex. + The labels to be removed can be provided as a list of tuples if no level + is specified, or as a list of labels from a specific level if the level + parameter is provided. This can be useful for refining the structure of a + MultiIndex to fit specific requirements. + Parameters ---------- codes : array-like Must be a list of tuples when ``level`` is not specified. level : int or level name, default None + Level from which the labels will be dropped. errors : str, default 'raise' + If 'ignore', suppress error and existing labels are dropped. Returns ------- MultiIndex + A new MultiIndex with the specified labels removed. + + See Also + -------- + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + MultiIndex.reorder_levels : Rearrange levels using input order. + MultiIndex.rename : Rename levels in a MultiIndex. Examples -------- - >>> idx = pd.MultiIndex.from_product([(0, 1, 2), ('green', 'purple')], - ... names=["number", "color"]) + >>> idx = pd.MultiIndex.from_product( + ... [(0, 1, 2), ("green", "purple")], names=["number", "color"] + ... ) >>> idx MultiIndex([(0, 'green'), (0, 'purple'), @@ -2384,7 +2471,7 @@ def drop( # type: ignore[override] (2, 'green'), (2, 'purple')], names=['number', 'color']) - >>> idx.drop([(1, 'green'), (2, 'purple')]) + >>> idx.drop([(1, "green"), (2, "purple")]) MultiIndex([(0, 'green'), (0, 'purple'), (1, 'purple'), @@ -2393,7 +2480,7 @@ def drop( # type: ignore[override] We can also drop from a specific level. - >>> idx.drop('green', level='color') + >>> idx.drop("green", level="color") MultiIndex([(0, 'purple'), (1, 'purple'), (2, 'purple')], @@ -2425,7 +2512,7 @@ def drop( # type: ignore[override] step = loc.step if loc.step is not None else 1 inds.extend(range(loc.start, loc.stop, step)) elif com.is_bool_indexer(loc): - if self._lexsort_depth == 0: + if get_option("performance_warnings") and self._lexsort_depth == 0: warnings.warn( "dropping on a non-lexsorted multi-index " "without a level parameter may impact performance.", @@ -2492,8 +2579,9 @@ def swaplevel(self, i=-2, j=-1) -> MultiIndex: Examples -------- - >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi = pd.MultiIndex( + ... levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ... ) >>> mi MultiIndex([('a', 'bb'), ('a', 'aa'), @@ -2526,6 +2614,13 @@ def reorder_levels(self, order) -> MultiIndex: """ Rearrange levels using input order. May not drop or duplicate levels. + `reorder_levels` is useful when you need to change the order of levels in + a MultiIndex, such as when reordering levels for hierarchical indexing. It + maintains the integrity of the MultiIndex, ensuring that all existing levels + are present and no levels are duplicated. This method is helpful for aligning + the index structure with other data structures or for optimizing the order + for specific data operations. + Parameters ---------- order : list of int or list of str @@ -2535,10 +2630,17 @@ def reorder_levels(self, order) -> MultiIndex: Returns ------- MultiIndex + A new MultiIndex with levels rearranged according to the specified order. + + See Also + -------- + MultiIndex.swaplevel : Swap two levels of the MultiIndex. + MultiIndex.set_names : Set names for the MultiIndex levels. + DataFrame.reorder_levels : Reorder levels in a DataFrame with a MultiIndex. Examples -------- - >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) + >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=["x", "y"]) >>> mi MultiIndex([(1, 3), (2, 4)], @@ -2549,7 +2651,7 @@ def reorder_levels(self, order) -> MultiIndex: (4, 2)], names=['y', 'x']) - >>> mi.reorder_levels(order=['y', 'x']) + >>> mi.reorder_levels(order=["y", "x"]) MultiIndex([(3, 1), (4, 2)], names=['y', 'x']) @@ -2574,7 +2676,7 @@ def _reorder_ilevels(self, order) -> MultiIndex: def _recode_for_new_levels( self, new_levels, copy: bool = True - ) -> Generator[np.ndarray, None, None]: + ) -> Generator[np.ndarray]: if len(new_levels) > self.nlevels: raise AssertionError( f"Length of new_levels ({len(new_levels)}) " @@ -2594,9 +2696,9 @@ def _get_codes_for_sorting(self) -> list[Categorical]: a valid valid """ - def cats(level_codes): + def cats(level_codes: np.ndarray) -> np.ndarray: return np.arange( - np.array(level_codes).max() + 1 if len(level_codes) else 0, + level_codes.max() + 1 if len(level_codes) else 0, dtype=level_codes.dtype, ) @@ -2615,8 +2717,15 @@ def sortlevel( """ Sort MultiIndex at the requested level. - The result will respect the original ordering of the associated - factor at that level. + This method is useful when dealing with MultiIndex objects, allowing for + sorting at a specific level of the index. The function preserves the + relative ordering of data within the same level while sorting + the overall MultiIndex. The method provides flexibility with the `ascending` + parameter to define the sort order and with the `sort_remaining` parameter to + control whether the remaining levels should also be sorted. Sorting a + MultiIndex can be crucial when performing operations that require ordered + indices, such as grouping or merging datasets. The `na_position` argument is + important in handling missing values consistently across different levels. Parameters ---------- @@ -2626,7 +2735,9 @@ def sortlevel( ascending : bool, default True False to sort in descending order. Can also be a list to specify a directed ordering. - sort_remaining : sort by the remaining levels after level + sort_remaining : bool, default True + If True, sorts by the remaining levels after sorting by the specified + `level`. na_position : {'first' or 'last'}, default 'first' Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. @@ -2640,6 +2751,13 @@ def sortlevel( indexer : np.ndarray[np.intp] Indices of output values in original index. + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Index.sort_values : Sort Index values. + DataFrame.sort_index : Sort DataFrame by the index. + Series.sort_index : Sort Series by the index. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) @@ -2673,7 +2791,8 @@ def sortlevel( # error: Item "Hashable" of "Union[Hashable, Sequence[Hashable]]" has # no attribute "__iter__" (not iterable) level = [ - self._get_level_number(lev) for lev in level # type: ignore[union-attr] + self._get_level_number(lev) + for lev in level # type: ignore[union-attr] ] sortorder = None @@ -2722,7 +2841,7 @@ def _wrap_reindex_result(self, target, indexer, preserve_names: bool): target = self._maybe_preserve_names(target, preserve_names) return target - def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index: + def _maybe_preserve_names(self, target: IndexT, preserve_names: bool) -> IndexT: if ( preserve_names and target.nlevels == self.nlevels @@ -2823,18 +2942,18 @@ def get_slice_bound( Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + >>> mi = pd.MultiIndex.from_arrays([list("abbc"), list("gefd")]) Get the locations from the leftmost 'b' in the first level until the end of the multiindex: - >>> mi.get_slice_bound('b', side="left") + >>> mi.get_slice_bound("b", side="left") 1 Like above, but if you get the locations from the rightmost 'b' in the first level and 'f' in the second level: - >>> mi.get_slice_bound(('b','f'), side="right") + >>> mi.get_slice_bound(("b", "f"), side="right") 3 See Also @@ -2847,7 +2966,6 @@ def get_slice_bound( label = (label,) return self._partial_tup_index(label, side=side) - # pylint: disable-next=useless-parent-delegation def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: """ For an ordered MultiIndex, compute the slice locations for input @@ -2878,19 +2996,20 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')], - ... names=['A', 'B']) + >>> mi = pd.MultiIndex.from_arrays( + ... [list("abbd"), list("deff")], names=["A", "B"] + ... ) Get the slice locations from the beginning of 'b' in the first level until the end of the multiindex: - >>> mi.slice_locs(start='b') + >>> mi.slice_locs(start="b") (1, 4) Like above, but stop at the end of 'b' in the first level and 'f' in the second level: - >>> mi.slice_locs(start='b', end=('b', 'f')) + >>> mi.slice_locs(start="b", end=("b", "f")) (1, 3) See Also @@ -2981,14 +3100,19 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: def get_loc(self, key): """ - Get location for a label or a tuple of labels. + Get location for a label or a tuple of labels. The location is returned \ + as an integer/slice or boolean mask. - The location is returned as an integer/slice or boolean - mask. + This method returns the integer location, slice object, or boolean mask + corresponding to the specified key, which can be a single label or a tuple + of labels. The key represents a position in the MultiIndex, and the location + indicates where the key is found within the index. Parameters ---------- key : label or tuple of labels (one for each level) + A label or tuple of labels that correspond to the levels of the MultiIndex. + The key must match the structure of the MultiIndex. Returns ------- @@ -3012,12 +3136,12 @@ def get_loc(self, key): Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + >>> mi = pd.MultiIndex.from_arrays([list("abb"), list("def")]) - >>> mi.get_loc('b') + >>> mi.get_loc("b") slice(1, 3, None) - >>> mi.get_loc(('b', 'e')) + >>> mi.get_loc(("b", "e")) 1 """ self._check_indexing_error(key) @@ -3055,7 +3179,7 @@ def _maybe_to_slice(loc): raise KeyError(key) from err except TypeError: # e.g. test_partial_slicing_with_multiindex partial string slicing - loc, _ = self.get_loc_level(key, list(range(self.nlevels))) + loc, _ = self.get_loc_level(key, range(self.nlevels)) return loc # -- partial selection or non-unique index @@ -3082,11 +3206,12 @@ def _maybe_to_slice(loc): if not follow_key: return slice(start, stop) - warnings.warn( - "indexing past lexsort depth may impact performance.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) + if get_option("performance_warnings"): + warnings.warn( + "indexing past lexsort depth may impact performance.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) loc = np.arange(start, stop, dtype=np.intp) @@ -3105,10 +3230,21 @@ def get_loc_level(self, key, level: IndexLabel = 0, drop_level: bool = True): """ Get location and sliced index for requested label(s)/level(s). + The `get_loc_level` method is a more advanced form of `get_loc`, allowing + users to specify not just a label or sequence of labels, but also the level(s) + in which to search. This method is useful when you need to isolate particular + sections of a MultiIndex, either for further analysis or for slicing and + dicing the data. The method provides flexibility in terms of maintaining + or dropping levels from the resulting index based on the `drop_level` + parameter. + Parameters ---------- key : label or sequence of labels + The label(s) for which to get the location. level : int/level name or list thereof, optional + The level(s) in the MultiIndex to consider. If not provided, defaults + to the first level. drop_level : bool, default True If ``False``, the resulting index will not drop any level. @@ -3130,19 +3266,18 @@ def get_loc_level(self, key, level: IndexLabel = 0, drop_level: bool = True): Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')], - ... names=['A', 'B']) + >>> mi = pd.MultiIndex.from_arrays([list("abb"), list("def")], names=["A", "B"]) - >>> mi.get_loc_level('b') + >>> mi.get_loc_level("b") (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) - >>> mi.get_loc_level('e', level='B') + >>> mi.get_loc_level("e", level="B") (array([False, True, False]), Index(['b'], dtype='object', name='A')) - >>> mi.get_loc_level(['b', 'e']) + >>> mi.get_loc_level(["b", "e"]) (1, None) """ - if not isinstance(level, (list, tuple)): + if not isinstance(level, (range, list, tuple)): level = self._get_level_number(level) else: level = [self._get_level_number(lev) for lev in level] @@ -3397,7 +3532,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): locs = (level_codes >= idx.start) & (level_codes < idx.stop) return locs - locs = np.array(level_codes == idx, dtype=bool, copy=False) + locs = np.asarray(level_codes == idx, dtype=bool) if not locs.any(): # The label is present in self.levels[level] but unused: @@ -3441,15 +3576,15 @@ def get_locs(self, seq) -> npt.NDArray[np.intp]: Examples -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + >>> mi = pd.MultiIndex.from_arrays([list("abb"), list("def")]) - >>> mi.get_locs('b') # doctest: +SKIP + >>> mi.get_locs("b") # doctest: +SKIP array([1, 2], dtype=int64) - >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP + >>> mi.get_locs([slice(None), ["e", "f"]]) # doctest: +SKIP array([1, 2], dtype=int64) - >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP + >>> mi.get_locs([[True, False, True], slice("e", "f")]) # doctest: +SKIP array([2], dtype=int64) """ @@ -3487,7 +3622,11 @@ def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]: "cannot index with a boolean indexer that " "is not the same length as the index" ) + if isinstance(k, (ABCSeries, Index)): + k = k._values lvl_indexer = np.asarray(k) + if indexer is None: + lvl_indexer = lvl_indexer.copy() elif is_list_like(k): # a collection of labels to include from this level (these are or'd) @@ -3626,7 +3765,7 @@ def _reorder_indexer( new_order = key_order_map[self.codes[i][indexer]] elif isinstance(k, slice) and k.step is not None and k.step < 0: # flip order for negative step - new_order = np.arange(n)[::-1][indexer] + new_order = np.arange(n - 1, -1, -1)[indexer] elif isinstance(k, slice) and k.start is None and k.stop is None: # slice(None) should not determine order GH#31330 new_order = np.ones((n,), dtype=np.intp)[indexer] @@ -3655,13 +3794,18 @@ def truncate(self, before=None, after=None) -> MultiIndex: MultiIndex The truncated MultiIndex. + See Also + -------- + DataFrame.truncate : Truncate a DataFrame before and after some index values. + Series.truncate : Truncate a Series before and after some index values. + Examples -------- - >>> mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z']]) + >>> mi = pd.MultiIndex.from_arrays([["a", "b", "c"], ["x", "y", "z"]]) >>> mi MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')], ) - >>> mi.truncate(before='a', after='b') + >>> mi.truncate(before="a", after="b") MultiIndex([('a', 'x'), ('b', 'y')], ) """ @@ -3719,31 +3863,16 @@ def equals(self, other: object) -> bool: other_mask = other_codes == -1 if not np.array_equal(self_mask, other_mask): return False - self_codes = self_codes[~self_mask] - self_values = self.levels[i]._values.take(self_codes) - - other_codes = other_codes[~other_mask] - other_values = other.levels[i]._values.take(other_codes) - - # since we use NaT both datetime64 and timedelta64 we can have a - # situation where a level is typed say timedelta64 in self (IOW it - # has other values than NaT) but types datetime64 in other (where - # its all NaT) but these are equivalent - if len(self_values) == 0 and len(other_values) == 0: - continue - - if not isinstance(self_values, np.ndarray): - # i.e. ExtensionArray - if not self_values.equals(other_values): - return False - elif not isinstance(other_values, np.ndarray): - # i.e. other is ExtensionArray - if not other_values.equals(self_values): - return False - else: - if not array_equivalent(self_values, other_values): - return False - + self_level = self.levels[i] + other_level = other.levels[i] + new_codes = recode_for_categories( + other_codes, other_level, self_level, copy=False + ) + if not np.array_equal(self_codes, new_codes): + return False + if not self_level[:0].equals(other_level[:0]): + # e.g. Int64 != int64 + return False return True def equal_levels(self, other: MultiIndex) -> bool: @@ -3955,6 +4084,8 @@ def insert(self, loc: int, item) -> MultiIndex: # other codes lev_loc = len(level) level = level.insert(lev_loc, k) + if isna(level[lev_loc]): # GH 59003, 60388 + lev_loc = -1 else: lev_loc = level.get_loc(k) @@ -4054,8 +4185,6 @@ def sparsify_labels(label_list, start: int = 0, sentinel: object = ""): for i, (p, t) in enumerate(zip(prev, cur)): if i == k - 1: sparse_cur.append(t) - # error: Argument 1 to "append" of "list" has incompatible - # type "list[Any]"; expected "tuple[Any, ...]" result.append(sparse_cur) # type: ignore[arg-type] break @@ -4063,8 +4192,6 @@ def sparsify_labels(label_list, start: int = 0, sentinel: object = ""): sparse_cur.append(sentinel) else: sparse_cur.extend(cur[i:]) - # error: Argument 1 to "append" of "list" has incompatible - # type "list[Any]"; expected "tuple[Any, ...]" result.append(sparse_cur) # type: ignore[arg-type] break @@ -4153,3 +4280,60 @@ def _require_listlike(level, arr, arrname: str): if not is_list_like(arr) or not is_list_like(arr[0]): raise TypeError(f"{arrname} must be list of lists-like") return level, arr + + +def cartesian_product(X: list[np.ndarray]) -> list[np.ndarray]: + """ + Numpy version of itertools.product. + Sometimes faster (for large inputs)... + + Parameters + ---------- + X : list-like of list-likes + + Returns + ------- + product : list of ndarrays + + Examples + -------- + >>> cartesian_product([list("ABC"), [1, 2]]) + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='>> idx = pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3]) + >>> idx = pd.PeriodIndex(data=["2000Q1", "2002Q3"], freq="Q") >>> idx PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]') """ @@ -200,7 +181,7 @@ def _resolution_obj(self) -> Resolution: @doc( PeriodArray.asfreq, - other="pandas.arrays.PeriodArray", + other="arrays.PeriodArray", other_name="PeriodArray", **_shared_doc_kwargs, ) @@ -234,84 +215,29 @@ def second(self) -> Index: def __new__( cls, data=None, - ordinal=None, freq=None, dtype: Dtype | None = None, copy: bool = False, name: Hashable | None = None, - **fields, ) -> Self: - valid_field_set = { - "year", - "month", - "day", - "quarter", - "hour", - "minute", - "second", - } - refs = None if not copy and isinstance(data, (Index, ABCSeries)): refs = data._references - if not set(fields).issubset(valid_field_set): - argument = next(iter(set(fields) - valid_field_set)) - raise TypeError(f"__new__() got an unexpected keyword argument {argument}") - elif len(fields): - # GH#55960 - warnings.warn( - "Constructing PeriodIndex from fields is deprecated. Use " - "PeriodIndex.from_fields instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if ordinal is not None: - # GH#55960 - warnings.warn( - "The 'ordinal' keyword in PeriodIndex is deprecated and will " - "be removed in a future version. Use PeriodIndex.from_ordinals " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = maybe_extract_name(name, data, cls) - if data is None and ordinal is None: - # range-based. - if not fields: - # test_pickle_compat_construction - cls._raise_scalar_data_error(None) - data = cls.from_fields(**fields, freq=freq)._data - copy = False + freq = validate_dtype_freq(dtype, freq) - elif fields: - if data is not None: - raise ValueError("Cannot pass both data and fields") - raise ValueError("Cannot pass both ordinal and fields") + # PeriodIndex allow PeriodIndex(period_index, freq=different) + # Let's not encourage that kind of behavior in PeriodArray. - else: - freq = validate_dtype_freq(dtype, freq) - - # PeriodIndex allow PeriodIndex(period_index, freq=different) - # Let's not encourage that kind of behavior in PeriodArray. - - if freq and isinstance(data, cls) and data.freq != freq: - # TODO: We can do some of these with no-copy / coercion? - # e.g. D -> 2D seems to be OK - data = data.asfreq(freq) - - if data is None and ordinal is not None: - ordinal = np.asarray(ordinal, dtype=np.int64) - dtype = PeriodDtype(freq) - data = PeriodArray(ordinal, dtype=dtype) - elif data is not None and ordinal is not None: - raise ValueError("Cannot pass both data and ordinal") - else: - # don't pass copy here, since we copy later. - data = period_array(data=data, freq=freq) + if freq and isinstance(data, cls) and data.freq != freq: + # TODO: We can do some of these with no-copy / coercion? + # e.g. D -> 2D seems to be OK + data = data.asfreq(freq) + + # don't pass copy here, since we copy later. + data = period_array(data=data, freq=freq) if copy: data = data.copy() @@ -331,6 +257,43 @@ def from_fields( second=None, freq=None, ) -> Self: + """ + Construct a PeriodIndex from fields (year, month, day, etc.). + + Parameters + ---------- + year : int, array, or Series, default None + Year for the PeriodIndex. + quarter : int, array, or Series, default None + Quarter for the PeriodIndex. + month : int, array, or Series, default None + Month for the PeriodIndex. + day : int, array, or Series, default None + Day for the PeriodIndex. + hour : int, array, or Series, default None + Hour for the PeriodIndex. + minute : int, array, or Series, default None + Minute for the PeriodIndex. + second : int, array, or Series, default None + Second for the PeriodIndex. + freq : str or period object, optional + One of pandas period strings or corresponding objects. + + Returns + ------- + PeriodIndex + + See Also + -------- + PeriodIndex.from_ordinals : Construct a PeriodIndex from ordinals. + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + + Examples + -------- + >>> idx = pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3]) + >>> idx + PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]') + """ fields = { "year": year, "quarter": quarter, @@ -346,6 +309,34 @@ def from_fields( @classmethod def from_ordinals(cls, ordinals, *, freq, name=None) -> Self: + """ + Construct a PeriodIndex from ordinals. + + Parameters + ---------- + ordinals : array-like of int + The period offsets from the proleptic Gregorian epoch. + freq : str or period object + One of pandas period strings or corresponding objects. + name : str, default None + Name of the resulting PeriodIndex. + + Returns + ------- + PeriodIndex + + See Also + -------- + PeriodIndex.from_fields : Construct a PeriodIndex from fields + (year, month, day, etc.). + PeriodIndex.to_timestamp : Cast to DatetimeArray/Index. + + Examples + -------- + >>> idx = pd.PeriodIndex.from_ordinals([-1, 0, 1], freq="Q") + >>> idx + PeriodIndex(['1969Q4', '1970Q1', '1970Q2'], dtype='period[Q-DEC]') + """ ordinals = np.asarray(ordinals, dtype=np.int64) dtype = PeriodDtype(freq) data = PeriodArray._simple_new(ordinals, dtype=dtype) @@ -574,18 +565,26 @@ def period_range( Returns ------- PeriodIndex + A PeriodIndex of fixed frequency periods. + + See Also + -------- + date_range : Returns a fixed frequency DatetimeIndex. + Period : Represents a period of time. + PeriodIndex : Immutable ndarray holding ordinal values indicating regular periods + in time. Notes ----- Of the three parameters: ``start``, ``end``, and ``periods``, exactly two must be specified. - To learn more about the frequency strings, please see `this link - `__. + To learn more about the frequency strings, please see + :ref:`this link`. Examples -------- - >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + >>> pd.period_range(start="2017-01-01", end="2018-01-01", freq="M") PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01'], @@ -595,8 +594,11 @@ def period_range( endpoints for a ``PeriodIndex`` with frequency matching that of the ``period_range`` constructor. - >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'), - ... end=pd.Period('2017Q2', freq='Q'), freq='M') + >>> pd.period_range( + ... start=pd.Period("2017Q1", freq="Q"), + ... end=pd.Period("2017Q2", freq="Q"), + ... freq="M", + ... ) PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], dtype='period[M]') """ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 62afcf8badb50..2db50bbbdfa37 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterator, ) @@ -10,7 +11,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -22,15 +22,15 @@ index as libindex, lib, ) -from pandas._libs.algos import unique_deltas from pandas._libs.lib import no_default from pandas.compat.numpy import function as nv from pandas.util._decorators import ( cache_readonly, - deprecate_nonkeyword_arguments, doc, + set_module, ) +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, @@ -44,6 +44,7 @@ from pandas.core import ops import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.indexers import check_array_indexer import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -55,14 +56,26 @@ from pandas._typing import ( Axis, Dtype, + JoinHow, NaPosition, + NumpySorter, Self, npt, ) + + from pandas import Series + _empty_range = range(0) _dtype_int64 = np.dtype(np.int64) +def min_fitting_element(start: int, step: int, lower_limit: int) -> int: + """Returns the smallest element greater than or equal to the limit""" + no_steps = -(-(lower_limit - start) // abs(step)) + return start + abs(step) * no_steps + + +@set_module("pandas") class RangeIndex(Index): """ Immutable Index implementing a monotonic integer range. @@ -79,7 +92,9 @@ class RangeIndex(Index): start : int (default: 0), range, or other RangeIndex instance If int and "stop" is not given, interpreted as "stop" instead. stop : int (default: 0) + The end value of the range (exclusive). step : int (default: 1) + The step size of the range. dtype : np.int64 Unused, accepted for homogeneity with other index types. copy : bool, default False @@ -175,10 +190,31 @@ def from_range(cls, data: range, name=None, dtype: Dtype | None = None) -> Self: """ Create :class:`pandas.RangeIndex` from a ``range`` object. + This method provides a way to create a :class:`pandas.RangeIndex` directly + from a Python ``range`` object. The resulting :class:`RangeIndex` will have + the same start, stop, and step values as the input ``range`` object. + It is particularly useful for constructing indices in an efficient and + memory-friendly manner. + + Parameters + ---------- + data : range + The range object to be converted into a RangeIndex. + name : str, default None + Name to be stored in the index. + dtype : Dtype or None + Data type for the RangeIndex. If None, the default integer type will + be used. + Returns ------- RangeIndex + See Also + -------- + RangeIndex : Immutable Index implementing a monotonic integer range. + Index : Immutable sequence used for indexing and alignment. + Examples -------- >>> pd.RangeIndex.from_range(range(5)) @@ -190,7 +226,7 @@ def from_range(cls, data: range, name=None, dtype: Dtype | None = None) -> Self: if not isinstance(data, range): raise TypeError( f"{cls.__name__}(...) must be called with object coercible to a " - f"range, {repr(data)} was passed" + f"range, {data!r} was passed" ) cls._validate_dtype(dtype) return cls._simple_new(data, name=name) @@ -282,6 +318,16 @@ def start(self) -> int: """ The value of the `start` parameter (``0`` if this was not supplied). + This property returns the starting value of the `RangeIndex`. If the `start` + value is not explicitly provided during the creation of the `RangeIndex`, + it defaults to 0. + + See Also + -------- + RangeIndex : Immutable index implementing a range-based index. + RangeIndex.stop : Returns the stop value of the `RangeIndex`. + RangeIndex.step : Returns the step value of the `RangeIndex`. + Examples -------- >>> idx = pd.RangeIndex(5) @@ -300,6 +346,17 @@ def stop(self) -> int: """ The value of the `stop` parameter. + This property returns the `stop` value of the RangeIndex, which defines the + upper (or lower, in case of negative steps) bound of the index range. The + `stop` value is exclusive, meaning the RangeIndex includes values up to but + not including this value. + + See Also + -------- + RangeIndex : Immutable index representing a range of integers. + RangeIndex.start : The start value of the RangeIndex. + RangeIndex.step : The step size between elements in the RangeIndex. + Examples -------- >>> idx = pd.RangeIndex(5) @@ -317,6 +374,15 @@ def step(self) -> int: """ The value of the `step` parameter (``1`` if this was not supplied). + The ``step`` parameter determines the increment (or decrement in the case + of negative values) between consecutive elements in the ``RangeIndex``. + + See Also + -------- + RangeIndex : Immutable index implementing a range-based index. + RangeIndex.stop : Returns the stop value of the RangeIndex. + RangeIndex.start : Returns the start value of the RangeIndex. + Examples -------- >>> idx = pd.RangeIndex(5) @@ -394,7 +460,7 @@ def __contains__(self, key: Any) -> bool: hash(key) try: key = ensure_python_int(key) - except TypeError: + except (TypeError, OverflowError): return False return key in self._range @@ -470,28 +536,35 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Index(values, name=name, dtype=np.float64) - # GH 46675 & 43885: If values is equally spaced, return a - # more memory-compact RangeIndex instead of Index with 64-bit dtype - unique_diffs = unique_deltas(values) - if len(unique_diffs) == 1 and unique_diffs[0] != 0: - diff = unique_diffs[0] - new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=name) - else: - return self._constructor._simple_new(values, name=name) + if values.dtype.kind == "i" and values.ndim == 1: + # GH 46675 & 43885: If values is equally spaced, return a + # more memory-compact RangeIndex instead of Index with 64-bit dtype + if len(values) == 1: + start = values[0] + new_range = range(start, start + self.step, self.step) + return type(self)._simple_new(new_range, name=name) + maybe_range = ibase.maybe_sequence_to_range(values) + if isinstance(maybe_range, range): + return type(self)._simple_new(maybe_range, name=name) + return self._constructor._simple_new(values, name=name) def _view(self) -> Self: result = type(self)._simple_new(self._range, name=self._name) result._cache = self._cache return result + def _wrap_reindex_result(self, target, indexer, preserve_names: bool): + if not isinstance(target, type(self)) and target.dtype.kind == "i": + target = self._shallow_copy(target._values, name=target.name) + return super()._wrap_reindex_result(target, indexer, preserve_names) + @doc(Index.copy) def copy(self, name: Hashable | None = None, deep: bool = False) -> Self: name = self._validate_names(name=name, deep=deep)[0] new_index = self._rename(name=name) return new_index - def _minmax(self, meth: str): + def _minmax(self, meth: Literal["min", "max"]) -> int | float: no_steps = len(self) - 1 if no_steps == -1: return np.nan @@ -500,18 +573,51 @@ def _minmax(self, meth: str): return self.start + self.step * no_steps - def min(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def min(self, axis=None, skipna: bool = True, *args, **kwargs) -> int | float: """The minimum value of the RangeIndex""" nv.validate_minmax_axis(axis) nv.validate_min(args, kwargs) return self._minmax("min") - def max(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def max(self, axis=None, skipna: bool = True, *args, **kwargs) -> int | float: """The maximum value of the RangeIndex""" nv.validate_minmax_axis(axis) nv.validate_max(args, kwargs) return self._minmax("max") + def _argminmax( + self, + meth: Literal["min", "max"], + axis=None, + skipna: bool = True, + ) -> int: + nv.validate_minmax_axis(axis) + if len(self) == 0: + return getattr(super(), f"arg{meth}")( + axis=axis, + skipna=skipna, + ) + elif meth == "min": + if self.step > 0: + return 0 + else: + return len(self) - 1 + elif meth == "max": + if self.step > 0: + return len(self) - 1 + else: + return 0 + else: + raise ValueError(f"{meth=} must be max or min") + + def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + nv.validate_argmin(args, kwargs) + return self._argminmax("min", axis=axis, skipna=skipna) + + def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + nv.validate_argmax(args, kwargs) + return self._argminmax("max", axis=axis, skipna=skipna) + def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: """ Returns the indices that would sort the index and its @@ -529,25 +635,30 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: kwargs.pop("kind", None) # e.g. "mergesort" is irrelevant nv.validate_argsort(args, kwargs) + start, stop, step = None, None, None if self._range.step > 0: - result = np.arange(len(self), dtype=np.intp) + if ascending: + start = len(self) + else: + start, stop, step = len(self) - 1, -1, -1 + elif ascending: + start, stop, step = len(self) - 1, -1, -1 else: - result = np.arange(len(self) - 1, -1, -1, dtype=np.intp) + start = len(self) - if not ascending: - result = result[::-1] - return result + return np.arange(start, stop, step, dtype=np.intp) def factorize( self, sort: bool = False, use_na_sentinel: bool = True, ) -> tuple[npt.NDArray[np.intp], RangeIndex]: - codes = np.arange(len(self), dtype=np.intp) - uniques = self if sort and self.step < 0: - codes = codes[::-1] - uniques = uniques[::-1] + codes = np.arange(len(self) - 1, -1, -1, dtype=np.intp) + uniques = self[::-1] + else: + codes = np.arange(len(self), dtype=np.intp) + uniques = self return codes, uniques def equals(self, other: object) -> bool: @@ -567,8 +678,7 @@ def sort_values( ascending: bool = ..., na_position: NaPosition = ..., key: Callable | None = ..., - ) -> Self: - ... + ) -> Self: ... @overload def sort_values( @@ -578,8 +688,7 @@ def sort_values( ascending: bool = ..., na_position: NaPosition = ..., key: Callable | None = ..., - ) -> tuple[Self, np.ndarray | RangeIndex]: - ... + ) -> tuple[Self, np.ndarray | RangeIndex]: ... @overload def sort_values( @@ -589,14 +698,11 @@ def sort_values( ascending: bool = ..., na_position: NaPosition = ..., key: Callable | None = ..., - ) -> Self | tuple[Self, np.ndarray | RangeIndex]: - ... + ) -> Self | tuple[Self, np.ndarray | RangeIndex]: ... - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self"], name="sort_values" - ) def sort_values( self, + *, return_indexer: bool = False, ascending: bool = True, na_position: NaPosition = "last", @@ -663,26 +769,15 @@ def _intersection(self, other: Index, sort: bool = False): # intersection disregarding the lower bounds tmp_start = first.start + (second.start - first.start) * first.step // gcd * s new_step = first.step * second.step // gcd - new_range = range(tmp_start, int_high, new_step) - new_index = self._simple_new(new_range) # adjust index to limiting interval - new_start = new_index._min_fitting_element(int_low) - new_range = range(new_start, new_index.stop, new_index.step) - new_index = self._simple_new(new_range) - - if (self.step < 0 and other.step < 0) is not (new_index.step < 0): - new_index = new_index[::-1] + new_start = min_fitting_element(tmp_start, new_step, int_low) + new_range = range(new_start, int_high, new_step) - if sort is None: - new_index = new_index.sort_values() + if (self.step < 0 and other.step < 0) is not (new_range.step < 0): + new_range = new_range[::-1] - return new_index - - def _min_fitting_element(self, lower_limit: int) -> int: - """Returns the smallest element greater than or equal to the limit""" - no_steps = -(-(lower_limit - self.start) // abs(self.step)) - return self.start + abs(self.step) * no_steps + return self._simple_new(new_range) def _extended_gcd(self, a: int, b: int) -> tuple[int, int, int]: """ @@ -868,15 +963,15 @@ def _difference(self, other, sort=None): # e.g. range(10) and range(0, 10, 3) return super()._difference(other, sort=sort) - new_index = type(self)._simple_new(new_rng, name=res_name) if first is not self._range: - new_index = new_index[::-1] + new_rng = new_rng[::-1] + new_index = type(self)._simple_new(new_rng, name=res_name) return new_index def symmetric_difference( self, other, result_name: Hashable | None = None, sort=None - ): + ) -> Index: if not isinstance(other, RangeIndex) or sort is not None: return super().symmetric_difference(other, result_name, sort) @@ -888,6 +983,48 @@ def symmetric_difference( result = result.rename(result_name) return result + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + if not isinstance(other, RangeIndex) and other.dtype.kind == "i": + other = self._shallow_copy(other._values, name=other.name) + return super()._join_empty(other, how=how, sort=sort) + + def _join_monotonic( + self, other: Index, how: JoinHow = "left" + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + # This currently only gets called for the monotonic increasing case + if not isinstance(other, type(self)): + maybe_ri = self._shallow_copy(other._values, name=other.name) + if not isinstance(maybe_ri, type(self)): + return super()._join_monotonic(other, how=how) + other = maybe_ri + + if self.equals(other): + ret_index = other if how == "right" else self + return ret_index, None, None + + if how == "left": + join_index = self + lidx = None + ridx = other.get_indexer(join_index) + elif how == "right": + join_index = other + lidx = self.get_indexer(join_index) + ridx = None + elif how == "inner": + join_index = self.intersection(other) + lidx = self.get_indexer(join_index) + ridx = other.get_indexer(join_index) + elif how == "outer": + join_index = self.union(other) + lidx = self.get_indexer(join_index) + ridx = other.get_indexer(join_index) + + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + # -------------------------------------------------------------------- # error: Return type "Index" of "delete" incompatible with return type @@ -915,23 +1052,27 @@ def delete(self, loc) -> Index: # type: ignore[override] return super().delete(loc) def insert(self, loc: int, item) -> Index: - if len(self) and (is_integer(item) or is_float(item)): + if is_integer(item) or is_float(item): # We can retain RangeIndex is inserting at the beginning or end, # or right in the middle. - rng = self._range - if loc == 0 and item == self[0] - self.step: - new_rng = range(rng.start - rng.step, rng.stop, rng.step) - return type(self)._simple_new(new_rng, name=self._name) - - elif loc == len(self) and item == self[-1] + self.step: - new_rng = range(rng.start, rng.stop + rng.step, rng.step) - return type(self)._simple_new(new_rng, name=self._name) - - elif len(self) == 2 and item == self[0] + self.step / 2: - # e.g. inserting 1 into [0, 2] - step = int(self.step / 2) - new_rng = range(self.start, self.stop, step) + if len(self) == 0 and loc == 0 and is_integer(item): + new_rng = range(item, item + self.step, self.step) return type(self)._simple_new(new_rng, name=self._name) + elif len(self): + rng = self._range + if loc == 0 and item == self[0] - self.step: + new_rng = range(rng.start - rng.step, rng.stop, rng.step) + return type(self)._simple_new(new_rng, name=self._name) + + elif loc == len(self) and item == self[-1] + self.step: + new_rng = range(rng.start, rng.stop + rng.step, rng.step) + return type(self)._simple_new(new_rng, name=self._name) + + elif len(self) == 2 and item == self[0] + self.step / 2: + # e.g. inserting 1 into [0, 2] + step = int(self.step / 2) + new_rng = range(self.start, self.stop, step) + return type(self)._simple_new(new_rng, name=self._name) return super().insert(loc, item) @@ -945,7 +1086,10 @@ def _concat(self, indexes: list[Index], name: Hashable) -> Index: indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Index([0,1,2,4,5], dtype='int64') """ if not all(isinstance(x, RangeIndex) for x in indexes): - return super()._concat(indexes, name) + result = super()._concat(indexes, name) + if result.dtype.kind == "i": + return self._shallow_copy(result._values) + return result elif len(indexes) == 1: return indexes[0] @@ -955,7 +1099,17 @@ def _concat(self, indexes: list[Index], name: Hashable) -> Index: start = step = next_ = None # Filter the empty indexes - non_empty_indexes = [obj for obj in rng_indexes if len(obj)] + non_empty_indexes = [] + all_same_index = True + prev: RangeIndex | None = None + for obj in rng_indexes: + if len(obj): + non_empty_indexes.append(obj) + if all_same_index: + if prev is not None: + all_same_index = prev.equals(obj) + else: + prev = obj for obj in non_empty_indexes: rng = obj._range @@ -968,7 +1122,12 @@ def _concat(self, indexes: list[Index], name: Hashable) -> Index: elif step is None: # First non-empty index had only one element if rng.start == start: - values = np.concatenate([x._values for x in rng_indexes]) + if all_same_index: + values = np.tile( + non_empty_indexes[0]._values, len(non_empty_indexes) + ) + else: + values = np.concatenate([x._values for x in rng_indexes]) result = self._constructor(values) return result.rename(name) @@ -978,9 +1137,13 @@ def _concat(self, indexes: list[Index], name: Hashable) -> Index: next_ is not None and rng.start != next_ ) if non_consecutive: - result = self._constructor( - np.concatenate([x._values for x in rng_indexes]) - ) + if all_same_index: + values = np.tile( + non_empty_indexes[0]._values, len(non_empty_indexes) + ) + else: + values = np.concatenate([x._values for x in rng_indexes]) + result = self._constructor(values) return result.rename(name) if step is not None: @@ -990,11 +1153,13 @@ def _concat(self, indexes: list[Index], name: Hashable) -> Index: # Get the stop value from "next" or alternatively # from the last non-empty index stop = non_empty_indexes[-1].stop if next_ is None else next_ - return RangeIndex(start, stop, step).rename(name) + if len(non_empty_indexes) == 1: + step = non_empty_indexes[0].step + return RangeIndex(start, stop, step, name=name) # Here all "indexes" had 0 length, i.e. were empty. # In this case return an empty range index. - return RangeIndex(0, 0).rename(name) + return RangeIndex(_empty_range, name=name) def __len__(self) -> int: """ @@ -1010,6 +1175,8 @@ def __getitem__(self, key): """ Conserve RangeIndex type for scalar and slice keys. """ + if key is Ellipsis: + key = slice(None) if isinstance(key, slice): return self._getitem_slice(key) elif is_integer(key): @@ -1027,7 +1194,17 @@ def __getitem__(self, key): "and integer or boolean " "arrays are valid indices" ) - return super().__getitem__(key) + elif com.is_bool_indexer(key): + if isinstance(getattr(key, "dtype", None), ExtensionDtype): + key = key.to_numpy(dtype=bool, na_value=False) + else: + key = np.asarray(key, dtype=bool) + check_array_indexer(self._range, key) # type: ignore[arg-type] + key = np.flatnonzero(key) + try: + return self.take(key) + except (TypeError, ValueError): + return super().__getitem__(key) def _getitem_slice(self, slobj: slice) -> Self: """ @@ -1039,7 +1216,7 @@ def _getitem_slice(self, slobj: slice) -> Self: @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): if is_integer(other) and other != 0: - if len(self) == 0 or self.start % other == 0 and self.step % other == 0: + if len(self) == 0 or (self.start % other == 0 and self.step % other == 0): start = self.start // other step = self.step // other stop = start + len(self) * step @@ -1063,6 +1240,42 @@ def any(self, *args, **kwargs) -> bool: # -------------------------------------------------------------------- + # error: Return type "RangeIndex | Index" of "round" incompatible with + # return type "RangeIndex" in supertype "Index" + def round(self, decimals: int = 0) -> Self | Index: # type: ignore[override] + """ + Round each value in the Index to the given number of decimals. + + Parameters + ---------- + decimals : int, optional + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point + e.g. ``round(11.0, -1) == 10.0``. + + Returns + ------- + Index or RangeIndex + A new Index with the rounded values. + + Examples + -------- + >>> import pandas as pd + >>> idx = pd.RangeIndex(10, 30, 10) + >>> idx.round(decimals=-1) + RangeIndex(start=10, stop=30, step=10) + >>> idx = pd.RangeIndex(10, 15, 1) + >>> idx.round(decimals=-1) + Index([10, 10, 10, 10, 10], dtype='int64') + """ + if decimals >= 0: + return self.copy() + elif self.start % 10**-decimals == 0 and self.step % 10**-decimals == 0: + # e.g. RangeIndex(10, 30, 10).round(-1) doesn't need rounding + return self.copy() + else: + return super().round(decimals=decimals) + def _cmp_method(self, other, op): if isinstance(other, RangeIndex) and self._range == other._range: # Both are immutable so if ._range attr. are equal, shortcut is possible @@ -1143,6 +1356,27 @@ def _arith_method(self, other, op): # test_arithmetic_explicit_conversions return super()._arith_method(other, op) + def __abs__(self) -> Self | Index: + if len(self) == 0 or self.min() >= 0: + return self.copy() + elif self.max() <= 0: + return -self + else: + return super().__abs__() + + def __neg__(self) -> Self: + rng = range(-self.start, -self.stop, -self.step) + return self._simple_new(rng, name=self.name) + + def __pos__(self) -> Self: + return self.copy() + + def __invert__(self) -> Self: + if len(self) == 0: + return self.copy() + rng = range(~self.start, ~self.stop, -self.step) + return self._simple_new(rng, name=self.name) + # error: Return type "Index" of "take" incompatible with return type # "RangeIndex" in supertype "Index" def take( # type: ignore[override] @@ -1152,7 +1386,7 @@ def take( # type: ignore[override] allow_fill: bool = True, fill_value=None, **kwargs, - ) -> Index: + ) -> Self | Index: if kwargs: nv.validate_take((), kwargs) if is_scalar(indices): @@ -1163,7 +1397,7 @@ def take( # type: ignore[override] self._maybe_disallow_fill(allow_fill, fill_value, indices) if len(indices) == 0: - taken = np.array([], dtype=self.dtype) + return type(self)(_empty_range, name=self.name) else: ind_max = indices.max() if ind_max >= len(self): @@ -1183,5 +1417,65 @@ def take( # type: ignore[override] if self.start != 0: taken += self.start - # _constructor so RangeIndex-> Index with an int64 dtype - return self._constructor._simple_new(taken, name=self.name) + return self._shallow_copy(taken, name=self.name) + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ) -> Series: + from pandas import Series + + if bins is not None: + return super().value_counts( + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + name = "proportion" if normalize else "count" + data: npt.NDArray[np.floating] | npt.NDArray[np.signedinteger] = np.ones( + len(self), dtype=np.int64 + ) + if normalize: + data = data / len(self) + return Series(data, index=self.copy(), name=name) + + def searchsorted( # type: ignore[override] + self, + value, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + if side not in {"left", "right"} or sorter is not None: + return super().searchsorted(value=value, side=side, sorter=sorter) + + was_scalar = False + if is_scalar(value): + was_scalar = True + array_value = np.array([value]) + else: + array_value = np.asarray(value) + if array_value.dtype.kind not in "iu": + return super().searchsorted(value=value, side=side, sorter=sorter) + + if flip := (self.step < 0): + rng = self._range[::-1] + start = rng.start + step = rng.step + shift = side == "right" + else: + start = self.start + step = self.step + shift = side == "left" + result = (array_value - start - int(shift)) // step + 1 + if flip: + result = len(self) - result + result = np.maximum(np.minimum(result, len(self)), 0) + if was_scalar: + return np.intp(result.item()) + return result.astype(np.intp, copy=False) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 08a265ba47648..fa3de46621643 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,8 +1,8 @@ -""" implement the TimedeltaIndex """ +"""implement the TimedeltaIndex""" + from __future__ import annotations from typing import TYPE_CHECKING -import warnings from pandas._libs import ( index as libindex, @@ -13,8 +13,7 @@ Timedelta, to_offset, ) -from pandas._libs.tslibs.timedeltas import disallow_ambiguous_unit -from pandas.util._exceptions import find_stack_level +from pandas.util._decorators import set_module from pandas.core.dtypes.common import ( is_scalar, @@ -32,6 +31,7 @@ from pandas.core.indexes.extension import inherit_names if TYPE_CHECKING: + from pandas._libs import NaTType from pandas._typing import DtypeObj @@ -51,6 +51,7 @@ ], TimedeltaArray, ) +@set_module("pandas") class TimedeltaIndex(DatetimeTimedeltaMixin): """ Immutable Index of timedelta64 data. @@ -61,12 +62,6 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): ---------- data : array-like (1-dimensional), optional Optional timedelta-like data to construct index with. - unit : {'D', 'h', 'm', 's', 'ms', 'us', 'ns'}, optional - The unit of ``data``. - - .. deprecated:: 2.2.0 - Use ``pd.to_timedelta`` instead. - freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string ``'infer'`` can be passed in order to set the frequency of the index as @@ -108,18 +103,18 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): Notes ----- - To learn more about the frequency strings, please see `this link - `__. + To learn more about the frequency strings, please see + :ref:`this link`. Examples -------- - >>> pd.TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days']) + >>> pd.TimedeltaIndex(["0 days", "1 days", "2 days", "3 days", "4 days"]) TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) We can also let pandas infer the frequency when possible. - >>> pd.TimedeltaIndex(np.arange(5) * 24 * 3600 * 1e9, freq='infer') + >>> pd.TimedeltaIndex(np.arange(5) * 24 * 3600 * 1e9, freq="infer") TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq='D') """ @@ -149,40 +144,16 @@ def _resolution_obj(self) -> Resolution | None: # type: ignore[override] def __new__( cls, data=None, - unit=lib.no_default, freq=lib.no_default, - closed=lib.no_default, dtype=None, copy: bool = False, name=None, ): - if closed is not lib.no_default: - # GH#52628 - warnings.warn( - f"The 'closed' keyword in {cls.__name__} construction is " - "deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if unit is not lib.no_default: - # GH#55499 - warnings.warn( - f"The 'unit' keyword in {cls.__name__} construction is " - "deprecated and will be removed in a future version. " - "Use pd.to_timedelta instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - unit = None - name = maybe_extract_name(name, data, cls) if is_scalar(data): cls._raise_scalar_data_error(data) - disallow_ambiguous_unit(unit) if dtype is not None: dtype = pandas_dtype(dtype) @@ -209,7 +180,7 @@ def __new__( # - Cases checked above all return/raise before reaching here - # tdarr = TimedeltaArray._from_sequence_not_strict( - data, freq=freq, unit=unit, dtype=dtype, copy=copy + data, freq=freq, unit=None, dtype=dtype, copy=copy ) refs = None if not copy and isinstance(data, (ABCSeries, Index)): @@ -245,7 +216,10 @@ def get_loc(self, key): return Index.get_loc(self, key) - def _parse_with_reso(self, label: str): + # error: Return type "tuple[Timedelta | NaTType, None]" of "_parse_with_reso" + # incompatible with return type "tuple[datetime, Resolution]" in supertype + # "DatetimeIndexOpsMixin" + def _parse_with_reso(self, label: str) -> tuple[Timedelta | NaTType, None]: # type: ignore[override] # the "with_reso" is a no-op for TimedeltaIndex parsed = Timedelta(label) return parsed, None @@ -263,6 +237,7 @@ def inferred_type(self) -> str: return "timedelta64" +@set_module("pandas") def timedelta_range( start=None, end=None, @@ -299,6 +274,12 @@ def timedelta_range( Returns ------- TimedeltaIndex + Fixed frequency, with day as the default. + + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. Notes ----- @@ -307,19 +288,19 @@ def timedelta_range( ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between ``start`` and ``end`` (closed on both sides). - To learn more about the frequency strings, please see `this link - `__. + To learn more about the frequency strings, please see + :ref:`this link`. Examples -------- - >>> pd.timedelta_range(start='1 day', periods=4) + >>> pd.timedelta_range(start="1 day", periods=4) TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq='D') The ``closed`` parameter specifies which endpoint is included. The default behavior is to include both endpoints. - >>> pd.timedelta_range(start='1 day', periods=4, closed='right') + >>> pd.timedelta_range(start="1 day", periods=4, closed="right") TimedeltaIndex(['2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq='D') @@ -327,7 +308,7 @@ def timedelta_range( Only fixed frequencies can be passed, non-fixed frequencies such as 'M' (month end) will raise. - >>> pd.timedelta_range(start='1 day', end='2 days', freq='6h') + >>> pd.timedelta_range(start="1 day", end="2 days", freq="6h") TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00', '1 days 18:00:00', '2 days 00:00:00'], dtype='timedelta64[ns]', freq='6h') @@ -335,7 +316,7 @@ def timedelta_range( Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). - >>> pd.timedelta_range(start='1 day', end='5 days', periods=4) + >>> pd.timedelta_range(start="1 day", end="5 days", periods=4) TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00', '5 days 00:00:00'], dtype='timedelta64[ns]', freq=None) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4be7e17035128..34a437ba40bd8 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -13,11 +13,6 @@ import numpy as np -from pandas._config import ( - using_copy_on_write, - warn_copy_on_write, -) - from pandas._libs.indexing import NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim from pandas.compat import PYPY @@ -27,12 +22,9 @@ IndexingError, InvalidIndexError, LossySetitemError, - _chained_assignment_msg, - _chained_assignment_warning_msg, - _check_cacher, ) +from pandas.errors.cow import _chained_assignment_msg from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( can_hold_element, @@ -57,6 +49,7 @@ ABCSeries, ) from pandas.core.dtypes.missing import ( + construct_1d_array_from_inferred_fill_value, infer_fill_value, is_valid_na_for_dtype, isna, @@ -68,7 +61,6 @@ from pandas.core.construction import ( array as pd_array, extract_array, - sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -121,14 +113,17 @@ class _IndexSlice: Examples -------- - >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) - >>> columns = ['foo', 'bar'] - >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), - ... index=midx, columns=columns) + >>> midx = pd.MultiIndex.from_product([["A0", "A1"], ["B0", "B1", "B2", "B3"]]) + >>> columns = ["foo", "bar"] + >>> dfmi = pd.DataFrame( + ... np.arange(16).reshape((len(midx), len(columns))), + ... index=midx, + ... columns=columns, + ... ) Using the default slice command: - >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :] + >>> dfmi.loc[(slice(None), slice("B0", "B1")), :] foo bar A0 B0 0 1 B1 2 3 @@ -138,7 +133,7 @@ class _IndexSlice: Using the IndexSlice class for a more intuitive command: >>> idx = pd.IndexSlice - >>> dfmi.loc[idx[:, 'B0':'B1'], :] + >>> dfmi.loc[idx[:, "B0":"B1"], :] foo bar A0 B0 0 1 B1 2 3 @@ -163,9 +158,9 @@ def iloc(self) -> _iLocIndexer: """ Purely integer-location based indexing for selection by position. - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0 - Returning a tuple from a callable is deprecated. + Callables which return a tuple are deprecated as input. ``.iloc[]`` is primarily integer position based (from ``0`` to ``length-1`` of the axis), but may also be used with a boolean @@ -200,9 +195,11 @@ def iloc(self) -> _iLocIndexer: Examples -------- - >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, - ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] + >>> mydict = [ + ... {"a": 1, "b": 2, "c": 3, "d": 4}, + ... {"a": 100, "b": 200, "c": 300, "d": 400}, + ... {"a": 1000, "b": 2000, "c": 3000, "d": 4000}, + ... ] >>> df = pd.DataFrame(mydict) >>> df a b c d @@ -215,7 +212,7 @@ def iloc(self) -> _iLocIndexer: With a scalar integer. >>> type(df.iloc[0]) - + >>> df.iloc[0] a 1 b 2 @@ -229,7 +226,7 @@ def iloc(self) -> _iLocIndexer: a b c d 0 1 2 3 4 >>> type(df.iloc[[0]]) - + >>> df.iloc[[0, 1]] a b c d @@ -268,7 +265,7 @@ def iloc(self) -> _iLocIndexer: With scalar integers. >>> df.iloc[0, 1] - 2 + np.int64(2) With lists of integers. @@ -350,9 +347,11 @@ def loc(self) -> _LocIndexer: -------- **Getting values** - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) + >>> df = pd.DataFrame( + ... [[1, 2], [4, 5], [7, 8]], + ... index=["cobra", "viper", "sidewinder"], + ... columns=["max_speed", "shield"], + ... ) >>> df max_speed shield cobra 1 2 @@ -361,27 +360,27 @@ def loc(self) -> _LocIndexer: Single label. Note this returns the row as a Series. - >>> df.loc['viper'] + >>> df.loc["viper"] max_speed 4 shield 5 Name: viper, dtype: int64 List of labels. Note using ``[[]]`` returns a DataFrame. - >>> df.loc[['viper', 'sidewinder']] + >>> df.loc[["viper", "sidewinder"]] max_speed shield viper 4 5 sidewinder 7 8 Single label for row and column - >>> df.loc['cobra', 'shield'] - 2 + >>> df.loc["cobra", "shield"] + np.int64(2) Slice with labels for row and single label for column. As mentioned above, note that both the start and stop of the slice are included. - >>> df.loc['cobra':'viper', 'max_speed'] + >>> df.loc["cobra":"viper", "max_speed"] cobra 1 viper 4 Name: max_speed, dtype: int64 @@ -394,8 +393,9 @@ def loc(self) -> _LocIndexer: Alignable boolean Series: - >>> df.loc[pd.Series([False, True, False], - ... index=['viper', 'sidewinder', 'cobra'])] + >>> df.loc[ + ... pd.Series([False, True, False], index=["viper", "sidewinder", "cobra"]) + ... ] max_speed shield sidewinder 7 8 @@ -409,25 +409,25 @@ def loc(self) -> _LocIndexer: Conditional that returns a boolean Series - >>> df.loc[df['shield'] > 6] + >>> df.loc[df["shield"] > 6] max_speed shield sidewinder 7 8 Conditional that returns a boolean Series with column labels specified - >>> df.loc[df['shield'] > 6, ['max_speed']] + >>> df.loc[df["shield"] > 6, ["max_speed"]] max_speed sidewinder 7 Multiple conditional using ``&`` that returns a boolean Series - >>> df.loc[(df['max_speed'] > 1) & (df['shield'] < 8)] + >>> df.loc[(df["max_speed"] > 1) & (df["shield"] < 8)] max_speed shield viper 4 5 Multiple conditional using ``|`` that returns a boolean Series - >>> df.loc[(df['max_speed'] > 4) | (df['shield'] < 5)] + >>> df.loc[(df["max_speed"] > 4) | (df["shield"] < 5)] max_speed shield cobra 1 2 sidewinder 7 8 @@ -444,7 +444,7 @@ def loc(self) -> _LocIndexer: Callable that returns a boolean Series - >>> df.loc[lambda df: df['shield'] == 8] + >>> df.loc[lambda df: df["shield"] == 8] max_speed shield sidewinder 7 8 @@ -452,7 +452,7 @@ def loc(self) -> _LocIndexer: Set value for all items matching the list of labels - >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 + >>> df.loc[["viper", "sidewinder"], ["shield"]] = 50 >>> df max_speed shield cobra 1 2 @@ -461,7 +461,7 @@ def loc(self) -> _LocIndexer: Set value for an entire row - >>> df.loc['cobra'] = 10 + >>> df.loc["cobra"] = 10 >>> df max_speed shield cobra 10 10 @@ -470,7 +470,7 @@ def loc(self) -> _LocIndexer: Set value for an entire column - >>> df.loc[:, 'max_speed'] = 30 + >>> df.loc[:, "max_speed"] = 30 >>> df max_speed shield cobra 30 10 @@ -479,7 +479,7 @@ def loc(self) -> _LocIndexer: Set value for rows matching callable condition - >>> df.loc[df['shield'] > 35] = 0 + >>> df.loc[df["shield"] > 35] = 0 >>> df max_speed shield cobra 30 10 @@ -510,8 +510,11 @@ def loc(self) -> _LocIndexer: Another example using integers for the index - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) + >>> df = pd.DataFrame( + ... [[1, 2], [4, 5], [7, 8]], + ... index=[7, 8, 9], + ... columns=["max_speed", "shield"], + ... ) >>> df max_speed shield 7 1 2 @@ -532,14 +535,16 @@ def loc(self) -> _LocIndexer: A number of examples using a DataFrame with a MultiIndex >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ("cobra", "mark i"), + ... ("cobra", "mark ii"), + ... ("sidewinder", "mark i"), + ... ("sidewinder", "mark ii"), + ... ("viper", "mark ii"), + ... ("viper", "mark iii"), ... ] >>> index = pd.MultiIndex.from_tuples(tuples) - >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] - >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) + >>> values = [[12, 2], [0, 4], [10, 20], [1, 4], [7, 1], [16, 36]] + >>> df = pd.DataFrame(values, columns=["max_speed", "shield"], index=index) >>> df max_speed shield cobra mark i 12 2 @@ -551,14 +556,14 @@ def loc(self) -> _LocIndexer: Single label. Note this returns a DataFrame with a single index. - >>> df.loc['cobra'] + >>> df.loc["cobra"] max_speed shield mark i 12 2 mark ii 0 4 Single index tuple. Note this returns a Series. - >>> df.loc[('cobra', 'mark ii')] + >>> df.loc[("cobra", "mark ii")] max_speed 0 shield 4 Name: (cobra, mark ii), dtype: int64 @@ -566,25 +571,25 @@ def loc(self) -> _LocIndexer: Single label for row and column. Similar to passing in a tuple, this returns a Series. - >>> df.loc['cobra', 'mark i'] + >>> df.loc["cobra", "mark i"] max_speed 12 shield 2 Name: (cobra, mark i), dtype: int64 Single tuple. Note using ``[[]]`` returns a DataFrame. - >>> df.loc[[('cobra', 'mark ii')]] + >>> df.loc[[("cobra", "mark ii")]] max_speed shield cobra mark ii 0 4 Single tuple for the index with a single label for the column - >>> df.loc[('cobra', 'mark i'), 'shield'] - 2 + >>> df.loc[("cobra", "mark i"), "shield"] + np.int64(2) Slice from index tuple to single label - >>> df.loc[('cobra', 'mark i'):'viper'] + >>> df.loc[("cobra", "mark i") : "viper"] max_speed shield cobra mark i 12 2 mark ii 0 4 @@ -595,7 +600,7 @@ def loc(self) -> _LocIndexer: Slice from index tuple to index tuple - >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] + >>> df.loc[("cobra", "mark i") : ("viper", "mark ii")] max_speed shield cobra mark i 12 2 mark ii 0 4 @@ -647,8 +652,11 @@ def at(self) -> _AtIndexer: Examples -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df = pd.DataFrame( + ... [[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], + ... columns=["A", "B", "C"], + ... ) >>> df A B C 4 0 2 3 @@ -657,19 +665,19 @@ def at(self) -> _AtIndexer: Get value at specified row/column pair - >>> df.at[4, 'B'] - 2 + >>> df.at[4, "B"] + np.int64(2) Set value at specified row/column pair - >>> df.at[4, 'B'] = 10 - >>> df.at[4, 'B'] - 10 + >>> df.at[4, "B"] = 10 + >>> df.at[4, "B"] + np.int64(10) Get value within a Series - >>> df.loc[5].at['B'] - 4 + >>> df.loc[5].at["B"] + np.int64(4) """ return _AtIndexer("at", self) @@ -695,8 +703,9 @@ def iat(self) -> _iAtIndexer: Examples -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... columns=['A', 'B', 'C']) + >>> df = pd.DataFrame( + ... [[0, 2, 3], [0, 4, 1], [10, 20, 30]], columns=["A", "B", "C"] + ... ) >>> df A B C 0 0 2 3 @@ -706,18 +715,18 @@ def iat(self) -> _iAtIndexer: Get value at specified row/column pair >>> df.iat[1, 2] - 1 + np.int64(1) Set value at specified row/column pair >>> df.iat[1, 2] = 10 >>> df.iat[1, 2] - 10 + np.int64(10) Get value within a series >>> df.loc[0].iat[1] - 2 + np.int64(2) """ return _iAtIndexer("iat", self) @@ -747,7 +756,7 @@ def _get_setitem_indexer(self, key): """ if self.name == "loc": # always holds here bc iloc overrides _get_setitem_indexer - self._ensure_listlike_indexer(key) + self._ensure_listlike_indexer(key, axis=self.axis) if isinstance(key, tuple): for x in key: @@ -801,8 +810,11 @@ def _maybe_mask_setitem_value(self, indexer, value): if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1: # e.g. test_loc_setitem_boolean_mask_allfalse - # test_loc_setitem_ndframe_values_alignment - value = self.obj.iloc._align_series(indexer, value) + if len(newkey) == 0: + value = value.iloc[:0] + else: + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_series(indexer, value) indexer = (newkey, icols) elif ( @@ -818,8 +830,11 @@ def _maybe_mask_setitem_value(self, indexer, value): indexer = (newkey, icols) elif ndim == 2 and value.shape[1] == 1: - # test_loc_setitem_ndframe_values_alignment - value = self.obj.iloc._align_frame(indexer, value) + if len(newkey) == 0: + value = value.iloc[:0] + else: + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_frame(indexer, value) indexer = (newkey, icols) elif com.is_bool_indexer(indexer): indexer = indexer.nonzero()[0] @@ -844,12 +859,13 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: if self.ndim != 2: return - orig_key = key if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc # if length of key is > 1 set key to column part - key = key[column_axis] - axis = column_axis + # unless axis is already specified, then go with that + if axis is None: + axis = column_axis + key = key[axis] if ( axis == column_axis @@ -862,7 +878,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: keys = self.obj.columns.union(key, sort=False) diff = Index(key).difference(self.obj.columns, sort=False) - if len(diff) and com.is_null_slice(orig_key[0]): + if len(diff): # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" # is a new column, add the new columns with dtype=np.void # so that later when we go through setitem_single_column @@ -882,36 +898,28 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: @final def __setitem__(self, key, value) -> None: - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self.obj) <= 2: warnings.warn( _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 ) - elif not PYPY and not using_copy_on_write(): - ctr = sys.getrefcount(self.obj) - ref_count = 2 - if not warn_copy_on_write() and _check_cacher(self.obj): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_msg, FutureWarning, stacklevel=2 - ) check_dict_or_set_indexers(key) if isinstance(key, tuple): - key = tuple(list(x) if is_iterator(x) else x for x in key) + key = (list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: maybe_callable = com.apply_if_callable(key, self.obj) - key = self._check_deprecated_callable_usage(key, maybe_callable) + key = self._raise_callable_usage(key, maybe_callable) indexer = self._get_setitem_indexer(key) self._has_valid_setitem_indexer(key) - iloc = self if self.name == "iloc" else self.obj.iloc + iloc: _iLocIndexer = ( + cast("_iLocIndexer", self) if self.name == "iloc" else self.obj.iloc + ) iloc._setitem_with_indexer(indexer, value, self.name) - def _validate_key(self, key, axis: AxisInt): + def _validate_key(self, key, axis: AxisInt) -> None: """ Ensure that key is valid for current indexer. @@ -967,8 +975,7 @@ def _validate_tuple_indexer(self, key: tuple) -> tuple: self._validate_key(k, i) except ValueError as err: raise ValueError( - "Location based indexing can only have " - f"[{self._valid_types}] types" + f"Location based indexing can only have [{self._valid_types}] types" ) from err return key @@ -1146,7 +1153,7 @@ def _contains_slice(x: object) -> bool: # GH#41369 Loop in reverse order ensures indexing along columns before rows # which selects only necessary blocks which avoids dtype conversion if possible axis = len(tup) - 1 - for key in tup[::-1]: + for key in reversed(tup): if com.is_null_slice(key): axis -= 1 continue @@ -1163,14 +1170,11 @@ def _contains_slice(x: object) -> bool: def _convert_to_indexer(self, key, axis: AxisInt): raise AbstractMethodError(self) - def _check_deprecated_callable_usage(self, key: Any, maybe_callable: T) -> T: + def _raise_callable_usage(self, key: Any, maybe_callable: T) -> T: # GH53533 if self.name == "iloc" and callable(key) and isinstance(maybe_callable, tuple): - warnings.warn( - "Returning a tuple from a callable with iloc " - "is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + "Returning a tuple from a callable with iloc is not allowed.", ) return maybe_callable @@ -1178,7 +1182,7 @@ def _check_deprecated_callable_usage(self, key: Any, maybe_callable: T) -> T: def __getitem__(self, key): check_dict_or_set_indexers(key) if type(key) is tuple: - key = tuple(list(x) if is_iterator(x) else x for x in key) + key = (list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): return self.obj._get_value(*key, takeable=self._takeable) @@ -1188,17 +1192,17 @@ def __getitem__(self, key): axis = self.axis or 0 maybe_callable = com.apply_if_callable(key, self.obj) - maybe_callable = self._check_deprecated_callable_usage(key, maybe_callable) + maybe_callable = self._raise_callable_usage(key, maybe_callable) return self._getitem_axis(maybe_callable, axis=axis) def _is_scalar_access(self, key: tuple): - raise NotImplementedError() + raise NotImplementedError def _getitem_tuple(self, tup: tuple): raise AbstractMethodError(self) def _getitem_axis(self, key, axis: AxisInt): - raise NotImplementedError() + raise NotImplementedError def _has_valid_setitem_indexer(self, indexer) -> bool: raise AbstractMethodError(self) @@ -1209,7 +1213,7 @@ def _getbool_axis(self, key, axis: AxisInt): labels = self.obj._get_axis(axis) key = check_bool_indexer(labels, key) inds = key.nonzero()[0] - return self.obj._take_with_is_copy(inds, axis=axis) + return self.obj.take(inds, axis=axis) @doc(IndexingMixin.loc) @@ -1225,7 +1229,7 @@ class _LocIndexer(_LocationIndexer): # Key Checks @doc(_LocationIndexer._validate_key) - def _validate_key(self, key, axis: Axis): + def _validate_key(self, key, axis: Axis) -> None: # valid for a collection of labels (we check their presence later) # slice of labels (where start-end in labels) # slice of integers (only if in the labels) @@ -1234,8 +1238,10 @@ def _validate_key(self, key, axis: Axis): if isinstance(key, bool) and not ( is_bool_dtype(ax.dtype) or ax.dtype.name == "boolean" - or isinstance(ax, MultiIndex) - and is_bool_dtype(ax.get_level_values(0).dtype) + or ( + isinstance(ax, MultiIndex) + and is_bool_dtype(ax.get_level_values(0).dtype) + ) ): raise KeyError( f"{key}: boolean label can not be used without a boolean index" @@ -1328,7 +1334,7 @@ def _multi_take(self, tup: tuple): axis: self._get_listlike_indexer(key, axis) for (key, axis) in zip(tup, self.obj._AXIS_ORDERS) } - return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True) + return self.obj._reindex_with_indexers(d, allow_dups=True) # ------------------------------------------------------------------- @@ -1360,7 +1366,7 @@ def _getitem_iterable(self, key, axis: AxisInt): # A collection of keys keyarr, indexer = self._get_listlike_indexer(key, axis) return self.obj._reindex_with_indexers( - {axis: [keyarr, indexer]}, copy=True, allow_dups=True + {axis: [keyarr, indexer]}, allow_dups=True ) def _getitem_tuple(self, tup: tuple): @@ -1572,18 +1578,13 @@ class _iLocIndexer(_LocationIndexer): # ------------------------------------------------------------------- # Key Checks - def _validate_key(self, key, axis: AxisInt): + def _validate_key(self, key, axis: AxisInt) -> None: if com.is_bool_indexer(key): if hasattr(key, "index") and isinstance(key.index, Index): if key.index.inferred_type == "integer": - raise NotImplementedError( - "iLocation based boolean " - "indexing on an integer type " - "is not available" - ) + return raise ValueError( - "iLocation based boolean indexing cannot use " - "an indexable as a mask" + "iLocation based boolean indexing cannot use an indexable as a mask" ) return @@ -1712,7 +1713,7 @@ def _get_list_axis(self, key, axis: AxisInt): `axis` can only be zero. """ try: - return self.obj._take_with_is_copy(key, axis=axis) + return self.obj.take(key, axis=axis) except IndexError as err: # re-raise with different error message, e.g. test_getitem_ndarray_3d raise IndexError("positional indexers are out-of-bounds") from err @@ -1765,7 +1766,7 @@ def _get_slice_axis(self, slice_obj: slice, axis: AxisInt): labels._validate_positional_slice(slice_obj) return self.obj._slice(slice_obj, axis=axis) - def _convert_to_indexer(self, key, axis: AxisInt): + def _convert_to_indexer(self, key: T, axis: AxisInt) -> T: """ Much simpler as we only have to deal with our valid types. """ @@ -1783,7 +1784,7 @@ def _get_setitem_indexer(self, key): # ------------------------------------------------------------------- - def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): + def _setitem_with_indexer(self, indexer, value, name: str = "iloc") -> None: """ _setitem_with_indexer is for setting values on a Series/DataFrame using positional indexers. @@ -1806,10 +1807,10 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: + if not take_split_path and len(self.obj._mgr.blocks) and self.ndim > 1: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value - arr = self.obj._mgr.arrays[0] + arr = self.obj._mgr.blocks[0].values take_split_path = not can_hold_element( arr, extract_array(val, extract_numpy=True) ) @@ -1878,12 +1879,9 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): self.obj[key] = empty_value elif not is_list_like(value): - # Find our empty_value dtype by constructing an array - # from our value and doing a .take on it - arr = sanitize_array(value, Index(range(1)), copy=False) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - self.obj[key] = empty_value + self.obj[key] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) else: # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value(value) @@ -1900,15 +1898,7 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): # just replacing the block manager here # so the object is the same index = self.obj._get_axis(i) - with warnings.catch_warnings(): - # TODO: re-issue this with setitem-specific message? - warnings.filterwarnings( - "ignore", - "The behavior of Index.insert with object-dtype " - "is deprecated", - category=FutureWarning, - ) - labels = index.insert(len(index), key) + labels = index.insert(len(index), key) # We are expanding the Series/DataFrame values to match # the length of thenew index `labels`. GH#40096 ensure @@ -1920,8 +1910,6 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): reindexers, allow_dups=True ) self.obj._mgr = new_obj._mgr - self.obj._maybe_update_cacher(clear=True) - self.obj._is_copy = None nindexer.append(labels.get_loc(key)) @@ -2000,8 +1988,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): return self._setitem_with_indexer((pi, info_axis[0]), value[0]) raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" + "Must have equal len keys and value when setting with an iterable" ) elif lplane_indexer == 0 and len(value) == len(self.obj.index): @@ -2029,8 +2016,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): else: raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" + "Must have equal len keys and value when setting with an iterable" ) else: @@ -2038,7 +2024,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): for loc in ilocs: self._setitem_single_column(loc, value, pi) - def _setitem_with_indexer_2d_value(self, indexer, value): + def _setitem_with_indexer_2d_value(self, indexer, value) -> None: # We get here with np.ndim(value) == 2, excluding DataFrame, # which goes through _setitem_with_indexer_frame_value pi = indexer[0] @@ -2060,7 +2046,9 @@ def _setitem_with_indexer_2d_value(self, indexer, value): value_col = value_col.tolist() self._setitem_single_column(loc, value_col, pi) - def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): + def _setitem_with_indexer_frame_value( + self, indexer, value: DataFrame, name: str + ) -> None: ilocs = self._ensure_iterable_column_indexer(indexer[1]) sub_indexer = list(indexer) @@ -2105,7 +2093,7 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str tuple(sub_indexer), value[item], multiindex_indexer, - using_cow=using_copy_on_write(), + using_cow=True, ) else: val = np.nan @@ -2126,7 +2114,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)) - is_null_setter = com.is_empty_slice(pi) or is_array_like(pi) and len(pi) == 0 + is_null_setter = com.is_empty_slice(pi) or (is_array_like(pi) and len(pi) == 0) if is_null_setter: # no-op, don't cast dtype later @@ -2137,18 +2125,42 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: self.obj._mgr.column_setitem( loc, plane_indexer, value, inplace_only=True ) - except (ValueError, TypeError, LossySetitemError): + except (ValueError, TypeError, LossySetitemError) as exc: # If we're setting an entire column and we can't do it inplace, # then we can use value's dtype (or inferred dtype) # instead of object + dtype = self.obj.dtypes.iloc[loc] + if dtype not in (np.void, object) and not self.obj.empty: + # - Exclude np.void, as that is a special case for expansion. + # We want to raise for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'a'] = .3 + # but not for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'b'] = .3 + # - Exclude `object`, as then no upcasting happens. + # - Exclude empty initial object with enlargement, + # as then there's nothing to be inconsistent with. + raise TypeError( + f"Invalid value '{value}' for dtype '{dtype}'" + ) from exc self.obj.isetitem(loc, value) else: # set value into the column (first attempting to operate inplace, then # falling back to casting if necessary) + dtype = self.obj.dtypes.iloc[loc] + if dtype == np.void: + # This means we're expanding, with multiple columns, e.g. + # df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]}) + # df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc') + # Columns F and G will initially be set to np.void. + # Here, we replace those temporary `np.void` columns with + # columns of the appropriate dtype, based on `value`. + self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) self.obj._mgr.column_setitem(loc, plane_indexer, value) - self.obj._clear_item_cache() - def _setitem_single_block(self, indexer, value, name: str) -> None: """ _setitem_with_indexer for the case when we have a single Block. @@ -2184,12 +2196,8 @@ def _setitem_single_block(self, indexer, value, name: str) -> None: if isinstance(value, ABCDataFrame) and name != "iloc": value = self._align_frame(indexer, value)._values - # check for chained assignment - self.obj._check_is_chained_assignment_possible() - # actually do the set self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) - self.obj._maybe_update_cacher(clear=True, inplace=True) def _setitem_with_indexer_missing(self, indexer, value): """ @@ -2201,14 +2209,7 @@ def _setitem_with_indexer_missing(self, indexer, value): # and set inplace if self.ndim == 1: index = self.obj.index - with warnings.catch_warnings(): - # TODO: re-issue this with setitem-specific message? - warnings.filterwarnings( - "ignore", - "The behavior of Index.insert with object-dtype is deprecated", - category=FutureWarning, - ) - new_index = index.insert(len(index), indexer) + new_index = index.insert(len(index), indexer) # we have a coerced indexer, e.g. a float # that matches in an int64 Index, so @@ -2255,7 +2256,6 @@ def _setitem_with_indexer_missing(self, indexer, value): self.obj._mgr = self.obj._constructor( new_values, index=new_index, name=self.obj.name )._mgr - self.obj._maybe_update_cacher(clear=True) elif self.ndim == 2: if not len(self.obj.columns): @@ -2265,7 +2265,7 @@ def _setitem_with_indexer_missing(self, indexer, value): has_dtype = hasattr(value, "dtype") if isinstance(value, ABCSeries): # append a Series - value = value.reindex(index=self.obj.columns, copy=True) + value = value.reindex(index=self.obj.columns) value.name = indexer elif isinstance(value, dict): value = Series( @@ -2295,21 +2295,20 @@ def _setitem_with_indexer_missing(self, indexer, value): if not has_dtype: # i.e. if we already had a Series or ndarray, keep that # dtype. But if we had a list or dict, then do inference - df = df.infer_objects(copy=False) + df = df.infer_objects() self.obj._mgr = df._mgr else: self.obj._mgr = self.obj._append(value)._mgr - self.obj._maybe_update_cacher(clear=True) def _ensure_iterable_column_indexer(self, column_indexer): """ Ensure that our column indexer is something that can be iterated over. """ - ilocs: Sequence[int | np.integer] | np.ndarray + ilocs: Sequence[int | np.integer] | np.ndarray | range if is_integer(column_indexer): ilocs = [column_indexer] elif isinstance(column_indexer, slice): - ilocs = np.arange(len(self.obj.columns))[column_indexer] + ilocs = range(len(self.obj.columns))[column_indexer] elif ( isinstance(column_indexer, np.ndarray) and column_indexer.dtype.kind == "b" ): @@ -2346,11 +2345,17 @@ def _align_series( if isinstance(indexer, tuple): # flatten np.ndarray indexers + if ( + len(indexer) == 2 + and isinstance(indexer[1], np.ndarray) + and indexer[1].dtype == np.bool_ + ): + indexer = (indexer[0], np.where(indexer[1])[0]) + def ravel(i): return i.ravel() if isinstance(i, np.ndarray) else i indexer = tuple(map(ravel, indexer)) - aligners = [not com.is_null_slice(idx) for idx in indexer] sum_aligners = sum(aligners) single_aligner = sum_aligners == 1 @@ -2368,12 +2373,15 @@ def ravel(i): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) - if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): - ser_values = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values + if all(is_sequence(_) or isinstance(_, slice) for _ in indexer): + ser_values = ser.reindex(obj.axes[0][indexer[0]])._values # single indexer if len(indexer) > 1 and not multiindex_indexer: - len_indexer = len(indexer[1]) + if isinstance(indexer[1], slice): + len_indexer = len(obj.axes[1][indexer[1]]) + else: + len_indexer = len(indexer[1]) ser_values = ( np.tile(ser_values, len_indexer).reshape(len_indexer, -1).T ) @@ -2392,7 +2400,7 @@ def ravel(i): new_ix = Index([new_ix]) else: new_ix = Index(new_ix) - if ser.index.equals(new_ix): + if not len(new_ix) or ser.index.equals(new_ix): if using_cow: return ser return ser._values.copy() @@ -2437,7 +2445,7 @@ def _align_frame(self, indexer, df: DataFrame) -> DataFrame: ax = self.obj.axes[i] if is_sequence(ix) or isinstance(ix, slice): if isinstance(ix, np.ndarray): - ix = ix.ravel() + ix = ix.reshape(-1) if idx is None: idx = ax[ix] elif cols is None: @@ -2739,19 +2747,15 @@ def check_dict_or_set_indexers(key) -> None: """ Check if the indexer is or contains a dict or set, which is no longer allowed. """ - if ( - isinstance(key, set) - or isinstance(key, tuple) - and any(isinstance(x, set) for x in key) + if isinstance(key, set) or ( + isinstance(key, tuple) and any(isinstance(x, set) for x in key) ): raise TypeError( "Passing a set as an indexer is not supported. Use a list instead." ) - if ( - isinstance(key, dict) - or isinstance(key, tuple) - and any(isinstance(x, dict) for x in key) + if isinstance(key, dict) or ( + isinstance(key, tuple) and any(isinstance(x, dict) for x in key) ): raise TypeError( "Passing a dict as an indexer is not supported. Use a list instead." diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index a54e4428bd836..8953360a91c8e 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -12,6 +12,7 @@ if TYPE_CHECKING: import numpy as np + import pyarrow as pa class PandasBuffer(Buffer): @@ -23,15 +24,14 @@ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if not x.strides == (x.dtype.itemsize,): + if x.strides[0] and not x.strides == (x.dtype.itemsize,): # The protocol does not support strided buffers, so a copy is # necessary. If that's not allowed, we need to raise an exception. if allow_copy: x = x.copy() else: raise RuntimeError( - "Exports cannot be zero-copy in the case " - "of a non-contiguous buffer" + "Exports cannot be zero-copy in the case of a non-contiguous buffer" ) # Store the numpy array in which the data resides as a private @@ -76,3 +76,60 @@ def __repr__(self) -> str: ) + ")" ) + + +class PandasBufferPyarrow(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__( + self, + buffer: pa.Buffer, + *, + length: int, + ) -> None: + """ + Handle pyarrow chunked arrays. + """ + self._buffer = buffer + self._length = length + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._buffer.size + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._buffer.address + + def __dlpack__(self) -> Any: + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer[pyarrow](" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": "CPU", + } + ) + + ")" + ) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index acfbc5d9e6c62..c27a9d8141712 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import Any +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np @@ -9,14 +12,18 @@ from pandas.errors import NoBufferPresent from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.dtypes import ( +from pandas.core.dtypes.dtypes import BaseMaskedDtype + +import pandas as pd +from pandas import ( ArrowDtype, DatetimeTZDtype, ) - -import pandas as pd from pandas.api.types import is_string_dtype -from pandas.core.interchange.buffer import PandasBuffer +from pandas.core.interchange.buffer import ( + PandasBuffer, + PandasBufferPyarrow, +) from pandas.core.interchange.dataframe_protocol import ( Column, ColumnBuffers, @@ -29,6 +36,9 @@ dtype_to_arrow_c_fmt, ) +if TYPE_CHECKING: + from pandas.core.interchange.dataframe_protocol import Buffer + _NP_KINDS = { "i": DtypeKind.INT, "u": DtypeKind.UINT, @@ -76,6 +86,14 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ + if isinstance(column, pd.DataFrame): + raise TypeError( + "Expected a Series, got a DataFrame. This likely happened " + "because you called __dataframe__ on a DataFrame which, " + "after converting column names to string, resulted in duplicated " + f"names: {column.columns}. Please rename these columns before " + "using the interchange protocol." + ) if not isinstance(column, pd.Series): raise NotImplementedError(f"Columns of type {type(column)} not handled yet") @@ -116,7 +134,7 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: Endianness.NATIVE, ) elif is_string_dtype(dtype): - if infer_dtype(self._col) == "string": + if infer_dtype(self._col) in ("string", "empty"): return ( DtypeKind.STRING, 8, @@ -143,9 +161,21 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: byteorder = dtype.numpy_dtype.byteorder elif isinstance(dtype, DatetimeTZDtype): byteorder = dtype.base.byteorder # type: ignore[union-attr] + elif isinstance(dtype, BaseMaskedDtype): + byteorder = dtype.numpy_dtype.byteorder else: byteorder = dtype.byteorder + if dtype == "bool[pyarrow]": + # return early to avoid the `* 8` below, as this is a bitmask + # rather than a bytemask + return ( + kind, + dtype.itemsize, # pyright: ignore[reportAttributeAccessIssue] + ArrowCTypes.BOOL, + byteorder, + ) + return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder @property @@ -179,11 +209,21 @@ def describe_categorical(self): @property def describe_null(self): + if isinstance(self._col.dtype, BaseMaskedDtype): + column_null_dtype = ColumnNullType.USE_BYTEMASK + null_value = 1 + return column_null_dtype, null_value + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + if self._col.array._pa_array.chunks[0].buffers()[0] is None: # type: ignore[attr-defined] + return ColumnNullType.NON_NULLABLE, None + return ColumnNullType.USE_BITMASK, 0 kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] - except KeyError: - raise NotImplementedError(f"Data type {kind} not yet supported") + except KeyError as err: + raise NotImplementedError(f"Data type {kind} not yet supported") from err return null, value @@ -263,25 +303,47 @@ def get_buffers(self) -> ColumnBuffers: def _get_data_buffer( self, - ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple + ) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]: """ Return the buffer containing the data and the buffer's associated dtype. """ - if self.dtype[0] in ( - DtypeKind.INT, - DtypeKind.UINT, - DtypeKind.FLOAT, - DtypeKind.BOOL, - DtypeKind.DATETIME, - ): + buffer: Buffer + if self.dtype[0] == DtypeKind.DATETIME: # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make # it longer than 4 characters - if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: + if len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: np_arr = self._col.to_numpy() buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) + dtype = ( + DtypeKind.INT, + 64, + ArrowCTypes.INT64, + Endianness.NATIVE, + ) + elif self.dtype[0] in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): dtype = self.dtype + arr = self._col.array + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so + # this is already single-chunk by the time we get here. + arr = arr._pa_array.chunks[0] # type: ignore[attr-defined] + buffer = PandasBufferPyarrow( + arr.buffers()[1], # type: ignore[attr-defined] + length=len(arr), + ) + return buffer, dtype + if isinstance(self._col.dtype, BaseMaskedDtype): + np_arr = arr._data # type: ignore[attr-defined] + else: + np_arr = arr._ndarray # type: ignore[attr-defined] + buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values._codes buffer = PandasBuffer(codes, allow_copy=self._allow_copy) @@ -301,10 +363,12 @@ def _get_data_buffer( buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer + # TODO: this will need correcting + # https://github.com/pandas-dev/pandas/issues/54781 dtype = ( - DtypeKind.STRING, + DtypeKind.UINT, 8, - ArrowCTypes.STRING, + ArrowCTypes.UINT8, Endianness.NATIVE, ) # note: currently only support native endianness else: @@ -312,13 +376,32 @@ def _get_data_buffer( return buffer, dtype - def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: + def _get_validity_buffer(self) -> tuple[Buffer, Any] | None: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. Raises NoBufferPresent if null representation is not a bit or byte mask. """ null, invalid = self.describe_null + buffer: Buffer + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + arr = self._col.array._pa_array.chunks[0] # type: ignore[attr-defined] + dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) + if arr.buffers()[0] is None: + return None + buffer = PandasBufferPyarrow( + arr.buffers()[0], + length=len(arr), + ) + return buffer, dtype + + if isinstance(self._col.dtype, BaseMaskedDtype): + mask = self._col.array._mask # type: ignore[attr-defined] + buffer = PandasBuffer(mask) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + return buffer, dtype if self.dtype[0] == DtypeKind.STRING: # For now, use byte array as the mask. @@ -344,9 +427,9 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: try: msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask" - except KeyError: + except KeyError as err: # TODO: implement for other bit/byte masks? - raise NotImplementedError("See self.describe_null") + raise NotImplementedError("See self.describe_null") from err raise NoBufferPresent(msg) diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 4f08b2c2b3a7b..0a116af567e59 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -5,6 +5,7 @@ from pandas.core.interchange.column import PandasColumn from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg +from pandas.core.interchange.utils import maybe_rechunk if TYPE_CHECKING: from collections.abc import ( @@ -32,8 +33,12 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. """ - self._df = df + self._df = df.rename(columns=str) self._allow_copy = allow_copy + for i, _col in enumerate(self._df.columns): + rechunked = maybe_rechunk(self._df.iloc[:, i], allow_copy=allow_copy) + if rechunked is not None: + self._df.isetitem(i, rechunked) def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index d45ae37890ba7..c2fbef1089d5a 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -2,12 +2,16 @@ import ctypes import re -from typing import Any +from typing import ( + Any, + overload, +) import numpy as np +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency -from pandas.errors import SettingWithCopyError import pandas as pd from pandas.core.interchange.dataframe_protocol import ( @@ -34,6 +38,21 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: """ Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + .. note:: + + For new development, we highly recommend using the Arrow C Data Interface + alongside the Arrow PyCapsule Interface instead of the interchange protocol. + From pandas 3.0 onwards, `from_dataframe` uses the PyCapsule Interface, + only falling back to the interchange protocol if that fails. + + .. warning:: + + Due to severe implementation issues, we recommend only considering using the + interchange protocol in the following cases: + + - converting to pandas: for pandas >= 2.0.3 + - converting from pandas: for pandas >= 3.0.0 + Parameters ---------- df : DataFrameXchg @@ -45,15 +64,23 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: Returns ------- pd.DataFrame + A pandas DataFrame built from the provided interchange + protocol object. + + See Also + -------- + pd.DataFrame : DataFrame class which can be created from various input data + formats, including objects that support the interchange protocol. Examples -------- - >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> interchange_object = df_not_necessarily_pandas.__dataframe__() >>> interchange_object.column_names() Index(['A', 'B'], dtype='object') - >>> df_pandas = (pd.api.interchange.from_dataframe - ... (interchange_object.select_columns_by_name(['A']))) + >>> df_pandas = pd.api.interchange.from_dataframe( + ... interchange_object.select_columns_by_name(["A"]) + ... ) >>> df_pandas A 0 1 @@ -65,6 +92,18 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: if isinstance(df, pd.DataFrame): return df + if hasattr(df, "__arrow_c_stream__"): + try: + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + except ImportError: + # fallback to _from_dataframe + pass + else: + try: + return pa.table(df).to_pandas(zero_copy_only=not allow_copy) + except pa.ArrowInvalid as e: + raise RuntimeError(e) from e + if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") @@ -73,7 +112,7 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: ) -def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True): +def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True) -> pd.DataFrame: """ Build a ``pd.DataFrame`` from the DataFrame interchange object. @@ -124,8 +163,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: ------- pd.DataFrame """ - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). columns: dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -295,13 +332,14 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): - assert buffers["validity"], "Validity buffers cannot be empty for masks" - valid_buff, valid_dtype = buffers["validity"] - null_pos = buffer_to_ndarray( - valid_buff, valid_dtype, offset=col.offset, length=col.size() - ) - if sentinel_val == 0: - null_pos = ~null_pos + validity = buffers["validity"] + if validity is not None: + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) + if sentinel_val == 0: + null_pos = ~null_pos # Assemble the strings from the code units str_list: list[None | float | str] = [None] * col.size() @@ -323,8 +361,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Add to our list of strings str_list[i] = string - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers + if using_string_dtype(): + res = pd.Series(str_list, dtype="str") + else: + res = np.asarray(str_list, dtype="object") # type: ignore[assignment] + + return res, buffers # type: ignore[return-value] def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: @@ -459,12 +501,39 @@ def buffer_to_ndarray( return np.array([], dtype=ctypes_type) +@overload +def set_nulls( + data: np.ndarray, + col: Column, + validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, + allow_modify_inplace: bool = ..., +) -> np.ndarray: ... + + +@overload +def set_nulls( + data: pd.Series, + col: Column, + validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, + allow_modify_inplace: bool = ..., +) -> pd.Series: ... + + +@overload +def set_nulls( + data: np.ndarray | pd.Series, + col: Column, + validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, + allow_modify_inplace: bool = ..., +) -> np.ndarray | pd.Series: ... + + def set_nulls( data: np.ndarray | pd.Series, col: Column, validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, allow_modify_inplace: bool = True, -): +) -> np.ndarray | pd.Series: """ Set null values for the data according to the column null kind. @@ -486,13 +555,14 @@ def set_nulls( np.ndarray or pd.Series Data with the nulls being set. """ + if validity is None: + return data null_kind, sentinel_val = col.describe_null null_pos = None if null_kind == ColumnNullType.USE_SENTINEL: null_pos = pd.Series(data) == sentinel_val elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): - assert validity, "Expected to have a validity buffer for the mask" valid_buff, valid_dtype = validity null_pos = buffer_to_ndarray( valid_buff, valid_dtype, offset=col.offset, length=col.size() @@ -515,9 +585,5 @@ def set_nulls( # cast the `data` to nullable float dtype. data = data.astype(float) data[null_pos] = None - except SettingWithCopyError: - # `SettingWithCopyError` may happen for datetime-like with missing values. - data = data.copy() - data[null_pos] = None return data diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 4ac063080e62d..035a1f8abdbc5 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -16,6 +16,8 @@ DatetimeTZDtype, ) +import pandas as pd + if typing.TYPE_CHECKING: from pandas._typing import DtypeObj @@ -37,6 +39,7 @@ "float": "f", # float32 "double": "g", # float64 "string": "u", + "large_string": "U", "binary": "z", "time32[s]": "tts", "time32[ms]": "ttm", @@ -132,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: if format_str is not None: return format_str - if lib.is_np_dtype(dtype, "M"): + if isinstance(dtype, pd.StringDtype): + # TODO(infer_string) this should be LARGE_STRING for pyarrow storage, + # but current tests don't cover this distinction + return ArrowCTypes.STRING + + elif lib.is_np_dtype(dtype, "M"): # Selecting the first char of resolution string: # dtype.str -> ' 'n' resolution = np.datetime_data(dtype)[0][0] @@ -141,6 +149,35 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: elif isinstance(dtype, DatetimeTZDtype): return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) + elif isinstance(dtype, pd.BooleanDtype): + return ArrowCTypes.BOOL + raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) + + +def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None: + """ + Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary. + + - Returns `None` if the input series is not backed by a multi-chunk pyarrow array + (and so doesn't need rechunking) + - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk + pyarrow array and `allow_copy` is `True`. + - Raises a `RuntimeError` if `allow_copy` is `False` and input is a + based by a multi-chunk pyarrow array. + """ + if not isinstance(series.dtype, pd.ArrowDtype): + return None + chunked_array = series.array._pa_array # type: ignore[attr-defined] + if len(chunked_array.chunks) == 1: + return None + if not allow_copy: + raise RuntimeError( + "Found multi-chunk pyarrow array, but `allow_copy` is False. " + "Please rechunk the array before calling this function, or set " + "`allow_copy=True`." + ) + arr = chunked_array.combine_chunks() + return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 2eb413440ba9c..d64c7e33657d4 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,12 +1,4 @@ from pandas.core.internals.api import make_block # 2023-09-18 pyarrow uses this -from pandas.core.internals.array_manager import ( - ArrayManager, - SingleArrayManager, -) -from pandas.core.internals.base import ( - DataManager, - SingleDataManager, -) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( BlockManager, @@ -14,17 +6,12 @@ ) __all__ = [ - "Block", # pylint: disable=undefined-all-variable - "DatetimeTZBlock", # pylint: disable=undefined-all-variable - "ExtensionBlock", # pylint: disable=undefined-all-variable - "make_block", - "DataManager", - "ArrayManager", + "Block", # pyright:ignore[reportUnsupportedDunderAll)] "BlockManager", - "SingleDataManager", + "ExtensionBlock", # pyright:ignore[reportUnsupportedDunderAll)] "SingleBlockManager", - "SingleArrayManager", "concatenate_managers", + "make_block", ] @@ -37,7 +24,7 @@ def __getattr__(name: str): warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", - DeprecationWarning, + FutureWarning, # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 # on hard-coding stacklevel stacklevel=2, @@ -47,39 +34,24 @@ def __getattr__(name: str): return create_block_manager_from_blocks if name in [ - "NumericBlock", - "ObjectBlock", "Block", "ExtensionBlock", - "DatetimeTZBlock", ]: warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", - DeprecationWarning, + FutureWarning, # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 # on hard-coding stacklevel stacklevel=2, ) - if name == "NumericBlock": - from pandas.core.internals.blocks import NumericBlock - - return NumericBlock - elif name == "DatetimeTZBlock": - from pandas.core.internals.blocks import DatetimeTZBlock - - return DatetimeTZBlock - elif name == "ExtensionBlock": + if name == "ExtensionBlock": from pandas.core.internals.blocks import ExtensionBlock return ExtensionBlock - elif name == "Block": + else: from pandas.core.internals.blocks import Block return Block - else: - from pandas.core.internals.blocks import ObjectBlock - - return ObjectBlock raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'") diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index e5ef44d07061e..04944db2ebd9c 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -6,6 +6,7 @@ 2) Use only functions exposed here (or in core.internals) """ + from __future__ import annotations from typing import TYPE_CHECKING @@ -14,15 +15,18 @@ import numpy as np from pandas._libs.internals import BlockPlacement -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, + ExtensionDtype, PeriodDtype, ) -from pandas.core.arrays import DatetimeArray +from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, +) from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( check_ndim, @@ -33,11 +37,43 @@ ) if TYPE_CHECKING: - from pandas._typing import Dtype + from pandas._typing import ( + ArrayLike, + Dtype, + ) from pandas.core.internals.blocks import Block +def _make_block(values: ArrayLike, placement: np.ndarray) -> Block: + """ + This is an analogue to blocks.new_block(_2d) that ensures: + 1) correct dimension for EAs that support 2D (`ensure_block_shape`), and + 2) correct EA class for datetime64/timedelta64 (`maybe_coerce_values`). + + The input `values` is assumed to be either numpy array or ExtensionArray: + - In case of a numpy array, it is assumed to already be in the expected + shape for Blocks (2D, (cols, rows)). + - In case of an ExtensionArray the input can be 1D, also for EAs that are + internally stored as 2D. + + For the rest no preprocessing or validation is done, except for those dtypes + that are internally stored as EAs but have an exact numpy equivalent (and at + the moment use that numpy dtype), i.e. datetime64/timedelta64. + """ + dtype = values.dtype + klass = get_block_type(dtype) + placement_obj = BlockPlacement(placement) + + if (isinstance(dtype, ExtensionDtype) and dtype._supports_2d) or isinstance( + values, (DatetimeArray, TimedeltaArray) + ): + values = ensure_block_shape(values, ndim=2) + + values = maybe_coerce_values(values) + return klass(values, ndim=2, placement=placement_obj) + + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None ) -> Block: @@ -53,11 +89,12 @@ def make_block( - Block.__init__ """ warnings.warn( - # GH#40226 + # GH#56815 "make_block is deprecated and will be removed in a future version. " - "Use public APIs instead.", + "Use pd.api.internals.create_dataframe_from_blocks or " + "(recommended) higher-level public APIs instead.", DeprecationWarning, - stacklevel=find_stack_level(), + stacklevel=2, ) if dtype is not None: @@ -65,10 +102,7 @@ def make_block( values, dtype = extract_pandas_array(values, dtype, ndim) - from pandas.core.internals.blocks import ( - DatetimeTZBlock, - ExtensionBlock, - ) + from pandas.core.internals.blocks import ExtensionBlock if klass is ExtensionBlock and isinstance(values.dtype, PeriodDtype): # GH-44681 changed PeriodArray to be stored in the 2D @@ -80,16 +114,6 @@ def make_block( dtype = dtype or values.dtype klass = get_block_type(dtype) - elif klass is DatetimeTZBlock and not isinstance(values.dtype, DatetimeTZDtype): - # pyarrow calls get here - values = DatetimeArray._simple_new( - # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has - # incompatible type "Union[ExtensionDtype, dtype[Any], None]"; - # expected "Union[dtype[datetime64], DatetimeTZDtype]" - values, - dtype=dtype, # type: ignore[arg-type] - ) - if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) @@ -119,47 +143,3 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int else: ndim = values.ndim return ndim - - -def __getattr__(name: str): - # GH#55139 - - if name in [ - "Block", - "ExtensionBlock", - "DatetimeTZBlock", - "create_block_manager_from_blocks", - ]: - # GH#33892 - warnings.warn( - f"{name} is deprecated and will be removed in a future version. " - "Use public APIs instead.", - DeprecationWarning, - # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 - # on hard-coding stacklevel - stacklevel=2, - ) - - if name == "create_block_manager_from_blocks": - from pandas.core.internals.managers import create_block_manager_from_blocks - - return create_block_manager_from_blocks - - elif name == "Block": - from pandas.core.internals.blocks import Block - - return Block - - elif name == "DatetimeTZBlock": - from pandas.core.internals.blocks import DatetimeTZBlock - - return DatetimeTZBlock - - elif name == "ExtensionBlock": - from pandas.core.internals.blocks import ExtensionBlock - - return ExtensionBlock - - raise AttributeError( - f"module 'pandas.core.internals.api' has no attribute '{name}'" - ) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py deleted file mode 100644 index e253f82256a5f..0000000000000 --- a/pandas/core/internals/array_manager.py +++ /dev/null @@ -1,1340 +0,0 @@ -""" -Experimental manager based on storing a collection of 1D arrays -""" -from __future__ import annotations - -import itertools -from typing import ( - TYPE_CHECKING, - Callable, - Literal, -) - -import numpy as np - -from pandas._libs import ( - NaT, - lib, -) - -from pandas.core.dtypes.astype import ( - astype_array, - astype_array_safe, -) -from pandas.core.dtypes.cast import ( - ensure_dtype_can_hold_na, - find_common_type, - infer_dtype_from_scalar, - np_find_common_type, -) -from pandas.core.dtypes.common import ( - ensure_platform_int, - is_datetime64_ns_dtype, - is_integer, - is_numeric_dtype, - is_object_dtype, - is_timedelta64_ns_dtype, -) -from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCSeries, -) -from pandas.core.dtypes.missing import ( - array_equals, - isna, - na_value_for_dtype, -) - -import pandas.core.algorithms as algos -from pandas.core.array_algos.quantile import quantile_compat -from pandas.core.array_algos.take import take_1d -from pandas.core.arrays import ( - DatetimeArray, - ExtensionArray, - NumpyExtensionArray, - TimedeltaArray, -) -from pandas.core.construction import ( - ensure_wrapped_if_datetimelike, - extract_array, - sanitize_array, -) -from pandas.core.indexers import ( - maybe_convert_indices, - validate_indices, -) -from pandas.core.indexes.api import ( - Index, - ensure_index, -) -from pandas.core.indexes.base import get_values_for_csv -from pandas.core.internals.base import ( - DataManager, - SingleDataManager, - ensure_np_dtype, - interleaved_dtype, -) -from pandas.core.internals.blocks import ( - BlockPlacement, - ensure_block_shape, - external_values, - extract_pandas_array, - maybe_coerce_values, - new_block, -) -from pandas.core.internals.managers import make_na_array - -if TYPE_CHECKING: - from collections.abc import Hashable - - from pandas._typing import ( - ArrayLike, - AxisInt, - DtypeObj, - QuantileInterpolation, - Self, - npt, - ) - - -class BaseArrayManager(DataManager): - """ - Core internal data structure to implement DataFrame and Series. - - Alternative to the BlockManager, storing a list of 1D arrays instead of - Blocks. - - This is *not* a public API class - - Parameters - ---------- - arrays : Sequence of arrays - axes : Sequence of Index - verify_integrity : bool, default True - - """ - - __slots__ = [ - "_axes", # private attribute, because 'axes' has different order, see below - "arrays", - ] - - arrays: list[np.ndarray | ExtensionArray] - _axes: list[Index] - - def __init__( - self, - arrays: list[np.ndarray | ExtensionArray], - axes: list[Index], - verify_integrity: bool = True, - ) -> None: - raise NotImplementedError - - def make_empty(self, axes=None) -> Self: - """Return an empty ArrayManager with the items axis of len 0 (no columns)""" - if axes is None: - axes = [self.axes[1:], Index([])] - - arrays: list[np.ndarray | ExtensionArray] = [] - return type(self)(arrays, axes) - - @property - def items(self) -> Index: - return self._axes[-1] - - @property - # error: Signature of "axes" incompatible with supertype "DataManager" - def axes(self) -> list[Index]: # type: ignore[override] - # mypy doesn't work to override attribute with property - # see https://github.com/python/mypy/issues/4125 - """Axes is BlockManager-compatible order (columns, rows)""" - return [self._axes[1], self._axes[0]] - - @property - def shape_proper(self) -> tuple[int, ...]: - # this returns (n_rows, n_columns) - return tuple(len(ax) for ax in self._axes) - - @staticmethod - def _normalize_axis(axis: AxisInt) -> int: - # switch axis - axis = 1 if axis == 0 else 0 - return axis - - def set_axis(self, axis: AxisInt, new_labels: Index) -> None: - # Caller is responsible for ensuring we have an Index object. - self._validate_set_axis(axis, new_labels) - axis = self._normalize_axis(axis) - self._axes[axis] = new_labels - - def get_dtypes(self) -> npt.NDArray[np.object_]: - return np.array([arr.dtype for arr in self.arrays], dtype="object") - - def add_references(self, mgr: BaseArrayManager) -> None: - """ - Only implemented on the BlockManager level - """ - return - - def __getstate__(self): - return self.arrays, self._axes - - def __setstate__(self, state) -> None: - self.arrays = state[0] - self._axes = state[1] - - def __repr__(self) -> str: - output = type(self).__name__ - output += f"\nIndex: {self._axes[0]}" - if self.ndim == 2: - output += f"\nColumns: {self._axes[1]}" - output += f"\n{len(self.arrays)} arrays:" - for arr in self.arrays: - output += f"\n{arr.dtype}" - return output - - def apply( - self, - f, - align_keys: list[str] | None = None, - **kwargs, - ) -> Self: - """ - Iterate over the arrays, collect and create a new ArrayManager. - - Parameters - ---------- - f : str or callable - Name of the Array method to apply. - align_keys: List[str] or None, default None - **kwargs - Keywords to pass to `f` - - Returns - ------- - ArrayManager - """ - assert "filter" not in kwargs - - align_keys = align_keys or [] - result_arrays: list[ArrayLike] = [] - # fillna: Series/DataFrame is responsible for making sure value is aligned - - aligned_args = {k: kwargs[k] for k in align_keys} - - if f == "apply": - f = kwargs.pop("func") - - for i, arr in enumerate(self.arrays): - if aligned_args: - for k, obj in aligned_args.items(): - if isinstance(obj, (ABCSeries, ABCDataFrame)): - # The caller is responsible for ensuring that - # obj.axes[-1].equals(self.items) - if obj.ndim == 1: - kwargs[k] = obj.iloc[i] - else: - kwargs[k] = obj.iloc[:, i]._values - else: - # otherwise we have an array-like - kwargs[k] = obj[i] - - if callable(f): - applied = f(arr, **kwargs) - else: - applied = getattr(arr, f)(**kwargs) - - result_arrays.append(applied) - - new_axes = self._axes - return type(self)(result_arrays, new_axes) - - def apply_with_block(self, f, align_keys=None, **kwargs) -> Self: - # switch axis to follow BlockManager logic - swap_axis = True - if f == "interpolate": - swap_axis = False - if swap_axis and "axis" in kwargs and self.ndim == 2: - kwargs["axis"] = 1 if kwargs["axis"] == 0 else 0 - - align_keys = align_keys or [] - aligned_args = {k: kwargs[k] for k in align_keys} - - result_arrays = [] - - for i, arr in enumerate(self.arrays): - if aligned_args: - for k, obj in aligned_args.items(): - if isinstance(obj, (ABCSeries, ABCDataFrame)): - # The caller is responsible for ensuring that - # obj.axes[-1].equals(self.items) - if obj.ndim == 1: - if self.ndim == 2: - kwargs[k] = obj.iloc[slice(i, i + 1)]._values - else: - kwargs[k] = obj.iloc[:]._values - else: - kwargs[k] = obj.iloc[:, [i]]._values - else: - # otherwise we have an ndarray - if obj.ndim == 2: - kwargs[k] = obj[[i]] - - if isinstance(arr.dtype, np.dtype) and not isinstance(arr, np.ndarray): - # i.e. TimedeltaArray, DatetimeArray with tz=None. Need to - # convert for the Block constructors. - arr = np.asarray(arr) - - arr = maybe_coerce_values(arr) - if self.ndim == 2: - arr = ensure_block_shape(arr, 2) - bp = BlockPlacement(slice(0, 1, 1)) - block = new_block(arr, placement=bp, ndim=2) - else: - bp = BlockPlacement(slice(0, len(self), 1)) - block = new_block(arr, placement=bp, ndim=1) - - applied = getattr(block, f)(**kwargs) - if isinstance(applied, list): - applied = applied[0] - arr = applied.values - if self.ndim == 2 and arr.ndim == 2: - # 2D for np.ndarray or DatetimeArray/TimedeltaArray - assert len(arr) == 1 - # error: No overload variant of "__getitem__" of "ExtensionArray" - # matches argument type "Tuple[int, slice]" - arr = arr[0, :] # type: ignore[call-overload] - result_arrays.append(arr) - - return type(self)(result_arrays, self._axes) - - def setitem(self, indexer, value, warn: bool = True) -> Self: - return self.apply_with_block("setitem", indexer=indexer, value=value) - - def diff(self, n: int) -> Self: - assert self.ndim == 2 # caller ensures - return self.apply(algos.diff, n=n) - - def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self: - if copy is None: - copy = True - - return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) - - def convert(self, copy: bool | None) -> Self: - if copy is None: - copy = True - - def _convert(arr): - if is_object_dtype(arr.dtype): - # extract NumpyExtensionArray for tests that patch - # NumpyExtensionArray._typ - arr = np.asarray(arr) - result = lib.maybe_convert_objects( - arr, - convert_non_numeric=True, - ) - if result is arr and copy: - return arr.copy() - return result - else: - return arr.copy() if copy else arr - - return self.apply(_convert) - - def get_values_for_csv( - self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None - ) -> Self: - return self.apply( - get_values_for_csv, - na_rep=na_rep, - quoting=quoting, - float_format=float_format, - date_format=date_format, - decimal=decimal, - ) - - @property - def any_extension_types(self) -> bool: - """Whether any of the blocks in this manager are extension blocks""" - return False # any(block.is_extension for block in self.blocks) - - @property - def is_view(self) -> bool: - """return a boolean if we are a single block and are a view""" - # TODO what is this used for? - return False - - @property - def is_single_block(self) -> bool: - return len(self.arrays) == 1 - - def _get_data_subset(self, predicate: Callable) -> Self: - indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] - arrays = [self.arrays[i] for i in indices] - # TODO copy? - # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq, - # see test_describe_datetime_columns - taker = np.array(indices, dtype="intp") - new_cols = self._axes[1].take(taker) - new_axes = [self._axes[0], new_cols] - return type(self)(arrays, new_axes, verify_integrity=False) - - def get_bool_data(self, copy: bool = False) -> Self: - """ - Select columns that are bool-dtype and object-dtype columns that are all-bool. - - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks - """ - return self._get_data_subset(lambda x: x.dtype == np.dtype(bool)) - - def get_numeric_data(self, copy: bool = False) -> Self: - """ - Select columns that have a numeric dtype. - - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks - """ - return self._get_data_subset( - lambda arr: is_numeric_dtype(arr.dtype) - or getattr(arr.dtype, "_is_numeric", False) - ) - - def copy(self, deep: bool | Literal["all"] | None = True) -> Self: - """ - Make deep or shallow copy of ArrayManager - - Parameters - ---------- - deep : bool or string, default True - If False, return shallow copy (do not copy data) - If 'all', copy data and a deep copy of the index - - Returns - ------- - BlockManager - """ - if deep is None: - # ArrayManager does not yet support CoW, so deep=None always means - # deep=True for now - deep = True - - # this preserves the notion of view copying of axes - if deep: - # hit in e.g. tests.io.json.test_pandas - - def copy_func(ax): - return ax.copy(deep=True) if deep == "all" else ax.view() - - new_axes = [copy_func(ax) for ax in self._axes] - else: - new_axes = list(self._axes) - - if deep: - new_arrays = [arr.copy() for arr in self.arrays] - else: - new_arrays = list(self.arrays) - return type(self)(new_arrays, new_axes, verify_integrity=False) - - def reindex_indexer( - self, - new_axis, - indexer, - axis: AxisInt, - fill_value=None, - allow_dups: bool = False, - copy: bool | None = True, - # ignored keywords - only_slice: bool = False, - # ArrayManager specific keywords - use_na_proxy: bool = False, - ) -> Self: - axis = self._normalize_axis(axis) - return self._reindex_indexer( - new_axis, - indexer, - axis, - fill_value, - allow_dups, - copy, - use_na_proxy, - ) - - def _reindex_indexer( - self, - new_axis, - indexer: npt.NDArray[np.intp] | None, - axis: AxisInt, - fill_value=None, - allow_dups: bool = False, - copy: bool | None = True, - use_na_proxy: bool = False, - ) -> Self: - """ - Parameters - ---------- - new_axis : Index - indexer : ndarray[intp] or None - axis : int - fill_value : object, default None - allow_dups : bool, default False - copy : bool, default True - - - pandas-indexer with -1's only. - """ - if copy is None: - # ArrayManager does not yet support CoW, so deep=None always means - # deep=True for now - copy = True - - if indexer is None: - if new_axis is self._axes[axis] and not copy: - return self - - result = self.copy(deep=copy) - result._axes = list(self._axes) - result._axes[axis] = new_axis - return result - - # some axes don't allow reindexing with dups - if not allow_dups: - self._axes[axis]._validate_can_reindex(indexer) - - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - if axis == 1: - new_arrays = [] - for i in indexer: - if i == -1: - arr = self._make_na_array( - fill_value=fill_value, use_na_proxy=use_na_proxy - ) - else: - arr = self.arrays[i] - if copy: - arr = arr.copy() - new_arrays.append(arr) - - else: - validate_indices(indexer, len(self._axes[0])) - indexer = ensure_platform_int(indexer) - mask = indexer == -1 - needs_masking = mask.any() - new_arrays = [ - take_1d( - arr, - indexer, - allow_fill=needs_masking, - fill_value=fill_value, - mask=mask, - # if fill_value is not None else blk.fill_value - ) - for arr in self.arrays - ] - - new_axes = list(self._axes) - new_axes[axis] = new_axis - - return type(self)(new_arrays, new_axes, verify_integrity=False) - - def take( - self, - indexer: npt.NDArray[np.intp], - axis: AxisInt = 1, - verify: bool = True, - ) -> Self: - """ - Take items along any axis. - """ - assert isinstance(indexer, np.ndarray), type(indexer) - assert indexer.dtype == np.intp, indexer.dtype - - axis = self._normalize_axis(axis) - - if not indexer.ndim == 1: - raise ValueError("indexer should be 1-dimensional") - - n = self.shape_proper[axis] - indexer = maybe_convert_indices(indexer, n, verify=verify) - - new_labels = self._axes[axis].take(indexer) - return self._reindex_indexer( - new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True - ) - - def _make_na_array(self, fill_value=None, use_na_proxy: bool = False): - if use_na_proxy: - assert fill_value is None - return NullArrayProxy(self.shape_proper[0]) - - if fill_value is None: - fill_value = np.nan - - dtype, fill_value = infer_dtype_from_scalar(fill_value) - array_values = make_na_array(dtype, self.shape_proper[:1], fill_value) - return array_values - - def _equal_values(self, other) -> bool: - """ - Used in .equals defined in base class. Only check the column values - assuming shape and indexes have already been checked. - """ - for left, right in zip(self.arrays, other.arrays): - if not array_equals(left, right): - return False - return True - - # TODO - # to_dict - - -class ArrayManager(BaseArrayManager): - @property - def ndim(self) -> Literal[2]: - return 2 - - def __init__( - self, - arrays: list[np.ndarray | ExtensionArray], - axes: list[Index], - verify_integrity: bool = True, - ) -> None: - # Note: we are storing the axes in "_axes" in the (row, columns) order - # which contrasts the order how it is stored in BlockManager - self._axes = axes - self.arrays = arrays - - if verify_integrity: - self._axes = [ensure_index(ax) for ax in axes] - arrays = [extract_pandas_array(x, None, 1)[0] for x in arrays] - self.arrays = [maybe_coerce_values(arr) for arr in arrays] - self._verify_integrity() - - def _verify_integrity(self) -> None: - n_rows, n_columns = self.shape_proper - if not len(self.arrays) == n_columns: - raise ValueError( - "Number of passed arrays must equal the size of the column Index: " - f"{len(self.arrays)} arrays vs {n_columns} columns." - ) - for arr in self.arrays: - if not len(arr) == n_rows: - raise ValueError( - "Passed arrays should have the same length as the rows Index: " - f"{len(arr)} vs {n_rows} rows" - ) - if not isinstance(arr, (np.ndarray, ExtensionArray)): - raise ValueError( - "Passed arrays should be np.ndarray or ExtensionArray instances, " - f"got {type(arr)} instead" - ) - if not arr.ndim == 1: - raise ValueError( - "Passed arrays should be 1-dimensional, got array with " - f"{arr.ndim} dimensions instead." - ) - - # -------------------------------------------------------------------- - # Indexing - - def fast_xs(self, loc: int) -> SingleArrayManager: - """ - Return the array corresponding to `frame.iloc[loc]`. - - Parameters - ---------- - loc : int - - Returns - ------- - np.ndarray or ExtensionArray - """ - dtype = interleaved_dtype([arr.dtype for arr in self.arrays]) - - values = [arr[loc] for arr in self.arrays] - if isinstance(dtype, ExtensionDtype): - result = dtype.construct_array_type()._from_sequence(values, dtype=dtype) - # for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT - elif is_datetime64_ns_dtype(dtype): - result = DatetimeArray._from_sequence(values, dtype=dtype)._ndarray - elif is_timedelta64_ns_dtype(dtype): - result = TimedeltaArray._from_sequence(values, dtype=dtype)._ndarray - else: - result = np.array(values, dtype=dtype) - return SingleArrayManager([result], [self._axes[1]]) - - def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager: - axis = self._normalize_axis(axis) - - if axis == 0: - arrays = [arr[slobj] for arr in self.arrays] - elif axis == 1: - arrays = self.arrays[slobj] - - new_axes = list(self._axes) - new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - - return type(self)(arrays, new_axes, verify_integrity=False) - - def iget(self, i: int) -> SingleArrayManager: - """ - Return the data as a SingleArrayManager. - """ - values = self.arrays[i] - return SingleArrayManager([values], [self._axes[0]]) - - def iget_values(self, i: int) -> ArrayLike: - """ - Return the data for column i as the values (ndarray or ExtensionArray). - """ - return self.arrays[i] - - @property - def column_arrays(self) -> list[ArrayLike]: - """ - Used in the JSON C code to access column arrays. - """ - - return [np.asarray(arr) for arr in self.arrays] - - def iset( - self, - loc: int | slice | np.ndarray, - value: ArrayLike, - inplace: bool = False, - refs=None, - ) -> None: - """ - Set new column(s). - - This changes the ArrayManager in-place, but replaces (an) existing - column(s), not changing column values in-place). - - Parameters - ---------- - loc : integer, slice or boolean mask - Positional location (already bounds checked) - value : np.ndarray or ExtensionArray - inplace : bool, default False - Whether overwrite existing array as opposed to replacing it. - """ - # single column -> single integer index - if lib.is_integer(loc): - # TODO can we avoid needing to unpack this here? That means converting - # DataFrame into 1D array when loc is an integer - if isinstance(value, np.ndarray) and value.ndim == 2: - assert value.shape[1] == 1 - value = value[:, 0] - - # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item - # but we should avoid that and pass directly the proper array - value = maybe_coerce_values(value) - - assert isinstance(value, (np.ndarray, ExtensionArray)) - assert value.ndim == 1 - assert len(value) == len(self._axes[0]) - self.arrays[loc] = value - return - - # multiple columns -> convert slice or array to integer indices - elif isinstance(loc, slice): - indices: range | np.ndarray = range( - loc.start if loc.start is not None else 0, - loc.stop if loc.stop is not None else self.shape_proper[1], - loc.step if loc.step is not None else 1, - ) - else: - assert isinstance(loc, np.ndarray) - assert loc.dtype == "bool" - indices = np.nonzero(loc)[0] - - assert value.ndim == 2 - assert value.shape[0] == len(self._axes[0]) - - for value_idx, mgr_idx in enumerate(indices): - # error: No overload variant of "__getitem__" of "ExtensionArray" matches - # argument type "Tuple[slice, int]" - value_arr = value[:, value_idx] # type: ignore[call-overload] - self.arrays[mgr_idx] = value_arr - return - - def column_setitem( - self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False - ) -> None: - """ - Set values ("setitem") into a single column (not setting the full column). - - This is a method on the ArrayManager level, to avoid creating an - intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) - """ - if not is_integer(loc): - raise TypeError("The column index should be an integer") - arr = self.arrays[loc] - mgr = SingleArrayManager([arr], [self._axes[0]]) - if inplace_only: - mgr.setitem_inplace(idx, value) - else: - new_mgr = mgr.setitem((idx,), value) - # update existing ArrayManager in-place - self.arrays[loc] = new_mgr.arrays[0] - - def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: - """ - Insert item at selected position. - - Parameters - ---------- - loc : int - item : hashable - value : np.ndarray or ExtensionArray - """ - # insert to the axis; this could possibly raise a TypeError - new_axis = self.items.insert(loc, item) - - value = extract_array(value, extract_numpy=True) - if value.ndim == 2: - if value.shape[0] == 1: - # error: No overload variant of "__getitem__" of "ExtensionArray" - # matches argument type "Tuple[int, slice]" - value = value[0, :] # type: ignore[call-overload] - else: - raise ValueError( - f"Expected a 1D array, got an array with shape {value.shape}" - ) - value = maybe_coerce_values(value) - - # TODO self.arrays can be empty - # assert len(value) == len(self.arrays[0]) - - # TODO is this copy needed? - arrays = self.arrays.copy() - arrays.insert(loc, value) - - self.arrays = arrays - self._axes[1] = new_axis - - def idelete(self, indexer) -> ArrayManager: - """ - Delete selected locations in-place (new block and array, same BlockManager) - """ - to_keep = np.ones(self.shape[0], dtype=np.bool_) - to_keep[indexer] = False - - self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] - self._axes = [self._axes[0], self._axes[1][to_keep]] - return self - - # -------------------------------------------------------------------- - # Array-wise Operation - - def grouped_reduce(self, func: Callable) -> Self: - """ - Apply grouped reduction function columnwise, returning a new ArrayManager. - - Parameters - ---------- - func : grouped reduction function - - Returns - ------- - ArrayManager - """ - result_arrays: list[np.ndarray] = [] - result_indices: list[int] = [] - - for i, arr in enumerate(self.arrays): - # grouped_reduce functions all expect 2D arrays - arr = ensure_block_shape(arr, ndim=2) - res = func(arr) - if res.ndim == 2: - # reverse of ensure_block_shape - assert res.shape[0] == 1 - res = res[0] - - result_arrays.append(res) - result_indices.append(i) - - if len(result_arrays) == 0: - nrows = 0 - else: - nrows = result_arrays[0].shape[0] - index = Index(range(nrows)) - - columns = self.items - - # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; - # expected "List[Union[ndarray, ExtensionArray]]" - return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] - - def reduce(self, func: Callable) -> Self: - """ - Apply reduction function column-wise, returning a single-row ArrayManager. - - Parameters - ---------- - func : reduction function - - Returns - ------- - ArrayManager - """ - result_arrays: list[np.ndarray] = [] - for i, arr in enumerate(self.arrays): - res = func(arr, axis=0) - - # TODO NaT doesn't preserve dtype, so we need to ensure to create - # a timedelta result array if original was timedelta - # what if datetime results in timedelta? (eg std) - dtype = arr.dtype if res is NaT else None - result_arrays.append( - sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type] - ) - - index = Index._simple_new(np.array([None], dtype=object)) # placeholder - columns = self.items - - # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; - # expected "List[Union[ndarray, ExtensionArray]]" - new_mgr = type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] - return new_mgr - - def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager: - """ - Apply array_op blockwise with another (aligned) BlockManager. - """ - # TODO what if `other` is BlockManager ? - left_arrays = self.arrays - right_arrays = other.arrays - result_arrays = [ - array_op(left, right) for left, right in zip(left_arrays, right_arrays) - ] - return type(self)(result_arrays, self._axes) - - def quantile( - self, - *, - qs: Index, # with dtype float64 - transposed: bool = False, - interpolation: QuantileInterpolation = "linear", - ) -> ArrayManager: - arrs = [ensure_block_shape(x, 2) for x in self.arrays] - new_arrs = [ - quantile_compat(x, np.asarray(qs._values), interpolation) for x in arrs - ] - for i, arr in enumerate(new_arrs): - if arr.ndim == 2: - assert arr.shape[0] == 1, arr.shape - new_arrs[i] = arr[0] - - axes = [qs, self._axes[1]] - return type(self)(new_arrs, axes) - - # ---------------------------------------------------------------- - - def unstack(self, unstacker, fill_value) -> ArrayManager: - """ - Return a BlockManager with all blocks unstacked. - - Parameters - ---------- - unstacker : reshape._Unstacker - fill_value : Any - fill_value for newly introduced missing values. - - Returns - ------- - unstacked : BlockManager - """ - indexer, _ = unstacker._indexer_and_to_sort - if unstacker.mask.all(): - new_indexer = indexer - allow_fill = False - new_mask2D = None - needs_masking = None - else: - new_indexer = np.full(unstacker.mask.shape, -1) - new_indexer[unstacker.mask] = indexer - allow_fill = True - # calculating the full mask once and passing it to take_1d is faster - # than letting take_1d calculate it in each repeated call - new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape) - needs_masking = new_mask2D.any(axis=0) - new_indexer2D = new_indexer.reshape(*unstacker.full_shape) - new_indexer2D = ensure_platform_int(new_indexer2D) - - new_arrays = [] - for arr in self.arrays: - for i in range(unstacker.full_shape[1]): - if allow_fill: - # error: Value of type "Optional[Any]" is not indexable [index] - new_arr = take_1d( - arr, - new_indexer2D[:, i], - allow_fill=needs_masking[i], # type: ignore[index] - fill_value=fill_value, - mask=new_mask2D[:, i], # type: ignore[index] - ) - else: - new_arr = take_1d(arr, new_indexer2D[:, i], allow_fill=False) - new_arrays.append(new_arr) - - new_index = unstacker.new_index - new_columns = unstacker.get_new_columns(self._axes[1]) - new_axes = [new_index, new_columns] - - return type(self)(new_arrays, new_axes, verify_integrity=False) - - def as_array( - self, - dtype=None, - copy: bool = False, - na_value: object = lib.no_default, - ) -> np.ndarray: - """ - Convert the blockmanager data into an numpy array. - - Parameters - ---------- - dtype : object, default None - Data type of the return array. - copy : bool, default False - If True then guarantee that a copy is returned. A value of - False does not guarantee that the underlying data is not - copied. - na_value : object, default lib.no_default - Value to be used as the missing value sentinel. - - Returns - ------- - arr : ndarray - """ - if len(self.arrays) == 0: - empty_arr = np.empty(self.shape, dtype=float) - return empty_arr.transpose() - - # We want to copy when na_value is provided to avoid - # mutating the original object - copy = copy or na_value is not lib.no_default - - if not dtype: - dtype = interleaved_dtype([arr.dtype for arr in self.arrays]) - - dtype = ensure_np_dtype(dtype) - - result = np.empty(self.shape_proper, dtype=dtype) - - for i, arr in enumerate(self.arrays): - arr = arr.astype(dtype, copy=copy) - result[:, i] = arr - - if na_value is not lib.no_default: - result[isna(result)] = na_value - - return result - - @classmethod - def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: - """ - Concatenate uniformly-indexed ArrayManagers horizontally. - """ - # concatting along the columns -> combine reindexed arrays in a single manager - arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False) - return new_mgr - - @classmethod - def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: - """ - Concatenate uniformly-indexed ArrayManagers vertically. - """ - # concatting along the rows -> concat the reindexed arrays - # TODO(ArrayManager) doesn't yet preserve the correct dtype - arrays = [ - concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) - for j in range(len(mgrs[0].arrays)) - ] - new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False) - return new_mgr - - -class SingleArrayManager(BaseArrayManager, SingleDataManager): - __slots__ = [ - "_axes", # private attribute, because 'axes' has different order, see below - "arrays", - ] - - arrays: list[np.ndarray | ExtensionArray] - _axes: list[Index] - - @property - def ndim(self) -> Literal[1]: - return 1 - - def __init__( - self, - arrays: list[np.ndarray | ExtensionArray], - axes: list[Index], - verify_integrity: bool = True, - ) -> None: - self._axes = axes - self.arrays = arrays - - if verify_integrity: - assert len(axes) == 1 - assert len(arrays) == 1 - self._axes = [ensure_index(ax) for ax in self._axes] - arr = arrays[0] - arr = maybe_coerce_values(arr) - arr = extract_pandas_array(arr, None, 1)[0] - self.arrays = [arr] - self._verify_integrity() - - def _verify_integrity(self) -> None: - (n_rows,) = self.shape - assert len(self.arrays) == 1 - arr = self.arrays[0] - assert len(arr) == n_rows - if not arr.ndim == 1: - raise ValueError( - "Passed array should be 1-dimensional, got array with " - f"{arr.ndim} dimensions instead." - ) - - @staticmethod - def _normalize_axis(axis): - return axis - - def make_empty(self, axes=None) -> Self: - """Return an empty ArrayManager with index/array of length 0""" - if axes is None: - axes = [Index([], dtype=object)] - array: np.ndarray = np.array([], dtype=self.dtype) - return type(self)([array], axes) - - @classmethod - def from_array(cls, array, index) -> SingleArrayManager: - return cls([array], [index]) - - # error: Cannot override writeable attribute with read-only property - @property - def axes(self) -> list[Index]: # type: ignore[override] - return self._axes - - @property - def index(self) -> Index: - return self._axes[0] - - @property - def dtype(self): - return self.array.dtype - - def external_values(self): - """The array that Series.values returns""" - return external_values(self.array) - - def internal_values(self): - """The array that Series._values returns""" - return self.array - - def array_values(self): - """The array that Series.array returns""" - arr = self.array - if isinstance(arr, np.ndarray): - arr = NumpyExtensionArray(arr) - return arr - - @property - def _can_hold_na(self) -> bool: - if isinstance(self.array, np.ndarray): - return self.array.dtype.kind not in "iub" - else: - # ExtensionArray - return self.array._can_hold_na - - @property - def is_single_block(self) -> bool: - return True - - def fast_xs(self, loc: int) -> SingleArrayManager: - raise NotImplementedError("Use series._values[loc] instead") - - def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleArrayManager: - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") - - new_array = self.array[slobj] - new_index = self.index._getitem_slice(slobj) - return type(self)([new_array], [new_index], verify_integrity=False) - - def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> SingleArrayManager: - new_array = self.array[indexer] - new_index = self.index[indexer] - return type(self)([new_array], [new_index]) - - # error: Signature of "apply" incompatible with supertype "BaseArrayManager" - def apply(self, func, **kwargs) -> Self: # type: ignore[override] - if callable(func): - new_array = func(self.array, **kwargs) - else: - new_array = getattr(self.array, func)(**kwargs) - return type(self)([new_array], self._axes) - - def setitem(self, indexer, value, warn: bool = True) -> SingleArrayManager: - """ - Set values with indexer. - - For SingleArrayManager, this backs s[indexer] = value - - See `setitem_inplace` for a version that works inplace and doesn't - return a new Manager. - """ - if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: - raise ValueError(f"Cannot set values with ndim > {self.ndim}") - return self.apply_with_block("setitem", indexer=indexer, value=value) - - def idelete(self, indexer) -> SingleArrayManager: - """ - Delete selected locations in-place (new array, same ArrayManager) - """ - to_keep = np.ones(self.shape[0], dtype=np.bool_) - to_keep[indexer] = False - - self.arrays = [self.arrays[0][to_keep]] - self._axes = [self._axes[0][to_keep]] - return self - - def _get_data_subset(self, predicate: Callable) -> SingleArrayManager: - # used in get_numeric_data / get_bool_data - if predicate(self.array): - return type(self)(self.arrays, self._axes, verify_integrity=False) - else: - return self.make_empty() - - def set_values(self, values: ArrayLike) -> None: - """ - Set (replace) the values of the SingleArrayManager in place. - - Use at your own risk! This does not check if the passed values are - valid for the current SingleArrayManager (length, dtype, etc). - """ - self.arrays[0] = values - - def to_2d_mgr(self, columns: Index) -> ArrayManager: - """ - Manager analogue of Series.to_frame - """ - arrays = [self.arrays[0]] - axes = [self.axes[0], columns] - - return ArrayManager(arrays, axes, verify_integrity=False) - - -class NullArrayProxy: - """ - Proxy object for an all-NA array. - - Only stores the length of the array, and not the dtype. The dtype - will only be known when actually concatenating (after determining the - common dtype, for which this proxy is ignored). - Using this object avoids that the internals/concat.py needs to determine - the proper dtype and array type. - """ - - ndim = 1 - - def __init__(self, n: int) -> None: - self.n = n - - @property - def shape(self) -> tuple[int]: - return (self.n,) - - def to_array(self, dtype: DtypeObj) -> ArrayLike: - """ - Helper function to create the actual all-NA array from the NullArrayProxy - object. - - Parameters - ---------- - arr : NullArrayProxy - dtype : the dtype for the resulting array - - Returns - ------- - np.ndarray or ExtensionArray - """ - if isinstance(dtype, ExtensionDtype): - empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) - indexer = -np.ones(self.n, dtype=np.intp) - return empty.take(indexer, allow_fill=True) - else: - # when introducing missing values, int becomes float, bool becomes object - dtype = ensure_dtype_can_hold_na(dtype) - fill_value = na_value_for_dtype(dtype) - arr = np.empty(self.n, dtype=dtype) - arr.fill(fill_value) - return ensure_wrapped_if_datetimelike(arr) - - -def concat_arrays(to_concat: list) -> ArrayLike: - """ - Alternative for concat_compat but specialized for use in the ArrayManager. - - Differences: only deals with 1D arrays (no axis keyword), assumes - ensure_wrapped_if_datetimelike and does not skip empty arrays to determine - the dtype. - In addition ensures that all NullArrayProxies get replaced with actual - arrays. - - Parameters - ---------- - to_concat : list of arrays - - Returns - ------- - np.ndarray or ExtensionArray - """ - # ignore the all-NA proxies to determine the resulting dtype - to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] - - dtypes = {x.dtype for x in to_concat_no_proxy} - single_dtype = len(dtypes) == 1 - - if single_dtype: - target_dtype = to_concat_no_proxy[0].dtype - elif all(lib.is_np_dtype(x, "iub") for x in dtypes): - # GH#42092 - target_dtype = np_find_common_type(*dtypes) - else: - target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) - - to_concat = [ - arr.to_array(target_dtype) - if isinstance(arr, NullArrayProxy) - else astype_array(arr, target_dtype, copy=False) - for arr in to_concat - ] - - if isinstance(to_concat[0], ExtensionArray): - cls = type(to_concat[0]) - return cls._concat_same_type(to_concat) - - result = np.concatenate(to_concat) - - # TODO decide on exact behaviour (we shouldn't do this only for empty result) - # see https://github.com/pandas-dev/pandas/issues/39817 - if len(result) == 0: - # all empties -> check for bool to not coerce to float - kinds = {obj.dtype.kind for obj in to_concat_no_proxy} - if len(kinds) != 1: - if "b" in kinds: - result = result.astype(object) - return result diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py deleted file mode 100644 index ae91f167205a0..0000000000000 --- a/pandas/core/internals/base.py +++ /dev/null @@ -1,407 +0,0 @@ -""" -Base class for the internal managers. Both BlockManager and ArrayManager -inherit from this class. -""" -from __future__ import annotations - -from typing import ( - TYPE_CHECKING, - Any, - Literal, - cast, - final, -) - -import numpy as np - -from pandas._config import ( - using_copy_on_write, - warn_copy_on_write, -) - -from pandas._libs import ( - algos as libalgos, - lib, -) -from pandas.errors import AbstractMethodError -from pandas.util._validators import validate_bool_kwarg - -from pandas.core.dtypes.cast import ( - find_common_type, - np_can_hold_element, -) -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, - SparseDtype, -) - -from pandas.core.base import PandasObject -from pandas.core.construction import extract_array -from pandas.core.indexes.api import ( - Index, - default_index, -) - -if TYPE_CHECKING: - from pandas._typing import ( - ArrayLike, - AxisInt, - DtypeObj, - Self, - Shape, - ) - - -class _AlreadyWarned: - def __init__(self): - # This class is used on the manager level to the block level to - # ensure that we warn only once. The block method can update the - # warned_already option without returning a value to keep the - # interface consistent. This is only a temporary solution for - # CoW warnings. - self.warned_already = False - - -class DataManager(PandasObject): - # TODO share more methods/attributes - - axes: list[Index] - - @property - def items(self) -> Index: - raise AbstractMethodError(self) - - @final - def __len__(self) -> int: - return len(self.items) - - @property - def ndim(self) -> int: - return len(self.axes) - - @property - def shape(self) -> Shape: - return tuple(len(ax) for ax in self.axes) - - @final - def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: - # Caller is responsible for ensuring we have an Index object. - old_len = len(self.axes[axis]) - new_len = len(new_labels) - - if axis == 1 and len(self.items) == 0: - # If we are setting the index on a DataFrame with no columns, - # it is OK to change the length. - pass - - elif new_len != old_len: - raise ValueError( - f"Length mismatch: Expected axis has {old_len} elements, new " - f"values have {new_len} elements" - ) - - def reindex_indexer( - self, - new_axis, - indexer, - axis: AxisInt, - fill_value=None, - allow_dups: bool = False, - copy: bool = True, - only_slice: bool = False, - ) -> Self: - raise AbstractMethodError(self) - - @final - def reindex_axis( - self, - new_index: Index, - axis: AxisInt, - fill_value=None, - only_slice: bool = False, - ) -> Self: - """ - Conform data manager to new index. - """ - new_index, indexer = self.axes[axis].reindex(new_index) - - return self.reindex_indexer( - new_index, - indexer, - axis=axis, - fill_value=fill_value, - copy=False, - only_slice=only_slice, - ) - - def _equal_values(self, other: Self) -> bool: - """ - To be implemented by the subclasses. Only check the column values - assuming shape and indexes have already been checked. - """ - raise AbstractMethodError(self) - - @final - def equals(self, other: object) -> bool: - """ - Implementation for DataFrame.equals - """ - if not isinstance(other, type(self)): - return False - - self_axes, other_axes = self.axes, other.axes - if len(self_axes) != len(other_axes): - return False - if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): - return False - - return self._equal_values(other) - - def apply( - self, - f, - align_keys: list[str] | None = None, - **kwargs, - ) -> Self: - raise AbstractMethodError(self) - - def apply_with_block( - self, - f, - align_keys: list[str] | None = None, - **kwargs, - ) -> Self: - raise AbstractMethodError(self) - - @final - def isna(self, func) -> Self: - return self.apply("apply", func=func) - - @final - def fillna(self, value, limit: int | None, inplace: bool, downcast) -> Self: - if limit is not None: - # Do this validation even if we go through one of the no-op paths - limit = libalgos.validate_limit(None, limit=limit) - - return self.apply_with_block( - "fillna", - value=value, - limit=limit, - inplace=inplace, - downcast=downcast, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - @final - def where(self, other, cond, align: bool) -> Self: - if align: - align_keys = ["other", "cond"] - else: - align_keys = ["cond"] - other = extract_array(other, extract_numpy=True) - - return self.apply_with_block( - "where", - align_keys=align_keys, - other=other, - cond=cond, - using_cow=using_copy_on_write(), - ) - - @final - def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self: - if align: - align_keys = ["new", "mask"] - else: - align_keys = ["mask"] - new = extract_array(new, extract_numpy=True) - - already_warned = None - if warn_copy_on_write(): - already_warned = _AlreadyWarned() - if not warn: - already_warned.warned_already = True - - return self.apply_with_block( - "putmask", - align_keys=align_keys, - mask=mask, - new=new, - using_cow=using_copy_on_write(), - already_warned=already_warned, - ) - - @final - def round(self, decimals: int, using_cow: bool = False) -> Self: - return self.apply_with_block( - "round", - decimals=decimals, - using_cow=using_cow, - ) - - @final - def replace(self, to_replace, value, inplace: bool) -> Self: - inplace = validate_bool_kwarg(inplace, "inplace") - # NDFrame.replace ensures the not-is_list_likes here - assert not lib.is_list_like(to_replace) - assert not lib.is_list_like(value) - return self.apply_with_block( - "replace", - to_replace=to_replace, - value=value, - inplace=inplace, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - @final - def replace_regex(self, **kwargs) -> Self: - return self.apply_with_block( - "_replace_regex", - **kwargs, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - @final - def replace_list( - self, - src_list: list[Any], - dest_list: list[Any], - inplace: bool = False, - regex: bool = False, - ) -> Self: - """do a list replace""" - inplace = validate_bool_kwarg(inplace, "inplace") - - bm = self.apply_with_block( - "replace_list", - src_list=src_list, - dest_list=dest_list, - inplace=inplace, - regex=regex, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - bm._consolidate_inplace() - return bm - - def interpolate(self, inplace: bool, **kwargs) -> Self: - return self.apply_with_block( - "interpolate", - inplace=inplace, - **kwargs, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: - return self.apply_with_block( - "pad_or_backfill", - inplace=inplace, - **kwargs, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - def shift(self, periods: int, fill_value) -> Self: - if fill_value is lib.no_default: - fill_value = None - - return self.apply_with_block("shift", periods=periods, fill_value=fill_value) - - # -------------------------------------------------------------------- - # Consolidation: No-ops for all but BlockManager - - def is_consolidated(self) -> bool: - return True - - def consolidate(self) -> Self: - return self - - def _consolidate_inplace(self) -> None: - return - - -class SingleDataManager(DataManager): - @property - def ndim(self) -> Literal[1]: - return 1 - - @final - @property - def array(self) -> ArrayLike: - """ - Quick access to the backing array of the Block or SingleArrayManager. - """ - # error: "SingleDataManager" has no attribute "arrays"; maybe "array" - return self.arrays[0] # type: ignore[attr-defined] - - def setitem_inplace(self, indexer, value, warn: bool = True) -> None: - """ - Set values with indexer. - - For Single[Block/Array]Manager, this backs s[indexer] = value - - This is an inplace version of `setitem()`, mutating the manager/values - in place, not returning a new Manager (and Block), and thus never changing - the dtype. - """ - arr = self.array - - # EAs will do this validation in their own __setitem__ methods. - if isinstance(arr, np.ndarray): - # Note: checking for ndarray instead of np.dtype means we exclude - # dt64/td64, which do their own validation. - value = np_can_hold_element(arr.dtype, value) - - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: - # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 - value = value[0, ...] - - arr[indexer] = value - - def grouped_reduce(self, func): - arr = self.array - res = func(arr) - index = default_index(len(res)) - - mgr = type(self).from_array(res, index) - return mgr - - @classmethod - def from_array(cls, arr: ArrayLike, index: Index): - raise AbstractMethodError(cls) - - -def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: - """ - Find the common dtype for `blocks`. - - Parameters - ---------- - blocks : List[DtypeObj] - - Returns - ------- - dtype : np.dtype, ExtensionDtype, or None - None is returned when `blocks` is empty. - """ - if not len(dtypes): - return None - - return find_common_type(dtypes) - - -def ensure_np_dtype(dtype: DtypeObj) -> np.dtype: - # TODO: https://github.com/pandas-dev/pandas/issues/22791 - # Give EAs some input on what happens here. Sparse needs this. - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - dtype = cast(np.dtype, dtype) - elif isinstance(dtype, ExtensionDtype): - dtype = np.dtype("object") - elif dtype == np.dtype(str): - dtype = np.dtype("object") - return dtype diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 20eff9315bc80..6aa5062b8ed86 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,11 +1,10 @@ from __future__ import annotations -from functools import wraps +import inspect import re from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, final, @@ -15,12 +14,6 @@ import numpy as np -from pandas._config import ( - get_option, - using_copy_on_write, - warn_copy_on_write, -) - from pandas._libs import ( NaT, internals as libinternals, @@ -36,7 +29,6 @@ AxisInt, DtypeBackend, DtypeObj, - F, FillnaOptions, IgnoreRaise, InterpolateOptions, @@ -45,7 +37,10 @@ Shape, npt, ) -from pandas.errors import AbstractMethodError +from pandas.errors import ( + AbstractMethodError, + OutOfBoundsDatetime, +) from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg @@ -59,7 +54,6 @@ can_hold_element, convert_dtypes, find_result_type, - maybe_downcast_to_dtype, np_can_hold_element, ) from pandas.core.dtypes.common import ( @@ -83,6 +77,7 @@ ABCNumpyExtensionArray, ABCSeries, ) +from pandas.core.dtypes.inference import is_re from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -106,7 +101,6 @@ ) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, IntervalArray, @@ -114,6 +108,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions @@ -126,6 +121,8 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, + Generator, Iterable, Sequence, ) @@ -137,46 +134,6 @@ _dtype_obj = np.dtype("object") -COW_WARNING_GENERAL_MSG = """\ -Setting a value on a view: behaviour will change in pandas 3.0. -You are mutating a Series or DataFrame object, and currently this mutation will -also have effect on other Series or DataFrame objects that share data with this -object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object -will never modify another. -""" - - -COW_WARNING_SETITEM_MSG = """\ -Setting a value on a view: behaviour will change in pandas 3.0. -Currently, the mutation will also have effect on the object that shares data -with this object. For example, when setting a value in a Series that was -extracted from a column of a DataFrame, that DataFrame will also be updated: - - ser = df["col"] - ser[0] = 0 <--- in pandas 2, this also updates `df` - -In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never -modify another, and thus in the example above, `df` will not be changed. -""" - - -def maybe_split(meth: F) -> F: - """ - If we have a multi-column block, split and operate block-wise. Otherwise - use the original method. - """ - - @wraps(meth) - def newfunc(self, *args, **kwargs) -> list[Block]: - if self.ndim == 1 or self.shape[0] == 1: - return meth(self, *args, **kwargs) - else: - # Split and operate column-by-column - return self.split_and_operate(meth, *args, **kwargs) - - return cast(F, newfunc) - - class Block(PandasObject, libinternals.Block): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas @@ -198,12 +155,9 @@ class Block(PandasObject, libinternals.Block): def _validate_ndim(self) -> bool: """ We validate dimension for blocks that can hold 2D values, which for now - means numpy dtypes or DatetimeTZDtype. + means numpy dtypes or EA dtypes like DatetimeTZDtype and PeriodDtype. """ - dtype = self.dtype - return not isinstance(dtype, ExtensionDtype) or isinstance( - dtype, DatetimeTZDtype - ) + return not is_1d_only_ea_dtype(self.dtype) @final @cache_readonly @@ -433,20 +387,18 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]: return [nb] @final - def _split(self) -> list[Block]: + def _split(self) -> Generator[Block]: """ Split a block into a list of single-column blocks. """ assert self.ndim == 2 - new_blocks = [] for i, ref_loc in enumerate(self._mgr_locs): vals = self.values[slice(i, i + 1)] bp = BlockPlacement(ref_loc) nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) - new_blocks.append(nb) - return new_blocks + yield nb @final def split_and_operate(self, func, *args, **kwargs) -> list[Block]: @@ -475,7 +427,7 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: # Up/Down-casting @final - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + def coerce_to_target_dtype(self, other, raise_on_upcast: bool) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -498,8 +450,11 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and is_integer_dtype(self.values.dtype) and isna(other) and other is not NaT + and not ( + isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) + ) ): - warn_on_upcast = False + raise_on_upcast = False elif ( isinstance(other, np.ndarray) and other.ndim == 1 @@ -507,139 +462,42 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and is_float_dtype(other.dtype) and lib.has_only_ints_or_nan(other) ): - warn_on_upcast = False - - if warn_on_upcast: - warnings.warn( - f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " - f"Value '{other}' has dtype incompatible with {self.values.dtype}, " - "please explicitly cast to a compatible dtype first.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise_on_upcast = False + + if raise_on_upcast: + raise TypeError(f"Invalid value '{other}' for dtype '{self.values.dtype}'") if self.values.dtype == new_dtype: raise AssertionError( f"Did not expect new dtype {new_dtype} to equal self.dtype " f"{self.values.dtype}. Please report a bug at " "https://github.com/pandas-dev/pandas/issues." ) - return self.astype(new_dtype, copy=False) - - @final - def _maybe_downcast( - self, - blocks: list[Block], - downcast, - using_cow: bool, - caller: str, - ) -> list[Block]: - if downcast is False: - return blocks - - if self.dtype == _dtype_obj: - # TODO: does it matter that self.dtype might not match blocks[i].dtype? - # GH#44241 We downcast regardless of the argument; - # respecting 'downcast=None' may be worthwhile at some point, - # but ATM it breaks too much existing code. - # split and convert the blocks - - if caller == "fillna" and get_option("future.no_silent_downcasting"): - return blocks - - nbs = extend_blocks( - [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] - ) - if caller == "fillna": - if len(nbs) != len(blocks) or not all( - x.dtype == y.dtype for x, y in zip(nbs, blocks) - ): - # GH#54261 - warnings.warn( - "Downcasting object dtype arrays on .fillna, .ffill, .bfill " - "is deprecated and will change in a future version. " - "Call result.infer_objects(copy=False) instead. " - "To opt-in to the future " - "behavior, set " - "`pd.set_option('future.no_silent_downcasting', True)`", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return nbs - - elif downcast is None: - return blocks - elif caller == "where" and get_option("future.no_silent_downcasting") is True: - return blocks - else: - nbs = extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) - - # When _maybe_downcast is called with caller="where", it is either - # a) with downcast=False, which is a no-op (the desired future behavior) - # b) with downcast="infer", which is _not_ passed by the user. - # In the latter case the future behavior is to stop doing inference, - # so we issue a warning if and only if some inference occurred. - if caller == "where": - # GH#53656 - if len(blocks) != len(nbs) or any( - left.dtype != right.dtype for left, right in zip(blocks, nbs) - ): - # In this case _maybe_downcast was _not_ a no-op, so the behavior - # will change, so we issue a warning. - warnings.warn( - "Downcasting behavior in Series and DataFrame methods 'where', " - "'mask', and 'clip' is deprecated. In a future " - "version this will not infer object dtypes or cast all-round " - "floats to integers. Instead call " - "result.infer_objects(copy=False) for object inference, " - "or cast round floats explicitly. To opt-in to the future " - "behavior, set " - "`pd.set_option('future.no_silent_downcasting', True)`", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return nbs - - @final - @maybe_split - def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: - """ - downcast specialized to 2D case post-validation. - - Refactored to allow use of maybe_split. - """ - new_values = maybe_downcast_to_dtype(self.values, dtype=dtype) - new_values = maybe_coerce_values(new_values) - refs = self.refs if new_values is self.values else None - return [self.make_block(new_values, refs=refs)] + try: + return self.astype(new_dtype) + except OutOfBoundsDatetime as err: + # e.g. GH#56419 if self.dtype is a low-resolution dt64 and we try to + # upcast to a higher-resolution dt64, we may have entries that are + # out of bounds for the higher resolution. + # Re-raise with a more informative message. + raise OutOfBoundsDatetime( + f"Incompatible (high-resolution) value for dtype='{self.dtype}'. " + "Explicitly cast before operating." + ) from err @final - def convert( - self, - *, - copy: bool = True, - using_cow: bool = False, - ) -> list[Block]: + def convert(self) -> list[Block]: """ Attempt to coerce any object types to better types. Return a copy of the block (if copy = True). """ if not self.is_object: - if not copy and using_cow: - return [self.copy(deep=False)] - return [self.copy()] if copy else [self] + return [self.copy(deep=False)] if self.ndim != 1 and self.shape[0] != 1: - blocks = self.split_and_operate( - Block.convert, copy=copy, using_cow=using_cow - ) + blocks = self.split_and_operate(Block.convert) if all(blk.dtype.kind == "O" for blk in blocks): # Avoid fragmenting the block if convert is a no-op - if using_cow: - return [self.copy(deep=False)] - return [self.copy()] if copy else [self] + return [self.copy(deep=False)] return blocks values = self.values @@ -653,9 +511,10 @@ def convert( convert_non_numeric=True, ) refs = None - if copy and res_values is values: - res_values = values.copy() - elif res_values is values: + if res_values is values or ( + isinstance(res_values, NumpyExtensionArray) + and res_values._ndarray is values + ): refs = self.refs res_values = ensure_block_shape(res_values, self.ndim) @@ -664,8 +523,6 @@ def convert( def convert_dtypes( self, - copy: bool, - using_cow: bool, infer_objects: bool = True, convert_string: bool = True, convert_integer: bool = True, @@ -674,19 +531,21 @@ def convert_dtypes( dtype_backend: DtypeBackend = "numpy_nullable", ) -> list[Block]: if infer_objects and self.is_object: - blks = self.convert(copy=False, using_cow=using_cow) + blks = self.convert() else: blks = [self] if not any( [convert_floating, convert_integer, convert_boolean, convert_string] ): - return [b.copy(deep=copy) for b in blks] + return [b.copy(deep=False) for b in blks] rbs = [] for blk in blks: # Determine dtype column by column - sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split() + sub_blks = ( + [blk] if blk.ndim == 1 or self.shape[0] == 1 else list(blk._split()) + ) dtypes = [ convert_dtypes( b.values, @@ -701,11 +560,11 @@ def convert_dtypes( ] if all(dtype == self.dtype for dtype in dtypes): # Avoid block splitting if no dtype changes - rbs.append(blk.copy(deep=copy)) + rbs.append(blk.copy(deep=False)) continue for dtype, b in zip(dtypes, sub_blks): - rbs.append(b.astype(dtype=dtype, copy=copy, squeeze=b.ndim != 1)) + rbs.append(b.astype(dtype=dtype, squeeze=b.ndim != 1)) return rbs # --------------------------------------------------------------------- @@ -720,9 +579,7 @@ def dtype(self) -> DtypeObj: def astype( self, dtype: DtypeObj, - copy: bool = False, errors: IgnoreRaise = "raise", - using_cow: bool = False, squeeze: bool = False, ) -> Block: """ @@ -731,13 +588,9 @@ def astype( Parameters ---------- dtype : np.dtype or ExtensionDtype - copy : bool, default False - copy if indicated errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object - using_cow: bool, default False - Signaling if copy on write copy logic is used. squeeze : bool, default False squeeze values to ndim=1 if only one column is given @@ -751,18 +604,18 @@ def astype( raise ValueError("Can not squeeze with more than one column.") values = values[0, :] # type: ignore[call-overload] - new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) + new_values = astype_array_safe(values, dtype, errors=errors) new_values = maybe_coerce_values(new_values) refs = None - if (using_cow or not copy) and astype_is_view(values.dtype, new_values.dtype): + if astype_is_view(values.dtype, new_values.dtype): refs = self.refs newb = self.make_block(new_values, refs=refs) if newb.shape != self.shape: raise TypeError( - f"cannot set astype for copy = [{copy}] for dtype " + f"cannot set astype for dtype " f"({self.dtype.name} [{self.shape}]) to different shape " f"({newb.dtype.name} [{newb.shape}])" ) @@ -798,21 +651,18 @@ def copy(self, deep: bool = True) -> Self: # --------------------------------------------------------------------- # Copy-on-Write Helpers - @final - def _maybe_copy(self, using_cow: bool, inplace: bool) -> Self: - if using_cow and inplace: + def _maybe_copy(self, inplace: bool) -> Self: + if inplace: deep = self.refs.has_reference() - blk = self.copy(deep=deep) - else: - blk = self if inplace else self.copy() - return blk + return self.copy(deep=deep) + return self.copy() @final - def _get_refs_and_copy(self, using_cow: bool, inplace: bool): + def _get_refs_and_copy(self, inplace: bool): refs = None copy = not inplace if inplace: - if using_cow and self.refs.has_reference(): + if self.refs.has_reference(): copy = True else: refs = self.refs @@ -829,8 +679,6 @@ def replace( inplace: bool = False, # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, - using_cow: bool = False, - already_warned=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -842,82 +690,32 @@ def replace( # go through replace_list values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(using_cow, inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=to_replace, value=value, inplace=True) - return [blk] - if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. # Note: If to_replace were a list, NDFrame.replace would call # replace_list instead of replace. - if using_cow: - return [self.copy(deep=False)] - else: - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] if mask is None: mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. - if using_cow: - return [self.copy(deep=False)] - else: - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] - elif self._can_hold_element(value): + elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? - blk = self._maybe_copy(using_cow, inplace) + blk = self._maybe_copy(inplace) putmask_inplace(blk.values, mask, value) - if ( - inplace - and warn_copy_on_write() - and already_warned is not None - and not already_warned.warned_already - ): - if self.refs.has_reference(): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - already_warned.warned_already = True - - if not (self.is_object and value is None): - # if the user *explicitly* gave None, we keep None, otherwise - # may downcast to NaN - if get_option("future.no_silent_downcasting") is True: - blocks = [blk] - else: - blocks = blk.convert(copy=False, using_cow=using_cow) - if len(blocks) > 1 or blocks[0].dtype != blk.dtype: - warnings.warn( - # GH#54710 - "Downcasting behavior in `replace` is deprecated and " - "will be removed in a future version. To retain the old " - "behavior, explicitly call " - "`result.infer_objects(copy=False)`. " - "To opt-in to the future " - "behavior, set " - "`pd.set_option('future.no_silent_downcasting', True)`", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - blocks = [blk] - return blocks + return [blk] elif self.ndim == 1 or self.shape[0] == 1: if value is None or value is NA: blk = self.astype(np.dtype(object)) else: - blk = self.coerce_to_target_dtype(value) + blk = self.coerce_to_target_dtype(value, raise_on_upcast=False) return blk.replace( to_replace=to_replace, value=value, @@ -936,7 +734,6 @@ def replace( value=value, inplace=True, mask=mask[i : i + 1], - using_cow=using_cow, ) ) return blocks @@ -948,8 +745,6 @@ def _replace_regex( value, inplace: bool = False, mask=None, - using_cow: bool = False, - already_warned=None, ) -> list[Block]: """ Replace elements by the given value. @@ -964,55 +759,32 @@ def _replace_regex( Perform inplace modification. mask : array-like of bool, optional True indicate corresponding element is ignored. - using_cow: bool, default False - Specifying if copy on write is enabled. Returns ------- List[Block] """ - if not self._can_hold_element(to_replace): + if not is_re(to_replace) and not self._can_hold_element(to_replace): # i.e. only if self.is_object is True, but could in principle include a # String ExtensionBlock - if using_cow: - return [self.copy(deep=False)] - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] - rx = re.compile(to_replace) + if is_re(to_replace) and self.dtype not in [object, "string"]: + # only object or string dtype can hold strings, and a regex object + # will only match strings + return [self.copy(deep=False)] - block = self._maybe_copy(using_cow, inplace) + if not ( + self._can_hold_element(value) or (self.dtype == "string" and is_re(value)) + ): + block = self.astype(np.dtype(object)) + else: + block = self._maybe_copy(inplace) - replace_regex(block.values, rx, value, mask) + rx = re.compile(to_replace) - if ( - inplace - and warn_copy_on_write() - and already_warned is not None - and not already_warned.warned_already - ): - if self.refs.has_reference(): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - already_warned.warned_already = True - - nbs = block.convert(copy=False, using_cow=using_cow) - opt = get_option("future.no_silent_downcasting") - if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: - warnings.warn( - # GH#54710 - "Downcasting behavior in `replace` is deprecated and " - "will be removed in a future version. To retain the old " - "behavior, explicitly call `result.infer_objects(copy=False)`. " - "To opt-in to the future " - "behavior, set " - "`pd.set_option('future.no_silent_downcasting', True)`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return nbs + replace_regex(block.values, rx, value, mask) + return [block] @final def replace_list( @@ -1021,31 +793,20 @@ def replace_list( dest_list: Sequence[Any], inplace: bool = False, regex: bool = False, - using_cow: bool = False, - already_warned=None, ) -> list[Block]: """ See BlockManager.replace_list docstring. """ values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(using_cow, inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=src_list, value=dest_list, inplace=True) - return [blk] - # Exclude anything that we know we won't contain pairs = [ - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + (x, y) + for x, y in zip(src_list, dest_list) + if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) ] - if not len(pairs): - if using_cow: - return [self.copy(deep=False)] - # shortcut, nothing to replace - return [self] if inplace else [self.copy()] + if not pairs: + return [self.copy(deep=False)] src_len = len(pairs) - 1 @@ -1072,30 +833,11 @@ def replace_list( if inplace: masks = list(masks) - if using_cow: - # Don't set up refs here, otherwise we will think that we have - # references when we check again later - rb = [self] - else: - rb = [self if inplace else self.copy()] + # Don't set up refs here, otherwise we will think that we have + # references when we check again later + rb = [self] - if ( - inplace - and warn_copy_on_write() - and already_warned is not None - and not already_warned.warned_already - ): - if self.refs.has_reference(): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - already_warned.warned_already = True - - opt = get_option("future.no_silent_downcasting") for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): - convert = i == src_len # only convert once at the end new_rb: list[Block] = [] # GH-39338: _replace_coerce can split a block into @@ -1118,10 +860,9 @@ def replace_list( mask=m, inplace=inplace, regex=regex, - using_cow=using_cow, ) - if using_cow and i != src_len: + if i != src_len: # This is ugly, but we have to get rid of intermediate refs # that did not go out of scope yet, otherwise we will trigger # many unnecessary copies @@ -1130,34 +871,6 @@ def replace_list( b.refs.referenced_blocks.pop( b.refs.referenced_blocks.index(ref) ) - - if ( - not opt - and convert - and blk.is_object - and not all(x is None for x in dest_list) - ): - # GH#44498 avoid unwanted cast-back - nbs = [] - for res_blk in result: - converted = res_blk.convert( - copy=True and not using_cow, using_cow=using_cow - ) - if len(converted) > 1 or converted[0].dtype != res_blk.dtype: - warnings.warn( - # GH#54710 - "Downcasting behavior in `replace` is deprecated " - "and will be removed in a future version. To " - "retain the old behavior, explicitly call " - "`result.infer_objects(copy=False)`. " - "To opt-in to the future " - "behavior, set " - "`pd.set_option('future.no_silent_downcasting', True)`", - FutureWarning, - stacklevel=find_stack_level(), - ) - nbs.extend(converted) - result = nbs new_rb.extend(result) rb = new_rb return rb @@ -1170,7 +883,6 @@ def _replace_coerce( mask: npt.NDArray[np.bool_], inplace: bool = True, regex: bool = False, - using_cow: bool = False, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -1205,23 +917,20 @@ def _replace_coerce( # gh-45601, gh-45836, gh-46634 if mask.any(): has_ref = self.refs.has_reference() - nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow) - if (nb is self or using_cow) and not inplace: + nb = self.astype(np.dtype(object)) + if not inplace: nb = nb.copy() - elif inplace and has_ref and nb.refs.has_reference() and using_cow: + elif inplace and has_ref and nb.refs.has_reference(): # no copy in astype and we had refs before nb = nb.copy() putmask_inplace(nb.values, mask, value) return [nb] - if using_cow: - return [self] - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] return self.replace( to_replace=to_replace, value=value, inplace=inplace, mask=mask, - using_cow=using_cow, ) # --------------------------------------------------------------------- @@ -1366,7 +1075,7 @@ def _unstack( # --------------------------------------------------------------------- - def setitem(self, indexer, value, using_cow: bool = False) -> Block: + def setitem(self, indexer, value) -> Block: """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -1376,8 +1085,6 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: The subset of self.values to set value : object The value being set - using_cow: bool, default False - Signaling if CoW is used. Returns ------- @@ -1405,7 +1112,7 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: casted = np_can_hold_element(values.dtype, value) except LossySetitemError: # current dtype cannot store value, coerce to common dtype - nb = self.coerce_to_target_dtype(value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(value, raise_on_upcast=True) return nb.setitem(indexer, value) else: if self.dtype == _dtype_obj: @@ -1416,17 +1123,22 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: # test_iloc_setitem_custom_object casted = setitem_datetimelike_compat(values, len(vi), casted) - self = self._maybe_copy(using_cow, inplace=True) + self = self._maybe_copy(inplace=True) values = cast(np.ndarray, self.values.T) if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 casted = casted[0, ...] - values[indexer] = casted + try: + values[indexer] = casted + except (TypeError, ValueError) as err: + if is_list_like(casted): + raise ValueError( + "setting an array element with a sequence." + ) from err + raise return self - def putmask( - self, mask, new, using_cow: bool = False, already_warned=None - ) -> list[Block]: + def putmask(self, mask, new) -> list[Block]: """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -1437,7 +1149,6 @@ def putmask( ---------- mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object - using_cow: bool, default False Returns ------- @@ -1455,27 +1166,12 @@ def putmask( new = extract_array(new, extract_numpy=True) if noop: - if using_cow: - return [self.copy(deep=False)] - return [self] - - if ( - warn_copy_on_write() - and already_warned is not None - and not already_warned.warned_already - ): - if self.refs.has_reference(): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - already_warned.warned_already = True + return [self.copy(deep=False)] try: casted = np_can_hold_element(values.dtype, new) - self = self._maybe_copy(using_cow, inplace=True) + self = self._maybe_copy(inplace=True) values = cast(np.ndarray, self.values) putmask_without_repeat(values.T, mask, casted) @@ -1487,32 +1183,29 @@ def putmask( if not is_list_like(new): # using just new[indexer] can't save us the need to cast return self.coerce_to_target_dtype( - new, warn_on_upcast=True + new, raise_on_upcast=True ).putmask(mask, new) else: indexer = mask.nonzero()[0] - nb = self.setitem(indexer, new[indexer], using_cow=using_cow) + nb = self.setitem(indexer, new[indexer]) return [nb] else: is_array = isinstance(new, np.ndarray) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = new if is_array: # we have a different value per-column n = new[:, i : i + 1] submask = orig_mask[:, i : i + 1] - rbs = nb.putmask(submask, n, using_cow=using_cow) + rbs = nb.putmask(submask, n) res_blocks.extend(rbs) return res_blocks - def where( - self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False - ) -> list[Block]: + def where(self, other, cond) -> list[Block]: """ evaluate the block; return result block(s) from the result @@ -1520,8 +1213,6 @@ def where( ---------- other : a ndarray/object cond : np.ndarray[bool], SparseArray[bool], or BooleanArray - _downcast : str or None, default "infer" - Private because we only specify it when calling from fillna. Returns ------- @@ -1542,10 +1233,7 @@ def where( icond, noop = validate_putmask(values, ~cond) if noop: - # GH-39595: Always return a copy; short-circuit up/downcasting - if using_cow: - return [self.copy(deep=False)] - return [self.copy()] + return [self.copy(deep=False)] if other is lib.no_default: other = self.fill_value @@ -1563,30 +1251,21 @@ def where( if self.ndim == 1 or self.shape[0] == 1: # no need to split columns - block = self.coerce_to_target_dtype(other) - blocks = block.where(orig_other, cond, using_cow=using_cow) - return self._maybe_downcast( - blocks, downcast=_downcast, using_cow=using_cow, caller="where" - ) + block = self.coerce_to_target_dtype(other, raise_on_upcast=False) + return block.where(orig_other, cond) else: - # since _maybe_downcast would split blocks anyway, we - # can avoid some potential upcast/downcast by splitting - # on the front end. is_array = isinstance(other, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): oth = other if is_array: # we have a different value per-column oth = other[:, i : i + 1] submask = cond[:, i : i + 1] - rbs = nb.where( - oth, submask, _downcast=_downcast, using_cow=using_cow - ) + rbs = nb.where(oth, submask) res_blocks.extend(rbs) return res_blocks @@ -1634,9 +1313,6 @@ def fillna( value, limit: int | None = None, inplace: bool = False, - downcast=None, - using_cow: bool = False, - already_warned=None, ) -> list[Block]: """ fillna on the block with the value. If we fail, then convert to @@ -1654,95 +1330,43 @@ def fillna( if noop: # we can't process the value, but nothing to do - if inplace: - if using_cow: - return [self.copy(deep=False)] - # Arbitrarily imposing the convention that we ignore downcast - # on no-op when inplace=True - return [self] - else: - # GH#45423 consistent downcasting on no-ops. - nb = self.copy(deep=not using_cow) - nbs = nb._maybe_downcast( - [nb], downcast=downcast, using_cow=using_cow, caller="fillna" - ) - return nbs + return [self.copy(deep=False)] if limit is not None: - mask[mask.cumsum(self.ndim - 1) > limit] = False + mask[mask.cumsum(self.values.ndim - 1) > limit] = False if inplace: - nbs = self.putmask( - mask.T, value, using_cow=using_cow, already_warned=already_warned - ) + nbs = self.putmask(mask.T, value) else: - # without _downcast, we would break - # test_fillna_dtype_conversion_equiv_replace - nbs = self.where(value, ~mask.T, _downcast=False) - - # Note: blk._maybe_downcast vs self._maybe_downcast(nbs) - # makes a difference bc blk may have object dtype, which has - # different behavior in _maybe_downcast. - return extend_blocks( - [ - blk._maybe_downcast( - [blk], downcast=downcast, using_cow=using_cow, caller="fillna" - ) - for blk in nbs - ] - ) + nbs = self.where(value, ~mask.T) + return extend_blocks(nbs) def pad_or_backfill( self, *, method: FillnaOptions, - axis: AxisInt = 0, inplace: bool = False, limit: int | None = None, limit_area: Literal["inside", "outside"] | None = None, - downcast: Literal["infer"] | None = None, - using_cow: bool = False, - already_warned=None, ) -> list[Block]: if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op - if using_cow: - return [self.copy(deep=False)] - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] - copy, refs = self._get_refs_and_copy(using_cow, inplace) + copy, refs = self._get_refs_and_copy(inplace) # Dispatch to the NumpyExtensionArray method. # We know self.array_values is a NumpyExtensionArray bc EABlock overrides vals = cast(NumpyExtensionArray, self.array_values) - if axis == 1: - vals = vals.T - new_values = vals._pad_or_backfill( + new_values = vals.T._pad_or_backfill( method=method, limit=limit, limit_area=limit_area, copy=copy, - ) - if ( - not copy - and warn_copy_on_write() - and already_warned is not None - and not already_warned.warned_already - ): - if self.refs.has_reference(): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - already_warned.warned_already = True - if axis == 1: - new_values = new_values.T + ).T data = extract_array(new_values, extract_numpy=True) - - nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow, caller="fillna") + return [self.make_block_same_class(data, refs=refs)] @final def interpolate( @@ -1754,9 +1378,6 @@ def interpolate( limit: int | None = None, limit_direction: Literal["forward", "backward", "both"] = "forward", limit_area: Literal["inside", "outside"] | None = None, - downcast: Literal["infer"] | None = None, - using_cow: bool = False, - already_warned=None, **kwargs, ) -> list[Block]: inplace = validate_bool_kwarg(inplace, "inplace") @@ -1767,20 +1388,14 @@ def interpolate( if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op - if using_cow: - return [self.copy(deep=False)] - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] - # TODO(3.0): this case will not be reachable once GH#53638 is enforced if self.dtype == _dtype_obj: - # only deal with floats - # bc we already checked that can_hold_na, we don't have int dtype here - # test_interp_basic checks that we make a copy here - if using_cow: - return [self.copy(deep=False)] - return [self] if inplace else [self.copy()] + # GH#53631 + name = {1: "Series", 2: "DataFrame"}[self.ndim] + raise TypeError(f"{name} cannot interpolate with object dtype.") - copy, refs = self._get_refs_and_copy(using_cow, inplace) + copy, refs = self._get_refs_and_copy(inplace) # Dispatch to the EA method. new_values = self.array_values.interpolate( @@ -1794,23 +1409,7 @@ def interpolate( **kwargs, ) data = extract_array(new_values, extract_numpy=True) - - if ( - not copy - and warn_copy_on_write() - and already_warned is not None - and not already_warned.warned_already - ): - if self.refs.has_reference(): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - already_warned.warned_already = True - - nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow, caller="interpolate") + return [self.make_block_same_class(data, refs=refs)] @final def diff(self, n: int) -> list[Block]: @@ -1842,10 +1441,11 @@ def shift(self, periods: int, fill_value: Any = None) -> list[Block]: # error: Argument 1 to "np_can_hold_element" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" casted = np_can_hold_element( - self.dtype, fill_value # type: ignore[arg-type] + self.dtype, # type: ignore[arg-type] + fill_value, ) except LossySetitemError: - nb = self.coerce_to_target_dtype(fill_value) + nb = self.coerce_to_target_dtype(fill_value, raise_on_upcast=False) return nb.shift(periods, fill_value=fill_value) else: @@ -1884,11 +1484,11 @@ def quantile( return new_block_2d(result, placement=self._mgr_locs) @final - def round(self, decimals: int, using_cow: bool = False) -> Self: + def round(self, decimals: int) -> Self: """ Rounds the values. If the block is not of an integer or float dtype, nothing happens. - This is consistent with DataFrame.round behavivor. + This is consistent with DataFrame.round behavior. (Note: Series.round would raise) Parameters @@ -1896,26 +1496,19 @@ def round(self, decimals: int, using_cow: bool = False) -> Self: decimals: int, Number of decimal places to round to. Caller is responsible for validating this - using_cow: bool, - Whether Copy on Write is enabled right now """ if not self.is_numeric or self.is_bool: - return self.copy(deep=not using_cow) - refs = None + return self.copy(deep=False) # TODO: round only defined on BaseMaskedArray # Series also does this, so would need to fix both places # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]" # has no attribute "round" values = self.values.round(decimals) # type: ignore[union-attr] + + refs = None if values is self.values: - if not using_cow: - # Normally would need to do this before, but - # numpy only returns same array when round operation - # is no-op - # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 - values = values.copy() - else: - refs = self.refs + refs = self.refs + return self.make_block_same_class(values, refs=refs) # --------------------------------------------------------------------- @@ -2010,7 +1603,7 @@ def shift(self, periods: int, fill_value: Any = None) -> list[Block]: return [self.make_block_same_class(new_values)] @final - def setitem(self, indexer, value, using_cow: bool = False): + def setitem(self, indexer, value): """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -2023,8 +1616,6 @@ def setitem(self, indexer, value, using_cow: bool = False): The subset of self.values to set value : object The value being set - using_cow: bool, default False - Signaling if CoW is used. Returns ------- @@ -2053,11 +1644,11 @@ def setitem(self, indexer, value, using_cow: bool = False): except (ValueError, TypeError): if isinstance(self.dtype, IntervalDtype): # see TestSetitemFloatIntervalWithIntIntervalValues - nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(orig_value, raise_on_upcast=True) return nb.setitem(orig_indexer, orig_value) elif isinstance(self, NDArrayBackedExtensionBlock): - nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(orig_value, raise_on_upcast=True) return nb.setitem(orig_indexer, orig_value) else: @@ -2067,10 +1658,7 @@ def setitem(self, indexer, value, using_cow: bool = False): return self @final - def where( - self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False - ) -> list[Block]: - # _downcast private bc we only specify it when calling from fillna + def where(self, other, cond) -> list[Block]: arr = self.values.T cond = extract_bool_array(cond) @@ -2087,30 +1675,31 @@ def where( if noop: # GH#44181, GH#45135 # Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast - if using_cow: - return [self.copy(deep=False)] - return [self.copy()] + return [self.copy(deep=False)] try: res_values = arr._where(cond, other).T + except OutOfBoundsDatetime: + raise except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # TestSetitemFloatIntervalWithIntIntervalValues - blk = self.coerce_to_target_dtype(orig_other) - nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) - return self._maybe_downcast( - nbs, downcast=_downcast, using_cow=using_cow, caller="where" - ) + blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) + if ( + self.ndim == 2 + and isinstance(orig_cond, np.ndarray) + and orig_cond.ndim == 1 + and not is_1d_only_ea_dtype(blk.dtype) + ): + orig_cond = orig_cond[:, None] + return blk.where(orig_other, orig_cond) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_other) - nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) - return self._maybe_downcast( - nbs, downcast=_downcast, using_cow=using_cow, caller="where" - ) + blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) + return blk.where(orig_other, orig_cond) else: raise @@ -2120,15 +1709,14 @@ def where( is_array = isinstance(orig_other, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = orig_other if is_array: # we have a different value per-column n = orig_other[:, i : i + 1] submask = orig_cond[:, i : i + 1] - rbs = nb.where(n, submask, using_cow=using_cow) + rbs = nb.where(n, submask) res_blocks.extend(rbs) return res_blocks @@ -2136,9 +1724,7 @@ def where( return [nb] @final - def putmask( - self, mask, new, using_cow: bool = False, already_warned=None - ) -> list[Block]: + def putmask(self, mask, new) -> list[Block]: """ See Block.putmask.__doc__ """ @@ -2152,24 +1738,9 @@ def putmask( mask = self._maybe_squeeze_arg(mask) if not mask.any(): - if using_cow: - return [self.copy(deep=False)] - return [self] - - if ( - warn_copy_on_write() - and already_warned is not None - and not already_warned.warned_already - ): - if self.refs.has_reference(): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - already_warned.warned_already = True + return [self.copy(deep=False)] - self = self._maybe_copy(using_cow, inplace=True) + self = self._maybe_copy(inplace=True) values = self.values if values.ndim == 2: values = values.T @@ -2177,18 +1748,20 @@ def putmask( try: # Caller is responsible for ensuring matching lengths values._putmask(mask, new) + except OutOfBoundsDatetime: + raise except (TypeError, ValueError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): # Discussion about what we want to support in the general # case GH#39584 - blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + blk = self.coerce_to_target_dtype(orig_new, raise_on_upcast=True) return blk.putmask(orig_mask, orig_new) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + blk = self.coerce_to_target_dtype(orig_new, raise_on_upcast=True) return blk.putmask(orig_mask, orig_new) else: @@ -2199,8 +1772,7 @@ def putmask( is_array = isinstance(orig_new, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = orig_new if is_array: # we have a different value per-column @@ -2246,21 +1818,27 @@ def pad_or_backfill( self, *, method: FillnaOptions, - axis: AxisInt = 0, inplace: bool = False, limit: int | None = None, limit_area: Literal["inside", "outside"] | None = None, - downcast: Literal["infer"] | None = None, - using_cow: bool = False, - already_warned=None, ) -> list[Block]: values = self.values - if values.ndim == 2 and axis == 1: + kwargs: dict[str, Any] = {"method": method, "limit": limit} + if "limit_area" in inspect.signature(values._pad_or_backfill).parameters: + kwargs["limit_area"] = limit_area + elif limit_area is not None: + raise NotImplementedError( + f"{type(values).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtensionArray authors " + "need to add this argument to _pad_or_backfill." + ) + + if values.ndim == 2: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T._pad_or_backfill(method=method, limit=limit).T + new_values = values.T._pad_or_backfill(**kwargs).T else: - new_values = values._pad_or_backfill(method=method, limit=limit) + new_values = values._pad_or_backfill(**kwargs) return [self.make_block_same_class(new_values)] @@ -2283,34 +1861,28 @@ def fillna( value, limit: int | None = None, inplace: bool = False, - downcast=None, - using_cow: bool = False, - already_warned=None, ) -> list[Block]: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # Block.fillna handles coercion (test_fillna_interval) + if isinstance(self.dtype, IntervalDtype) and limit is not None: + raise ValueError("limit must be None") return super().fillna( value=value, limit=limit, inplace=inplace, - downcast=downcast, - using_cow=using_cow, - already_warned=already_warned, ) - if using_cow and self._can_hold_na and not self.values._hasna: + if self._can_hold_na and not self.values._hasna: refs = self.refs new_values = self.values else: - copy, refs = self._get_refs_and_copy(using_cow, inplace) + copy, refs = self._get_refs_and_copy(inplace) try: - new_values = self.values.fillna( - value=value, method=None, limit=limit, copy=copy - ) + new_values = self.values.fillna(value=value, limit=limit, copy=copy) except TypeError: # 3rd party EA that has not implemented copy keyword yet refs = None - new_values = self.values.fillna(value=value, method=None, limit=limit) + new_values = self.values.fillna(value=value, limit=limit) # issue the warning *after* retrying, in case the TypeError # was caused by an invalid fill_value warnings.warn( @@ -2323,23 +1895,8 @@ def fillna( DeprecationWarning, stacklevel=find_stack_level(), ) - else: - if ( - not copy - and warn_copy_on_write() - and already_warned is not None - and not already_warned.warned_already - ): - if self.refs.has_reference(): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - already_warned.warned_already = True - nb = self.make_block_same_class(new_values, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow=using_cow, caller="fillna") + return [self.make_block_same_class(new_values, refs=refs)] @cache_readonly def shape(self) -> Shape: @@ -2541,7 +2098,7 @@ def _unstack( self.values.take( indices, allow_fill=needs_masking[i], fill_value=fill_value ), - BlockPlacement(place), + BlockPlacement(place), # type: ignore[arg-type] ndim=2, ) for i, (indices, place) in enumerate(zip(new_values, new_placement)) @@ -2575,18 +2132,6 @@ def is_numeric(self) -> bool: # type: ignore[override] return kind in "fciub" -class NumericBlock(NumpyBlock): - # this Block type is kept for backwards-compatibility - # TODO(3.0): delete and remove deprecation in __init__.py. - __slots__ = () - - -class ObjectBlock(NumpyBlock): - # this Block type is kept for backwards-compatibility - # TODO(3.0): delete and remove deprecation in __init__.py. - __slots__ = () - - class NDArrayBackedExtensionBlock(EABackedBlock): """ Block backed by an NDArrayBackedExtensionArray @@ -2609,14 +2154,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): values: DatetimeArray | TimedeltaArray -class DatetimeTZBlock(DatetimeLikeBlock): - """implement a datetime64 block with a tz attribute""" - - values: DatetimeArray - - __slots__ = () - - # ----------------------------------------------------------------- # Constructor Helpers @@ -2663,7 +2200,7 @@ def get_block_type(dtype: DtypeObj) -> type[Block]: cls : class, subclass of Block """ if isinstance(dtype, DatetimeTZDtype): - return DatetimeTZBlock + return DatetimeLikeBlock elif isinstance(dtype, PeriodDtype): return NDArrayBackedExtensionBlock elif isinstance(dtype, ExtensionDtype): @@ -2681,7 +2218,7 @@ def get_block_type(dtype: DtypeObj) -> type[Block]: def new_block_2d( values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None -): +) -> Block: # new_block specialized to case with # ndim=2 # isinstance(placement, BlockPlacement) @@ -2728,8 +2265,7 @@ def check_ndim(values, placement: BlockPlacement, ndim: int) -> None: if values.ndim > ndim: # Check for both np.ndarray and ExtensionArray raise ValueError( - "Wrong number of dimensions. " - f"values.ndim > ndim [{values.ndim} > {ndim}]" + f"Wrong number of dimensions. values.ndim > ndim [{values.ndim} > {ndim}]" ) if not is_1d_only_ea_dtype(values.dtype): @@ -2820,7 +2356,7 @@ def external_values(values: ArrayLike) -> ArrayLike: # Avoid raising in .astype in casting from dt64tz to dt64 values = values._ndarray - if isinstance(values, np.ndarray) and using_copy_on_write(): + if isinstance(values, np.ndarray): values = values.view() values.flags.writeable = False diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index b2d463a8c6c26..2ee7d3948a70f 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -4,7 +4,6 @@ TYPE_CHECKING, cast, ) -import warnings import numpy as np @@ -16,7 +15,6 @@ ) from pandas._libs.missing import NA from pandas.util._decorators import cache_readonly -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( ensure_dtype_can_hold_na, @@ -24,22 +22,13 @@ ) from pandas.core.dtypes.common import ( is_1d_only_ea_dtype, - is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, - SparseDtype, -) -from pandas.core.dtypes.missing import ( - is_valid_na_for_dtype, - isna, - isna_all, -) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.construction import ensure_wrapped_if_datetimelike -from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import ( ensure_block_shape, new_block_2d, @@ -50,13 +39,15 @@ ) if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Generator, + Sequence, + ) from pandas._typing import ( ArrayLike, AxisInt, DtypeObj, - Manager2D, Shape, ) @@ -67,33 +58,9 @@ ) -def _concatenate_array_managers( - mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt -) -> Manager2D: - """ - Concatenate array managers into one. - - Parameters - ---------- - mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples - axes : list of Index - concat_axis : int - - Returns - ------- - ArrayManager - """ - if concat_axis == 1: - return mgrs[0].concat_vertical(mgrs, axes) - else: - # concatting along the columns -> combine reindexed arrays in a single manager - assert concat_axis == 0 - return mgrs[0].concat_horizontal(mgrs, axes) - - def concatenate_managers( mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool -) -> Manager2D: +) -> BlockManager: """ Concatenate block managers into one. @@ -111,16 +78,6 @@ def concatenate_managers( needs_copy = copy and concat_axis == 0 - # TODO(ArrayManager) this assumes that all managers are of the same type - if isinstance(mgrs_indexers[0][0], ArrayManager): - mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) - # error: Argument 1 to "_concatenate_array_managers" has incompatible - # type "List[BlockManager]"; expected "List[Union[ArrayManager, - # SingleArrayManager, BlockManager, SingleBlockManager]]" - return _concatenate_array_managers( - mgrs, axes, concat_axis # type: ignore[arg-type] - ) - # Assertions disabled for performance # for tup in mgrs_indexers: # # caller is responsible for ensuring this @@ -136,6 +93,7 @@ def concatenate_managers( if first_dtype in [np.float64, np.float32]: # TODO: support more dtypes here. This will be simpler once # JoinUnit.is_na behavior is deprecated. + # (update 2024-04-13 that deprecation has been enforced) if ( all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers) and len(mgrs_indexers) > 1 @@ -154,12 +112,10 @@ def concatenate_managers( out.axes = axes return out - concat_plan = _get_combined_plan(mgrs) - blocks = [] values: ArrayLike - for placement, join_units in concat_plan: + for placement, join_units in _get_combined_plan(mgrs): unit = join_units[0] blk = unit.block @@ -221,7 +177,6 @@ def _maybe_reindex_columns_na_proxy( axes[i], indexers[i], axis=i, - copy=False, only_slice=True, # only relevant for i==0 allow_dups=True, use_na_proxy=True, # only relevant for i==0 @@ -295,14 +250,12 @@ def _concat_homogeneous_fastpath( def _get_combined_plan( mgrs: list[BlockManager], -) -> list[tuple[BlockPlacement, list[JoinUnit]]]: - plan = [] - +) -> Generator[tuple[BlockPlacement, list[JoinUnit]]]: max_len = mgrs[0].shape[0] blknos_list = [mgr.blknos for mgr in mgrs] pairs = libinternals.get_concat_blkno_indexers(blknos_list) - for ind, (blknos, bp) in enumerate(pairs): + for blknos, bp in pairs: # assert bp.is_slice_like # assert len(bp) > 0 @@ -314,9 +267,7 @@ def _get_combined_plan( unit = JoinUnit(nb) units_for_bp.append(unit) - plan.append((bp, units_for_bp)) - - return plan + yield bp, units_for_bp def _get_block_for_concat_plan( @@ -352,7 +303,7 @@ def __init__(self, block: Block) -> None: self.block = block def __repr__(self) -> str: - return f"{type(self).__name__}({repr(self.block)})" + return f"{type(self).__name__}({self.block!r})" def _is_valid_na_for(self, dtype: DtypeObj) -> bool: """ @@ -388,41 +339,6 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool: @cache_readonly def is_na(self) -> bool: - blk = self.block - if blk.dtype.kind == "V": - return True - - if not blk._can_hold_na: - return False - - values = blk.values - if values.size == 0: - # GH#39122 this case will return False once deprecation is enforced - return True - - if isinstance(values.dtype, SparseDtype): - return False - - if values.ndim == 1: - # TODO(EA2D): no need for special case with 2D EAs - val = values[0] - if not is_scalar(val) or not isna(val): - # ideally isna_all would do this short-circuiting - return False - return isna_all(values) - else: - val = values[0][0] - if not is_scalar(val) or not isna(val): - # ideally isna_all would do this short-circuiting - return False - return all(isna_all(row) for row in values) - - @cache_readonly - def is_na_after_size_and_isna_all_deprecation(self) -> bool: - """ - Will self.is_na be True after values.size == 0 deprecation and isna_all - deprecation are enforced? - """ blk = self.block if blk.dtype.kind == "V": return True @@ -458,7 +374,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike """ Concatenate values from several join units along axis=1. """ - empty_dtype, empty_dtype_future = _get_empty_dtype(join_units) + empty_dtype = _get_empty_dtype(join_units) has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) @@ -474,9 +390,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike # error: No overload variant of "__getitem__" of "ExtensionArray" matches # argument type "Tuple[int, slice]" to_concat = [ - t - if is_1d_only_ea_dtype(t.dtype) - else t[0, :] # type: ignore[call-overload] + t if is_1d_only_ea_dtype(t.dtype) else t[0, :] # type: ignore[call-overload] for t in to_concat ] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) @@ -485,18 +399,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike else: concat_values = concat_compat(to_concat, axis=1) - if empty_dtype != empty_dtype_future: - if empty_dtype == concat_values.dtype: - # GH#39122, GH#40893 - warnings.warn( - "The behavior of DataFrame concatenation with empty or all-NA " - "entries is deprecated. In a future version, this will no longer " - "exclude empty or all-NA columns when determining the result dtypes. " - "To retain the old behavior, exclude the relevant entries before " - "the concat operation.", - FutureWarning, - stacklevel=find_stack_level(), - ) return concat_values @@ -523,7 +425,7 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): raise NotImplementedError -def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]: +def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ Return dtype and N/A values to use when concatenating specified units. @@ -535,38 +437,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj """ if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]): empty_dtype = join_units[0].block.dtype - return empty_dtype, empty_dtype + return empty_dtype has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) dtypes = [unit.block.dtype for unit in join_units if not unit.is_na] - if not len(dtypes): - dtypes = [ - unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" - ] dtype = find_common_type(dtypes) if has_none_blocks: dtype = ensure_dtype_can_hold_na(dtype) - dtype_future = dtype - if len(dtypes) != len(join_units): - dtypes_future = [ - unit.block.dtype - for unit in join_units - if not unit.is_na_after_size_and_isna_all_deprecation - ] - if not len(dtypes_future): - dtypes_future = [ - unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" - ] - - if len(dtypes) != len(dtypes_future): - dtype_future = find_common_type(dtypes_future) - if has_none_blocks: - dtype_future = ensure_dtype_can_hold_na(dtype_future) - - return dtype, dtype_future + return dtype def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 609d2c9a7a285..35de97d570bd3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -2,6 +2,7 @@ Functions for preparing various inputs passed to the DataFrame or Series constructors before passing them to a BlockManager. """ + from __future__ import annotations from collections import abc @@ -13,7 +14,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib @@ -31,12 +32,14 @@ is_list_like, is_named_tuple, is_object_dtype, + is_scalar, ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, ) +from pandas.core.dtypes.missing import isna from pandas.core import ( algorithms, @@ -46,7 +49,6 @@ from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( array as pd_array, - ensure_wrapped_if_datetimelike, extract_array, range_to_ndarray, sanitize_array, @@ -58,12 +60,9 @@ default_index, ensure_index, get_objs_combined_axis, + maybe_sequence_to_range, union_indexes, ) -from pandas.core.internals.array_manager import ( - ArrayManager, - SingleArrayManager, -) from pandas.core.internals.blocks import ( BlockPlacement, ensure_block_shape, @@ -71,8 +70,6 @@ new_block_2d, ) from pandas.core.internals.managers import ( - BlockManager, - SingleBlockManager, create_block_manager_from_blocks, create_block_manager_from_column_arrays, ) @@ -100,7 +97,6 @@ def arrays_to_mgr( *, dtype: DtypeObj | None = None, verify_integrity: bool = True, - typ: str | None = None, consolidate: bool = True, ) -> Manager: """ @@ -148,14 +144,9 @@ def arrays_to_mgr( # from BlockManager perspective axes = [columns, index] - if typ == "block": - return create_block_manager_from_column_arrays( - arrays, axes, consolidate=consolidate, refs=refs - ) - elif typ == "array": - return ArrayManager(arrays, [index, columns]) - else: - raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") + return create_block_manager_from_column_arrays( + arrays, axes, consolidate=consolidate, refs=refs + ) def rec_array_to_mgr( @@ -164,7 +155,6 @@ def rec_array_to_mgr( columns, dtype: DtypeObj | None, copy: bool, - typ: str, ) -> Manager: """ Extract from a masked rec array and create the manager. @@ -186,59 +176,23 @@ def rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ) + mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype) if copy: mgr = mgr.copy() return mgr -def mgr_to_mgr(mgr, typ: str, copy: bool = True) -> Manager: - """ - Convert to specific type of Manager. Does not copy if the type is already - correct. Does not guarantee a copy otherwise. `copy` keyword only controls - whether conversion from Block->ArrayManager copies the 1D arrays. - """ - new_mgr: Manager - - if typ == "block": - if isinstance(mgr, BlockManager): - new_mgr = mgr - else: - if mgr.ndim == 2: - new_mgr = arrays_to_mgr( - mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block" - ) - else: - new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index) - elif typ == "array": - if isinstance(mgr, ArrayManager): - new_mgr = mgr - else: - if mgr.ndim == 2: - arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))] - if copy: - arrays = [arr.copy() for arr in arrays] - new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) - else: - array = mgr.internal_values() - if copy: - array = array.copy() - new_mgr = SingleArrayManager([array], [mgr.index]) - else: - raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") - return new_mgr - - # --------------------------------------------------------------------- # DataFrame Constructor Interface def ndarray_to_mgr( - values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str + values, index, columns, dtype: DtypeObj | None, copy: bool ) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray + infer_object = not isinstance(values, (ABCSeries, Index, ExtensionArray)) if isinstance(values, ABCSeries): if columns is None: @@ -253,10 +207,6 @@ def ndarray_to_mgr( if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) - # if the array preparation does a copy -> avoid this for ArrayManager, - # since the copy is done on conversion to 1D arrays - copy_on_sanitize = False if typ == "array" else copy - vdtype = getattr(values, "dtype", None) refs = None if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): @@ -279,7 +229,7 @@ def ndarray_to_mgr( else: columns = ensure_index(columns) - return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) + return arrays_to_mgr(values, columns, index, dtype=dtype) elif isinstance(vdtype, ExtensionDtype): # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype) @@ -291,12 +241,10 @@ def ndarray_to_mgr( values = values.reshape(-1, 1) elif isinstance(values, (ABCSeries, Index)): - if not copy_on_sanitize and ( - dtype is None or astype_is_view(values.dtype, dtype) - ): + if not copy and (dtype is None or astype_is_view(values.dtype, dtype)): refs = values._references - if copy_on_sanitize: + if copy: values = values._values.copy() else: values = values._values @@ -305,18 +253,18 @@ def ndarray_to_mgr( elif isinstance(values, (np.ndarray, ExtensionArray)): # drop subclass info - _copy = ( - copy_on_sanitize - if (dtype is None or astype_is_view(values.dtype, dtype)) - else False - ) - values = np.array(values, copy=_copy) + if copy and (dtype is None or astype_is_view(values.dtype, dtype)): + # only force a copy now if copy=True was requested + # and a subsequent `astype` will not already result in a copy + values = np.array(values, copy=True, order="F") + else: + values = np.asarray(values) values = _ensure_2d(values) else: # by definition an array here # the dtypes will be coerced to a single dtype - values = _prep_ndarraylike(values, copy=copy_on_sanitize) + values = _prep_ndarraylike(values, copy=copy) if dtype is not None and values.dtype != dtype: # GH#40110 see similar check inside sanitize_array @@ -324,7 +272,7 @@ def ndarray_to_mgr( values, None, dtype=dtype, - copy=copy_on_sanitize, + copy=copy, allow_2d=True, ) @@ -335,48 +283,26 @@ def ndarray_to_mgr( _check_values_indices_shape_match(values, index, columns) - if typ == "array": - if issubclass(values.dtype.type, str): - values = np.array(values, dtype=object) - - if dtype is None and is_object_dtype(values.dtype): - arrays = [ - ensure_wrapped_if_datetimelike( - maybe_infer_to_datetimelike(values[:, i]) - ) - for i in range(values.shape[1]) - ] - else: - if lib.is_np_dtype(values.dtype, "mM"): - values = ensure_wrapped_if_datetimelike(values) - arrays = [values[:, i] for i in range(values.shape[1])] - - if copy: - arrays = [arr.copy() for arr in arrays] - - return ArrayManager(arrays, [index, columns], verify_integrity=False) - values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(values.dtype): + if dtype is None and infer_object and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): - dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] block_values = [ - new_block_2d(dvals_list[n], placement=BlockPlacement(n)) - for n in range(len(dvals_list)) + new_block_2d(ensure_block_shape(dval, 2), placement=BlockPlacement(n)) + for n, dval in enumerate(maybe_datetime) ] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] - elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): - dtype = StringDtype(storage="pyarrow_numpy") + elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): + dtype = StringDtype(na_value=np.nan) obj_columns = list(values) block_values = [ @@ -426,7 +352,6 @@ def dict_to_mgr( columns, *, dtype: DtypeObj | None = None, - typ: str = "block", copy: bool = True, ) -> Manager: """ @@ -435,72 +360,70 @@ def dict_to_mgr( Used in DataFrame.__init__ """ - arrays: Sequence[Any] | Series + arrays: Sequence[Any] if columns is not None: - from pandas.core.series import Series + columns = ensure_index(columns) + arrays = [np.nan] * len(columns) + midxs = set() + data_keys = ensure_index(data.keys()) # type: ignore[arg-type] + data_values = list(data.values()) + + for i, column in enumerate(columns): + try: + idx = data_keys.get_loc(column) + except KeyError: + midxs.add(i) + continue + array = data_values[idx] + arrays[i] = array + if is_scalar(array) and isna(array): + midxs.add(i) - arrays = Series(data, index=columns, dtype=object) - missing = arrays.isna() if index is None: # GH10856 # raise ValueError if only scalars in dict - index = _extract_index(arrays[~missing]) + if midxs: + index = _extract_index( + [array for i, array in enumerate(arrays) if i not in midxs] + ) + else: + index = _extract_index(arrays) else: index = ensure_index(index) # no obvious "empty" int column - if missing.any() and not is_integer_dtype(dtype): - nan_dtype: DtypeObj - - if dtype is not None: - # calling sanitize_array ensures we don't mix-and-match - # NA dtypes - midxs = missing.values.nonzero()[0] - for i in midxs: - arr = sanitize_array(arrays.iat[i], index, dtype=dtype) - arrays.iat[i] = arr - else: - # GH#1783 - nan_dtype = np.dtype("object") - val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) - nmissing = missing.sum() - if copy: - rhs = [val] * nmissing - else: - # GH#45369 - rhs = [val.copy() for _ in range(nmissing)] - arrays.loc[missing] = rhs - - arrays = list(arrays) - columns = ensure_index(columns) + if midxs and not is_integer_dtype(dtype): + # GH#1783 + for i in midxs: + arr = construct_1d_arraylike_from_scalar( + arrays[i], + len(index), + dtype if dtype is not None else np.dtype("object"), + ) + arrays[i] = arr else: - keys = list(data.keys()) + keys = maybe_sequence_to_range(list(data.keys())) columns = Index(keys) if keys else default_index(0) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] if copy: - if typ == "block": - # We only need to copy arrays that will not get consolidated, i.e. - # only EA arrays - arrays = [ - x.copy() - if isinstance(x, ExtensionArray) - else x.copy(deep=True) - if ( - isinstance(x, Index) - or isinstance(x, ABCSeries) - and is_1d_only_ea_dtype(x.dtype) - ) - else x - for x in arrays - ] - else: - # dtype check to exclude e.g. range objects, scalars - arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] + # We only need to copy arrays that will not get consolidated, i.e. + # only EA arrays + arrays = [ + x.copy() + if isinstance(x, ExtensionArray) + else x.copy(deep=True) + if ( + isinstance(x, Index) + or (isinstance(x, ABCSeries) and is_1d_only_ea_dtype(x.dtype)) + ) + else x + for x in arrays + ] - return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) + return arrays_to_mgr(arrays, columns, index, dtype=dtype, consolidate=copy) def nested_data_to_arrays( @@ -604,11 +527,11 @@ def _homogenize( for val in data: if isinstance(val, (ABCSeries, Index)): if dtype is not None: - val = val.astype(dtype, copy=False) + val = val.astype(dtype) if isinstance(val, ABCSeries) and val.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later - val = val.reindex(index, copy=False) + val = val.reindex(index) refs.append(val._references) val = val._values else: @@ -643,7 +566,7 @@ def _extract_index(data) -> Index: if len(data) == 0: return default_index(0) - raw_lengths = [] + raw_lengths = set() indexes: list[list[Hashable] | Index] = [] have_raw_arrays = False @@ -659,7 +582,7 @@ def _extract_index(data) -> Index: indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True - raw_lengths.append(len(val)) + raw_lengths.add(len(val)) elif isinstance(val, np.ndarray) and val.ndim > 1: raise ValueError("Per-column arrays must each be 1-dimensional") @@ -672,24 +595,23 @@ def _extract_index(data) -> Index: index = union_indexes(indexes, sort=False) if have_raw_arrays: - lengths = list(set(raw_lengths)) - if len(lengths) > 1: + if len(raw_lengths) > 1: raise ValueError("All arrays must be of the same length") if have_dicts: raise ValueError( "Mixing dicts with non-Series may lead to ambiguous ordering." ) - + raw_length = raw_lengths.pop() if have_series: - if lengths[0] != len(index): + if raw_length != len(index): msg = ( - f"array length {lengths[0]} does not match index " + f"array length {raw_length} does not match index " f"length {len(index)}" ) raise ValueError(msg) else: - index = default_index(lengths[0]) + index = default_index(raw_length) return ensure_index(index) @@ -698,7 +620,7 @@ def reorder_arrays( arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int ) -> tuple[list[ArrayLike], Index]: """ - Pre-emptively (cheaply) reindex arrays with new columns. + Preemptively (cheaply) reindex arrays with new columns. """ # reorder according to the columns if columns is not None: @@ -712,7 +634,7 @@ def reorder_arrays( arr = np.empty(length, dtype=object) arr.fill(np.nan) else: - arr = arrays[k] + arr = arrays[k] # type: ignore[assignment] new_arrays.append(arr) arrays = new_arrays @@ -827,7 +749,8 @@ def to_arrays( elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray - columns = Index(list(data.dtype.names)) + if columns is None: + columns = Index(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns @@ -919,7 +842,7 @@ def _list_of_dict_to_arrays( # assure that they are of the base dict class and not of derived # classes - data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721 + data = [d if type(d) is dict else dict(d) for d in data] content = lib.dicts_to_array(data, list(columns)) return content, columns @@ -941,7 +864,7 @@ def _finalize_columns_and_data( # GH#26429 do not raise user-facing AssertionError raise ValueError(err) from err - if len(contents) and contents[0].dtype == np.object_: + if contents and contents[0].dtype == np.object_: contents = convert_object_array(contents, dtype=dtype) return contents, columns @@ -984,8 +907,7 @@ def _validate_or_indexify_columns( if not is_mi_list and len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError( - f"{len(columns)} columns passed, passed data had " - f"{len(content)} columns" + f"{len(columns)} columns passed, passed data had {len(content)} columns" ) if is_mi_list: # check if nested list column, length of each sub-list should be equal @@ -1042,8 +964,9 @@ def convert(arr): if dtype is None: if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert - arr = maybe_infer_to_datetimelike(arr) - if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + convert_to_nullable_dtype = dtype_backend != "numpy" + arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype) + if convert_to_nullable_dtype and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() arr = arr_cls._from_sequence(arr, dtype=new_dtype) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3719bf1f77f85..e238bb78bbdfa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,27 +1,27 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Sequence, ) import itertools from typing import ( TYPE_CHECKING, - Callable, + Any, Literal, + NoReturn, cast, + final, ) import warnings -import weakref import numpy as np -from pandas._config import ( - using_copy_on_write, - warn_copy_on_write, -) +from pandas._config.config import get_option from pandas._libs import ( + algos as libalgos, internals as libinternals, lib, ) @@ -30,11 +30,19 @@ BlockValuesRefs, ) from pandas._libs.tslibs import Timestamp -from pandas.errors import PerformanceWarning +from pandas.errors import ( + AbstractMethodError, + PerformanceWarning, +) from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level +from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.cast import infer_dtype_from_scalar +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from_scalar, + np_can_hold_element, +) from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, @@ -43,6 +51,7 @@ from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, ExtensionDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -54,12 +63,9 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import ( - ArrowExtensionArray, - ArrowStringArray, - DatetimeArray, -) +from pandas.core.arrays import DatetimeArray from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.base import PandasObject from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, @@ -67,17 +73,10 @@ from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import ( Index, + default_index, ensure_index, ) -from pandas.core.internals.base import ( - DataManager, - SingleDataManager, - ensure_np_dtype, - interleaved_dtype, -) from pandas.core.internals.blocks import ( - COW_WARNING_GENERAL_MSG, - COW_WARNING_SETITEM_MSG, Block, NumpyBlock, ensure_block_shape, @@ -93,6 +92,8 @@ ) if TYPE_CHECKING: + from collections.abc import Generator + from pandas._typing import ( ArrayLike, AxisInt, @@ -106,7 +107,39 @@ from pandas.api.extensions import ExtensionArray -class BaseBlockManager(DataManager): +def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: + """ + Find the common dtype for `blocks`. + + Parameters + ---------- + blocks : List[DtypeObj] + + Returns + ------- + dtype : np.dtype, ExtensionDtype, or None + None is returned when `blocks` is empty. + """ + if not len(dtypes): + return None + + return find_common_type(dtypes) + + +def ensure_np_dtype(dtype: DtypeObj) -> np.dtype: + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + dtype = cast(np.dtype, dtype) + elif isinstance(dtype, ExtensionDtype): + dtype = np.dtype("object") + elif dtype == np.dtype(str): + dtype = np.dtype("object") + return dtype + + +class BaseBlockManager(PandasObject): """ Core internal data structure to implement DataFrame, Series, etc. @@ -174,6 +207,14 @@ def ndim(self) -> int: def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: raise NotImplementedError + @final + def __len__(self) -> int: + return len(self.items) + + @property + def shape(self) -> Shape: + return tuple(len(ax) for ax in self.axes) + @classmethod def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: raise NotImplementedError @@ -208,7 +249,7 @@ def blklocs(self) -> npt.NDArray[np.intp]: def make_empty(self, axes=None) -> Self: """return an empty BlockManager with the items axis of len 0""" if axes is None: - axes = [Index([])] + self.axes[1:] + axes = [default_index(0)] + self.axes[1:] # preserve dtype if possible if self.ndim == 1: @@ -222,23 +263,31 @@ def make_empty(self, axes=None) -> Self: blocks = [] return type(self).from_blocks(blocks, axes) - def __nonzero__(self) -> bool: + def __bool__(self) -> bool: return True - # Python3 compat - __bool__ = __nonzero__ - - def _normalize_axis(self, axis: AxisInt) -> int: - # switch axis to follow BlockManager logic - if self.ndim == 2: - axis = 1 if axis == 0 else 0 - return axis - def set_axis(self, axis: AxisInt, new_labels: Index) -> None: # Caller is responsible for ensuring we have an Index object. self._validate_set_axis(axis, new_labels) self.axes[axis] = new_labels + @final + def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + old_len = len(self.axes[axis]) + new_len = len(new_labels) + + if axis == 1 and len(self.items) == 0: + # If we are setting the index on a DataFrame with no columns, + # it is OK to change the length. + pass + + elif new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) + @property def is_single_block(self) -> bool: # Assumes we are 2D; overridden by SingleBlockManager @@ -282,8 +331,8 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: Checks if two blocks from two different block managers reference the same underlying values. """ - ref = weakref.ref(self.blocks[blkno]) - return ref in mgr.blocks[blkno].refs.referenced_blocks + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) @@ -301,6 +350,8 @@ def arrays(self) -> list[ArrayLike]: Warning! The returned arrays don't handle Copy-on-Write, so this should be used with caution (only in read-mode). """ + # TODO: Deprecate, usage in Dask + # https://github.com/dask/dask/blob/484fc3f1136827308db133cd256ba74df7a38d8c/dask/base.py#L1312 return [blk.values for blk in self.blocks] def __repr__(self) -> str: @@ -315,6 +366,29 @@ def __repr__(self) -> str: output += f"\n{block}" return output + def _equal_values(self, other: Self) -> bool: + """ + To be implemented by the subclasses. Only check the column values + assuming shape and indexes have already been checked. + """ + raise AbstractMethodError(self) + + @final + def equals(self, other: object) -> bool: + """ + Implementation for DataFrame.equals + """ + if not isinstance(other, type(self)): + return False + + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + + return self._equal_values(other) + def apply( self, f, @@ -367,10 +441,108 @@ def apply( out = type(self).from_blocks(result_blocks, self.axes) return out - # Alias so we can share code with ArrayManager - apply_with_block = apply + @final + def isna(self, func) -> Self: + return self.apply("apply", func=func) + + @final + def fillna(self, value, limit: int | None, inplace: bool) -> Self: + if limit is not None: + # Do this validation even if we go through one of the no-op paths + limit = libalgos.validate_limit(None, limit=limit) + + return self.apply( + "fillna", + value=value, + limit=limit, + inplace=inplace, + ) + + @final + def where(self, other, cond, align: bool) -> Self: + if align: + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply( + "where", + align_keys=align_keys, + other=other, + cond=cond, + ) + + @final + def putmask(self, mask, new, align: bool = True) -> Self: + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + return self.apply( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + ) + + @final + def round(self, decimals: int) -> Self: + return self.apply("round", decimals=decimals) + + @final + def replace(self, to_replace, value, inplace: bool) -> Self: + inplace = validate_bool_kwarg(inplace, "inplace") + # NDFrame.replace ensures the not-is_list_likes here + assert not lib.is_list_like(to_replace) + assert not lib.is_list_like(value) + return self.apply( + "replace", + to_replace=to_replace, + value=value, + inplace=inplace, + ) + + @final + def replace_regex(self, **kwargs) -> Self: + return self.apply("_replace_regex", **kwargs) + + @final + def replace_list( + self, + src_list: list[Any], + dest_list: list[Any], + inplace: bool = False, + regex: bool = False, + ) -> Self: + """do a list replace""" + inplace = validate_bool_kwarg(inplace, "inplace") + + bm = self.apply( + "replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, + ) + bm._consolidate_inplace() + return bm + + def interpolate(self, inplace: bool, **kwargs) -> Self: + return self.apply("interpolate", inplace=inplace, **kwargs) + + def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: + return self.apply("pad_or_backfill", inplace=inplace, **kwargs) + + def shift(self, periods: int, fill_value) -> Self: + if fill_value is lib.no_default: + fill_value = None - def setitem(self, indexer, value, warn: bool = True) -> Self: + return self.apply("shift", periods=periods, fill_value=fill_value) + + def setitem(self, indexer, value) -> Self: """ Set values with indexer. @@ -379,14 +551,7 @@ def setitem(self, indexer, value, warn: bool = True) -> Self: if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") - if warn and warn_copy_on_write() and not self._has_no_reference(0): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - - elif using_copy_on_write() and not self._has_no_reference(0): + if not self._has_no_reference(0): # this method is only called if there is a single block -> hardcoded 0 # Split blocks to only copy the columns we want to modify if self.ndim == 2 and isinstance(indexer, tuple): @@ -419,43 +584,14 @@ def diff(self, n: int) -> Self: # only reached with self.ndim == 2 return self.apply("diff", n=n) - def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self: - if copy is None: - if using_copy_on_write(): - copy = False - else: - copy = True - elif using_copy_on_write(): - copy = False + def astype(self, dtype, errors: str = "raise") -> Self: + return self.apply("astype", dtype=dtype, errors=errors) - return self.apply( - "astype", - dtype=dtype, - copy=copy, - errors=errors, - using_cow=using_copy_on_write(), - ) - - def convert(self, copy: bool | None) -> Self: - if copy is None: - if using_copy_on_write(): - copy = False - else: - copy = True - elif using_copy_on_write(): - copy = False - - return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) + def convert(self) -> Self: + return self.apply("convert") def convert_dtypes(self, **kwargs): - if using_copy_on_write(): - copy = False - else: - copy = True - - return self.apply( - "convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs - ) + return self.apply("convert_dtypes", **kwargs) def get_values_for_csv( self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None @@ -488,7 +624,7 @@ def is_view(self) -> bool: # e.g. [ b.values.base is not None for b in self.blocks ] # but then we have the case of possibly some blocks being a view # and some blocks not. setting in theory is possible on the non-view - # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit + # blocks. But this is a bit # complicated return False @@ -510,8 +646,7 @@ def get_bool_data(self) -> Self: new_blocks.append(blk) elif blk.is_object: - nbs = blk._split() - new_blocks.extend(nb for nb in nbs if nb.is_bool) + new_blocks.extend(nb for nb in blk._split() if nb.is_bool) return self._combine(new_blocks) @@ -555,7 +690,7 @@ def _combine(self, blocks: list[Block], index: Index | None = None) -> Self: def nblocks(self) -> int: return len(self.blocks) - def copy(self, deep: bool | None | Literal["all"] = True) -> Self: + def copy(self, deep: bool | Literal["all"] = True) -> Self: """ Make deep or shallow copy of BlockManager @@ -569,14 +704,6 @@ def copy(self, deep: bool | None | Literal["all"] = True) -> Self: ------- BlockManager """ - if deep is None: - if using_copy_on_write(): - # use shallow copy - deep = False - else: - # preserve deep copy for BlockManager with copy=None - deep = True - # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas @@ -586,10 +713,7 @@ def copy_func(ax): new_axes = [copy_func(ax) for ax in self.axes] else: - if using_copy_on_write(): - new_axes = [ax.view() for ax in self.axes] - else: - new_axes = list(self.axes) + new_axes = [ax.view() for ax in self.axes] res = self.apply("copy", deep=deep) res.axes = new_axes @@ -605,6 +729,9 @@ def copy_func(ax): res._consolidate_inplace() return res + def is_consolidated(self) -> bool: + return True + def consolidate(self) -> Self: """ Join together blocks having same dtype @@ -621,6 +748,30 @@ def consolidate(self) -> Self: bm._consolidate_inplace() return bm + def _consolidate_inplace(self) -> None: + return + + @final + def reindex_axis( + self, + new_index: Index, + axis: AxisInt, + fill_value=None, + only_slice: bool = False, + ) -> Self: + """ + Conform data manager to new index. + """ + new_index, indexer = self.axes[axis].reindex(new_index) + + return self.reindex_indexer( + new_index, + indexer, + axis=axis, + fill_value=fill_value, + only_slice=only_slice, + ) + def reindex_indexer( self, new_axis: Index, @@ -628,7 +779,6 @@ def reindex_indexer( axis: AxisInt, fill_value=None, allow_dups: bool = False, - copy: bool | None = True, only_slice: bool = False, *, use_na_proxy: bool = False, @@ -641,8 +791,6 @@ def reindex_indexer( axis : int fill_value : object, default None allow_dups : bool, default False - copy : bool or None, default True - If None, regard as False to get shallow copy. only_slice : bool, default False Whether to take views, not copies, along columns. use_na_proxy : bool, default False @@ -650,19 +798,11 @@ def reindex_indexer( pandas-indexer with -1's only. """ - if copy is None: - if using_copy_on_write(): - # use shallow copy - copy = False - else: - # preserve deep copy for BlockManager with copy=None - copy = True - if indexer is None: - if new_axis is self.axes[axis] and not copy: + if new_axis is self.axes[axis]: return self - result = self.copy(deep=copy) + result = self.copy(deep=False) result.axes = list(self.axes) result.axes[axis] = new_axis return result @@ -678,11 +818,13 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0( - indexer, - fill_value=fill_value, - only_slice=only_slice, - use_na_proxy=use_na_proxy, + new_blocks = list( + self._slice_take_blocks_ax0( + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, + ) ) else: new_blocks = [ @@ -714,7 +856,7 @@ def _slice_take_blocks_ax0( *, use_na_proxy: bool = False, ref_inplace_op: bool = False, - ) -> list[Block]: + ) -> Generator[Block]: """ Slice/take blocks along axis=0. @@ -732,9 +874,9 @@ def _slice_take_blocks_ax0( ref_inplace_op: bool, default False Don't track refs if True because we operate inplace - Returns - ------- - new_blocks : list of Block + Yields + ------ + Block : New Block """ allow_fill = fill_value is not lib.no_default @@ -749,9 +891,10 @@ def _slice_take_blocks_ax0( # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: - return [] + return bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + yield blk.getitem_block_columns(slobj, new_mgr_locs=bp) + return elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -759,25 +902,21 @@ def _slice_take_blocks_ax0( if not allow_fill and only_slice: # GH#33597 slice instead of take, so we get # views instead of copies - blocks = [ - blk.getitem_block_columns( + for i, ml in enumerate(slobj): + yield blk.getitem_block_columns( slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i), ref_inplace_op=ref_inplace_op, ) - for i, ml in enumerate(slobj) - ] - return blocks else: bp = BlockPlacement(slice(0, sllen)) - return [ - blk.take_nd( - slobj, - axis=0, - new_mgr_locs=bp, - fill_value=fill_value, - ) - ] + yield blk.take_nd( + slobj, + axis=0, + new_mgr_locs=bp, + fill_value=fill_value, + ) + return if sl_type == "slice": blknos = self.blknos[slobj] @@ -792,18 +931,15 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). - blocks = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: # If we've got here, fill_value was not lib.no_default - blocks.append( - self._make_na_block( - placement=mgr_locs, - fill_value=fill_value, - use_na_proxy=use_na_proxy, - ) + yield self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, ) else: blk = self.blocks[blkno] @@ -814,23 +950,22 @@ def _slice_take_blocks_ax0( # A non-consolidatable block, it's easy, because there's # only one item and each mgr loc is a copy of that single # item. - deep = not (only_slice or using_copy_on_write()) + deep = False for mgr_loc in mgr_locs: newblk = blk.copy(deep=deep) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) - blocks.append(newblk) + yield newblk else: # GH#32779 to avoid the performance penalty of copying, # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - if only_slice or using_copy_on_write(): - taker = lib.maybe_indices_to_slice(taker, max_len) + taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) - blocks.append(nb) + yield nb elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies @@ -839,12 +974,10 @@ def _slice_take_blocks_ax0( bp = BlockPlacement(ml) nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) - blocks.append(nb) + yield nb else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) - - return blocks + yield nb def _make_na_block( self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False @@ -858,8 +991,12 @@ def _make_na_block( nb = NumpyBlock(vals, placement, ndim=2) return nb - if fill_value is None: + if fill_value is None or fill_value is np.nan: fill_value = np.nan + # GH45857 avoid unnecessary upcasting + dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) + if dtype is not None and np.issubdtype(dtype.type, np.floating): + fill_value = dtype.type(fill_value) shape = (len(placement), self.shape[1]) @@ -897,7 +1034,6 @@ def take( indexer=indexer, axis=axis, allow_dups=True, - copy=None, ) @@ -971,7 +1107,9 @@ def fast_xs(self, loc: int) -> SingleBlockManager: if len(self.blocks) == 1: # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like; # is this ruled out in the general case? - result = self.blocks[0].iget((slice(None), loc)) + result: np.ndarray | ExtensionArray = self.blocks[0].iget( + (slice(None), loc) + ) # in the case of a single block, the new block is a view bp = BlockPlacement(slice(0, len(result))) block = new_block( @@ -1141,7 +1279,7 @@ def value_getitem(placement): blk_locs = blklocs[val_locs.indexer] if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write - if using_copy_on_write() and not self._has_no_reference_block(blkno_l): + if not self._has_no_reference_block(blkno_l): self._iset_split_block( blkno_l, blk_locs, value_getitem(val_locs), refs=refs ) @@ -1160,7 +1298,7 @@ def value_getitem(placement): # Defer setting the new values to enable consolidation self._iset_split_block(blkno_l, blk_locs, refs=refs) - if len(removed_blknos): + if removed_blknos: # Remove blocks & update blknos accordingly is_deleted = np.zeros(self.nblocks, dtype=np.bool_) is_deleted[removed_blknos] = True @@ -1224,7 +1362,7 @@ def _iset_split_block( """Removes columns from a block by splitting the block. Avoids copying the whole block through slicing and updates the manager - after determinint the new block structure. Optionally adds a new block, + after determining the new block structure. Optionally adds a new block, otherwise has to be done by the caller. Parameters @@ -1282,10 +1420,7 @@ def _iset_single( # Caller is responsible for verifying value.shape if inplace and blk.should_store(value): - copy = False - if using_copy_on_write() and not self._has_no_reference_block(blkno): - # perform Copy-on-Write and clear the reference - copy = True + copy = not self._has_no_reference_block(blkno) iloc = self.blklocs[loc] blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) return @@ -1305,17 +1440,7 @@ def column_setitem( This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ - needs_to_warn = False - if warn_copy_on_write() and not self._has_no_reference(loc): - if not isinstance( - self.blocks[self.blknos[loc]].values, - (ArrowExtensionArray, ArrowStringArray), - ): - # We might raise if we are in an expansion case, so defer - # warning till we actually updated - needs_to_warn = True - - elif using_copy_on_write() and not self._has_no_reference(loc): + if not self._has_no_reference(loc): blkno = self.blknos[loc] # Split blocks to only copy the column we want to modify blk_loc = self.blklocs[loc] @@ -1338,13 +1463,6 @@ def column_setitem( new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) - if needs_to_warn: - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) - def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: """ Insert item at selected position. @@ -1356,14 +1474,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: value : np.ndarray or ExtensionArray refs : The reference tracking object of the value to set. """ - with warnings.catch_warnings(): - # TODO: re-issue this with setitem-specific message? - warnings.filterwarnings( - "ignore", - "The behavior of Index.insert with object-dtype is deprecated", - category=FutureWarning, - ) - new_axis = self.items.insert(loc, item) + new_axis = self.items.insert(loc, item) if value.ndim == 2: value = value.T @@ -1390,7 +1501,10 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: self._known_consolidated = False - if sum(not block.is_extension for block in self.blocks) > 100: + if ( + get_option("performance_warnings") + and sum(not block.is_extension for block in self.blocks) > 100 + ): warnings.warn( "DataFrame is highly fragmented. This is usually the result " "of calling `frame.insert` many times, which has poor performance. " @@ -1405,7 +1519,9 @@ def _insert_update_mgr_locs(self, loc) -> None: When inserting a new Block at location 'loc', we increment all of the mgr_locs of blocks above that by one. """ - for blkno, count in _fast_count_smallints(self.blknos[loc:]): + # Faster version of set(arr) for sequences of small numbers + blknos = np.bincount(self.blknos[loc:]).nonzero()[0] + for blkno in blknos: # .620 this way, .326 of which is in increment_above blk = self.blocks[blkno] blk._mgr_locs = blk._mgr_locs.increment_above(loc) @@ -1422,9 +1538,9 @@ def _insert_update_blklocs_and_blknos(self, loc) -> None: self._blklocs = np.append(self._blklocs, 0) self._blknos = np.append(self._blknos, len(self.blocks)) elif loc == 0: - # np.append is a lot faster, let's use it if we can. - self._blklocs = np.append(self._blklocs[::-1], 0)[::-1] - self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1] + # As of numpy 1.26.4, np.concatenate faster than np.append + self._blklocs = np.concatenate([[0], self._blklocs]) + self._blknos = np.concatenate([[len(self.blocks)], self._blknos]) else: new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos( self.blklocs, self.blknos, loc, len(self.blocks) @@ -1477,7 +1593,7 @@ def grouped_reduce(self, func: Callable) -> Self: nrows = 0 else: nrows = result_blocks[0].values.shape[-1] - index = Index(range(nrows)) + index = default_index(nrows) return type(self).from_blocks(result_blocks, [self.axes[0], index]) @@ -1615,21 +1731,18 @@ def unstack(self, unstacker, fill_value) -> BlockManager: bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) return bm - def to_dict(self) -> dict[str, Self]: + def to_iter_dict(self) -> Generator[tuple[str, Self]]: """ - Return a dict of str(dtype) -> BlockManager + Yield a tuple of (str(dtype), BlockManager) Returns ------- - values : a dict of dtype -> BlockManager + values : a tuple of (str(dtype), BlockManager) """ - - bd: dict[str, list[Block]] = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} + key = lambda block: str(block.dtype) + for dtype, blocks in itertools.groupby(sorted(self.blocks, key=key), key=key): + # TODO(EA2D): the combine will be unnecessary with 2D EAs + yield dtype, self._combine(list(blocks)) def as_array( self, @@ -1683,10 +1796,12 @@ def as_array( na_value=na_value, copy=copy, ).reshape(blk.shape) + elif not copy: + arr = np.asarray(blk.values, dtype=dtype) else: arr = np.array(blk.values, dtype=dtype, copy=copy) - if using_copy_on_write() and not copy: + if not copy: arr = arr.view() arr.flags.writeable = False else: @@ -1821,7 +1936,7 @@ def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: raise NotImplementedError("This logic lives (for now) in internals.concat") -class SingleBlockManager(BaseBlockManager, SingleDataManager): +class SingleBlockManager(BaseBlockManager): """manage a single block with""" @property @@ -1940,20 +2055,30 @@ def _post_setstate(self) -> None: def _block(self) -> Block: return self.blocks[0] + @final @property - def _blknos(self): + def array(self) -> ArrayLike: + """ + Quick access to the backing array of the Block. + """ + return self.blocks[0].values + + # error: Cannot override writeable attribute with read-only property + @property + def _blknos(self) -> None: # type: ignore[override] """compat with BlockManager""" return None + # error: Cannot override writeable attribute with read-only property @property - def _blklocs(self): + def _blklocs(self) -> None: # type: ignore[override] """compat with BlockManager""" return None def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: # similar to get_slice, but not restricted to slice indexer blk = self._block - if using_copy_on_write() and len(indexer) > 0 and indexer.all(): + if len(indexer) > 0 and indexer.all(): return type(self)(blk.copy(deep=False), self.index) array = blk.values[indexer] @@ -2017,30 +2142,33 @@ def get_numeric_data(self) -> Self: def _can_hold_na(self) -> bool: return self._block._can_hold_na - def setitem_inplace(self, indexer, value, warn: bool = True) -> None: + def setitem_inplace(self, indexer, value) -> None: """ Set values with indexer. - For Single[Block/Array]Manager, this backs s[indexer] = value + For SingleBlockManager, this backs s[indexer] = value This is an inplace version of `setitem()`, mutating the manager/values in place, not returning a new Manager (and Block), and thus never changing the dtype. """ - using_cow = using_copy_on_write() - warn_cow = warn_copy_on_write() - if (using_cow or warn_cow) and not self._has_no_reference(0): - if using_cow: - self.blocks = (self._block.copy(),) - self._cache.clear() - elif warn_cow and warn: - warnings.warn( - COW_WARNING_SETITEM_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) + if not self._has_no_reference(0): + self.blocks = (self._block.copy(),) + self._reset_cache() - super().setitem_inplace(indexer, value) + arr = self.array + + # EAs will do this validation in their own __setitem__ methods. + if isinstance(arr, np.ndarray): + # Note: checking for ndarray instead of np.dtype means we exclude + # dt64/td64, which do their own validation. + value = np_can_hold_element(arr.dtype, value) + + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + value = value[0, ...] + + arr[indexer] = value def idelete(self, indexer) -> SingleBlockManager: """ @@ -2051,7 +2179,7 @@ def idelete(self, indexer) -> SingleBlockManager: nb = self._block.delete(indexer)[0] self.blocks = (nb,) self.axes[0] = self.axes[0].delete(indexer) - self._cache.clear() + self._reset_cache() return self def fast_xs(self, loc): @@ -2086,6 +2214,14 @@ def _equal_values(self, other: Self) -> bool: right = other.blocks[0].values return array_equals(left, right) + def grouped_reduce(self, func): + arr = self.array + res = func(arr) + index = default_index(len(res)) + + mgr = type(self).from_array(res, index) + return mgr + # -------------------------------------------------------------------- # Constructor Helpers @@ -2149,7 +2285,7 @@ def raise_construction_error( block_shape: Shape, axes: list[Index], e: ValueError | None = None, -): +) -> NoReturn: """raise a helpful message about our construction""" passed = tuple(map(int, [tot_items] + list(block_shape))) # Correcting the user facing error message during dataframe construction @@ -2187,7 +2323,7 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]: def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: - tuples = list(enumerate(arrays)) + tuples = enumerate(arrays) if not consolidate: return _tuples_to_blocks_no_consolidate(tuples, refs) @@ -2208,7 +2344,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list if issubclass(dtype.type, (str, bytes)): dtype = np.dtype(object) - values, placement = _stack_arrays(list(tup_block), dtype) + values, placement = _stack_arrays(tup_block, dtype) if is_dtlike: values = ensure_wrapped_if_datetimelike(values) blk = block_type(values, placement=BlockPlacement(placement), ndim=2) @@ -2307,15 +2443,6 @@ def _merge_blocks( return blocks, False -def _fast_count_smallints(arr: npt.NDArray[np.intp]): - """Faster version of set(arr) for sequences of small numbers.""" - counts = np.bincount(arr) - nz = counts.nonzero()[0] - # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, - # in one benchmark by a factor of 11 - return zip(nz, counts[nz]) - - def _preprocess_slice_or_indexer( slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool ): @@ -2366,9 +2493,9 @@ def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike: else: # NB: we should never get here with dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish - missing_arr = np.empty(shape, dtype=dtype) - missing_arr.fill(fill_value) + missing_arr_np = np.empty(shape, dtype=dtype) + missing_arr_np.fill(fill_value) if dtype.kind in "mM": - missing_arr = ensure_wrapped_if_datetimelike(missing_arr) - return missing_arr + missing_arr_np = ensure_wrapped_if_datetimelike(missing_arr_np) + return missing_arr_np diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index c620bb9d17976..944e28a9b0238 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -3,6 +3,7 @@ Method NDFrame.describe() delegates actual execution to function describe_ndframe(). """ + from __future__ import annotations from abc import ( @@ -11,13 +12,11 @@ ) from typing import ( TYPE_CHECKING, - Callable, cast, ) import numpy as np -from pandas._libs.tslibs import Timestamp from pandas._typing import ( DtypeObj, NDFrameT, @@ -42,6 +41,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -173,8 +173,9 @@ def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame: col_names = reorder_columns(ldesc) d = concat( - [x.reindex(col_names, copy=False) for x in ldesc], + [x.reindex(col_names) for x in ldesc], axis=1, + ignore_index=True, sort=False, ) d.columns = data.columns.copy() @@ -228,10 +229,15 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: formatted_percentiles = format_percentiles(percentiles) + if len(percentiles) == 0: + quantiles = [] + else: + quantiles = series.quantile(percentiles).tolist() + stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] d = ( [series.count(), series.mean(), series.std(), series.min()] - + series.quantile(percentiles).tolist() + + quantiles + [series.max()] ) # GH#48340 - always return float on non-complex numeric data @@ -287,54 +293,6 @@ def describe_categorical_1d( return Series(result, index=names, name=data.name, dtype=dtype) -def describe_timestamp_as_categorical_1d( - data: Series, - percentiles_ignored: Sequence[float], -) -> Series: - """Describe series containing timestamp data treated as categorical. - - Parameters - ---------- - data : Series - Series to be described. - percentiles_ignored : list-like of numbers - Ignored, but in place to unify interface. - """ - names = ["count", "unique"] - objcounts = data.value_counts() - count_unique = len(objcounts[objcounts != 0]) - result: list[float | Timestamp] = [data.count(), count_unique] - dtype = None - if count_unique > 0: - top, freq = objcounts.index[0], objcounts.iloc[0] - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] - - # If the DataFrame is empty, set 'top' and 'freq' to None - # to maintain output shape consistency - else: - names += ["top", "freq"] - result += [np.nan, np.nan] - dtype = "object" - - from pandas import Series - - return Series(result, index=names, name=data.name, dtype=dtype) - - def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series: """Describe series containing datetime64 dtype. @@ -401,10 +359,6 @@ def _refine_percentiles( # get them all to be in [0, 1] validate_percentile(percentiles) - # median should always be included - if 0.5 not in percentiles: - percentiles.append(0.5) - percentiles = np.asarray(percentiles) # sort and check for duplicates diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index a2f8ca94134b8..59516b16905dc 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -10,6 +10,8 @@ ) from typing import ( TYPE_CHECKING, + Generic, + Literal, cast, final, ) @@ -28,20 +30,34 @@ ) from pandas.core.dtypes.dtypes import BaseMaskedDtype +from pandas.core.indexes.api import default_index + if TYPE_CHECKING: from pandas._typing import ( DtypeObj, IndexLabel, + NDFrameT, ) from pandas import ( DataFrame, + Index, Series, ) +else: + # Generic[...] requires a non-str, provide it with a plain TypeVar at + # runtime to avoid circular imports + from pandas._typing import T + + NDFrameT = T + DataFrame = T + Series = T -class SelectN: - def __init__(self, obj, n: int, keep: str) -> None: +class SelectN(Generic[NDFrameT]): + def __init__( + self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"] + ) -> None: self.obj = obj self.n = n self.keep = keep @@ -49,15 +65,15 @@ def __init__(self, obj, n: int, keep: str) -> None: if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') - def compute(self, method: str) -> DataFrame | Series: + def compute(self, method: str) -> NDFrameT: raise NotImplementedError @final - def nlargest(self): + def nlargest(self) -> NDFrameT: return self.compute("nlargest") @final - def nsmallest(self): + def nsmallest(self) -> NDFrameT: return self.compute("nsmallest") @final @@ -72,7 +88,7 @@ def is_valid_dtype_n_method(dtype: DtypeObj) -> bool: return needs_i8_conversion(dtype) -class SelectNSeries(SelectN): +class SelectNSeries(SelectN[Series]): """ Implement n largest/smallest for Series @@ -98,15 +114,25 @@ def compute(self, method: str) -> Series: if n <= 0: return self.obj[[]] - dropped = self.obj.dropna() - nan_index = self.obj.drop(dropped.index) + # Save index and reset to default index to avoid performance impact + # from when index contains duplicates + original_index: Index = self.obj.index + default_index = self.obj.reset_index(drop=True) - # slow method - if n >= len(self.obj): + # Slower method used when taking the full length of the series + # In this case, it is equivalent to a sort. + if n >= len(default_index): ascending = method == "nsmallest" - return self.obj.sort_values(ascending=ascending).head(n) + result = default_index.sort_values(ascending=ascending, kind="stable").head( + n + ) + result.index = original_index.take(result.index) + return result + + # Fast method used in the general case + dropped = default_index.dropna() + nan_index = default_index.drop(dropped.index) - # fast method new_dtype = dropped.dtype # Similar to algorithms._ensure_data @@ -145,7 +171,7 @@ def compute(self, method: str) -> Series: else: kth_val = np.nan (ns,) = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind="mergesort")] + inds = ns[arr[ns].argsort(kind="stable")] if self.keep != "all": inds = inds[:n] @@ -160,10 +186,12 @@ def compute(self, method: str) -> Series: # reverse indices inds = narr - 1 - inds - return concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result = concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result.index = original_index.take(result.index) + return result -class SelectNFrame(SelectN): +class SelectNFrame(SelectN[DataFrame]): """ Implement n largest/smallest for DataFrame @@ -179,7 +207,13 @@ class SelectNFrame(SelectN): nordered : DataFrame """ - def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None: + def __init__( + self, + obj: DataFrame, + n: int, + keep: Literal["first", "last", "all"], + columns: IndexLabel, + ) -> None: super().__init__(obj, n, keep) if not is_list_like(columns) or isinstance(columns, tuple): columns = [columns] @@ -189,8 +223,6 @@ def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> No self.columns = columns def compute(self, method: str) -> DataFrame: - from pandas.core.api import Index - n = self.n frame = self.obj columns = self.columns @@ -199,11 +231,11 @@ def compute(self, method: str) -> DataFrame: dtype = frame[column].dtype if not self.is_valid_dtype_n_method(dtype): raise TypeError( - f"Column {repr(column)} has dtype {dtype}, " - f"cannot use method {repr(method)} with this dtype" + f"Column {column!r} has dtype {dtype}, " + f"cannot use method {method!r} with this dtype" ) - def get_indexer(current_indexer, other_indexer): + def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: """ Helper function to concat `current_indexer` and `other_indexer` depending on `method` @@ -217,7 +249,7 @@ def get_indexer(current_indexer, other_indexer): original_index = frame.index cur_frame = frame = frame.reset_index(drop=True) cur_n = n - indexer = Index([], dtype=np.int64) + indexer: Index = default_index(0) for i, column in enumerate(columns): # For each column we apply method to cur_frame[column]. @@ -266,4 +298,4 @@ def get_indexer(current_indexer, other_indexer): ascending = method == "nsmallest" - return frame.sort_values(columns, ascending=ascending, kind="mergesort") + return frame.sort_values(columns, ascending=ascending, kind="stable") diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 7bd4851425c3b..aea95e4684573 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -24,11 +24,34 @@ from pandas.core import common as com if TYPE_CHECKING: + from collections.abc import Generator + from pandas._typing import MutableMappingT from pandas import DataFrame +def create_data_for_split( + df: DataFrame, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] +) -> Generator[list]: + """ + Simple helper method to create data for to ``to_dict(orient="split")`` + to create the main output data + """ + if are_all_object_dtype_cols: + for tup in df.itertuples(index=False, name=None): + yield list(map(maybe_box_native, tup)) + else: + for tup in df.itertuples(index=False, name=None): + data = list(tup) + if object_dtype_indices: + # If we have object_dtype_cols, apply maybe_box_naive after + # for perf + for i in object_dtype_indices: + data[i] = maybe_box_native(data[i]) + yield data + + @overload def to_dict( df: DataFrame, @@ -36,8 +59,7 @@ def to_dict( *, into: type[MutableMappingT] | MutableMappingT, index: bool = ..., -) -> MutableMappingT: - ... +) -> MutableMappingT: ... @overload @@ -47,8 +69,7 @@ def to_dict( *, into: type[MutableMappingT] | MutableMappingT, index: bool = ..., -) -> list[MutableMappingT]: - ... +) -> list[MutableMappingT]: ... @overload @@ -58,8 +79,7 @@ def to_dict( *, into: type[dict] = ..., index: bool = ..., -) -> dict: - ... +) -> dict: ... @overload @@ -69,8 +89,7 @@ def to_dict( *, into: type[dict] = ..., index: bool = ..., -) -> list[dict]: - ... +) -> list[dict]: ... # error: Incompatible default for argument "into" (default has type "type[dict @@ -129,14 +148,15 @@ def to_dict( Return a collections.abc.MutableMapping object representing the DataFrame. The resulting transformation depends on the `orient` parameter. """ - if not df.columns.is_unique: + if orient != "tight" and not df.columns.is_unique: warnings.warn( "DataFrame columns are not unique, some columns will be omitted.", UserWarning, stacklevel=find_stack_level(), ) # GH16122 - into_c = com.standardize_mapping(into) + # error: Call to untyped function "standardize_mapping" in typed context + into_c = com.standardize_mapping(into) # type: ignore[no-untyped-call] # error: Incompatible types in assignment (expression has type "str", # variable has type "Literal['dict', 'list', 'series', 'split', 'tight', @@ -152,39 +172,38 @@ def to_dict( # GH46470 Return quickly if orient series to avoid creating dtype objects return into_c((k, v) for k, v in df.items()) + if orient == "dict": + return into_c((k, v.to_dict(into=into)) for k, v in df.items()) + box_native_indices = [ i for i, col_dtype in enumerate(df.dtypes.values) if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype) ] - box_na_values = [ - lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA - for i, col_dtype in enumerate(df.dtypes.values) - ] - are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) - if orient == "dict": - return into_c((k, v.to_dict(into=into)) for k, v in df.items()) + are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) - elif orient == "list": + if orient == "list": object_dtype_indices_as_set: set[int] = set(box_native_indices) + box_na_values = ( + lib.no_default + if not isinstance(col_dtype, BaseMaskedDtype) + else libmissing.NA + for col_dtype in df.dtypes.values + ) return into_c( ( k, - list( - map( - maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist() - ) - ) + list(map(maybe_box_native, v.to_numpy(na_value=box_na_value))) if i in object_dtype_indices_as_set - else v.to_numpy().tolist(), + else list(map(maybe_box_native, v.to_numpy())), ) - for i, (k, v) in enumerate(df.items()) + for i, (box_na_value, (k, v)) in enumerate(zip(box_na_values, df.items())) ) elif orient == "split": - data = df._create_data_for_split_and_tight_to_dict( - are_all_object_dtype_cols, box_native_indices + data = list( + create_data_for_split(df, are_all_object_dtype_cols, box_native_indices) ) return into_c( @@ -196,10 +215,6 @@ def to_dict( ) elif orient == "tight": - data = df._create_data_for_split_and_tight_to_dict( - are_all_object_dtype_cols, box_native_indices - ) - return into_c( ((("index", df.index.tolist()),) if index else ()) + ( @@ -219,11 +234,9 @@ def to_dict( elif orient == "records": columns = df.columns.tolist() if are_all_object_dtype_cols: - rows = ( - dict(zip(columns, row)) for row in df.itertuples(index=False, name=None) - ) return [ - into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows + into_c(zip(columns, map(maybe_box_native, row))) + for row in df.itertuples(index=False, name=None) ] else: data = [ @@ -239,7 +252,7 @@ def to_dict( for row in data: for col in object_dtype_cols: row[col] = maybe_box_native(row[col]) - return data + return data # type: ignore[return-value] elif orient == "index": if not df.index.is_unique: @@ -252,24 +265,21 @@ def to_dict( ) elif box_native_indices: object_dtype_indices_as_set = set(box_native_indices) - is_object_dtype_by_index = [ - i in object_dtype_indices_as_set for i in range(len(df.columns)) - ] return into_c( ( t[0], { - columns[i]: maybe_box_native(v) - if is_object_dtype_by_index[i] + column: maybe_box_native(v) + if i in object_dtype_indices_as_set else v - for i, v in enumerate(t[1:]) + for i, (column, v) in enumerate(zip(columns, t[1:])) }, ) for t in df.itertuples(name=None) ) else: return into_c( - (t[0], dict(zip(df.columns, t[1:]))) for t in df.itertuples(name=None) + (t[0], dict(zip(columns, t[1:]))) for t in df.itertuples(name=None) ) else: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d275445983b6f..66609fa870f14 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,12 +1,10 @@ """ Routines for filling missing data. """ + from __future__ import annotations -from functools import ( - partial, - wraps, -) +from functools import wraps from typing import ( TYPE_CHECKING, Any, @@ -34,6 +32,7 @@ from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( is_array_like, + is_bool_dtype, is_numeric_dtype, is_numeric_v_string_like, is_object_dtype, @@ -103,21 +102,34 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # GH 21977 mask = np.zeros(arr.shape, dtype=bool) - for x in nonna: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - pass - else: - if potential_na: - new_mask = np.zeros(arr.shape, dtype=np.bool_) - new_mask[arr_mask] = arr[arr_mask] == x + if ( + is_numeric_dtype(arr.dtype) + and not is_bool_dtype(arr.dtype) + and is_bool_dtype(nonna.dtype) + ): + pass + elif ( + is_bool_dtype(arr.dtype) + and is_numeric_dtype(nonna.dtype) + and not is_bool_dtype(nonna.dtype) + ): + pass + else: + for x in nonna: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + pass else: - new_mask = arr == x + if potential_na: + new_mask = np.zeros(arr.shape, dtype=np.bool_) + new_mask[arr_mask] = arr[arr_mask] == x + else: + new_mask = arr == x - if not isinstance(new_mask, np.ndarray): - # usually BooleanArray - new_mask = new_mask.to_numpy(dtype=bool, na_value=False) - mask |= new_mask + if not isinstance(new_mask, np.ndarray): + # usually BooleanArray + new_mask = new_mask.to_numpy(dtype=bool, na_value=False) + mask |= new_mask if na_mask.any(): mask |= isna(arr) @@ -130,8 +142,7 @@ def clean_fill_method( method: Literal["ffill", "pad", "bfill", "backfill"], *, allow_nearest: Literal[False] = ..., -) -> Literal["pad", "backfill"]: - ... +) -> Literal["pad", "backfill"]: ... @overload @@ -139,8 +150,7 @@ def clean_fill_method( method: Literal["ffill", "pad", "bfill", "backfill", "nearest"], *, allow_nearest: Literal[True], -) -> Literal["pad", "backfill", "nearest"]: - ... +) -> Literal["pad", "backfill", "nearest"]: ... def clean_fill_method( @@ -231,7 +241,8 @@ def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None: return None if is_valid.ndim == 2: - is_valid = is_valid.any(axis=1) # reduce axis 1 + # reduce axis 1 + is_valid = is_valid.any(axis=1) # type: ignore[assignment] if how == "first": idxpos = is_valid[::].argmax() @@ -302,9 +313,9 @@ def get_interp_index(method, index: Index) -> Index: # create/use the index if method == "linear": # prior default - from pandas import Index + from pandas import RangeIndex - index = Index(np.arange(len(index))) + index = RangeIndex(len(index)) else: methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( @@ -312,13 +323,17 @@ def get_interp_index(method, index: Index) -> Index: or isinstance(index.dtype, DatetimeTZDtype) or lib.is_np_dtype(index.dtype, "mM") ) - if method not in methods and not is_numeric_or_datetime: - raise ValueError( - "Index column must be numeric or datetime type when " - f"using {method} method other than linear. " - "Try setting a numeric or datetime index column before " - "interpolating." - ) + valid = NP_METHODS + SP_METHODS + if method in valid: + if method not in methods and not is_numeric_or_datetime: + raise ValueError( + "Index column must be numeric or datetime type when " + f"using {method} method other than linear. " + "Try setting a numeric or datetime index column before " + "interpolating." + ) + else: + raise ValueError(f"Can not interpolate with method={method}.") if isna(index).any(): raise NotImplementedError( @@ -338,6 +353,7 @@ def interpolate_2d_inplace( limit_direction: str = "forward", limit_area: str | None = None, fill_value: Any | None = None, + mask=None, **kwargs, ) -> None: """ @@ -385,16 +401,11 @@ def func(yvalues: np.ndarray) -> None: limit_area=limit_area_validated, fill_value=fill_value, bounds_error=False, + mask=mask, **kwargs, ) - # error: Argument 1 to "apply_along_axis" has incompatible type - # "Callable[[ndarray[Any, Any]], None]"; expected "Callable[..., - # Union[_SupportsArray[dtype[]], Sequence[_SupportsArray - # [dtype[]]], Sequence[Sequence[_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]]]]]" - np.apply_along_axis(func, axis, data) # type: ignore[arg-type] + np.apply_along_axis(func, axis, data) def _index_to_interp_indices(index: Index, method: str) -> np.ndarray: @@ -429,6 +440,7 @@ def _interpolate_1d( fill_value: Any | None = None, bounds_error: bool = False, order: int | None = None, + mask=None, **kwargs, ) -> None: """ @@ -442,8 +454,10 @@ def _interpolate_1d( ----- Fills 'yvalues' in-place. """ - - invalid = isna(yvalues) + if mask is not None: + invalid = mask + else: + invalid = isna(yvalues) valid = ~invalid if not valid.any(): @@ -452,20 +466,20 @@ def _interpolate_1d( if valid.all(): return - # These are sets of index pointers to invalid values... i.e. {0, 1, etc... - all_nans = set(np.flatnonzero(invalid)) + # These index pointers to invalid values... i.e. {0, 1, etc... + all_nans = np.flatnonzero(invalid) first_valid_index = find_valid_index(how="first", is_valid=valid) if first_valid_index is None: # no nan found in start first_valid_index = 0 - start_nans = set(range(first_valid_index)) + start_nans = np.arange(first_valid_index) last_valid_index = find_valid_index(how="last", is_valid=valid) if last_valid_index is None: # no nan found in end last_valid_index = len(yvalues) - end_nans = set(range(1 + last_valid_index, len(valid))) + end_nans = np.arange(1 + last_valid_index, len(valid)) - # Like the sets above, preserve_nans contains indices of invalid values, + # preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. @@ -474,27 +488,25 @@ def _interpolate_1d( # are more than 'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit - preserve_nans: list | set if limit_direction == "forward": - preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + preserve_nans = np.union1d(start_nans, _interp_limit(invalid, limit, 0)) elif limit_direction == "backward": - preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + preserve_nans = np.union1d(end_nans, _interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit - preserve_nans = set(_interp_limit(invalid, limit, limit)) + preserve_nans = np.unique(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == "inside": # preserve NaNs on the outside - preserve_nans |= start_nans | end_nans + preserve_nans = np.union1d(preserve_nans, start_nans) + preserve_nans = np.union1d(preserve_nans, end_nans) elif limit_area == "outside": # preserve NaNs on the inside - mid_nans = all_nans - start_nans - end_nans - preserve_nans |= mid_nans - - # sort preserve_nans and convert to list - preserve_nans = sorted(preserve_nans) + mid_nans = np.setdiff1d(all_nans, start_nans, assume_unique=True) + mid_nans = np.setdiff1d(mid_nans, end_nans, assume_unique=True) + preserve_nans = np.union1d(preserve_nans, mid_nans) is_datetimelike = yvalues.dtype.kind in "mM" @@ -520,7 +532,10 @@ def _interpolate_1d( **kwargs, ) - if is_datetimelike: + if mask is not None: + mask[:] = False + mask[preserve_nans] = True + elif is_datetimelike: yvalues[preserve_nans] = NaT.value else: yvalues[preserve_nans] = np.nan @@ -593,7 +608,12 @@ def _interpolate_scipy_wrapper( y = y.copy() if not new_x.flags.writeable: new_x = new_x.copy() - terp = alt_methods[method] + terp = alt_methods.get(method, None) + if terp is None: + raise ValueError(f"Can not interpolate with method={method}.") + + # Make sure downcast is not in kwargs for alt methods + kwargs.pop("downcast", None) new_y = terp(x, y, new_x, **kwargs) return new_y @@ -783,58 +803,6 @@ def _cubicspline_interpolate( return P(x) -def _interpolate_with_limit_area( - values: np.ndarray, - method: Literal["pad", "backfill"], - limit: int | None, - limit_area: Literal["inside", "outside"], -) -> None: - """ - Apply interpolation and limit_area logic to values along a to-be-specified axis. - - Parameters - ---------- - values: np.ndarray - Input array. - method: str - Interpolation method. Could be "bfill" or "pad" - limit: int, optional - Index limit on interpolation. - limit_area: {'inside', 'outside'} - Limit area for interpolation. - - Notes - ----- - Modifies values in-place. - """ - - invalid = isna(values) - is_valid = ~invalid - - if not invalid.all(): - first = find_valid_index(how="first", is_valid=is_valid) - if first is None: - first = 0 - last = find_valid_index(how="last", is_valid=is_valid) - if last is None: - last = len(values) - - pad_or_backfill_inplace( - values, - method=method, - limit=limit, - ) - - if limit_area == "inside": - invalid[first : last + 1] = False - elif limit_area == "outside": - invalid[:first] = invalid[last + 1 :] = False - else: - raise ValueError("limit_area should be 'inside' or 'outside'") - - values[invalid] = np.nan - - def pad_or_backfill_inplace( values: np.ndarray, method: Literal["pad", "backfill"] = "pad", @@ -863,27 +831,6 @@ def pad_or_backfill_inplace( ----- Modifies values in-place. """ - if limit_area is not None: - np.apply_along_axis( - # error: Argument 1 to "apply_along_axis" has incompatible type - # "partial[None]"; expected - # "Callable[..., Union[_SupportsArray[dtype[]], - # Sequence[_SupportsArray[dtype[]]], - # Sequence[Sequence[_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_ - # SupportsArray[dtype[]]]]]]]]" - partial( # type: ignore[arg-type] - _interpolate_with_limit_area, - method=method, - limit=limit, - limit_area=limit_area, - ), - axis, - values, - ) - return - transf = (lambda x: x) if axis == 0 else (lambda x: x.T) # reshape a 1 dim if needed @@ -897,8 +844,7 @@ def pad_or_backfill_inplace( func = get_fill_func(method, ndim=2) # _pad_2d and _backfill_2d both modify tvalues inplace - func(tvalues, limit=limit) - return + func(tvalues, limit=limit, limit_area=limit_area) def _fillna_prep( @@ -909,7 +855,6 @@ def _fillna_prep( if mask is None: mask = isna(values) - mask = mask.view(np.uint8) return mask @@ -919,16 +864,23 @@ def _datetimelike_compat(func: F) -> F: """ @wraps(func) - def new_func(values, limit: int | None = None, mask=None): + def new_func( + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask=None, + ): if needs_i8_conversion(values.dtype): if mask is None: # This needs to occur before casting to int64 mask = isna(values) - result, mask = func(values.view("i8"), limit=limit, mask=mask) + result, mask = func( + values.view("i8"), limit=limit, limit_area=limit_area, mask=mask + ) return result.view(values.dtype), mask - return func(values, limit=limit, mask=mask) + return func(values, limit=limit, limit_area=limit_area, mask=mask) return cast(F, new_func) @@ -937,9 +889,12 @@ def new_func(values, limit: int | None = None, mask=None): def _pad_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.pad_inplace(values, mask, limit=limit) return values, mask @@ -948,9 +903,12 @@ def _pad_1d( def _backfill_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.backfill_inplace(values, mask, limit=limit) return values, mask @@ -959,23 +917,28 @@ def _backfill_1d( def _pad_2d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, -): +) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.pad_2d_inplace(values, mask, limit=limit) - else: - # for test coverage - pass return values, mask @_datetimelike_compat def _backfill_2d( - values, limit: int | None = None, mask: npt.NDArray[np.bool_] | None = None + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.backfill_2d_inplace(values, mask, limit=limit) @@ -985,6 +948,63 @@ def _backfill_2d( return values, mask +def _fill_limit_area_1d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 1d mask for ffill/bfill with limit_area. + + Caller is responsible for checking at least one value of mask is False. + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + mask[:first] = False + mask[last + 1 :] = False + elif limit_area == "outside": + mask[first + 1 : last] = False + + +def _fill_limit_area_2d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 2d mask for ffill/bfill with limit_area. + + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask.T + if limit_area == "outside": + # Identify inside + la_mask = ( + np.maximum.accumulate(neg_mask, axis=0) + & np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + else: + # Identify outside + la_mask = ( + ~np.maximum.accumulate(neg_mask, axis=0) + | ~np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + mask[la_mask.T] = False + + _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} @@ -1003,7 +1023,7 @@ def clean_reindex_fill_method(method) -> ReindexMethod | None: def _interp_limit( invalid: npt.NDArray[np.bool_], fw_limit: int | None, bw_limit: int | None -): +) -> np.ndarray: """ Get indexers of values that won't be filled because they exceed the limits. @@ -1028,27 +1048,30 @@ def _interp_limit( def _interp_limit(invalid, fw_limit, bw_limit): for x in np.where(invalid)[0]: - if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): + if invalid[max(0, x - fw_limit) : x + bw_limit + 1].all(): yield x """ # handle forward first; the backward direction is the same except # 1. operate on the reversed array # 2. subtract the returned indices from N - 1 N = len(invalid) - f_idx = set() - b_idx = set() + f_idx = np.array([], dtype=np.int64) + b_idx = np.array([], dtype=np.int64) + assume_unique = True def inner(invalid, limit: int): limit = min(limit, N) - windowed = _rolling_window(invalid, limit + 1).all(1) - idx = set(np.where(windowed)[0] + limit) | set( - np.where((~invalid[: limit + 1]).cumsum() == 0)[0] + windowed = np.lib.stride_tricks.sliding_window_view(invalid, limit + 1).all(1) + idx = np.union1d( + np.where(windowed)[0] + limit, + np.where((~invalid[: limit + 1]).cumsum() == 0)[0], ) return idx if fw_limit is not None: if fw_limit == 0: - f_idx = set(np.where(invalid)[0]) + f_idx = np.where(invalid)[0] + assume_unique = False else: f_idx = inner(invalid, fw_limit) @@ -1058,26 +1081,8 @@ def inner(invalid, limit: int): # just use forwards return f_idx else: - b_idx_inv = list(inner(invalid[::-1], bw_limit)) - b_idx = set(N - 1 - np.asarray(b_idx_inv)) + b_idx = N - 1 - inner(invalid[::-1], bw_limit) if fw_limit == 0: return b_idx - return f_idx & b_idx - - -def _rolling_window(a: npt.NDArray[np.bool_], window: int) -> npt.NDArray[np.bool_]: - """ - [True, True, False, True, False], 2 -> - - [ - [True, True], - [True, False], - [False, True], - [True, False], - ] - """ - # https://stackoverflow.com/a/6811241 - shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) - strides = a.strides + (a.strides[-1],) - return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + return np.intersect1d(f_idx, b_idx, assume_unique=assume_unique) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 6cb825e9b79a2..25fb6e6181082 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -3,8 +3,8 @@ import functools import itertools from typing import ( + TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -31,7 +31,6 @@ npt, ) from pandas.compat._optional import import_optional_dependency -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_complex, @@ -49,6 +48,9 @@ notna, ) +if TYPE_CHECKING: + from collections.abc import Callable + bn = import_optional_dependency("bottleneck", errors="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False @@ -506,12 +508,12 @@ def nanany( >>> from pandas.core import nanops >>> s = pd.Series([1, 2]) >>> nanops.nanany(s.values) - True + np.True_ >>> from pandas.core import nanops >>> s = pd.Series([np.nan]) >>> nanops.nanany(s.values) - False + np.False_ """ if values.dtype.kind in "iub" and mask is None: # GH#26032 fastpath @@ -521,12 +523,7 @@ def nanany( if values.dtype.kind == "M": # GH#34479 - warnings.warn( - "'any' with datetime64 dtypes is deprecated and will raise in a " - "future version. Use (obj != pd.Timestamp(0)).any() instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise TypeError("datetime64 type does not support operation 'any'") values, _ = _get_values(values, skipna, fill_value=False, mask=mask) @@ -567,12 +564,12 @@ def nanall( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanall(s.values) - True + np.True_ >>> from pandas.core import nanops >>> s = pd.Series([1, 0]) >>> nanops.nanall(s.values) - False + np.False_ """ if values.dtype.kind in "iub" and mask is None: # GH#26032 fastpath @@ -582,12 +579,7 @@ def nanall( if values.dtype.kind == "M": # GH#34479 - warnings.warn( - "'all' with datetime64 dtypes is deprecated and will raise in a " - "future version. Use (obj != pd.Timestamp(0)).all() instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise TypeError("datetime64 type does not support operation 'all'") values, _ = _get_values(values, skipna, fill_value=True, mask=mask) @@ -633,7 +625,7 @@ def nansum( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nansum(s.values) - 3.0 + np.float64(3.0) """ dtype = values.dtype values, mask = _get_values(values, skipna, fill_value=0, mask=mask) @@ -699,7 +691,7 @@ def nanmean( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanmean(s.values) - 1.5 + np.float64(1.5) """ dtype = values.dtype values, mask = _get_values(values, skipna, fill_value=0, mask=mask) @@ -734,7 +726,9 @@ def nanmean( @bottleneck_switch() -def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None): +def nanmedian( + values: np.ndarray, *, axis: AxisInt | None = None, skipna: bool = True, mask=None +) -> float | np.ndarray: """ Parameters ---------- @@ -746,7 +740,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask= Returns ------- - result : float + result : float | ndarray Unless input is a float array, in which case use the same precision as the input array. @@ -756,13 +750,17 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask= >>> s = pd.Series([1, np.nan, 2, 2]) >>> nanops.nanmedian(s.values) 2.0 + + >>> s = pd.Series([np.nan, np.nan, np.nan]) + >>> nanops.nanmedian(s.values) + nan """ # for floats without mask, the data already uses NaN as missing value # indicator, and `mask` will be calculated from that below -> in those # cases we never need to set NaN to the masked values using_nan_sentinel = values.dtype.kind == "f" and mask is None - def get_median(x, _mask=None): + def get_median(x: np.ndarray, _mask=None): if _mask is None: _mask = notna(x) else: @@ -774,6 +772,7 @@ def get_median(x, _mask=None): warnings.filterwarnings( "ignore", "All-NaN slice encountered", RuntimeWarning ) + warnings.filterwarnings("ignore", "Mean of empty slice", RuntimeWarning) res = np.nanmedian(x[_mask]) return res @@ -797,6 +796,8 @@ def get_median(x, _mask=None): notempty = values.size + res: float | np.ndarray + # an array from a frame if values.ndim > 1 and axis is not None: # there's a non-empty array to apply over otherwise numpy raises @@ -1060,7 +1061,7 @@ def nansem( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 2, 3]) >>> nanops.nansem(s.values) - 0.5773502691896258 + np.float64(0.5773502691896258) """ # This checks if non-numeric-like data is passed with numeric_only=False # and raises a TypeError otherwise @@ -1092,11 +1093,14 @@ def reduction( if values.size == 0: return _na_for_min_count(values, axis) + dtype = values.dtype values, mask = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask ) result = getattr(values, meth)(axis) - result = _maybe_null_out(result, axis, mask, values.shape) + result = _maybe_null_out( + result, axis, mask, values.shape, datetimelike=dtype.kind in "mM" + ) return result return reduction @@ -1132,7 +1136,7 @@ def nanargmax( >>> from pandas.core import nanops >>> arr = np.array([1, 2, 3, np.nan, 4]) >>> nanops.nanargmax(arr) - 4 + np.int64(4) >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3) >>> arr[2:, 2] = np.nan @@ -1178,7 +1182,7 @@ def nanargmin( >>> from pandas.core import nanops >>> arr = np.array([1, 2, 3, np.nan, 4]) >>> nanops.nanargmin(arr) - 0 + np.int64(0) >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3) >>> arr[2:, 0] = np.nan @@ -1233,7 +1237,7 @@ def nanskew( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 1, 2]) >>> nanops.nanskew(s.values) - 1.7320508075688787 + np.float64(1.7320508075688787) """ mask = _maybe_get_mask(values, skipna, mask) if values.dtype.kind != "f": @@ -1321,7 +1325,7 @@ def nankurt( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 1, 3, 2]) >>> nanops.nankurt(s.values) - -1.2892561983471076 + np.float64(-1.2892561983471076) """ mask = _maybe_get_mask(values, skipna, mask) if values.dtype.kind != "f": @@ -1413,7 +1417,7 @@ def nanprod( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, 3, np.nan]) >>> nanops.nanprod(s.values) - 6.0 + np.float64(6.0) """ mask = _maybe_get_mask(values, skipna, mask) @@ -1439,19 +1443,15 @@ def _maybe_arg_null_out( return result if axis is None or not getattr(result, "ndim", False): - if skipna: - if mask.all(): - return -1 - else: - if mask.any(): - return -1 + if skipna and mask.all(): + raise ValueError("Encountered all NA values") + elif not skipna and mask.any(): + raise ValueError("Encountered an NA value with skipna=False") else: - if skipna: - na_mask = mask.all(axis) - else: - na_mask = mask.any(axis) - if na_mask.any(): - result[na_mask] = -1 + if skipna and mask.all(axis).any(): + raise ValueError("Encountered all NA values") + elif not skipna and mask.any(axis).any(): + raise ValueError("Encountered an NA value with skipna=False") return result @@ -1502,6 +1502,7 @@ def _maybe_null_out( mask: npt.NDArray[np.bool_] | None, shape: tuple[int, ...], min_count: int = 1, + datetimelike: bool = False, ) -> np.ndarray | float | NaTType: """ Returns @@ -1523,7 +1524,10 @@ def _maybe_null_out( null_mask = np.broadcast_to(below_count, new_shape) if np.any(null_mask): - if is_numeric_dtype(result): + if datetimelike: + # GH#60646 For datetimelike, no need to cast to float + result[null_mask] = iNaT + elif is_numeric_dtype(result): if np.iscomplexobj(result): result = result.astype("c16") elif not is_float_dtype(result): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index ae889a7fdbc24..9f9d69a182f72 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -3,6 +3,7 @@ This is not a public API. """ + from __future__ import annotations from pandas.core.ops.array_ops import ( @@ -65,15 +66,18 @@ __all__ = [ "ARITHMETIC_BINOPS", "arithmetic_op", - "comparison_op", "comp_method_OBJECT_ARRAY", - "invalid_comparison", + "comparison_op", "fill_binop", + "get_array_op", + "get_op_result_name", + "invalid_comparison", "kleene_and", "kleene_or", "kleene_xor", "logical_op", "make_flex_doc", + "maybe_prepare_scalar_for_op", "radd", "rand_", "rdiv", @@ -87,7 +91,4 @@ "rtruediv", "rxor", "unpack_zerodim_and_defer", - "get_op_result_name", - "maybe_prepare_scalar_for_op", - "get_array_op", ] diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 4b762a359d321..3a466b6fc7fc8 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -2,6 +2,7 @@ Functions for arithmetic and comparison operations on NumPy arrays and ExtensionArrays. """ + from __future__ import annotations import datetime @@ -11,7 +12,6 @@ TYPE_CHECKING, Any, ) -import warnings import numpy as np @@ -28,7 +28,6 @@ is_supported_dtype, is_unitless, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, @@ -130,7 +129,7 @@ def comp_method_OBJECT_ARRAY(op, x, y): return result.reshape(x.shape) -def _masked_arith_op(x: np.ndarray, y, op): +def _masked_arith_op(x: np.ndarray, y, op) -> np.ndarray: """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). @@ -165,7 +164,7 @@ def _masked_arith_op(x: np.ndarray, y, op): else: if not is_scalar(y): raise TypeError( - f"Cannot broadcast np.ndarray with operand of type { type(y) }" + f"Cannot broadcast np.ndarray with operand of type {type(y)}" ) # mask is only meaningful for x @@ -423,15 +422,13 @@ def fill_bool(x, left=None): right = lib.item_from_zerodim(right) if is_list_like(right) and not hasattr(right, "dtype"): # e.g. list, tuple - warnings.warn( + raise TypeError( + # GH#52264 "Logical ops (and, or, xor) between Pandas objects and dtype-less " - "sequences (e.g. list, tuple) are deprecated and will raise in a " - "future version. Wrap the object in a Series, Index, or np.array " + "sequences (e.g. list, tuple) are no longer supported. " + "Wrap the object in a Series, Index, or np.array " "before operating instead.", - FutureWarning, - stacklevel=find_stack_level(), ) - right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right lvalues = ensure_wrapped_if_datetimelike(left) @@ -591,7 +588,7 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): } -def _bool_arith_check(op, a: np.ndarray, b): +def _bool_arith_check(op, a: np.ndarray, b) -> None: """ In contrast to numpy, pandas raises an error for certain operations with booleans. diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index 559977bacf881..5cbe1c421e05a 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -1,13 +1,11 @@ """ Boilerplate functions used in defining binary operations. """ + from __future__ import annotations from functools import wraps -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING from pandas._libs.lib import item_from_zerodim from pandas._libs.missing import is_matching_na @@ -18,6 +16,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import F @@ -40,7 +40,7 @@ def wrapper(method: F) -> F: return wrapper -def _unpack_zerodim_and_defer(method, name: str): +def _unpack_zerodim_and_defer(method: F, name: str) -> F: """ Boilerplate for pandas conventions in arithmetic and comparison methods. @@ -75,7 +75,9 @@ def new_method(self, other): return method(self, other) - return new_method + # error: Incompatible return value type (got "Callable[[Any, Any], Any]", + # expected "F") + return new_method # type: ignore[return-value] def get_op_result_name(left, right): diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index a939fdd3d041e..ebafc432dd89b 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -1,6 +1,7 @@ """ Functions for defining unary operations. """ + from __future__ import annotations from typing import ( diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index bd2e532536d84..5ce0a2da86f31 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -1,6 +1,7 @@ """ Templating for ops docstrings """ + from __future__ import annotations @@ -375,7 +376,7 @@ def make_flex_doc(op_name: str, typ: str) -> str: "ne": { "op": "!=", "desc": "Not equal to", - "reverse": None, + "reverse": "eq", "series_examples": _ne_example_SERIES, "series_returns": _returns_series, }, @@ -396,14 +397,14 @@ def make_flex_doc(op_name: str, typ: str) -> str: "gt": { "op": ">", "desc": "Greater than", - "reverse": None, + "reverse": "lt", "series_examples": _gt_example_SERIES, "series_returns": _returns_series, }, "ge": { "op": ">=", "desc": "Greater than or equal to", - "reverse": None, + "reverse": "le", "series_examples": _ge_example_SERIES, "series_returns": _returns_series, }, @@ -419,12 +420,12 @@ def make_flex_doc(op_name: str, typ: str) -> str: if reverse_op is not None: _op_descriptions[reverse_op] = _op_descriptions[key].copy() _op_descriptions[reverse_op]["reverse"] = key - _op_descriptions[key][ - "see_also_desc" - ] = f"Reverse of the {_op_descriptions[key]['desc']} operator, {_py_num_ref}" - _op_descriptions[reverse_op][ - "see_also_desc" - ] = f"Element-wise {_op_descriptions[key]['desc']}, {_py_num_ref}" + _op_descriptions[key]["see_also_desc"] = ( + f"Reverse of the {_op_descriptions[key]['desc']} operator, {_py_num_ref}" + ) + _op_descriptions[reverse_op]["see_also_desc"] = ( + f"Element-wise {_op_descriptions[key]['desc']}, {_py_num_ref}" + ) _flex_doc_SERIES = """ Return {desc} of series and other, element-wise (binary operator `{op_name}`). @@ -435,6 +436,7 @@ def make_flex_doc(op_name: str, typ: str) -> str: Parameters ---------- other : Series or scalar value + The second operand in this operation. level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level. @@ -623,6 +625,15 @@ def make_flex_doc(op_name: str, typ: str) -> str: B square 0.0 0.0 pentagon 0.0 0.0 hexagon 0.0 0.0 + +>>> df_pow = pd.DataFrame({{'A': [2, 3, 4, 5], +... 'B': [6, 7, 8, 9]}}) +>>> df_pow.pow(2) + A B +0 4 36 +1 9 49 +2 16 64 +3 25 81 """ _flex_comp_doc_FRAME = """ diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py index e5ae6d359ac22..62aa79a881717 100644 --- a/pandas/core/ops/invalid.py +++ b/pandas/core/ops/invalid.py @@ -1,18 +1,33 @@ """ Templates for invalid operations. """ + from __future__ import annotations import operator -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, + NoReturn, +) import numpy as np if TYPE_CHECKING: - from pandas._typing import npt + from collections.abc import Callable + + from pandas._typing import ( + ArrayLike, + Scalar, + npt, + ) -def invalid_comparison(left, right, op) -> npt.NDArray[np.bool_]: +def invalid_comparison( + left: ArrayLike, + right: ArrayLike | list | Scalar, + op: Callable[[Any, Any], bool], +) -> npt.NDArray[np.bool_]: """ If a comparison has mismatched types and is not necessarily meaningful, follow python3 conventions by: @@ -41,7 +56,7 @@ def invalid_comparison(left, right, op) -> npt.NDArray[np.bool_]: return res_values -def make_invalid_op(name: str): +def make_invalid_op(name: str) -> Callable[..., NoReturn]: """ Return a binary method that always raises a TypeError. @@ -54,7 +69,7 @@ def make_invalid_op(name: str): invalid_op : function """ - def invalid_op(self, other=None): + def invalid_op(self: object, other: object = None) -> NoReturn: typ = type(self).__name__ raise TypeError(f"cannot perform {name} with this index type: {typ}") diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index adc1f63c568bf..86a93e5a3ca2b 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -1,8 +1,11 @@ """ Ops for masked arrays. """ + from __future__ import annotations +from typing import TYPE_CHECKING + import numpy as np from pandas._libs import ( @@ -10,13 +13,16 @@ missing as libmissing, ) +if TYPE_CHECKING: + from pandas._typing import npt + def kleene_or( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, left_mask: np.ndarray | None, right_mask: np.ndarray | None, -): +) -> tuple[npt.NDArray[np.bool_], npt.NDArray[np.bool_]]: """ Boolean ``or`` using Kleene logic. @@ -78,7 +84,7 @@ def kleene_xor( right: bool | np.ndarray | libmissing.NAType, left_mask: np.ndarray | None, right_mask: np.ndarray | None, -): +) -> tuple[npt.NDArray[np.bool_], npt.NDArray[np.bool_]]: """ Boolean ``xor`` using Kleene logic. @@ -131,7 +137,7 @@ def kleene_and( right: bool | libmissing.NAType | np.ndarray, left_mask: np.ndarray | None, right_mask: np.ndarray | None, -): +) -> tuple[npt.NDArray[np.bool_], npt.NDArray[np.bool_]]: """ Boolean ``and`` using Kleene logic. @@ -184,6 +190,6 @@ def kleene_and( return result, mask -def raise_for_nan(value, method: str) -> None: +def raise_for_nan(value: object, method: str) -> None: if lib.is_float(value) and np.isnan(value): raise ValueError(f"Cannot perform logical '{method}' with floating NaN") diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index fc685935a35fc..a4e9e5305f74d 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -21,6 +21,7 @@ 3) divmod behavior consistent with 1) and 2). """ + from __future__ import annotations import operator @@ -30,7 +31,7 @@ from pandas.core import roperator -def _fill_zeros(result: np.ndarray, x, y): +def _fill_zeros(result: np.ndarray, x, y) -> np.ndarray: """ If this is a reversed op, then flip x,y @@ -89,9 +90,9 @@ def mask_zero_div_zero(x, y, result: np.ndarray) -> np.ndarray: >>> x = np.array([1, 0, -1], dtype=np.int64) >>> x array([ 1, 0, -1]) - >>> y = 0 # int 0; numpy behavior is different with float + >>> y = 0 # int 0; numpy behavior is different with float >>> result = x // y - >>> result # raw numpy result does not fill division by zero + >>> result # raw numpy result does not fill division by zero array([0, 0, 0]) >>> mask_zero_div_zero(x, y, result) array([ inf, nan, -inf]) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 48a5f85e1c388..08e3beef99e60 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -4,11 +4,11 @@ from textwrap import dedent from typing import ( TYPE_CHECKING, - Callable, Literal, cast, final, no_type_check, + overload, ) import warnings @@ -24,46 +24,38 @@ Timestamp, to_offset, ) -from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas._typing import NDFrameT -from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import ( Appender, Substitution, doc, ) -from pandas.util._exceptions import ( - find_stack_level, - rewrite_warning, -) +from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.dtypes import ArrowDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + PeriodDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, ) import pandas.core.algorithms as algos -from pandas.core.apply import ( - ResamplerWindowApply, - warn_alias_replacement, -) +from pandas.core.apply import ResamplerWindowApply from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import ( PandasObject, SelectionMixin, ) -import pandas.core.common as com from pandas.core.generic import ( NDFrame, _shared_docs, ) -from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, - _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -83,6 +75,7 @@ TimedeltaIndex, timedelta_range, ) +from pandas.core.reshape.concat import concat from pandas.tseries.frequencies import ( is_subperiod, @@ -94,15 +87,22 @@ ) if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._typing import ( + Any, AnyArrayLike, Axis, - AxisInt, + Concatenate, + FreqIndexT, Frequency, IndexLabel, InterpolateOptions, + P, + Self, T, TimedeltaConvertibleTypes, TimeGrouperOrigin, @@ -129,9 +129,6 @@ class Resampler(BaseGroupBy, PandasObject): ---------- obj : Series or DataFrame groupby : TimeGrouper - axis : int, default 0 - kind : str or None - 'period', 'timestamp' to override default index treatment Returns ------- @@ -151,11 +148,9 @@ class Resampler(BaseGroupBy, PandasObject): # to the groupby descriptor _attributes = [ "freq", - "axis", "closed", "label", "convention", - "kind", "origin", "offset", ] @@ -164,22 +159,19 @@ def __init__( self, obj: NDFrame, timegrouper: TimeGrouper, - axis: Axis = 0, - kind=None, *, gpr_index: Index, group_keys: bool = False, selection=None, - include_groups: bool = True, + include_groups: bool = False, ) -> None: + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") self._timegrouper = timegrouper self.keys = None self.sort = True - self.axis = obj._get_axis_number(axis) - self.kind = kind self.group_keys = group_keys self.as_index = True - self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index @@ -254,6 +246,22 @@ def _get_binner(self): bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer) return binner, bin_grouper + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + @final @Substitution( klass="Resampler", @@ -278,9 +286,9 @@ def _get_binner(self): @Appender(_pipe_template) def pipe( self, - func: Callable[..., T] | tuple[Callable[..., T], str], - *args, - **kwargs, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, ) -> T: return super().pipe(func, *args, **kwargs) @@ -370,23 +378,30 @@ def transform(self, arg, *args, **kwargs): ---------- arg : function To apply to each group. Should return a Series with the same index. + *args, **kwargs + Additional arguments and keywords. Returns ------- Series + A Series with the transformed values, maintaining the same index as + the original object. + + See Also + -------- + core.resample.Resampler.apply : Apply a function along each group. + core.resample.Resampler.aggregate : Aggregate using one or more operations + over the specified axis. Examples -------- - >>> s = pd.Series([1, 2], - ... index=pd.date_range('20180101', - ... periods=2, - ... freq='1h')) + >>> s = pd.Series([1, 2], index=pd.date_range("20180101", periods=2, freq="1h")) >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 Freq: h, dtype: int64 - >>> resampled = s.resample('15min') + >>> resampled = s.resample("15min") >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) 2018-01-01 00:00:00 NaN 2018-01-01 01:00:00 NaN @@ -396,7 +411,7 @@ def transform(self, arg, *args, **kwargs): arg, *args, **kwargs ) - def _downsample(self, f, **kwargs): + def _downsample(self, how, **kwargs): raise AbstractMethodError(self) def _upsample(self, f, limit: int | None = None, fill_value=None): @@ -426,7 +441,7 @@ def _gotitem(self, key, ndim: int, subset=None): assert subset.ndim == 1 grouped = get_groupby( - subset, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys + subset, by=None, grouper=grouper, group_keys=self.group_keys ) return grouped @@ -439,9 +454,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # Excludes `on` column when provided obj = self._obj_with_exclusions - grouped = get_groupby( - obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys - ) + grouped = get_groupby(obj, by=None, grouper=grouper, group_keys=self.group_keys) try: if callable(how): @@ -459,9 +472,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -473,48 +484,41 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) return self._wrap_result(result) @final def _get_resampler_for_grouping( - self, groupby: GroupBy, key, include_groups: bool = True + self, + groupby: GroupBy, + key, ): """ Return the correct class for resampling with groupby. """ return self._resampler_for_grouping( - groupby=groupby, key=key, parent=self, include_groups=include_groups + groupby=groupby, + key=key, + parent=self, ) def _wrap_result(self, result): """ Potentially wrap any results. """ - # GH 47705 - obj = self.obj - if ( - isinstance(result, ABCDataFrame) - and len(result) == 0 - and not isinstance(result.index, PeriodIndex) - ): - result = result.set_index( - _asfreq_compat(obj.index[:0], freq=self.freq), append=True - ) - if isinstance(result, ABCSeries) and self._selection is not None: result.name = self._selection if isinstance(result, ABCSeries) and result.empty: # When index is all NaT, result is empty but index is not + obj = self.obj result.index = _asfreq_compat(obj.index[:0], freq=self.freq) result.name = getattr(obj, "name", None) if self._timegrouper._arrow_dtype is not None: result.index = result.index.astype(self._timegrouper._arrow_dtype) + result.index.name = self.obj.index.name return result @@ -523,6 +527,11 @@ def ffill(self, limit: int | None = None): """ Forward fill the values. + This method fills missing values by propagating the last valid + observation forward, up to the next valid observation. It is commonly + used in time series analysis when resampling data to a higher frequency + (upsampling) and filling gaps in the resampled output. + Parameters ---------- limit : int, optional @@ -530,7 +539,8 @@ def ffill(self, limit: int | None = None): Returns ------- - An upsampled Series. + Series + The resampled data with missing values filled forward. See Also -------- @@ -541,8 +551,12 @@ def ffill(self, limit: int | None = None): -------- Here we only create a ``Series``. - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 @@ -552,7 +566,7 @@ def ffill(self, limit: int | None = None): Example for ``ffill`` with downsampling (we have fewer dates after resampling): - >>> ser.resample('MS').ffill() + >>> ser.resample("MS").ffill() 2023-01-01 1 2023-02-01 3 Freq: MS, dtype: int64 @@ -560,7 +574,7 @@ def ffill(self, limit: int | None = None): Example for ``ffill`` with upsampling (fill the new dates with the previous value): - >>> ser.resample('W').ffill() + >>> ser.resample("W").ffill() 2023-01-01 1 2023-01-08 1 2023-01-15 2 @@ -574,7 +588,7 @@ def ffill(self, limit: int | None = None): With upsampling and limiting (only fill the first new date with the previous value): - >>> ser.resample('W').ffill(limit=1) + >>> ser.resample("W").ffill(limit=1) 2023-01-01 1.0 2023-01-08 1.0 2023-01-15 2.0 @@ -614,21 +628,18 @@ def nearest(self, limit: int | None = None): See Also -------- - backfill : Backward fill the new missing values in the resampled data. - pad : Forward fill ``NaN`` values. + bfill : Backward fill the new missing values in the resampled data. + ffill : Forward fill ``NaN`` values. Examples -------- - >>> s = pd.Series([1, 2], - ... index=pd.date_range('20180101', - ... periods=2, - ... freq='1h')) + >>> s = pd.Series([1, 2], index=pd.date_range("20180101", periods=2, freq="1h")) >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 Freq: h, dtype: int64 - >>> s.resample('15min').nearest() + >>> s.resample("15min").nearest() 2018-01-01 00:00:00 1 2018-01-01 00:15:00 1 2018-01-01 00:30:00 2 @@ -638,7 +649,7 @@ def nearest(self, limit: int | None = None): Limit the number of upsampled values imputed by the nearest: - >>> s.resample('15min').nearest(limit=1) + >>> s.resample("15min").nearest(limit=1) 2018-01-01 00:00:00 1.0 2018-01-01 00:15:00 1.0 2018-01-01 00:30:00 NaN @@ -672,9 +683,6 @@ def bfill(self, limit: int | None = None): See Also -------- - bfill : Alias of backfill. - fillna : Fill NaN values using the specified method, which can be - 'backfill'. nearest : Fill NaN values with nearest neighbor starting from center. ffill : Forward fill NaN values. Series.fillna : Fill NaN values in the Series using the @@ -684,21 +692,22 @@ def bfill(self, limit: int | None = None): References ---------- - .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + .. [1] https://en.wikipedia.org/wiki/Imputation_%28statistics%29 Examples -------- Resampling a Series: - >>> s = pd.Series([1, 2, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> s = pd.Series( + ... [1, 2, 3], index=pd.date_range("20180101", periods=3, freq="h") + ... ) >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 2018-01-01 02:00:00 3 Freq: h, dtype: int64 - >>> s.resample('30min').bfill() + >>> s.resample("30min").bfill() 2018-01-01 00:00:00 1 2018-01-01 00:30:00 2 2018-01-01 01:00:00 2 @@ -706,7 +715,7 @@ def bfill(self, limit: int | None = None): 2018-01-01 02:00:00 3 Freq: 30min, dtype: int64 - >>> s.resample('15min').bfill(limit=2) + >>> s.resample("15min").bfill(limit=2) 2018-01-01 00:00:00 1.0 2018-01-01 00:15:00 NaN 2018-01-01 00:30:00 2.0 @@ -720,16 +729,17 @@ def bfill(self, limit: int | None = None): Resampling a DataFrame that has missing values: - >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, - ... index=pd.date_range('20180101', periods=3, - ... freq='h')) + >>> df = pd.DataFrame( + ... {"a": [2, np.nan, 6], "b": [1, 3, 5]}, + ... index=pd.date_range("20180101", periods=3, freq="h"), + ... ) >>> df a b 2018-01-01 00:00:00 2.0 1 2018-01-01 01:00:00 NaN 3 2018-01-01 02:00:00 6.0 5 - >>> df.resample('30min').bfill() + >>> df.resample("30min").bfill() a b 2018-01-01 00:00:00 2.0 1 2018-01-01 00:30:00 NaN 3 @@ -737,7 +747,7 @@ def bfill(self, limit: int | None = None): 2018-01-01 01:30:00 6.0 5 2018-01-01 02:00:00 6.0 5 - >>> df.resample('15min').bfill(limit=2) + >>> df.resample("15min").bfill(limit=2) a b 2018-01-01 00:00:00 2.0 1.0 2018-01-01 00:15:00 NaN NaN @@ -751,174 +761,6 @@ def bfill(self, limit: int | None = None): """ return self._upsample("bfill", limit=limit) - @final - def fillna(self, method, limit: int | None = None): - """ - Fill missing values introduced by upsampling. - - In statistics, imputation is the process of replacing missing data with - substituted values [1]_. When resampling data, missing values may - appear (e.g., when the resampling frequency is higher than the original - frequency). - - Missing values that existed in the original data will - not be modified. - - Parameters - ---------- - method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'} - Method to use for filling holes in resampled data - - * 'pad' or 'ffill': use previous valid observation to fill gap - (forward fill). - * 'backfill' or 'bfill': use next valid observation to fill gap. - * 'nearest': use nearest valid observation to fill gap. - - limit : int, optional - Limit of how many consecutive missing values to fill. - - Returns - ------- - Series or DataFrame - An upsampled Series or DataFrame with missing values filled. - - See Also - -------- - bfill : Backward fill NaN values in the resampled data. - ffill : Forward fill NaN values in the resampled data. - nearest : Fill NaN values in the resampled data - with nearest neighbor starting from center. - interpolate : Fill NaN values using interpolation. - Series.fillna : Fill NaN values in the Series using the - specified method, which can be 'bfill' and 'ffill'. - DataFrame.fillna : Fill NaN values in the DataFrame using the - specified method, which can be 'bfill' and 'ffill'. - - References - ---------- - .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) - - Examples - -------- - Resampling a Series: - - >>> s = pd.Series([1, 2, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) - >>> s - 2018-01-01 00:00:00 1 - 2018-01-01 01:00:00 2 - 2018-01-01 02:00:00 3 - Freq: h, dtype: int64 - - Without filling the missing values you get: - - >>> s.resample("30min").asfreq() - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:30:00 NaN - 2018-01-01 01:00:00 2.0 - 2018-01-01 01:30:00 NaN - 2018-01-01 02:00:00 3.0 - Freq: 30min, dtype: float64 - - >>> s.resample('30min').fillna("backfill") - 2018-01-01 00:00:00 1 - 2018-01-01 00:30:00 2 - 2018-01-01 01:00:00 2 - 2018-01-01 01:30:00 3 - 2018-01-01 02:00:00 3 - Freq: 30min, dtype: int64 - - >>> s.resample('15min').fillna("backfill", limit=2) - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:15:00 NaN - 2018-01-01 00:30:00 2.0 - 2018-01-01 00:45:00 2.0 - 2018-01-01 01:00:00 2.0 - 2018-01-01 01:15:00 NaN - 2018-01-01 01:30:00 3.0 - 2018-01-01 01:45:00 3.0 - 2018-01-01 02:00:00 3.0 - Freq: 15min, dtype: float64 - - >>> s.resample('30min').fillna("pad") - 2018-01-01 00:00:00 1 - 2018-01-01 00:30:00 1 - 2018-01-01 01:00:00 2 - 2018-01-01 01:30:00 2 - 2018-01-01 02:00:00 3 - Freq: 30min, dtype: int64 - - >>> s.resample('30min').fillna("nearest") - 2018-01-01 00:00:00 1 - 2018-01-01 00:30:00 2 - 2018-01-01 01:00:00 2 - 2018-01-01 01:30:00 3 - 2018-01-01 02:00:00 3 - Freq: 30min, dtype: int64 - - Missing values present before the upsampling are not affected. - - >>> sm = pd.Series([1, None, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) - >>> sm - 2018-01-01 00:00:00 1.0 - 2018-01-01 01:00:00 NaN - 2018-01-01 02:00:00 3.0 - Freq: h, dtype: float64 - - >>> sm.resample('30min').fillna('backfill') - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:30:00 NaN - 2018-01-01 01:00:00 NaN - 2018-01-01 01:30:00 3.0 - 2018-01-01 02:00:00 3.0 - Freq: 30min, dtype: float64 - - >>> sm.resample('30min').fillna('pad') - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:30:00 1.0 - 2018-01-01 01:00:00 NaN - 2018-01-01 01:30:00 NaN - 2018-01-01 02:00:00 3.0 - Freq: 30min, dtype: float64 - - >>> sm.resample('30min').fillna('nearest') - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:30:00 NaN - 2018-01-01 01:00:00 NaN - 2018-01-01 01:30:00 3.0 - 2018-01-01 02:00:00 3.0 - Freq: 30min, dtype: float64 - - DataFrame resampling is done column-wise. All the same options are - available. - - >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, - ... index=pd.date_range('20180101', periods=3, - ... freq='h')) - >>> df - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 02:00:00 6.0 5 - - >>> df.resample('30min').fillna("bfill") - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 00:30:00 NaN 3 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 01:30:00 6.0 5 - 2018-01-01 02:00:00 6.0 5 - """ - warnings.warn( - f"{type(self).__name__}.fillna is deprecated and will be removed " - "in a future version. Use obj.ffill(), obj.bfill(), " - "or obj.nearest() instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._upsample(method, limit=limit) - @final def interpolate( self, @@ -977,20 +819,6 @@ def interpolate( limit_direction : {{'forward', 'backward', 'both'}}, Optional Consecutive NaNs will be filled in this direction. - If limit is specified: - * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'. - * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be - 'backwards'. - - If 'limit' is not specified: - * If 'method' is 'backfill' or 'bfill', the default is 'backward' - * else the default is 'forward' - - raises ValueError if `limit_direction` is 'forward' or 'both' and - method is 'backfill' or 'bfill'. - raises ValueError if `limit_direction` is 'backward' or 'both' and - method is 'pad' or 'ffill'. - limit_area : {{`None`, 'inside', 'outside'}}, default None If limit is specified, consecutive NaNs will be filled with this restriction. @@ -1005,7 +833,7 @@ def interpolate( .. deprecated:: 2.1.0 - ``**kwargs`` : optional + **kwargs : optional Keyword arguments to pass on to the interpolating function. Returns @@ -1018,6 +846,8 @@ def interpolate( core.resample.Resampler.asfreq: Return the values at the new freq, essentially a reindex. DataFrame.interpolate: Fill NaN values using an interpolation method. + DataFrame.bfill : Backward fill NaN values in the resampled data. + DataFrame.ffill : Forward fill NaN values. Notes ----- @@ -1039,7 +869,7 @@ def interpolate( 2023-03-01 07:00:04 3 Freq: s, dtype: int64 - Upsample the dataframe to 0.5Hz by providing the period time of 2s. + Downsample the dataframe to 0.5Hz by providing the period time of 2s. >>> series.resample("2s").interpolate("linear") 2023-03-01 07:00:00 1 @@ -1047,7 +877,7 @@ def interpolate( 2023-03-01 07:00:04 3 Freq: 2s, dtype: int64 - Downsample the dataframe to 2Hz by providing the period time of 500ms. + Upsample the dataframe to 2Hz by providing the period time of 500ms. >>> series.resample("500ms").interpolate("linear") 2023-03-01 07:00:00.000 1.0 @@ -1062,30 +892,59 @@ def interpolate( Freq: 500ms, dtype: float64 Internal reindexing with ``asfreq()`` prior to interpolation leads to - an interpolated timeseries on the basis the reindexed timestamps (anchors). - Since not all datapoints from original series become anchors, - it can lead to misleading interpolation results as in the following example: + an interpolated timeseries on the basis of the reindexed timestamps + (anchors). It is assured that all available datapoints from original + series become anchors, so it also works for resampling-cases that lead + to non-aligned timestamps, as in the following example: >>> series.resample("400ms").interpolate("linear") - 2023-03-01 07:00:00.000 1.0 - 2023-03-01 07:00:00.400 1.2 - 2023-03-01 07:00:00.800 1.4 - 2023-03-01 07:00:01.200 1.6 - 2023-03-01 07:00:01.600 1.8 - 2023-03-01 07:00:02.000 2.0 - 2023-03-01 07:00:02.400 2.2 - 2023-03-01 07:00:02.800 2.4 - 2023-03-01 07:00:03.200 2.6 - 2023-03-01 07:00:03.600 2.8 - 2023-03-01 07:00:04.000 3.0 + 2023-03-01 07:00:00.000 1.000000 + 2023-03-01 07:00:00.400 0.333333 + 2023-03-01 07:00:00.800 -0.333333 + 2023-03-01 07:00:01.200 0.000000 + 2023-03-01 07:00:01.600 1.000000 + 2023-03-01 07:00:02.000 2.000000 + 2023-03-01 07:00:02.400 1.666667 + 2023-03-01 07:00:02.800 1.333333 + 2023-03-01 07:00:03.200 1.666667 + 2023-03-01 07:00:03.600 2.333333 + 2023-03-01 07:00:04.000 3.000000 Freq: 400ms, dtype: float64 - Note that the series erroneously increases between two anchors + Note that the series correctly decreases between two anchors ``07:00:00`` and ``07:00:02``. """ assert downcast is lib.no_default # just checking coverage result = self._upsample("asfreq") - return result.interpolate( + + # If the original data has timestamps which are not aligned with the + # target timestamps, we need to add those points back to the data frame + # that is supposed to be interpolated. This does not work with + # PeriodIndex, so we skip this case. GH#21351 + obj = self._selected_obj + is_period_index = isinstance(obj.index, PeriodIndex) + + # Skip this step for PeriodIndex + if not is_period_index: + final_index = result.index + if isinstance(final_index, MultiIndex): + raise NotImplementedError( + "Direct interpolation of MultiIndex data frames is not " + "supported. If you tried to resample and interpolate on a " + "grouped data frame, please use:\n" + "`df.groupby(...).apply(lambda x: x.resample(...)." + "interpolate(...))`" + "\ninstead, as resampling and interpolation has to be " + "performed for each group independently." + ) + + missing_data_points_index = obj.index.difference(final_index) + if len(missing_data_points_index) > 0: + result = concat( + [result, obj.loc[missing_data_points_index]] + ).sort_index() + + result_interpolated = result.interpolate( method=method, axis=axis, limit=limit, @@ -1096,6 +955,18 @@ def interpolate( **kwargs, ) + # No further steps if the original data has a PeriodIndex + if is_period_index: + return result_interpolated + + # Make sure that original data points which do not align with the + # resampled index are removed + result_interpolated = result_interpolated.loc[final_index] + + # Make sure frequency indexes are preserved + result_interpolated.index = final_index + return result_interpolated + @final def asfreq(self, fill_value=None): """ @@ -1120,15 +991,19 @@ def asfreq(self, fill_value=None): Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-31', '2023-02-01', '2023-02-28'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-31", "2023-02-01", "2023-02-28"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-31 2 2023-02-01 3 2023-02-28 4 dtype: int64 - >>> ser.resample('MS').asfreq() + >>> ser.resample("MS").asfreq() 2023-01-01 1 2023-02-01 3 Freq: MS, dtype: int64 @@ -1140,12 +1015,14 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, - *args, - **kwargs, ): """ Compute sum of group values. + This method provides a simple way to compute the sum of values within each + resampled group, particularly useful for aggregating time-based data into + daily, monthly, or yearly sums. + Parameters ---------- numeric_only : bool, default False @@ -1164,23 +1041,33 @@ def sum( Series or DataFrame Computed sum of values within each group. + See Also + -------- + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.count : Compute count of group, excluding missing + values. + DataFrame.resample : Resample time-series data. + Series.sum : Return the sum of the values over the requested axis. + Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').sum() + >>> ser.resample("MS").sum() 2023-01-01 3 2023-02-01 7 Freq: MS, dtype: int64 """ - maybe_warn_args_and_kwargs(type(self), "sum", args, kwargs) - nv.validate_resampler_func("sum", args, kwargs) return self._downsample("sum", numeric_only=numeric_only, min_count=min_count) @final @@ -1188,8 +1075,6 @@ def prod( self, numeric_only: bool = False, min_count: int = 0, - *args, - **kwargs, ): """ Compute prod of group values. @@ -1212,23 +1097,32 @@ def prod( Series or DataFrame Computed prod of values within each group. + See Also + -------- + core.resample.Resampler.sum : Compute sum of groups, excluding missing values. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').prod() + >>> ser.resample("MS").prod() 2023-01-01 2 2023-02-01 12 Freq: MS, dtype: int64 """ - maybe_warn_args_and_kwargs(type(self), "prod", args, kwargs) - nv.validate_resampler_func("prod", args, kwargs) return self._downsample("prod", numeric_only=numeric_only, min_count=min_count) @final @@ -1236,34 +1130,54 @@ def min( self, numeric_only: bool = False, min_count: int = 0, - *args, - **kwargs, ): """ Compute min value of group. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + Returns ------- Series or DataFrame + Compute the minimum value in the given Series or DataFrame. + + See Also + -------- + core.resample.Resampler.max : Compute max value of group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').min() + >>> ser.resample("MS").min() 2023-01-01 1 2023-02-01 3 Freq: MS, dtype: int64 """ - - maybe_warn_args_and_kwargs(type(self), "min", args, kwargs) - nv.validate_resampler_func("min", args, kwargs) return self._downsample("min", numeric_only=numeric_only, min_count=min_count) @final @@ -1271,33 +1185,54 @@ def max( self, numeric_only: bool = False, min_count: int = 0, - *args, - **kwargs, ): """ Compute max value of group. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + Returns ------- Series or DataFrame + Computes the maximum value in the given Series or Dataframe. + + See Also + -------- + core.resample.Resampler.min : Compute min value of group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').max() + >>> ser.resample("MS").max() 2023-01-01 2 2023-02-01 4 Freq: MS, dtype: int64 """ - maybe_warn_args_and_kwargs(type(self), "max", args, kwargs) - nv.validate_resampler_func("max", args, kwargs) return self._downsample("max", numeric_only=numeric_only, min_count=min_count) @final @@ -1306,12 +1241,11 @@ def first( self, numeric_only: bool = False, min_count: int = 0, - *args, - **kwargs, + skipna: bool = True, ): - maybe_warn_args_and_kwargs(type(self), "first", args, kwargs) - nv.validate_resampler_func("first", args, kwargs) - return self._downsample("first", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.last) @@ -1319,26 +1253,66 @@ def last( self, numeric_only: bool = False, min_count: int = 0, - *args, - **kwargs, + skipna: bool = True, ): - maybe_warn_args_and_kwargs(type(self), "last", args, kwargs) - nv.validate_resampler_func("last", args, kwargs) - return self._downsample("last", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final - @doc(GroupBy.median) - def median(self, numeric_only: bool = False, *args, **kwargs): - maybe_warn_args_and_kwargs(type(self), "median", args, kwargs) - nv.validate_resampler_func("median", args, kwargs) + def median(self, numeric_only: bool = False): + """ + Compute median of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to False. + + Returns + ------- + Series or DataFrame + Median of values within each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").median() + 2023-01-01 2.0 + 2023-02-01 4.0 + Freq: MS, dtype: float64 + """ return self._downsample("median", numeric_only=numeric_only) @final def mean( self, numeric_only: bool = False, - *args, - **kwargs, ): """ Compute mean of groups, excluding missing values. @@ -1357,24 +1331,36 @@ def mean( DataFrame or Series Mean of values within each group. + See Also + -------- + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + core.resample.Resampler.sum : Compute sum of groups, excluding missing values. + core.resample.Resampler.std : Compute standard deviation of groups, excluding + missing values. + core.resample.Resampler.var : Compute variance of groups, excluding missing + values. + Examples -------- - >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( - ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) >>> ser 2023-01-01 1 2023-01-15 2 2023-02-01 3 2023-02-15 4 dtype: int64 - >>> ser.resample('MS').mean() + >>> ser.resample("MS").mean() 2023-01-01 1.5 2023-02-01 3.5 Freq: MS, dtype: float64 """ - maybe_warn_args_and_kwargs(type(self), "mean", args, kwargs) - nv.validate_resampler_func("mean", args, kwargs) return self._downsample("mean", numeric_only=numeric_only) @final @@ -1382,8 +1368,6 @@ def std( self, ddof: int = 1, numeric_only: bool = False, - *args, - **kwargs, ): """ Compute standard deviation of groups, excluding missing values. @@ -1406,23 +1390,35 @@ def std( DataFrame or Series Standard deviation of values within each group. + See Also + -------- + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + core.resample.Resampler.var : Compute variance of groups, excluding missing + values. + Examples -------- - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').std() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").std() 2023-01-01 1.000000 2023-02-01 2.645751 Freq: MS, dtype: float64 """ - maybe_warn_args_and_kwargs(type(self), "std", args, kwargs) - nv.validate_resampler_func("std", args, kwargs) return self._downsample("std", ddof=ddof, numeric_only=numeric_only) @final @@ -1430,8 +1426,6 @@ def var( self, ddof: int = 1, numeric_only: bool = False, - *args, - **kwargs, ): """ Compute variance of groups, excluding missing values. @@ -1455,53 +1449,103 @@ def var( DataFrame or Series Variance of values within each group. + See Also + -------- + core.resample.Resampler.std : Compute standard deviation of groups, excluding + missing values. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + Examples -------- - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').var() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").var() 2023-01-01 1.0 2023-02-01 7.0 Freq: MS, dtype: float64 - >>> ser.resample('MS').var(ddof=0) + >>> ser.resample("MS").var(ddof=0) 2023-01-01 0.666667 2023-02-01 4.666667 Freq: MS, dtype: float64 """ - maybe_warn_args_and_kwargs(type(self), "var", args, kwargs) - nv.validate_resampler_func("var", args, kwargs) return self._downsample("var", ddof=ddof, numeric_only=numeric_only) @final - @doc(GroupBy.sem) def sem( self, ddof: int = 1, numeric_only: bool = False, - *args, - **kwargs, ): - maybe_warn_args_and_kwargs(type(self), "sem", args, kwargs) - nv.validate_resampler_func("sem", args, kwargs) + """ + Compute standard error of the mean of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard error of the mean of values within each group. + + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").sem() + 2023-01-01 0.577350 + 2023-02-01 1.527525 + Freq: MS, dtype: float64 + """ return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) @final @doc(GroupBy.ohlc) - def ohlc( - self, - *args, - **kwargs, - ): - maybe_warn_args_and_kwargs(type(self), "ohlc", args, kwargs) - nv.validate_resampler_func("ohlc", args, kwargs) - + def ohlc(self): ax = self.ax obj = self._obj_with_exclusions if len(ax) == 0: @@ -1521,14 +1565,38 @@ def ohlc( return self._downsample("ohlc") @final - @doc(SeriesGroupBy.nunique) - def nunique( - self, - *args, - **kwargs, - ): - maybe_warn_args_and_kwargs(type(self), "nunique", args, kwargs) - nv.validate_resampler_func("nunique", args, kwargs) + def nunique(self): + """ + Return number of unique elements in the group. + + Returns + ------- + Series + Number of unique values within each group. + + See Also + -------- + core.groupby.SeriesGroupBy.nunique : Method nunique for SeriesGroupBy. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 2, 3, 3], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 3 + dtype: int64 + >>> ser.resample("MS").nunique() + 2023-01-01 2 + 2023-02-01 1 + Freq: MS, dtype: int64 + """ return self._downsample("nunique") @final @@ -1539,7 +1607,7 @@ def size(self): # If the result is a non-empty DataFrame we stack to get a Series # GH 46826 if isinstance(result, ABCDataFrame) and not result.empty: - result = result.stack(future_stack=True) + result = result.stack() if not len(self.ax): from pandas import Series @@ -1597,19 +1665,25 @@ def quantile(self, q: float | list[float] | AnyArrayLike = 0.5, **kwargs): Examples -------- - >>> ser = pd.Series([1, 3, 2, 4, 3, 8], - ... index=pd.DatetimeIndex(['2023-01-01', - ... '2023-01-10', - ... '2023-01-15', - ... '2023-02-01', - ... '2023-02-10', - ... '2023-02-15'])) - >>> ser.resample('MS').quantile() + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").quantile() 2023-01-01 2.0 2023-02-01 4.0 Freq: MS, dtype: float64 - >>> ser.resample('MS').quantile(.25) + >>> ser.resample("MS").quantile(0.25) 2023-01-01 1.5 2023-02-01 3.5 Freq: MS, dtype: float64 @@ -1634,7 +1708,6 @@ def __init__( groupby: GroupBy, key=None, selection: IndexLabel | None = None, - include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1657,7 +1730,6 @@ def __init__( self.ax = parent.ax self.obj = parent.obj - self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1674,7 +1746,18 @@ def func(x): return x.apply(f, *args, **kwargs) - result = _apply(self._groupby, func, include_groups=self.include_groups) + result = self._groupby.apply(func) + + # GH 47705 + if ( + isinstance(result, ABCDataFrame) + and len(result) == 0 + and not isinstance(result.index, PeriodIndex) + ): + result = result.set_index( + _asfreq_compat(self.obj.index[:0], freq=self.freq), append=True + ) + return self._wrap_result(result) _upsample = _apply @@ -1725,12 +1808,12 @@ class DatetimeIndexResampler(Resampler): ax: DatetimeIndex @property - def _resampler_for_grouping(self): + def _resampler_for_grouping(self) -> type[DatetimeIndexResamplerGroupby]: return DatetimeIndexResamplerGroupby def _get_binner_for_time(self): # this is how we are actually creating the bins - if self.kind == "period": + if isinstance(self.ax, PeriodIndex): return self._timegrouper._get_time_period_bins(self.ax) return self._timegrouper._get_time_bins(self.ax) @@ -1743,10 +1826,6 @@ def _downsample(self, how, **kwargs): how : string / cython mapped function **kwargs : kw args passed to how function """ - orig_how = how - how = com.get_cython_func(how) or how - if orig_how != how: - warn_alias_replacement(self, orig_how, how) ax = self.ax # Excludes `on` column when provided @@ -1772,12 +1851,7 @@ def _downsample(self, how, **kwargs): # we are downsampling # we want to call the actual grouper method here - if self.axis == 0: - result = obj.groupby(self._grouper).aggregate(how, **kwargs) - else: - # test_resample_axis1 - result = obj.T.groupby(self._grouper).aggregate(how, **kwargs).T - + result = obj.groupby(self._grouper).aggregate(how, **kwargs) return self._wrap_result(result) def _adjust_binner_for_upsample(self, binner): @@ -1802,14 +1876,7 @@ def _upsample(self, method, limit: int | None = None, fill_value=None): Maximum size gap to fill when reindexing fill_value : scalar, default None Value to use for missing values - - See Also - -------- - .fillna: Fill NA/NaN values using the specified method. - """ - if self.axis: - raise AssertionError("axis must be 0") if self._from_selection: raise ValueError( "Upsampling from level= or on= selection " @@ -1844,7 +1911,9 @@ def _wrap_result(self, result): # we may have a different kind that we were asked originally # convert if needed - if self.kind == "period" and not isinstance(result.index, PeriodIndex): + if isinstance(self.ax, PeriodIndex) and not isinstance( + result.index, PeriodIndex + ): if isinstance(result.index, MultiIndex): # GH 24103 - e.g. groupby resample if not isinstance(result.index.levels[-1], PeriodIndex): @@ -1885,7 +1954,7 @@ def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby def _get_binner_for_time(self): - if self.kind == "timestamp": + if isinstance(self.ax, DatetimeIndex): return super()._get_binner_for_time() return self._timegrouper._get_period_bins(self.ax) @@ -1902,7 +1971,7 @@ def _convert_obj(self, obj: NDFrameT) -> NDFrameT: raise NotImplementedError(msg) # convert to timestamp - if self.kind == "timestamp": + if isinstance(obj, DatetimeIndex): obj = obj.to_timestamp(how=self.convention) return obj @@ -1917,13 +1986,9 @@ def _downsample(self, how, **kwargs): **kwargs : kw args passed to how function """ # we may need to actually resample as if we are timestamps - if self.kind == "timestamp": + if isinstance(self.ax, DatetimeIndex): return super()._downsample(how, **kwargs) - orig_how = how - how = com.get_cython_func(how) or how - if orig_how != how: - warn_alias_replacement(self, orig_how, how) ax = self.ax if is_subperiod(ax.freq, self.freq): @@ -1956,14 +2021,9 @@ def _upsample(self, method, limit: int | None = None, fill_value=None): Maximum size gap to fill when reindexing. fill_value : scalar, default None Value to use for missing values. - - See Also - -------- - .fillna: Fill NA/NaN values using the specified method. - """ # we may need to actually resample as if we are timestamps - if self.kind == "timestamp": + if isinstance(self.ax, DatetimeIndex): return super()._upsample(method, limit=limit, fill_value=fill_value) ax = self.ax @@ -1981,7 +2041,6 @@ def _upsample(self, method, limit: int | None = None, fill_value=None): obj, indexer, new_index, - axis=self.axis, ) return self._wrap_result(new_obj) @@ -2036,12 +2095,12 @@ def _resampler_cls(self): return TimedeltaIndexResampler -def get_resampler(obj: Series | DataFrame, kind=None, **kwds) -> Resampler: +def get_resampler(obj: Series | DataFrame, **kwds) -> Resampler: """ Create a TimeGrouper and return our resampler. """ tg = TimeGrouper(obj, **kwds) # type: ignore[arg-type] - return tg._get_resampler(obj, kind=kind) + return tg._get_resampler(obj) get_resampler.__doc__ = Resampler.__doc__ @@ -2053,9 +2112,7 @@ def get_resampler_for_grouping( how=None, fill_method=None, limit: int | None = None, - kind=None, on=None, - include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -2063,10 +2120,8 @@ def get_resampler_for_grouping( """ # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) - resampler = tg._get_resampler(groupby.obj, kind=kind) - return resampler._get_resampler_for_grouping( - groupby=groupby, include_groups=include_groups, key=tg.key - ) + resampler = tg._get_resampler(groupby.obj) + return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) class TimeGrouper(Grouper): @@ -2086,7 +2141,6 @@ class TimeGrouper(Grouper): "closed", "label", "how", - "kind", "convention", "origin", "offset", @@ -2102,10 +2156,8 @@ def __init__( closed: Literal["left", "right"] | None = None, label: Literal["left", "right"] | None = None, how: str = "mean", - axis: Axis = 0, fill_method=None, limit: int | None = None, - kind: str | None = None, convention: Literal["start", "end", "e", "s"] | None = None, origin: Literal["epoch", "start", "start_day", "end", "end_day"] | TimestampConvertibleTypes = "start_day", @@ -2123,9 +2175,7 @@ def __init__( raise ValueError(f"Unsupported value {convention} for `convention`") if ( - key is None - and obj is not None - and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined] + (key is None and obj is not None and isinstance(obj.index, PeriodIndex)) # type: ignore[attr-defined] or ( key is not None and obj is not None @@ -2163,7 +2213,6 @@ def __init__( self.closed = closed self.label = label - self.kind = kind self.convention = convention if convention is not None else "e" self.how = how self.fill_method = fill_method @@ -2199,17 +2248,15 @@ def __init__( # always sort time groupers kwargs["sort"] = True - super().__init__(freq=freq, key=key, axis=axis, **kwargs) + super().__init__(freq=freq, key=key, **kwargs) - def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: + def _get_resampler(self, obj: NDFrame) -> Resampler: """ Return my resampler or raise if we have an invalid axis. Parameters ---------- obj : Series or DataFrame - kind : string, optional - 'period','timestamp','timedelta' are valid Returns ------- @@ -2225,12 +2272,10 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: return DatetimeIndexResampler( obj, timegrouper=self, - kind=kind, - axis=self.axis, group_keys=self.group_keys, gpr_index=ax, ) - elif isinstance(ax, PeriodIndex) or kind == "period": + elif isinstance(ax, PeriodIndex): if isinstance(ax, PeriodIndex): # GH#53481 warnings.warn( @@ -2239,18 +2284,9 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: FutureWarning, stacklevel=find_stack_level(), ) - else: - warnings.warn( - "Resampling with kind='period' is deprecated. " - "Use datetime paths instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) return PeriodIndexResampler( obj, timegrouper=self, - kind=kind, - axis=self.axis, group_keys=self.group_keys, gpr_index=ax, ) @@ -2258,7 +2294,6 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: return TimedeltaIndexResampler( obj, timegrouper=self, - axis=self.axis, group_keys=self.group_keys, gpr_index=ax, ) @@ -2529,18 +2564,27 @@ def _set_grouper( return obj, ax, indexer +@overload def _take_new_index( - obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0 -) -> NDFrameT: + obj: DataFrame, indexer: npt.NDArray[np.intp], new_index: Index +) -> DataFrame: ... + + +@overload +def _take_new_index( + obj: Series, indexer: npt.NDArray[np.intp], new_index: Index +) -> Series: ... + + +def _take_new_index( + obj: DataFrame | Series, + indexer: npt.NDArray[np.intp], + new_index: Index, +) -> DataFrame | Series: if isinstance(obj, ABCSeries): new_values = algos.take_nd(obj._values, indexer) - # error: Incompatible return value type (got "Series", expected "NDFrameT") - return obj._constructor( # type: ignore[return-value] - new_values, index=new_index, name=obj.name - ) + return obj._constructor(new_values, index=new_index, name=obj.name) elif isinstance(obj, ABCDataFrame): - if axis == 1: - raise NotImplementedError("axis 1 is not supported") new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) else: @@ -2611,7 +2655,7 @@ def _get_timestamp_range_edges( ) if isinstance(freq, Day): first = first.tz_localize(index_tz) - last = last.tz_localize(index_tz) + last = last.tz_localize(index_tz, nonexistent="shift_forward") else: first = first.normalize() last = last.normalize() @@ -2806,12 +2850,7 @@ def asfreq( if isinstance(freq, BaseOffset): if hasattr(freq, "_period_dtype_code"): - freq = freq_to_period_freqstr(freq.n, freq.name) - else: - raise ValueError( - f"Invalid offset: '{freq.base}' for converting time series " - f"with PeriodIndex." - ) + freq = PeriodDtype(freq)._freqstr new_obj = obj.copy() new_obj.index = obj.index.asfreq(freq, how=how) @@ -2834,7 +2873,7 @@ def asfreq( return new_obj -def _asfreq_compat(index: DatetimeIndex | PeriodIndex | TimedeltaIndex, freq): +def _asfreq_compat(index: FreqIndexT, freq) -> FreqIndexT: """ Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex. @@ -2852,7 +2891,6 @@ def _asfreq_compat(index: DatetimeIndex | PeriodIndex | TimedeltaIndex, freq): raise ValueError( "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex" ) - new_index: Index if isinstance(index, PeriodIndex): new_index = index.asfreq(freq=freq) elif isinstance(index, DatetimeIndex): @@ -2862,52 +2900,3 @@ def _asfreq_compat(index: DatetimeIndex | PeriodIndex | TimedeltaIndex, freq): else: # pragma: no cover raise TypeError(type(index)) return new_index - - -def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None: - """ - Warn for deprecation of args and kwargs in resample functions. - - Parameters - ---------- - cls : type - Class to warn about. - kernel : str - Operation name. - args : tuple or None - args passed by user. Will be None if and only if kernel does not have args. - kwargs : dict or None - kwargs passed by user. Will be None if and only if kernel does not have kwargs. - """ - warn_args = args is not None and len(args) > 0 - warn_kwargs = kwargs is not None and len(kwargs) > 0 - if warn_args and warn_kwargs: - msg = "args and kwargs" - elif warn_args: - msg = "args" - elif warn_kwargs: - msg = "kwargs" - else: - return - warnings.warn( - f"Passing additional {msg} to {cls.__name__}.{kernel} has " - "no impact on the result and is deprecated. This will " - "raise a TypeError in a future version of pandas.", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - - -def _apply( - grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs -) -> DataFrame: - # GH#7155 - rewrite warning to appear as if it came from `.resample` - target_message = "DataFrameGroupBy.apply operated on the grouping columns" - new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") - with rewrite_warning( - target_message=target_message, - target_category=FutureWarning, - new_message=new_message, - ): - result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) - return result diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aacea92611697..5efaf0dc051bd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -1,12 +1,13 @@ """ Concat routines. """ + from __future__ import annotations from collections import abc +import types from typing import ( TYPE_CHECKING, - Callable, Literal, cast, overload, @@ -15,14 +16,13 @@ import numpy as np -from pandas._config import using_copy_on_write - -from pandas.util._decorators import cache_readonly +from pandas._libs import lib +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool, - is_iterator, + is_scalar, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -49,6 +49,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -81,9 +82,8 @@ def concat( names: list[HashableT] | None = ..., verify_integrity: bool = ..., sort: bool = ..., - copy: bool | None = ..., -) -> DataFrame: - ... + copy: bool | lib.NoDefault = ..., +) -> DataFrame: ... @overload @@ -98,9 +98,8 @@ def concat( names: list[HashableT] | None = ..., verify_integrity: bool = ..., sort: bool = ..., - copy: bool | None = ..., -) -> Series: - ... + copy: bool | lib.NoDefault = ..., +) -> Series: ... @overload @@ -115,9 +114,8 @@ def concat( names: list[HashableT] | None = ..., verify_integrity: bool = ..., sort: bool = ..., - copy: bool | None = ..., -) -> DataFrame | Series: - ... + copy: bool | lib.NoDefault = ..., +) -> DataFrame | Series: ... @overload @@ -132,9 +130,8 @@ def concat( names: list[HashableT] | None = ..., verify_integrity: bool = ..., sort: bool = ..., - copy: bool | None = ..., -) -> DataFrame: - ... + copy: bool | lib.NoDefault = ..., +) -> DataFrame: ... @overload @@ -149,11 +146,11 @@ def concat( names: list[HashableT] | None = ..., verify_integrity: bool = ..., sort: bool = ..., - copy: bool | None = ..., -) -> DataFrame | Series: - ... + copy: bool | lib.NoDefault = ..., +) -> DataFrame | Series: ... +@set_module("pandas") def concat( objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, @@ -165,7 +162,7 @@ def concat( names: list[HashableT] | None = None, verify_integrity: bool = False, sort: bool = False, - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Series: """ Concatenate pandas objects along a particular axis. @@ -178,8 +175,8 @@ def concat( Parameters ---------- - objs : a sequence or mapping of Series or DataFrame objects - If a mapping is passed, the sorted keys will be used as the `keys` + objs : an iterable or mapping of Series or DataFrame objects + If a mapping is passed, the keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. @@ -205,11 +202,27 @@ def concat( Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. sort : bool, default False - Sort non-concatenation axis if it is not already aligned. - - copy : bool, default True + Sort non-concatenation axis. One exception to this is when the + non-concatenation axis is a DatetimeIndex and join='outer' and the axis is + not already aligned. In that case, the non-concatenation axis is always + sorted lexicographically. + copy : bool, default False If False, do not copy data unnecessarily. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 + Returns ------- object, type of objs @@ -238,8 +251,8 @@ def concat( -------- Combine two ``Series``. - >>> s1 = pd.Series(['a', 'b']) - >>> s2 = pd.Series(['c', 'd']) + >>> s1 = pd.Series(["a", "b"]) + >>> s2 = pd.Series(["c", "d"]) >>> pd.concat([s1, s2]) 0 a 1 b @@ -260,7 +273,7 @@ def concat( Add a hierarchical index at the outermost level of the data with the ``keys`` option. - >>> pd.concat([s1, s2], keys=['s1', 's2']) + >>> pd.concat([s1, s2], keys=["s1", "s2"]) s1 0 a 1 b s2 0 c @@ -269,8 +282,7 @@ def concat( Label the index keys you create with the ``names`` option. - >>> pd.concat([s1, s2], keys=['s1', 's2'], - ... names=['Series name', 'Row ID']) + >>> pd.concat([s1, s2], keys=["s1", "s2"], names=["Series name", "Row ID"]) Series name Row ID s1 0 a 1 b @@ -280,14 +292,12 @@ def concat( Combine two ``DataFrame`` objects with identical columns. - >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], - ... columns=['letter', 'number']) + >>> df1 = pd.DataFrame([["a", 1], ["b", 2]], columns=["letter", "number"]) >>> df1 letter number 0 a 1 1 b 2 - >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], - ... columns=['letter', 'number']) + >>> df2 = pd.DataFrame([["c", 3], ["d", 4]], columns=["letter", "number"]) >>> df2 letter number 0 c 3 @@ -303,8 +313,9 @@ def concat( and return everything. Columns outside the intersection will be filled with ``NaN`` values. - >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], - ... columns=['letter', 'number', 'animal']) + >>> df3 = pd.DataFrame( + ... [["c", 3, "cat"], ["d", 4, "dog"]], columns=["letter", "number", "animal"] + ... ) >>> df3 letter number animal 0 c 3 cat @@ -330,8 +341,9 @@ def concat( Combine ``DataFrame`` objects horizontally along the x axis by passing in ``axis=1``. - >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], - ... columns=['animal', 'name']) + >>> df4 = pd.DataFrame( + ... [["bird", "polly"], ["monkey", "george"]], columns=["animal", "name"] + ... ) >>> pd.concat([df1, df4], axis=1) letter number animal name 0 a 1 bird polly @@ -340,11 +352,11 @@ def concat( Prevent the result from including duplicate index values with the ``verify_integrity`` option. - >>> df5 = pd.DataFrame([1], index=['a']) + >>> df5 = pd.DataFrame([1], index=["a"]) >>> df5 0 a 1 - >>> df6 = pd.DataFrame([2], index=['a']) + >>> df6 = pd.DataFrame([2], index=["a"]) >>> df6 0 a 2 @@ -355,11 +367,11 @@ def concat( Append a single row to the end of a ``DataFrame`` object. - >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0]) + >>> df7 = pd.DataFrame({"a": 1, "b": 2}, index=[0]) >>> df7 a b 0 1 2 - >>> new_row = pd.Series({'a': 3, 'b': 4}) + >>> new_row = pd.Series({"a": 3, "b": 4}) >>> new_row a 3 b 4 @@ -369,413 +381,458 @@ def concat( 0 1 2 1 3 4 """ - if copy is None: - if using_copy_on_write(): - copy = False - else: - copy = True - elif copy and using_copy_on_write(): - copy = False - - op = _Concatenator( - objs, - axis=axis, - ignore_index=ignore_index, - join=join, - keys=keys, - levels=levels, - names=names, - verify_integrity=verify_integrity, - copy=copy, - sort=sort, - ) + if ignore_index and keys is not None: + raise ValueError( + f"Cannot set {ignore_index=} and specify keys. Either should be used." + ) - return op.get_result() + if copy is not lib.no_default: + warnings.warn( + "The copy keyword is deprecated and will be removed in a future " + "version. Copy-on-Write is active in pandas since 3.0 which utilizes " + "a lazy copy mechanism that defers copies until necessary. Use " + ".copy() to make an eager copy if necessary.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + if join == "outer": + intersect = False + elif join == "inner": + intersect = True + else: # pragma: no cover + raise ValueError( + "Only can inner (intersect) or outer (union) join the other axis" + ) + if not is_bool(sort): + raise ValueError( + f"The 'sort' keyword only accepts boolean values; {sort} was passed." + ) + sort = bool(sort) -class _Concatenator: - """ - Orchestrates a concatenation operation for BlockManagers - """ + objs, keys, ndims = _clean_keys_and_objs(objs, keys) - sort: bool - - def __init__( - self, - objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], - axis: Axis = 0, - join: str = "outer", - keys: Iterable[Hashable] | None = None, - levels=None, - names: list[HashableT] | None = None, - ignore_index: bool = False, - verify_integrity: bool = False, - copy: bool = True, - sort: bool = False, - ) -> None: - if isinstance(objs, (ABCSeries, ABCDataFrame, str)): - raise TypeError( - "first argument must be an iterable of pandas " - f'objects, you passed an object of type "{type(objs).__name__}"' - ) + # select an object to be our result reference + sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect) - if join == "outer": - self.intersect = False - elif join == "inner": - self.intersect = True - else: # pragma: no cover - raise ValueError( - "Only can inner (intersect) or outer (union) join the other axis" - ) + # Standardize axis parameter to int + if sample.ndim == 1: + from pandas import DataFrame - if not is_bool(sort): - raise ValueError( - f"The 'sort' keyword only accepts boolean values; {sort} was passed." - ) - # Incompatible types in assignment (expression has type "Union[bool, bool_]", - # variable has type "bool") - self.sort = sort # type: ignore[assignment] + bm_axis = DataFrame._get_axis_number(axis) + is_frame = False + is_series = True + else: + bm_axis = sample._get_axis_number(axis) + is_frame = True + is_series = False - self.ignore_index = ignore_index - self.verify_integrity = verify_integrity - self.copy = copy + # Need to flip BlockManager axis in the DataFrame special case + bm_axis = sample._get_block_manager_axis(bm_axis) - objs, keys = self._clean_keys_and_objs(objs, keys) + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis) - # figure out what our result ndim is going to be - ndims = self._get_ndims(objs) - sample, objs = self._get_sample_object(objs, ndims, keys, names, levels) + axis = 1 - bm_axis if is_frame else 0 + names = names or getattr(keys, "names", None) + return _get_result( + objs, + is_series, + bm_axis, + ignore_index, + intersect, + sort, + keys, + levels, + verify_integrity, + names, + axis, + ) - # Standardize axis parameter to int - if sample.ndim == 1: - from pandas import DataFrame - axis = DataFrame._get_axis_number(axis) - self._is_frame = False - self._is_series = True - else: - axis = sample._get_axis_number(axis) - self._is_frame = True - self._is_series = False - - # Need to flip BlockManager axis in the DataFrame special case - axis = sample._get_block_manager_axis(axis) - - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - if len(ndims) > 1: - objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) - - self.objs = objs - - # note: this is the BlockManager axis (since DataFrame is transposed) - self.bm_axis = axis - self.axis = 1 - self.bm_axis if self._is_frame else 0 - self.keys = keys - self.names = names or getattr(keys, "names", None) - self.levels = levels - - def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]: - # figure out what our result ndim is going to be - ndims = set() - for obj in objs: - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - msg = ( - f"cannot concatenate object of type '{type(obj)}'; " - "only Series and DataFrame objs are valid" - ) - raise TypeError(msg) +def _sanitize_mixed_ndim( + objs: list[Series | DataFrame], + sample: Series | DataFrame, + ignore_index: bool, + axis: AxisInt, +) -> list[Series | DataFrame]: + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed - ndims.add(obj.ndim) - return ndims + new_objs = [] - def _clean_keys_and_objs( - self, - objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], - keys, - ) -> tuple[list[Series | DataFrame], Index | None]: - if isinstance(objs, abc.Mapping): - if keys is None: - keys = list(objs.keys()) - objs_list = [objs[k] for k in keys] - else: - objs_list = list(objs) + current_column = 0 + max_ndim = sample.ndim + for obj in objs: + ndim = obj.ndim + if ndim == max_ndim: + pass - if len(objs_list) == 0: - raise ValueError("No objects to concatenate") + elif ndim != max_ndim - 1: + raise ValueError( + "cannot concatenate unaligned mixed dimensional NDFrame objects" + ) - if keys is None: - objs_list = list(com.not_none(*objs_list)) else: - # GH#1649 - clean_keys = [] - clean_objs = [] - if is_iterator(keys): - keys = list(keys) - if len(keys) != len(objs_list): - # GH#43485 - warnings.warn( - "The behavior of pd.concat with len(keys) != len(objs) is " - "deprecated. In a future version this will raise instead of " - "truncating to the smaller of the two sequences", - FutureWarning, - stacklevel=find_stack_level(), - ) - for k, v in zip(keys, objs_list): - if v is None: - continue - clean_keys.append(k) - clean_objs.append(v) - objs_list = clean_objs - - if isinstance(keys, MultiIndex): - # TODO: retain levels? - keys = type(keys).from_tuples(clean_keys, names=keys.names) + name = getattr(obj, "name", None) + rename_columns = False + if ignore_index or name is None: + if axis == 1: + # doing a row-wise concatenation so need everything + # to line up + if name is None: + name = 0 + rename_columns = True + else: + # doing a column-wise concatenation so need series + # to have unique names + if name is None: + rename_columns = True + name = current_column + current_column += 1 + obj = sample._constructor(obj, copy=False) + if isinstance(obj, ABCDataFrame) and rename_columns: + obj.columns = range(name, name + 1, 1) else: - name = getattr(keys, "name", None) - keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None)) + obj = sample._constructor({name: obj}, copy=False) - if len(objs_list) == 0: - raise ValueError("All objects passed were None") + new_objs.append(obj) - return objs_list, keys + return new_objs - def _get_sample_object( - self, - objs: list[Series | DataFrame], - ndims: set[int], - keys, - names, - levels, - ) -> tuple[Series | DataFrame, list[Series | DataFrame]]: - # get the sample - # want the highest ndim that we have, and must be non-empty - # unless all objs are empty - sample: Series | DataFrame | None = None - if len(ndims) > 1: - max_ndim = max(ndims) - for obj in objs: - if obj.ndim == max_ndim and np.sum(obj.shape): - sample = obj - break - else: - # filter out the empties if we have not multi-index possibilities - # note to keep empty Series as it affect to result columns / name - non_empties = [obj for obj in objs if sum(obj.shape) > 0 or obj.ndim == 1] - - if len(non_empties) and ( - keys is None and names is None and levels is None and not self.intersect - ): - objs = non_empties - sample = objs[0] - - if sample is None: - sample = objs[0] - return sample, objs - - def _sanitize_mixed_ndim( - self, - objs: list[Series | DataFrame], - sample: Series | DataFrame, - ignore_index: bool, - axis: AxisInt, - ) -> list[Series | DataFrame]: - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - - new_objs = [] - - current_column = 0 - max_ndim = sample.ndim - for obj in objs: - ndim = obj.ndim - if ndim == max_ndim: - pass +def _get_result( + objs: list[Series | DataFrame], + is_series: bool, + bm_axis: AxisInt, + ignore_index: bool, + intersect: bool, + sort: bool, + keys: Iterable[Hashable] | None, + levels, + verify_integrity: bool, + names: list[HashableT] | None, + axis: AxisInt, +): + cons: Callable[..., DataFrame | Series] + sample: DataFrame | Series - elif ndim != max_ndim - 1: - raise ValueError( - "cannot concatenate unaligned mixed dimensional NDFrame objects" - ) + # series only + if is_series: + sample = cast("Series", objs[0]) - else: - name = getattr(obj, "name", None) - if ignore_index or name is None: - if axis == 1: - # doing a row-wise concatenation so need everything - # to line up - name = 0 - else: - # doing a column-wise concatenation so need series - # to have unique names - name = current_column - current_column += 1 + # stack blocks + if bm_axis == 0: + name = com.consensus_name_attr(objs) + cons = sample._constructor - obj = sample._constructor({name: obj}, copy=False) + arrs = [ser._values for ser in objs] - new_objs.append(obj) + res = concat_compat(arrs, axis=0) - return new_objs + if ignore_index: + new_index: Index = default_index(len(res)) + else: + new_index = _get_concat_axis_series( + objs, + ignore_index, + bm_axis, + keys, + levels, + verify_integrity, + names, + ) - def get_result(self): - cons: Callable[..., DataFrame | Series] - sample: DataFrame | Series + mgr = type(sample._mgr).from_array(res, index=new_index) - # series only - if self._is_series: - sample = cast("Series", self.objs[0]) + result = sample._constructor_from_mgr(mgr, axes=mgr.axes) + result._name = name + return result.__finalize__( + types.SimpleNamespace(objs=objs), method="concat" + ) - # stack blocks - if self.bm_axis == 0: - name = com.consensus_name_attr(self.objs) - cons = sample._constructor + # combine as columns in a frame + else: + data = dict(enumerate(objs)) + + # GH28330 Preserves subclassed objects through concat + cons = sample._constructor_expanddim - arrs = [ser._values for ser in self.objs] + index = get_objs_combined_axis( + objs, + axis=objs[0]._get_block_manager_axis(0), + intersect=intersect, + sort=sort, + ) + columns = _get_concat_axis_series( + objs, ignore_index, bm_axis, keys, levels, verify_integrity, names + ) + df = cons(data, index=index, copy=False) + df.columns = columns + return df.__finalize__(types.SimpleNamespace(objs=objs), method="concat") - res = concat_compat(arrs, axis=0) + # combine block managers + else: + sample = cast("DataFrame", objs[0]) + + mgrs_indexers = [] + result_axes = new_axes( + objs, + bm_axis, + intersect, + sort, + keys, + names, + axis, + levels, + verify_integrity, + ignore_index, + ) + for obj in objs: + indexers = {} + for ax, new_labels in enumerate(result_axes): + # ::-1 to convert BlockManager ax to DataFrame ax + if ax == bm_axis: + # Suppress reindexing on concat axis + continue - new_index: Index - if self.ignore_index: - # We can avoid surprisingly-expensive _get_concat_axis - new_index = default_index(len(res)) - else: - new_index = self.new_axes[0] + # 1-ax to convert BlockManager axis to DataFrame axis + obj_labels = obj.axes[1 - ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.get_indexer(new_labels) - mgr = type(sample._mgr).from_array(res, index=new_index) + mgrs_indexers.append((obj._mgr, indexers)) - result = sample._constructor_from_mgr(mgr, axes=mgr.axes) - result._name = name - return result.__finalize__(self, method="concat") + new_data = concatenate_managers( + mgrs_indexers, result_axes, concat_axis=bm_axis, copy=False + ) - # combine as columns in a frame + out = sample._constructor_from_mgr(new_data, axes=new_data.axes) + return out.__finalize__(types.SimpleNamespace(objs=objs), method="concat") + + +def new_axes( + objs: list[Series | DataFrame], + bm_axis: AxisInt, + intersect: bool, + sort: bool, + keys: Iterable[Hashable] | None, + names: list[HashableT] | None, + axis: AxisInt, + levels, + verify_integrity: bool, + ignore_index: bool, +) -> list[Index]: + """Return the new [index, column] result for concat.""" + return [ + _get_concat_axis_dataframe( + objs, + axis, + ignore_index, + keys, + names, + levels, + verify_integrity, + ) + if i == bm_axis + else get_objs_combined_axis( + objs, + axis=objs[0]._get_block_manager_axis(i), + intersect=intersect, + sort=sort, + ) + for i in range(2) + ] + + +def _get_concat_axis_series( + objs: list[Series | DataFrame], + ignore_index: bool, + bm_axis: AxisInt, + keys: Iterable[Hashable] | None, + levels, + verify_integrity: bool, + names: list[HashableT] | None, +) -> Index: + """Return result concat axis when concatenating Series objects.""" + if ignore_index: + return default_index(len(objs)) + elif bm_axis == 0: + indexes = [x.index for x in objs] + if keys is None: + if levels is not None: + raise ValueError("levels supported only when keys is not None") + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex(indexes, keys, levels, names) + if verify_integrity and not concat_axis.is_unique: + overlap = concat_axis[concat_axis.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") + return concat_axis + elif keys is None: + result_names: list[Hashable] = [None] * len(objs) + num = 0 + has_names = False + for i, x in enumerate(objs): + if x.ndim != 1: + raise TypeError( + f"Cannot concatenate type 'Series' with " + f"object of type '{type(x).__name__}'" + ) + if x.name is not None: + result_names[i] = x.name + has_names = True else: - data = dict(zip(range(len(self.objs)), self.objs)) + result_names[i] = num + num += 1 + if has_names: + return Index(result_names) + else: + return default_index(len(objs)) + else: + return ensure_index(keys).set_names(names) # type: ignore[arg-type] + + +def _get_concat_axis_dataframe( + objs: list[Series | DataFrame], + axis: AxisInt, + ignore_index: bool, + keys: Iterable[Hashable] | None, + names: list[HashableT] | None, + levels, + verify_integrity: bool, +) -> Index: + """Return result concat axis when concatenating DataFrame objects.""" + indexes_gen = (x.axes[axis] for x in objs) + + if ignore_index: + return default_index(sum(len(i) for i in indexes_gen)) + else: + indexes = list(indexes_gen) - # GH28330 Preserves subclassed objects through concat - cons = sample._constructor_expanddim + if keys is None: + if levels is not None: + raise ValueError("levels supported only when keys is not None") + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex(indexes, keys, levels, names) - index, columns = self.new_axes - df = cons(data, index=index, copy=self.copy) - df.columns = columns - return df.__finalize__(self, method="concat") + if verify_integrity and not concat_axis.is_unique: + overlap = concat_axis[concat_axis.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") - # combine block managers - else: - sample = cast("DataFrame", self.objs[0]) - - mgrs_indexers = [] - for obj in self.objs: - indexers = {} - for ax, new_labels in enumerate(self.new_axes): - # ::-1 to convert BlockManager ax to DataFrame ax - if ax == self.bm_axis: - # Suppress reindexing on concat axis - continue - - # 1-ax to convert BlockManager axis to DataFrame axis - obj_labels = obj.axes[1 - ax] - if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.get_indexer(new_labels) - - mgrs_indexers.append((obj._mgr, indexers)) - - new_data = concatenate_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy - ) - if not self.copy and not using_copy_on_write(): - new_data._consolidate_inplace() + return concat_axis - out = sample._constructor_from_mgr(new_data, axes=new_data.axes) - return out.__finalize__(self, method="concat") - def _get_result_dim(self) -> int: - if self._is_series and self.bm_axis == 1: - return 2 - else: - return self.objs[0].ndim - - @cache_readonly - def new_axes(self) -> list[Index]: - ndim = self._get_result_dim() - return [ - self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i) - for i in range(ndim) - ] - - def _get_comb_axis(self, i: AxisInt) -> Index: - data_axis = self.objs[0]._get_block_manager_axis(i) - return get_objs_combined_axis( - self.objs, - axis=data_axis, - intersect=self.intersect, - sort=self.sort, - copy=self.copy, +def _clean_keys_and_objs( + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + keys, +) -> tuple[list[Series | DataFrame], Index | None, set[int]]: + """ + Returns + ------- + clean_objs : list[Series | DataFrame] + List of DataFrame and Series with Nones removed. + keys : Index | None + None if keys was None + Index if objs was a Mapping or keys was not None. Filtered where objs was None. + ndim : set[int] + Unique .ndim attribute of obj encountered. + """ + if isinstance(objs, abc.Mapping): + if keys is None: + keys = objs.keys() + objs = [objs[k] for k in keys] + elif isinstance(objs, (ABCSeries, ABCDataFrame)) or is_scalar(objs): + raise TypeError( + "first argument must be an iterable of pandas " + f'objects, you passed an object of type "{type(objs).__name__}"' ) + elif not isinstance(objs, abc.Sized): + objs = list(objs) - @cache_readonly - def _get_concat_axis(self) -> Index: - """ - Return index to be used along concatenation axis. - """ - if self._is_series: - if self.bm_axis == 0: - indexes = [x.index for x in self.objs] - elif self.ignore_index: - idx = default_index(len(self.objs)) - return idx - elif self.keys is None: - names: list[Hashable] = [None] * len(self.objs) - num = 0 - has_names = False - for i, x in enumerate(self.objs): - if x.ndim != 1: - raise TypeError( - f"Cannot concatenate type 'Series' with " - f"object of type '{type(x).__name__}'" - ) - if x.name is not None: - names[i] = x.name - has_names = True - else: - names[i] = num - num += 1 - if has_names: - return Index(names) - else: - return default_index(len(self.objs)) - else: - return ensure_index(self.keys).set_names(self.names) - else: - indexes = [x.axes[self.axis] for x in self.objs] + if len(objs) == 0: + raise ValueError("No objects to concatenate") - if self.ignore_index: - idx = default_index(sum(len(i) for i in indexes)) - return idx + if keys is not None: + if not isinstance(keys, Index): + keys = Index(keys) + if len(keys) != len(objs): + # GH#43485 + raise ValueError( + f"The length of the keys ({len(keys)}) must match " + f"the length of the objects to concatenate ({len(objs)})" + ) - if self.keys is None: - if self.levels is not None: - raise ValueError("levels supported only when keys is not None") - concat_axis = _concat_indexes(indexes) + # GH#1649 + key_indices = [] + clean_objs = [] + ndims = set() + for i, obj in enumerate(objs): + if obj is None: + continue + elif isinstance(obj, (ABCSeries, ABCDataFrame)): + key_indices.append(i) + clean_objs.append(obj) + ndims.add(obj.ndim) else: - concat_axis = _make_concat_multiindex( - indexes, self.keys, self.levels, self.names + msg = ( + f"cannot concatenate object of type '{type(obj)}'; " + "only Series and DataFrame objs are valid" ) + raise TypeError(msg) + + if keys is not None and len(key_indices) < len(keys): + keys = keys.take(key_indices) + + if len(clean_objs) == 0: + raise ValueError("All objects passed were None") + + return clean_objs, keys, ndims + + +def _get_sample_object( + objs: list[Series | DataFrame], + ndims: set[int], + keys, + names, + levels, + intersect: bool, +) -> tuple[Series | DataFrame, list[Series | DataFrame]]: + # get the sample + # want the highest ndim that we have, and must be non-empty + # unless all objs are empty + if len(ndims) > 1: + max_ndim = max(ndims) + for obj in objs: + if obj.ndim == max_ndim and sum(obj.shape): # type: ignore[arg-type] + return obj, objs + elif keys is None and names is None and levels is None and not intersect: + # filter out the empties if we have not multi-index possibilities + # note to keep empty Series as it affect to result columns / name + if ndims.pop() == 2: + non_empties = [obj for obj in objs if sum(obj.shape)] + else: + non_empties = objs - self._maybe_check_integrity(concat_axis) + if len(non_empties): + return non_empties[0], non_empties - return concat_axis - - def _maybe_check_integrity(self, concat_index: Index): - if self.verify_integrity: - if not concat_index.is_unique: - overlap = concat_index[concat_index.duplicated()].unique() - raise ValueError(f"Indexes have overlapping values: {overlap}") + return objs[0], objs def _concat_indexes(indexes) -> Index: return indexes[0].append(indexes[1:]) +def validate_unique_levels(levels: list[Index]) -> None: + for level in levels: + if not level.is_unique: + raise ValueError(f"Level values not unique: {level.tolist()}") + + def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: if (levels is None and isinstance(keys[0], tuple)) or ( levels is not None and len(levels) > 1 @@ -788,6 +845,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde _, levels = factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] + validate_unique_levels(levels) else: zipped = [keys] if names is None: @@ -797,12 +855,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde levels = [ensure_index(keys).unique()] else: levels = [ensure_index(x) for x in levels] + validate_unique_levels(levels) - for level in levels: - if not level.is_unique: - raise ValueError(f"Level values not unique: {level.tolist()}") - - if not all_indexes_same(indexes) or not all(level.is_unique for level in levels): + if not all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes @@ -878,7 +933,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) - new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) + new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes) else: new_levels.append(new_index.unique()) single_codes = new_index.unique().get_indexer(new_index) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 3ed67bb7b7c02..ad4a5db441b89 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -6,13 +6,11 @@ Iterable, ) import itertools -from typing import ( - TYPE_CHECKING, - cast, -) +from typing import TYPE_CHECKING import numpy as np +from pandas._libs import missing as libmissing from pandas._libs.sparse import IntIndex from pandas.core.dtypes.common import ( @@ -62,15 +60,18 @@ def get_dummies( data : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, list of str, or dict of str, default None - String to append DataFrame column names. + A string to be prepended to DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. - prefix_sep : str, default '_' - If appending prefix, separator/delimiter to use. Or pass a - list or dictionary as with `prefix`. + prefix_sep : str, list of str, or dict of str, default '_' + Should you choose to prepend DataFrame column names with a prefix, this + is the separator/delimiter to use between the two. Alternatively, + `prefix_sep` can be a list with length equal to the number of columns, + or a dictionary mapping column names to separators. dummy_na : bool, default False - Add a column to indicate NaNs, if False NaNs are ignored. + If True, a NaN indicator column will be added even if no NaN values are present. + If False, NA values are encoded as all zero. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with @@ -101,7 +102,7 @@ def get_dummies( Examples -------- - >>> s = pd.Series(list('abca')) + >>> s = pd.Series(list("abca")) >>> pd.get_dummies(s) a b c @@ -110,7 +111,7 @@ def get_dummies( 2 False False True 3 True False False - >>> s1 = ['a', 'b', np.nan] + >>> s1 = ["a", "b", np.nan] >>> pd.get_dummies(s1) a b @@ -124,16 +125,15 @@ def get_dummies( 1 False True False 2 False False True - >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], - ... 'C': [1, 2, 3]}) + >>> df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]}) - >>> pd.get_dummies(df, prefix=['col1', 'col2']) + >>> pd.get_dummies(df, prefix=["col1", "col2"]) C col1_a col1_b col2_a col2_b col2_c 0 1 True False False True False 1 2 False True True False False 2 3 True False False False True - >>> pd.get_dummies(pd.Series(list('abcaa'))) + >>> pd.get_dummies(pd.Series(list("abcaa"))) a b c 0 True False False 1 False True False @@ -141,7 +141,7 @@ def get_dummies( 3 True False False 4 True False False - >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) + >>> pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) b c 0 False False 1 True False @@ -149,7 +149,7 @@ def get_dummies( 3 False False 4 False False - >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) + >>> pd.get_dummies(pd.Series(list("abc")), dtype=float) a b c 0 1.0 0.0 0.0 1 0.0 1.0 0.0 @@ -169,7 +169,7 @@ def get_dummies( data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols - def check_len(item, name: str): + def check_len(item, name: str) -> None: if is_list_like(item): if not len(item) == data_to_encode.shape[1]: len_msg = ( @@ -260,7 +260,7 @@ def _get_dummies_1d( dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] elif ( isinstance(input_dtype, StringDtype) - and input_dtype.storage != "pyarrow_numpy" + and input_dtype.na_value is libmissing.NA ): dtype = pandas_dtype("boolean") # type: ignore[assignment] else: @@ -340,7 +340,7 @@ def get_empty_frame(data) -> DataFrame: ) sparse_series.append(Series(data=sarr, index=index, name=col, copy=False)) - return concat(sparse_series, axis=1, copy=False) + return concat(sparse_series, axis=1) else: # ensure ndarray layout is column-major @@ -359,7 +359,7 @@ def get_empty_frame(data) -> DataFrame: if drop_first: # remove first GH12042 - dummy_mat = dummy_mat[:, 1:] + dummy_mat = dummy_mat[:, 1:] # type: ignore[assignment] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype) @@ -426,8 +426,7 @@ def from_dummies( Examples -------- - >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], - ... "c": [0, 0, 1, 0]}) + >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}) >>> df a b c @@ -442,9 +441,15 @@ def from_dummies( 2 c 3 a - >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], - ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], - ... "col2_c": [0, 0, 1]}) + >>> df = pd.DataFrame( + ... { + ... "col1_a": [1, 0, 1], + ... "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], + ... "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 1], + ... } + ... ) >>> df col1_a col1_b col2_a col2_b col2_c @@ -458,9 +463,15 @@ def from_dummies( 1 b a 2 a c - >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], - ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], - ... "col2_c": [0, 0, 0]}) + >>> df = pd.DataFrame( + ... { + ... "col1_a": [1, 0, 0], + ... "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], + ... "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 0], + ... } + ... ) >>> df col1_a col1_b col2_a col2_b col2_c @@ -482,19 +493,18 @@ def from_dummies( f"Received 'data' of type: {type(data).__name__}" ) - col_isna_mask = cast(Series, data.isna().any()) + col_isna_mask = data.isna().any() if col_isna_mask.any(): raise ValueError( - "Dummy DataFrame contains NA value in column: " - f"'{col_isna_mask.idxmax()}'" + f"Dummy DataFrame contains NA value in column: '{col_isna_mask.idxmax()}'" ) # index data with a list of all columns that are dummies try: - data_to_decode = data.astype("boolean", copy=False) - except TypeError: - raise TypeError("Passed DataFrame contains non-dummy data") + data_to_decode = data.astype("boolean") + except TypeError as err: + raise TypeError("Passed DataFrame contains non-dummy data") from err # collect prefixes and get lists to slice data for each prefix variables_slice = defaultdict(list) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bb1cd0d738dac..f4cb82816bbcf 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -5,17 +5,16 @@ import numpy as np -from pandas.util._decorators import Appender - -from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.common import ( + is_iterator, + is_list_like, +) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos from pandas.core.indexes.api import MultiIndex from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import tile_compat -from pandas.core.shared_docs import _shared_docs from pandas.core.tools.numeric import to_numeric if TYPE_CHECKING: @@ -40,7 +39,6 @@ def ensure_list_vars(arg_vars, variable: str, columns) -> list: return [] -@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) def melt( frame: DataFrame, id_vars=None, @@ -50,6 +48,131 @@ def melt( col_level=None, ignore_index: bool = True, ) -> DataFrame: + """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to reshape a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns are considered measured variables (`value_vars`), and are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + + Parameters + ---------- + frame : DataFrame + The DataFrame to unpivot. + id_vars : scalar, tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : scalar, tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar, tuple, list, or ndarray, optional + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. Must be a scalar if columns are a + MultiIndex. + value_name : scalar, default 'value' + Name to use for the 'value' column, can't be an existing column label. + col_level : scalar, optional + If columns are a MultiIndex then use this level to melt. + ignore_index : bool, default True + If True, original index is ignored. If False, the original index is retained. + Index labels will be repeated as necessary. + + Returns + ------- + DataFrame + Unpivoted DataFrame. + + See Also + -------- + DataFrame.melt : Identical method. + pivot_table : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "A": {0: "a", 1: "b", 2: "c"}, + ... "B": {0: 1, 1: 3, 2: 5}, + ... "C": {0: 2, 1: 4, 2: 6}, + ... } + ... ) + >>> df + A B C + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> pd.melt(df, id_vars=["A"], value_vars=["B"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> pd.melt(df, id_vars=["A"], value_vars=["B", "C"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 3 a C 2 + 4 b C 4 + 5 c C 6 + + The names of 'variable' and 'value' columns can be customized: + + >>> pd.melt( + ... df, + ... id_vars=["A"], + ... value_vars=["B"], + ... var_name="myVarname", + ... value_name="myValname", + ... ) + A myVarname myValname + 0 a B 1 + 1 b B 3 + 2 c B 5 + + Original index values can be kept around: + + >>> pd.melt(df, id_vars=["A"], value_vars=["B", "C"], ignore_index=False) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 0 a C 2 + 1 b C 4 + 2 c C 6 + + If you have multi-index columns: + + >>> df.columns = [list("ABC"), list("DEF")] + >>> df + A B C + D E F + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> pd.melt(df, col_level=0, id_vars=["A"], value_vars=["B"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> pd.melt(df, id_vars=[("A", "D")], value_vars=[("B", "E")]) + (A, D) variable_0 variable_1 value + 0 a B E 1 + 1 b B E 3 + 2 c B E 5 + """ if value_name in frame.columns: raise ValueError( f"value_name ({value_name}) cannot match an element in " @@ -78,9 +201,9 @@ def melt( if value_vars_was_not_none: frame = frame.iloc[:, algos.unique(idx)] else: - frame = frame.copy() + frame = frame.copy(deep=False) else: - frame = frame.copy() + frame = frame.copy(deep=False) if col_level is not None: # allow list or other? # frame is a copy @@ -97,7 +220,16 @@ def melt( frame.columns.name if frame.columns.name is not None else "variable" ] elif is_list_like(var_name): - raise ValueError(f"{var_name=} must be a scalar.") + if isinstance(frame.columns, MultiIndex): + if is_iterator(var_name): + var_name = list(var_name) + if len(var_name) > len(frame.columns): + raise ValueError( + f"{var_name=} has {len(var_name)} items, " + f"but the dataframe columns only have {len(frame.columns)} levels." + ) + else: + raise ValueError(f"{var_name=} must be a scalar.") else: var_name = [var_name] @@ -123,7 +255,7 @@ def melt( not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes ): mdata[value_name] = concat( - [frame.iloc[:, i] for i in range(frame.shape[1])] + [frame.iloc[:, i] for i in range(frame.shape[1])], ignore_index=True ).values else: mdata[value_name] = frame._values.ravel("F") @@ -133,7 +265,8 @@ def melt( result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: - result.index = tile_compat(frame.index, num_cols_adjusted) + taker = np.tile(np.arange(len(frame)), num_cols_adjusted) + result.index = frame.index.take(taker) return result @@ -176,15 +309,21 @@ def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: Examples -------- - >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], - ... 'team': ['Red Sox', 'Yankees'], - ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) + >>> data = pd.DataFrame( + ... { + ... "hr1": [514, 573], + ... "hr2": [545, 526], + ... "team": ["Red Sox", "Yankees"], + ... "year1": [2007, 2007], + ... "year2": [2008, 2008], + ... } + ... ) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 - >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) + >>> pd.lreshape(data, {"year": ["year1", "year2"], "hr": ["hr1", "hr2"]}) team year hr 0 Red Sox 2007 514 1 Yankees 2007 573 @@ -290,12 +429,15 @@ def wide_to_long( Examples -------- >>> np.random.seed(123) - >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, - ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, - ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, - ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, - ... "X" : dict(zip(range(3), np.random.randn(3))) - ... }) + >>> df = pd.DataFrame( + ... { + ... "A1970": {0: "a", 1: "b", 2: "c"}, + ... "A1980": {0: "d", 1: "e", 2: "f"}, + ... "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + ... "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + ... "X": dict(zip(range(3), np.random.randn(3))), + ... } + ... ) >>> df["id"] = df.index >>> df A1970 A1980 B1970 B1980 X id @@ -315,12 +457,14 @@ def wide_to_long( With multiple id columns - >>> df = pd.DataFrame({ - ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - ... }) + >>> df = pd.DataFrame( + ... { + ... "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], + ... } + ... ) >>> df famid birth ht1 ht2 0 1 1 2.8 3.4 @@ -332,8 +476,8 @@ def wide_to_long( 6 3 1 2.2 3.3 7 3 2 2.3 3.4 8 3 3 2.1 2.9 - >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') - >>> l + >>> long_format = pd.wide_to_long(df, stubnames="ht", i=["famid", "birth"], j="age") + >>> long_format ... # doctest: +NORMALIZE_WHITESPACE ht famid birth age @@ -358,9 +502,9 @@ def wide_to_long( Going from long back to wide just takes some creative use of `unstack` - >>> w = l.unstack() - >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) - >>> w.reset_index() + >>> wide_format = long_format.unstack() + >>> wide_format.columns = wide_format.columns.map("{0[0]}{0[1]}".format) + >>> wide_format.reset_index() famid birth ht1 ht2 0 1 1 2.8 3.4 1 1 2 2.9 3.8 @@ -375,20 +519,23 @@ def wide_to_long( Less wieldy column names are also handled >>> np.random.seed(0) - >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3), - ... 'A(weekly)-2011': np.random.rand(3), - ... 'B(weekly)-2010': np.random.rand(3), - ... 'B(weekly)-2011': np.random.rand(3), - ... 'X' : np.random.randint(3, size=3)}) - >>> df['id'] = df.index - >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + >>> df = pd.DataFrame( + ... { + ... "A(weekly)-2010": np.random.rand(3), + ... "A(weekly)-2011": np.random.rand(3), + ... "B(weekly)-2010": np.random.rand(3), + ... "B(weekly)-2011": np.random.rand(3), + ... "X": np.random.randint(3, size=3), + ... } + ... ) + >>> df["id"] = df.index + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id 0 0.548814 0.544883 0.437587 0.383442 0 0 1 0.715189 0.423655 0.891773 0.791725 1 1 2 0.602763 0.645894 0.963663 0.528895 1 2 - >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id', - ... j='year', sep='-') + >>> pd.wide_to_long(df, ["A(weekly)", "B(weekly)"], i="id", j="year", sep="-") ... # doctest: +NORMALIZE_WHITESPACE X A(weekly) B(weekly) id year @@ -403,8 +550,13 @@ def wide_to_long( stubnames and pass that list on to wide_to_long >>> stubnames = sorted( - ... set([match[0] for match in df.columns.str.findall( - ... r'[A-B]\(.*\)').values if match != []]) + ... set( + ... [ + ... match[0] + ... for match in df.columns.str.findall(r"[A-B]\(.*\)").values + ... if match != [] + ... ] + ... ) ... ) >>> list(stubnames) ['A(weekly)', 'B(weekly)'] @@ -412,12 +564,14 @@ def wide_to_long( All of the above examples have integers as suffixes. It is possible to have non-integers as suffixes. - >>> df = pd.DataFrame({ - ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - ... }) + >>> df = pd.DataFrame( + ... { + ... "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... "ht_one": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... "ht_two": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], + ... } + ... ) >>> df famid birth ht_one ht_two 0 1 1 2.8 3.4 @@ -430,9 +584,10 @@ def wide_to_long( 7 3 2 2.3 3.4 8 3 3 2.1 2.9 - >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', - ... sep='_', suffix=r'\w+') - >>> l + >>> long_format = pd.wide_to_long( + ... df, stubnames="ht", i=["famid", "birth"], j="age", sep="_", suffix=r"\w+" + ... ) + >>> long_format ... # doctest: +NORMALIZE_WHITESPACE ht famid birth age @@ -458,8 +613,7 @@ def wide_to_long( def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" - pattern = re.compile(regex) - return df.columns[df.columns.str.match(pattern)] + return df.columns[df.columns.str.match(regex)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 690e3c2700c6c..68d61da0cf7dd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1,6 +1,7 @@ """ SQL-style merge routines """ + from __future__ import annotations from collections.abc import ( @@ -39,9 +40,8 @@ ) from pandas.errors import MergeError from pandas.util._decorators import ( - Appender, - Substitution, cache_readonly, + set_module, ) from pandas.util._exceptions import find_stack_level @@ -94,7 +94,6 @@ ensure_wrapped_if_datetimelike, extract_array, ) -from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index from pandas.core.sorting import ( get_group_index, @@ -127,13 +126,22 @@ # See https://github.com/pandas-dev/pandas/issues/52451 if np.intc is not np.int32: - _factorizers[np.intc] = libhashtable.Int64Factorizer + if np.dtype(np.intc).itemsize == 4: + _factorizers[np.intc] = libhashtable.Int32Factorizer + else: + _factorizers[np.intc] = libhashtable.Int64Factorizer + +if np.uintc is not np.uint32: + if np.dtype(np.uintc).itemsize == 4: + _factorizers[np.uintc] = libhashtable.UInt32Factorizer + else: + _factorizers[np.uintc] = libhashtable.UInt64Factorizer + _known = (np.ndarray, ExtensionArray, Index, ABCSeries) -@Substitution("\nleft : DataFrame or named Series") -@Appender(_merge_doc, indents=0) +@set_module("pandas") def merge( left: DataFrame | Series, right: DataFrame | Series, @@ -145,11 +153,221 @@ def merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: + """ + Merge DataFrame or named Series objects with a database-style join. + + A named Series object is treated as a DataFrame with a single named column. + + The join is done on columns or indexes. If joining columns on + columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes + on indexes or indexes on a column or columns, the index will be passed on. + When performing a cross merge, no column specifications to merge on are + allowed. + + .. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + + Parameters + ---------- + left : DataFrame or named Series + First pandas object to merge. + right : DataFrame or named Series + Second pandas object to merge. + how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti}, + default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + * left_anti: use only keys from left frame that are not in right frame, similar + to SQL left anti join; preserve key order. + * right_anti: use only keys from right frame that are not in left frame, similar + to SQL right anti join; preserve key order. + on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. + left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. + right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. + left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. + right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. + sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). + suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + copy : bool, default False + If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 + indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + + validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + Returns + ------- + DataFrame + A DataFrame of the two merged objects. + + See Also + -------- + merge_ordered : Merge with optional filling/interpolation. + merge_asof : Merge on nearest keys. + DataFrame.join : Similar method using indices. + + Examples + -------- + >>> df1 = pd.DataFrame( + ... {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]} + ... ) + >>> df2 = pd.DataFrame( + ... {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]} + ... ) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey") + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2 with specified left and right suffixes + appended to any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=("_left", "_right")) + lkey value_left rkey value_right + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2, but raise an exception if the DataFrames have + any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=(False, False)) + Traceback (most recent call last): + ... + ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + + >>> df1 = pd.DataFrame({"a": ["foo", "bar"], "b": [1, 2]}) + >>> df2 = pd.DataFrame({"a": ["foo", "baz"], "c": [3, 4]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + >>> df1.merge(df2, how="left", on="a") + a b c + 0 foo 1 3.0 + 1 bar 2 NaN + + >>> df1 = pd.DataFrame({"left": ["foo", "bar"]}) + >>> df2 = pd.DataFrame({"right": [7, 8]}) + >>> df1 + left + 0 foo + 1 bar + >>> df2 + right + 0 7 + 1 8 + + >>> df1.merge(df2, how="cross") + left right + 0 foo 7 + 1 foo 8 + 2 bar 7 + 3 bar 8 + """ left_df = _validate_operand(left) + left._check_copy_deprecation(copy) right_df = _validate_operand(right) if how == "cross": return _cross_merge( @@ -164,7 +382,6 @@ def merge( suffixes=suffixes, indicator=indicator, validate=validate, - copy=copy, ) else: op = _MergeOperation( @@ -181,7 +398,7 @@ def merge( indicator=indicator, validate=validate, ) - return op.get_result(copy=copy) + return op.get_result() def _cross_merge( @@ -194,7 +411,6 @@ def _cross_merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), - copy: bool | None = None, indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: @@ -233,7 +449,6 @@ def _cross_merge( suffixes=suffixes, indicator=indicator, validate=validate, - copy=copy, ) del res[cross_col] return res @@ -264,7 +479,7 @@ def _groupby_and_merge( if all(item in right.columns for item in by): rby = right.groupby(by, sort=False) - for key, lhs in lby._grouper.get_iterator(lby._selected_obj, axis=lby.axis): + for key, lhs in lby._grouper.get_iterator(lby._selected_obj): if rby is None: rhs = right else: @@ -292,10 +507,11 @@ def _groupby_and_merge( from pandas.core.reshape.concat import concat result = concat(pieces, ignore_index=True) - result = result.reindex(columns=pieces[0].columns, copy=False) + result = result.reindex(columns=pieces[0].columns) return result, lby +@set_module("pandas") def merge_ordered( left: DataFrame | Series, right: DataFrame | Series, @@ -317,7 +533,9 @@ def merge_ordered( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label or list Field names to join on. Must be found in both DataFrames. left_on : label or list, or array-like @@ -367,7 +585,7 @@ def merge_ordered( ... { ... "key": ["a", "c", "e", "a", "c", "e"], ... "lvalue": [1, 2, 3, 1, 2, 3], - ... "group": ["a", "a", "a", "b", "b", "b"] + ... "group": ["a", "a", "a", "b", "b", "b"], ... } ... ) >>> df1 @@ -437,6 +655,7 @@ def _merger(x, y) -> DataFrame: return result +@set_module("pandas") def merge_asof( left: DataFrame | Series, right: DataFrame | Series, @@ -449,7 +668,7 @@ def merge_asof( left_by=None, right_by=None, suffixes: Suffixes = ("_x", "_y"), - tolerance: int | Timedelta | None = None, + tolerance: int | datetime.timedelta | None = None, allow_exact_matches: bool = True, direction: str = "backward", ) -> DataFrame: @@ -475,7 +694,9 @@ def merge_asof( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label Field name to join on. Must be found in both DataFrames. The data MUST be ordered. Furthermore this must be a numeric column, @@ -498,7 +719,7 @@ def merge_asof( suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right side, respectively. - tolerance : int or Timedelta, optional, default None + tolerance : int or timedelta, optional, default None Select asof tolerance within this range; must be compatible with the merge index. allow_exact_matches : bool, default True @@ -514,6 +735,7 @@ def merge_asof( Returns ------- DataFrame + A DataFrame of the two merged objects. See Also -------- @@ -598,20 +820,20 @@ def merge_asof( ... pd.Timestamp("2016-05-25 13:30:00.048"), ... pd.Timestamp("2016-05-25 13:30:00.049"), ... pd.Timestamp("2016-05-25 13:30:00.072"), - ... pd.Timestamp("2016-05-25 13:30:00.075") + ... pd.Timestamp("2016-05-25 13:30:00.075"), ... ], ... "ticker": [ - ... "GOOG", - ... "MSFT", - ... "MSFT", - ... "MSFT", - ... "GOOG", - ... "AAPL", - ... "GOOG", - ... "MSFT" - ... ], - ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], - ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT", + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], ... } ... ) >>> quotes @@ -626,19 +848,19 @@ def merge_asof( 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 >>> trades = pd.DataFrame( - ... { - ... "time": [ - ... pd.Timestamp("2016-05-25 13:30:00.023"), - ... pd.Timestamp("2016-05-25 13:30:00.038"), - ... pd.Timestamp("2016-05-25 13:30:00.048"), - ... pd.Timestamp("2016-05-25 13:30:00.048"), - ... pd.Timestamp("2016-05-25 13:30:00.048") - ... ], - ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], - ... "quantity": [75, 155, 100, 100, 100] - ... } - ... ) + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.038"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100], + ... } + ... ) >>> trades time ticker price quantity 0 2016-05-25 13:30:00.023 MSFT 51.95 75 @@ -679,7 +901,7 @@ def merge_asof( ... on="time", ... by="ticker", ... tolerance=pd.Timedelta("10ms"), - ... allow_exact_matches=False + ... allow_exact_matches=False, ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN @@ -709,7 +931,6 @@ def merge_asof( # TODO: transformations?? -# TODO: only copy DataFrames when modification necessary class _MergeOperation: """ Perform a database (SQL) merge operation between two DataFrame or Series @@ -727,7 +948,6 @@ class _MergeOperation: right_index: bool sort: bool suffixes: Suffixes - copy: bool indicator: str | bool validate: str | None join_names: list[Hashable] @@ -738,7 +958,7 @@ def __init__( self, left: DataFrame | Series, right: DataFrame | Series, - how: JoinHow | Literal["asof"] = "inner", + how: JoinHow | Literal["left_anti", "right_anti", "asof"] = "inner", on: IndexLabel | AnyArrayLike | None = None, left_on: IndexLabel | AnyArrayLike | None = None, right_on: IndexLabel | AnyArrayLike | None = None, @@ -753,7 +973,7 @@ def __init__( _right = _validate_operand(right) self.left = self.orig_left = _left self.right = self.orig_right = _right - self.how = how + self.how, self.anti_join = self._validate_how(how) self.on = com.maybe_make_list(on) @@ -812,6 +1032,37 @@ def __init__( if validate is not None: self._validate_validate_kwd(validate) + @final + def _validate_how( + self, how: JoinHow | Literal["left_anti", "right_anti", "asof"] + ) -> tuple[JoinHow | Literal["asof"], bool]: + """ + Validate the 'how' parameter and return the actual join type and whether + this is an anti join. + """ + # GH 59435: raise when "how" is not a valid Merge type + merge_type = { + "left", + "right", + "inner", + "outer", + "left_anti", + "right_anti", + "cross", + "asof", + } + if how not in merge_type: + raise ValueError( + f"'{how}' is not a valid Merge type: " + f"left, right, inner, outer, left_anti, right_anti, cross, asof" + ) + anti_join = False + if how in {"left_anti", "right_anti"}: + how = how.split("_")[0] # type: ignore[assignment] + anti_join = True + how = cast(JoinHow | Literal["asof"], how) + return how, anti_join + def _maybe_require_matching_dtypes( self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike] ) -> None: @@ -828,7 +1079,6 @@ def _reindex_and_concat( join_index: Index, left_indexer: npt.NDArray[np.intp] | None, right_indexer: npt.NDArray[np.intp] | None, - copy: bool | None, ) -> DataFrame: """ reindex along index and concat along columns. @@ -849,7 +1099,6 @@ def _reindex_and_concat( join_index, left_indexer, axis=1, - copy=False, only_slice=True, allow_dups=True, use_na_proxy=True, @@ -864,7 +1113,6 @@ def _reindex_and_concat( join_index, right_indexer, axis=1, - copy=False, only_slice=True, allow_dups=True, use_na_proxy=True, @@ -876,18 +1124,16 @@ def _reindex_and_concat( left.columns = llabels right.columns = rlabels - result = concat([left, right], axis=1, copy=copy) + result = concat([left, right], axis=1) return result - def get_result(self, copy: bool | None = True) -> DataFrame: + def get_result(self) -> DataFrame: if self.indicator: self.left, self.right = self._indicator_pre_merge(self.left, self.right) join_index, left_indexer, right_indexer = self._get_join_info() - result = self._reindex_and_concat( - join_index, left_indexer, right_indexer, copy=copy - ) + result = self._reindex_and_concat(join_index, left_indexer, right_indexer) result = result.__finalize__(self, method=self._merge_type) if self.indicator: @@ -1187,6 +1433,11 @@ def _get_join_info( n = len(left_ax) if left_indexer is None else len(left_indexer) join_index = default_index(n) + if self.anti_join: + join_index, left_indexer, right_indexer = self._handle_anti_join( + join_index, left_indexer, right_indexer + ) + return join_index, left_indexer, right_indexer @final @@ -1229,6 +1480,48 @@ def _create_join_index( return index.copy() return index.take(indexer) + @final + def _handle_anti_join( + self, + join_index: Index, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + """ + Handle anti join by returning the correct join index and indexers + + Parameters + ---------- + join_index : Index + join index + left_indexer : np.ndarray[np.intp] or None + left indexer + right_indexer : np.ndarray[np.intp] or None + right indexer + + Returns + ------- + Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None + """ + # Make sure indexers are not None + if left_indexer is None: + left_indexer = np.arange(len(self.left)) + if right_indexer is None: + right_indexer = np.arange(len(self.right)) + + assert self.how in {"left", "right"} + if self.how == "left": + # Filter to rows where left keys are not in right keys + filt = right_indexer == -1 + else: + # Filter to rows where right keys are not in left keys + filt = left_indexer == -1 + join_index = join_index[filt] + left_indexer = left_indexer[filt] + right_indexer = right_indexer[filt] + + return join_index, left_indexer, right_indexer + @final def _get_merge_keys( self, @@ -1526,6 +1819,11 @@ def _maybe_coerce_merge_keys(self) -> None: ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): # allows datetime with different resolutions continue + # datetime and timedelta not allowed + elif lk.dtype.kind == "M" and rk.dtype.kind == "m": + raise ValueError(msg) + elif lk.dtype.kind == "m" and rk.dtype.kind == "M": + raise ValueError(msg) elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): continue @@ -1575,7 +1873,7 @@ def _validate_left_right_on(self, left_on, right_on): not left_cols.join(common_cols, how="inner").is_unique or not right_cols.join(common_cols, how="inner").is_unique ): - raise MergeError(f"Data columns not unique: {repr(common_cols)}") + raise MergeError(f"Data columns not unique: {common_cols!r}") left_on = right_on = common_cols elif self.on is not None: if left_on is not None or right_on is not None: @@ -1706,9 +2004,9 @@ def get_join_indexers( np.ndarray[np.intp] or None Indexer into the right_keys. """ - assert len(left_keys) == len( - right_keys - ), "left_keys and right_keys must be the same length" + assert len(left_keys) == len(right_keys), ( + "left_keys and right_keys must be the same length" + ) # fast-path for empty left/right left_n = len(left_keys[0]) @@ -1785,7 +2083,10 @@ def get_join_indexers_non_unique( np.ndarray[np.intp] Indexer into right. """ - lkey, rkey, count = _factorize_keys(left, right, sort=sort) + lkey, rkey, count = _factorize_keys(left, right, sort=sort, how=how) + if count == -1: + # hash join + return lkey, rkey if how == "left": lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort) elif how == "right": @@ -1917,7 +2218,7 @@ def __init__( sort=True, # factorize sorts ) - def get_result(self, copy: bool | None = True) -> DataFrame: + def get_result(self) -> DataFrame: join_index, left_indexer, right_indexer = self._get_join_info() left_join_indexer: npt.NDArray[np.intp] | None @@ -1925,10 +2226,9 @@ def get_result(self, copy: bool | None = True) -> DataFrame: if self.fill_method == "ffill": if left_indexer is None: - raise TypeError("left_indexer cannot be None") - left_indexer = cast("npt.NDArray[np.intp]", left_indexer) - right_indexer = cast("npt.NDArray[np.intp]", right_indexer) - left_join_indexer = libjoin.ffill_indexer(left_indexer) + left_join_indexer = None + else: + left_join_indexer = libjoin.ffill_indexer(left_indexer) if right_indexer is None: right_join_indexer = None else: @@ -1940,7 +2240,7 @@ def get_result(self, copy: bool | None = True) -> DataFrame: raise ValueError("fill_method must be 'ffill' or None") result = self._reindex_and_concat( - join_index, left_join_indexer, right_join_indexer, copy=copy + join_index, left_join_indexer, right_join_indexer ) self._maybe_add_join_keys(result, left_indexer, right_indexer) @@ -2066,8 +2366,8 @@ def _validate_left_right_on(self, left_on, right_on): or is_string_dtype(ro_dtype) ): raise MergeError( - f"Incompatible merge dtype, {repr(ro_dtype)} and " - f"{repr(lo_dtype)}, both sides must have numeric dtype" + f"Incompatible merge dtype, {lo_dtype!r} and " + f"{ro_dtype!r}, both sides must have numeric dtype" ) # add 'by' to our key-list so we can have it in the @@ -2091,7 +2391,7 @@ def _maybe_require_matching_dtypes( ) -> None: # TODO: why do we do this for AsOfMerge but not the others? - def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int): + def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int) -> None: if left.dtype != right.dtype: if isinstance(left.dtype, CategoricalDtype) and isinstance( right.dtype, CategoricalDtype @@ -2105,13 +2405,13 @@ def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int): # later with a ValueError, so we don't *need* to check # for them here. msg = ( - f"incompatible merge keys [{i}] {repr(left.dtype)} and " - f"{repr(right.dtype)}, both sides category, but not equal ones" + f"incompatible merge keys [{i}] {left.dtype!r} and " + f"{right.dtype!r}, both sides category, but not equal ones" ) else: msg = ( - f"incompatible merge keys [{i}] {repr(left.dtype)} and " - f"{repr(right.dtype)}, must be the same type" + f"incompatible merge keys [{i}] {left.dtype!r} and " + f"{right.dtype!r}, must be the same type" ) raise MergeError(msg) @@ -2141,7 +2441,7 @@ def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None: msg = ( f"incompatible tolerance {self.tolerance}, must be compat " - f"with type {repr(lt.dtype)}" + f"with type {lt.dtype!r}" ) if needs_i8_conversion(lt.dtype) or ( @@ -2306,7 +2606,7 @@ def _get_multiindex_indexer( if sort: rcodes = list(map(np.take, rcodes, index.codes)) else: - i8copy = lambda a: a.astype("i8", subok=False, copy=True) + i8copy = lambda a: a.astype("i8", subok=False) rcodes = list(map(i8copy, index.codes)) # fix right labels if there were any nulls @@ -2392,7 +2692,10 @@ def _left_join_on_index( def _factorize_keys( - lk: ArrayLike, rk: ArrayLike, sort: bool = True + lk: ArrayLike, + rk: ArrayLike, + sort: bool = True, + how: str | None = None, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: """ Encode left and right keys as enumerated types. @@ -2408,6 +2711,9 @@ def _factorize_keys( sort : bool, defaults to True If True, the encoding is done such that the unique elements in the keys are sorted. + how: str, optional + Used to determine if we can use hash-join. If not given, then just factorize + keys. Returns ------- @@ -2416,7 +2722,8 @@ def _factorize_keys( np.ndarray[np.intp] Right (resp. left if called with `key='right'`) labels, as enumerated type. int - Number of unique elements in union of left and right labels. + Number of unique elements in union of left and right labels. -1 if we used + a hash-join. See Also -------- @@ -2469,8 +2776,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) - and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc @@ -2483,18 +2789,30 @@ def _factorize_keys( .combine_chunks() .dictionary_encode() ) - length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) + pc.fill_null(dc.indices[slice(len_lk)], -1) .to_numpy() .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) + pc.fill_null(dc.indices[slice(len_lk, None)], -1) .to_numpy() .astype(np.intp, copy=False), len(dc.dictionary), ) + + if sort: + uniques = dc.dictionary.to_numpy(zero_copy_only=False) + llab, rlab = _sort_labels(uniques, llab, rlab) + if dc.null_count > 0: + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) count += 1 return llab, rlab, count @@ -2503,8 +2821,7 @@ def _factorize_keys( isinstance(lk.dtype, ArrowDtype) and ( is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort + or (is_string_dtype(lk.dtype) and not sort) ) ): lk, _ = lk._values_for_factorize() @@ -2522,28 +2839,41 @@ def _factorize_keys( klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk) - rizer = klass(max(len(lk), len(rk))) + rizer = klass( + max(len(lk), len(rk)), + uses_mask=isinstance(rk, (BaseMaskedArray, ArrowExtensionArray)), + ) if isinstance(lk, BaseMaskedArray): assert isinstance(rk, BaseMaskedArray) - llab = rizer.factorize(lk._data, mask=lk._mask) - rlab = rizer.factorize(rk._data, mask=rk._mask) + lk_data, lk_mask = lk._data, lk._mask + rk_data, rk_mask = rk._data, rk._mask elif isinstance(lk, ArrowExtensionArray): assert isinstance(rk, ArrowExtensionArray) # we can only get here with numeric dtypes # TODO: Remove when we have a Factorizer for Arrow - llab = rizer.factorize( - lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() - ) - rlab = rizer.factorize( - rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() - ) + lk_data = lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype) + rk_data = rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype) + lk_mask, rk_mask = lk.isna(), rk.isna() else: # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - llab = rizer.factorize(lk) # type: ignore[arg-type] - rlab = rizer.factorize(rk) # type: ignore[arg-type] + lk_data, rk_data = lk, rk # type: ignore[assignment] + lk_mask, rk_mask = None, None + + hash_join_available = how == "inner" and not sort and lk.dtype.kind in "iufb" + if hash_join_available: + rlab = rizer.factorize(rk_data, mask=rk_mask) + if rizer.get_count() == len(rlab): + ridx, lidx = rizer.hash_inner_join(lk_data, lk_mask) + return lidx, ridx, -1 + else: + llab = rizer.factorize(lk_data, mask=lk_mask) + else: + llab = rizer.factorize(lk_data, mask=lk_mask) + rlab = rizer.factorize(rk_data, mask=rk_mask) + assert llab.dtype == np.dtype(np.intp), llab.dtype assert rlab.dtype == np.dtype(np.intp), rlab.dtype @@ -2591,9 +2921,7 @@ def _convert_arrays_and_get_rizer_klass( lk = lk.astype(dtype, copy=False) rk = rk.astype(dtype, copy=False) if isinstance(lk, BaseMaskedArray): - # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; - # expected type "Type[object]" - klass = _factorizers[lk.dtype.type] # type: ignore[index] + klass = _factorizers[lk.dtype.type] elif isinstance(lk.dtype, ArrowDtype): klass = _factorizers[lk.dtype.numpy_dtype.type] else: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b2a915589cba7..ac89f19b80a0f 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,25 +1,15 @@ from __future__ import annotations -from collections.abc import ( - Hashable, - Sequence, -) +import itertools from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) -import warnings import numpy as np from pandas._libs import lib -from pandas.util._decorators import ( - Appender, - Substitution, -) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -34,7 +24,6 @@ ) import pandas.core.common as com -from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper from pandas.core.indexes.api import ( Index, @@ -42,24 +31,25 @@ get_objs_combined_axis, ) from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series if TYPE_CHECKING: + from collections.abc import ( + Callable, + Hashable, + ) + from pandas._typing import ( AggFuncType, AggFuncTypeBase, AggFuncTypeDict, IndexLabel, + SequenceNotStr, ) from pandas import DataFrame -# Note: We need to make sure `frame` is imported before `pivot`, otherwise -# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot_table"], indents=1) def pivot_table( data: DataFrame, values=None, @@ -70,9 +60,183 @@ def pivot_table( margins: bool = False, dropna: bool = True, margins_name: Hashable = "All", - observed: bool | lib.NoDefault = lib.no_default, + observed: bool = True, sort: bool = True, + **kwargs, ) -> DataFrame: + """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects + (hierarchical indexes) on the index and columns of the result DataFrame. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + values : list-like or scalar, optional + Column or columns to aggregate. + index : column, Grouper, array, or sequence of the previous + Keys to group by on the pivot table index. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + columns : column, Grouper, array, or sequence of the previous + Keys to group by on the pivot table column. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + aggfunc : function, list of functions, dict, default "mean" + If a list of functions is passed, the resulting pivot table will have + hierarchical columns whose top level are the function names + (inferred from the function objects themselves). + If a dict is passed, the key is column to aggregate and the value is + function or list of functions. If ``margins=True``, aggfunc will be + used to calculate the partial aggregates. + fill_value : scalar, default None + Value to replace missing values with (in the resulting pivot table, + after aggregation). + margins : bool, default False + If ``margins=True``, special ``All`` columns and rows + will be added with partial group aggregates across the categories + on the rows and columns. + dropna : bool, default True + Do not include columns whose entries are all NaN. If True, + + * rows with an NA value in any column will be omitted before computing margins, + * index/column keys containing NA values will be dropped (see ``dropna`` + parameter in :meth:``DataFrame.groupby``). + + margins_name : str, default 'All' + Name of the row / column that will contain the totals + when margins is True. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged:: 3.0.0 + + The default value is now ``True``. + + sort : bool, default True + Specifies if the result should be sorted. + + .. versionadded:: 1.3.0 + + **kwargs : dict + Optional keyword arguments to pass to ``aggfunc``. + + .. versionadded:: 3.0.0 + + Returns + ------- + DataFrame + An Excel style pivot table. + + See Also + -------- + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + ... "C": [ + ... "small", + ... "large", + ... "large", + ... "small", + ... "small", + ... "large", + ... "small", + ... "small", + ... "large", + ... ], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + ... } + ... ) + >>> df + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum" + ... ) + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum", fill_value=0 + ... ) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pd.pivot_table( + ... df, values=["D", "E"], index=["A", "C"], aggfunc={"D": "mean", "E": "mean"} + ... ) + >>> table + D E + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. + + >>> table = pd.pivot_table( + ... df, + ... values=["D", "E"], + ... index=["A", "C"], + ... aggfunc={"D": "mean", "E": ["min", "max", "mean"]}, + ... ) + >>> table + D E + mean max mean min + A C + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 + """ index = _convert_by(index) columns = _convert_by(columns) @@ -92,6 +256,7 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, + kwargs=kwargs, ) pieces.append(_table) keys.append(getattr(func, "__name__", func)) @@ -111,6 +276,7 @@ def pivot_table( margins_name, observed, sort, + kwargs, ) return table.__finalize__(data, method="pivot_table") @@ -125,8 +291,9 @@ def __internal_pivot_table( margins: bool, dropna: bool, margins_name: Hashable, - observed: bool | lib.NoDefault, + observed: bool, sort: bool, + kwargs, ) -> DataFrame: """ Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``. @@ -168,19 +335,13 @@ def __internal_pivot_table( pass values = list(values) - observed_bool = False if observed is lib.no_default else observed - grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna) - if observed is lib.no_default and any( - ping._passed_categorical for ping in grouped._grouper.groupings - ): - warnings.warn( - "The default value of observed=False is deprecated and will change " - "to observed=True in a future version of pandas. Specify " - "observed=False to silence this warning and retain the current behavior", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - agged = grouped.agg(aggfunc) + grouped = data.groupby(keys, observed=observed, sort=sort, dropna=dropna) + if values_passed: + # GH#57876 and GH#61292 + # mypy is not aware `grouped[values]` will always be a DataFrameGroupBy + grouped = grouped[values] # type: ignore[assignment] + + agged = grouped.agg(aggfunc, **kwargs) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") @@ -204,15 +365,11 @@ def __internal_pivot_table( if not dropna: if isinstance(table.index, MultiIndex): - m = MultiIndex.from_arrays( - cartesian_product(table.index.levels), names=table.index.names - ) + m = MultiIndex.from_product(table.index.levels, names=table.index.names) table = table.reindex(m, axis=0, fill_value=fill_value) if isinstance(table.columns, MultiIndex): - m = MultiIndex.from_arrays( - cartesian_product(table.columns.levels), names=table.columns.names - ) + m = MultiIndex.from_product(table.columns.levels, names=table.columns.names) table = table.reindex(m, axis=1, fill_value=fill_value) if sort is True and isinstance(table, ABCDataFrame): @@ -235,6 +392,7 @@ def __internal_pivot_table( rows=index, cols=columns, aggfunc=aggfunc, + kwargs=kwargs, observed=dropna, margins_name=margins_name, fill_value=fill_value, @@ -260,6 +418,7 @@ def _add_margins( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", fill_value=None, @@ -272,7 +431,7 @@ def _add_margins( if margins_name in table.index.get_level_values(level): raise ValueError(msg) - grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) + grand_margin = _compute_grand_margin(data, values, aggfunc, kwargs, margins_name) if table.ndim == 2: # i.e. DataFrame @@ -293,7 +452,15 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name + table, + data, + values, + rows, + cols, + aggfunc, + kwargs, + observed, + margins_name, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -302,7 +469,7 @@ def _add_margins( # no values, and table is a DataFrame assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name + table, data, rows, cols, aggfunc, kwargs, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -339,26 +506,26 @@ def _add_margins( def _compute_grand_margin( - data: DataFrame, values, aggfunc, margins_name: Hashable = "All" + data: DataFrame, values, aggfunc, kwargs, margins_name: Hashable = "All" ): if values: grand_margin = {} for k, v in data[values].items(): try: if isinstance(aggfunc, str): - grand_margin[k] = getattr(v, aggfunc)() + grand_margin[k] = getattr(v, aggfunc)(**kwargs) elif isinstance(aggfunc, dict): if isinstance(aggfunc[k], str): - grand_margin[k] = getattr(v, aggfunc[k])() + grand_margin[k] = getattr(v, aggfunc[k])(**kwargs) else: - grand_margin[k] = aggfunc[k](v) + grand_margin[k] = aggfunc[k](v, **kwargs) else: - grand_margin[k] = aggfunc(v) + grand_margin[k] = aggfunc(v, **kwargs) except TypeError: pass return grand_margin else: - return {margins_name: aggfunc(data.index)} + return {margins_name: aggfunc(data.index, **kwargs)} def _generate_marginal_results( @@ -368,6 +535,7 @@ def _generate_marginal_results( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", ): @@ -381,21 +549,28 @@ def _all_key(key): return (key, margins_name) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) + margin = ( + data[rows + values] + .groupby(rows, observed=observed) + .agg(aggfunc, **kwargs) + ) cat_axis = 1 for key, piece in table.T.groupby(level=0, observed=observed): piece = piece.T all_key = _all_key(key) - # we are going to mutate this, so need to copy! - piece = piece.copy() piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: - from pandas import DataFrame + margin = ( + data[cols[:1] + values] + .groupby(cols[:1], observed=observed) + .agg(aggfunc, **kwargs) + .T + ) cat_axis = 0 for key, piece in table.groupby(level=0, observed=observed): @@ -404,13 +579,15 @@ def _all_key(key): else: all_key = margins_name table_pieces.append(piece) - # GH31016 this is to calculate margin for each group, and assign - # corresponded key as index - transformed_piece = DataFrame(piece.apply(aggfunc)).T + transformed_piece = margin[key].to_frame().T if isinstance(piece.index, MultiIndex): # We are adding an empty level transformed_piece.index = MultiIndex.from_tuples( - [all_key], names=piece.index.names + [None] + [all_key], + names=piece.index.names + + [ + None, + ], ) else: transformed_piece.index = Index([all_key], name=piece.index.name) @@ -432,11 +609,13 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) - row_margin = row_margin.stack(future_stack=True) + row_margin = ( + data[cols + values].groupby(cols, observed=observed).agg(aggfunc, **kwargs) + ) + row_margin = row_margin.stack() # GH#26568. Use names instead of indices in case of numeric names - new_order_indices = [len(cols)] + list(range(len(cols))) + new_order_indices = itertools.chain([len(cols)], range(len(cols))) new_order_names = [row_margin.index.names[i] for i in new_order_indices] row_margin.index = row_margin.index.reorder_levels(new_order_names) else: @@ -451,6 +630,7 @@ def _generate_marginal_results_without_values( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", ): @@ -465,14 +645,16 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply( + aggfunc, **kwargs + ) all_key = _all_key() table[all_key] = margin result = table margin_keys.append(all_key) else: - margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc) + margin = data.groupby(level=0, observed=observed).apply(aggfunc, **kwargs) all_key = _all_key() table[all_key] = margin result = table @@ -483,7 +665,9 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply( + aggfunc, **kwargs + ) else: row_margin = Series(np.nan, index=result.columns) @@ -504,8 +688,6 @@ def _convert_by(by): return by -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot"], indents=1) def pivot( data: DataFrame, *, @@ -513,16 +695,162 @@ def pivot( index: IndexLabel | lib.NoDefault = lib.no_default, values: IndexLabel | lib.NoDefault = lib.no_default, ) -> DataFrame: + """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. See the :ref:`User Guide ` for more on reshaping. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + columns : Hashable or a sequence of the previous + Column to use to make new frame's columns. + index : Hashable or a sequence of the previous, optional + Column to use to make new frame's index. If not given, uses existing index. + values : Hashable or a sequence of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + Returns + ------- + DataFrame + Returns reshaped DataFrame. + + Raises + ------ + ValueError: + When there are any `index`, `columns` combinations with multiple + values. `DataFrame.pivot_table` when you need to aggregate. + + See Also + -------- + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... "zoo": ["x", "y", "z", "q", "w", "t"], + ... } + ... ) + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + 5 two C 6 t + + >>> df.pivot(index="foo", columns="bar", values="baz") + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar")["baz"] + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar", values=["baz", "zoo"]) + baz zoo + bar A B C A B C + foo + one 1 2 3 x y z + two 4 5 6 q w t + + You could also assign a list of column names or a list of index names. + + >>> df = pd.DataFrame( + ... { + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5], + ... } + ... ) + >>> df + lev1 lev2 lev3 lev4 values + 0 1 1 1 1 0 + 1 1 1 2 2 1 + 2 1 2 1 3 2 + 3 2 1 2 4 3 + 4 2 1 1 5 4 + 5 2 2 2 6 5 + + >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") + lev2 1 2 + lev3 1 2 1 2 + lev1 + 1 0.0 1.0 2.0 NaN + 2 4.0 3.0 NaN 5.0 + + >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") + lev3 1 2 + lev1 lev2 + 1 1 0.0 1.0 + 2 2.0 NaN + 2 1 4.0 3.0 + 2 NaN 5.0 + + A ValueError is raised if there are any duplicates. + + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two"], + ... "bar": ["A", "A", "B", "C"], + ... "baz": [1, 2, 3, 4], + ... } + ... ) + >>> df + foo bar baz + 0 one A 1 + 1 one A 2 + 2 two B 3 + 3 two C 4 + + Notice that the first two rows are the same for our `index` + and `columns` arguments. + + >>> df.pivot(index="foo", columns="bar", values="baz") + Traceback (most recent call last): + ... + ValueError: Index contains duplicate entries, cannot reshape + """ columns_listlike = com.convert_to_list_like(columns) # If columns is None we will create a MultiIndex level with None as name # which might cause duplicated names because None is the default for # level names - data = data.copy(deep=False) - data.index = data.index.copy() - data.index.names = [ - name if name is not None else lib.no_default for name in data.index.names - ] + if any(name is None for name in data.index.names): + data = data.copy(deep=False) + data.index.names = [ + name if name is not None else lib.no_default for name in data.index.names + ] indexed: DataFrame | Series if values is lib.no_default: @@ -535,7 +863,8 @@ def pivot( # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray") # error: Unsupported left operand type for + ("ExtensionArray") indexed = data.set_index( - cols + columns_listlike, append=append # type: ignore[operator] + cols + columns_listlike, # type: ignore[operator] + append=append, ) else: index_list: list[Index] | list[Series] @@ -558,16 +887,18 @@ def pivot( if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name - values = cast(Sequence[Hashable], values) indexed = data._constructor( - data[values]._values, index=multiindex, columns=values + data[values]._values, + index=multiindex, + columns=cast("SequenceNotStr", values), ) else: indexed = data._constructor_sliced(data[values]._values, index=multiindex) # error: Argument 1 to "unstack" of "DataFrame" has incompatible type "Union # [List[Any], ExtensionArray, ndarray[Any, Any], Index, Series]"; expected # "Hashable" - result = indexed.unstack(columns_listlike) # type: ignore[arg-type] + # unstack with a MultiIndex returns a DataFrame + result = cast("DataFrame", indexed.unstack(columns_listlike)) # type: ignore[arg-type] result.index.names = [ name if name is not lib.no_default else None for name in result.index.names ] @@ -649,14 +980,55 @@ def crosstab( Examples -------- - >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", - ... "bar", "bar", "foo", "foo", "foo"], dtype=object) - >>> b = np.array(["one", "one", "one", "two", "one", "one", - ... "one", "two", "two", "two", "one"], dtype=object) - >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", - ... "shiny", "dull", "shiny", "shiny", "shiny"], - ... dtype=object) - >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + >>> a = np.array( + ... [ + ... "foo", + ... "foo", + ... "foo", + ... "foo", + ... "bar", + ... "bar", + ... "bar", + ... "bar", + ... "foo", + ... "foo", + ... "foo", + ... ], + ... dtype=object, + ... ) + >>> b = np.array( + ... [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... "one", + ... ], + ... dtype=object, + ... ) + >>> c = np.array( + ... [ + ... "dull", + ... "dull", + ... "shiny", + ... "dull", + ... "dull", + ... "shiny", + ... "shiny", + ... "dull", + ... "shiny", + ... "shiny", + ... "shiny", + ... ], + ... dtype=object, + ... ) + >>> pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) b one two c dull shiny dull shiny a @@ -667,8 +1039,8 @@ def crosstab( shown in the output because dropna is True by default. Set dropna=False to preserve categories with no data. - >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) - >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + >>> foo = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) + >>> bar = pd.Categorical(["d", "e"], categories=["d", "e", "f"]) >>> pd.crosstab(foo, bar) col_0 d e row_0 @@ -732,7 +1104,7 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, - observed=False, + observed=dropna, **kwargs, # type: ignore[arg-type] ) @@ -804,7 +1176,7 @@ def _normalize( elif normalize == "index": index_margin = index_margin / index_margin.sum() - table = table._append(index_margin) + table = table._append(index_margin, ignore_index=True) table = table.fillna(0) table.index = table_index @@ -813,7 +1185,7 @@ def _normalize( index_margin = index_margin / index_margin.sum() index_margin.loc[margins_name] = 1 table = concat([table, column_margin], axis=1) - table = table._append(index_margin) + table = table._append(index_margin, ignore_index=True) table = table.fillna(0) table.index = table_index @@ -828,7 +1200,7 @@ def _normalize( return table -def _get_names(arrs, names, prefix: str = "row"): +def _get_names(arrs, names, prefix: str = "row") -> list: if names is None: names = [] for i, arr in enumerate(arrs): @@ -874,13 +1246,7 @@ def _build_names_mapper( a list of column names with duplicate names replaced by dummy names """ - - def get_duplicates(names): - seen: set = set() - return {name for name in names if name not in seen} - - shared_names = set(rownames).intersection(set(colnames)) - dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names + dup_names = set(rownames) | set(colnames) rownames_mapper = { f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7a49682d7c57c..d2a838b616426 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -4,11 +4,14 @@ from typing import ( TYPE_CHECKING, cast, + overload, ) import warnings import numpy as np +from pandas._config.config import get_option + import pandas._libs.reshape as libreshape from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly @@ -32,13 +35,14 @@ factorize, unique, ) +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, + default_index, ) from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -82,8 +86,9 @@ class _Unstacker: Examples -------- - >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), - ... ('two', 'a'), ('two', 'b')]) + >>> index = pd.MultiIndex.from_tuples( + ... [("one", "a"), ("one", "b"), ("two", "a"), ("two", "b")] + ... ) >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) >>> s one a 1 @@ -132,24 +137,24 @@ def __init__( self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) - # Bug fix GH 20601 - # If the data frame is too big, the number of unique index combination - # will cause int32 overflow on windows environments. - # We want to check and raise an warning before this happens - num_rows = np.max([index_level.size for index_level in self.new_index_levels]) - num_columns = self.removed_level.size - - # GH20601: This forces an overflow if the number of cells is too high. - num_cells = num_rows * num_columns - - # GH 26314: Previous ValueError raised was too restrictive for many users. - if num_cells > np.iinfo(np.int32).max: - warnings.warn( - f"The following operation may generate {num_cells} cells " - f"in the resulting pandas object.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) + if get_option("performance_warnings"): + # Bug fix GH 20601 + # If the data frame is too big, the number of unique index combination + # will cause int32 overflow on windows environments. + # We want to check and raise an warning before this happens + num_rows = max(index_level.size for index_level in self.new_index_levels) + num_columns = self.removed_level.size + + # GH20601: This forces an overflow if the number of cells is too high. + # GH 26314: Previous ValueError raised was too restrictive for many users. + num_cells = num_rows * num_columns + if num_cells > np.iinfo(np.int32).max: + warnings.warn( + f"The following operation may generate {num_cells} cells " + f"in the resulting pandas object.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) self._make_selectors() @@ -163,6 +168,9 @@ def _indexer_and_to_sort( v = self.level codes = list(self.index.codes) + if not self.sort: + # Create new codes considering that labels are already sorted + codes = [factorize(code)[0] for code in codes] levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -181,14 +189,11 @@ def sorted_labels(self) -> list[np.ndarray]: return to_sort def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: - if self.sort: - indexer, _ = self._indexer_and_to_sort - - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values - return values + indexer, _ = self._indexer_and_to_sort + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values - def _make_selectors(self): + def _make_selectors(self) -> None: new_levels = self.new_index_levels # make the mask @@ -228,20 +233,31 @@ def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]: return new_values, mask.any(0) # TODO: in all tests we have mask.any(0).all(); can we rely on that? - def get_result(self, values, value_columns, fill_value) -> DataFrame: + def get_result(self, obj, value_columns, fill_value) -> DataFrame: + values = obj._values if values.ndim == 1: values = values[:, np.newaxis] if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError("must pass column labels for multi-column data") - values, _ = self.get_new_values(values, fill_value) + new_values, _ = self.get_new_values(values, fill_value) columns = self.get_new_columns(value_columns) index = self.new_index - return self.constructor( - values, index=index, columns=columns, dtype=values.dtype + result = self.constructor( + new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False ) + if isinstance(values, np.ndarray): + base, new_base = values.base, new_values.base + elif isinstance(values, NDArrayBackedExtensionArray): + base, new_base = values._ndarray.base, new_values._ndarray.base + else: + base, new_base = 1, 2 # type: ignore[assignment] + if base is new_base: + # We can only get here if one of the dimensions is size 1 + result._mgr.add_references(obj._mgr) + return result def get_new_values(self, values, fill_value=None): if values.ndim == 1: @@ -272,21 +288,19 @@ def get_new_values(self, values, fill_value=None): dtype = values.dtype - # if our mask is all True, then we can use our existing dtype - if mask_all: - dtype = values.dtype - new_values = np.empty(result_shape, dtype=dtype) - else: - if isinstance(dtype, ExtensionDtype): - # GH#41875 - # We are assuming that fill_value can be held by this dtype, - # unlike the non-EA case that promotes. - cls = dtype.construct_array_type() - new_values = cls._empty(result_shape, dtype=dtype) + if isinstance(dtype, ExtensionDtype): + # GH#41875 + # We are assuming that fill_value can be held by this dtype, + # unlike the non-EA case that promotes. + cls = dtype.construct_array_type() + new_values = cls._empty(result_shape, dtype=dtype) + if not mask_all: new_values[:] = fill_value - else: + else: + if not mask_all: dtype, fill_value = maybe_promote(dtype, fill_value) - new_values = np.empty(result_shape, dtype=dtype) + new_values = np.empty(result_shape, dtype=dtype) + if not mask_all: new_values.fill(fill_value) name = dtype.name @@ -376,9 +390,15 @@ def _repeater(self) -> np.ndarray: return repeater @cache_readonly - def new_index(self) -> MultiIndex: + def new_index(self) -> MultiIndex | Index: # Does not depend on values or value_columns - result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] + if self.sort: + labels = self.sorted_labels[:-1] + else: + v = self.level + codes = list(self.index.codes) + labels = codes[:v] + codes[v + 1 :] + result_codes = [lab.take(self.compressor) for lab in labels] # construct the new index if len(self.new_index_levels) == 1: @@ -439,7 +459,7 @@ def _unstack_multiple( ) if isinstance(data, Series): - dummy = data.copy() + dummy = data.copy(deep=False) dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort) @@ -451,7 +471,11 @@ def _unstack_multiple( result = data while clocs: val = clocs.pop(0) - result = result.unstack(val, fill_value=fill_value, sort=sort) + # error: Incompatible types in assignment (expression has type + # "DataFrame | Series", variable has type "DataFrame") + result = result.unstack( # type: ignore[assignment] + val, fill_value=fill_value, sort=sort + ) clocs = [v if v < val else v - 1 for v in clocs] return result @@ -460,7 +484,9 @@ def _unstack_multiple( dummy_df = data.copy(deep=False) dummy_df.index = dummy_index - unstacked = dummy_df.unstack( + # error: Incompatible types in assignment (expression has type "DataFrame | + # Series", variable has type "DataFrame") + unstacked = dummy_df.unstack( # type: ignore[assignment] "__placeholder__", fill_value=fill_value, sort=sort ) if isinstance(unstacked, Series): @@ -486,7 +512,19 @@ def _unstack_multiple( return unstacked -def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): +@overload +def unstack(obj: Series, level, fill_value=..., sort: bool = ...) -> DataFrame: ... + + +@overload +def unstack( + obj: Series | DataFrame, level, fill_value=..., sort: bool = ... +) -> Series | DataFrame: ... + + +def unstack( + obj: Series | DataFrame, level, fill_value=None, sort: bool = True +) -> Series | DataFrame: if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, @@ -503,7 +541,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) else: - return obj.T.stack(future_stack=True) + return obj.T.stack() elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose @@ -517,9 +555,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort ) - return unstacker.get_result( - obj._values, value_columns=None, fill_value=fill_value - ) + return unstacker.get_result(obj, value_columns=None, fill_value=fill_value) def _unstack_frame( @@ -535,7 +571,7 @@ def _unstack_frame( return obj._constructor_from_mgr(mgr, axes=mgr.axes) else: return unstacker.get_result( - obj._values, value_columns=obj.columns, fill_value=fill_value + obj, value_columns=obj.columns, fill_value=fill_value ) @@ -573,10 +609,14 @@ def _unstack_extension_series( # equiv: result.droplevel(level=0, axis=1) # but this avoids an extra copy result.columns = result.columns._drop_level_numbers([0]) - return result + # error: Incompatible return value type (got "DataFrame | Series", expected + # "DataFrame") + return result # type: ignore[return-value] -def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True): +def stack( + frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True +) -> Series | DataFrame: """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index @@ -659,7 +699,9 @@ def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = Tr if all(lev in frame.columns.names for lev in level): result = frame for lev in level: - result = stack(result, lev, dropna=dropna, sort=sort) + # error: Incompatible types in assignment (expression has type + # "Series | DataFrame", variable has type "DataFrame") + result = stack(result, lev, dropna=dropna, sort=sort) # type: ignore[assignment] # Otherwise, level numbers may change as each successive level is stacked elif all(isinstance(lev, int) for lev in level): @@ -672,7 +714,9 @@ def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = Tr while level: lev = level.pop(0) - result = stack(result, lev, dropna=dropna, sort=sort) + # error: Incompatible types in assignment (expression has type + # "Series | DataFrame", variable has type "DataFrame") + result = stack(result, lev, dropna=dropna, sort=sort) # type: ignore[assignment] # Decrement all level numbers greater than current, as these # have now shifted down by one level = [v if v <= lev else v - 1 for v in level] @@ -686,15 +730,15 @@ def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = Tr return result -def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: +def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) - levs = [ + levs = ( [lev[c] if c >= 0 else None for c in codes] for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) - ] + ) # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) @@ -796,7 +840,7 @@ def _convert_level_number(level_num: int, columns: Index): [x._values.astype(dtype, copy=False) for _, x in subset.items()] ) N, K = subset.shape - idx = np.arange(N * K).reshape(K, N).T.ravel() + idx = np.arange(N * K).reshape(K, N).T.reshape(-1) value_slice = value_slice.take(idx) else: value_slice = subset.values @@ -867,7 +911,7 @@ def _reorder_for_extension_array_stack( Examples -------- - >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) + >>> arr = np.array(["a", "b", "c", "d", "e", "f"]) >>> _reorder_for_extension_array_stack(arr, 2, 3) array(['a', 'c', 'e', 'b', 'd', 'f'], dtype=' Series | DataFrame: if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") - - # If we need to drop `level` from columns, it needs to be in descending order - drop_levnums = sorted(level, reverse=True) + if not len(level): + return frame + set_levels = set(level) stack_cols = frame.columns._drop_level_numbers( - [k for k in range(frame.columns.nlevels) if k not in level][::-1] + [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) + + result: Series | DataFrame + if not isinstance(frame.columns, MultiIndex): + # GH#58817 Fast path when we're stacking the columns of a non-MultiIndex. + # When columns are homogeneous EAs, we pass through object + # dtype but this is still slightly faster than the normal path. + if len(frame.columns) > 0 and frame._is_homogeneous_type: + dtype = frame._mgr.blocks[0].dtype + else: + dtype = None + result = frame._constructor_sliced( + frame._values.reshape(-1, order="F"), dtype=dtype + ) + else: + result = stack_reshape(frame, level, set_levels, stack_cols) + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + ratio = 0 if frame.empty else len(result) // len(frame) + + index_levels: list | FrozenList + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = list(np.tile(frame.index.codes, (1, ratio))) + else: + codes, uniques = factorize(frame.index, use_na_sentinel=False) + index_levels = [uniques] + index_codes = list(np.tile(codes, (1, ratio))) + if len(level) > 1: # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] sorter = np.argsort(level) + assert isinstance(stack_cols, MultiIndex) ordered_stack_cols = stack_cols._reorder_ilevels(sorter) else: ordered_stack_cols = stack_cols - - stack_cols_unique = stack_cols.unique() ordered_stack_cols_unique = ordered_stack_cols.unique() + if isinstance(ordered_stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols_unique] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + + # error: Incompatible types in assignment (expression has type "list[ndarray[Any, + # dtype[Any]]]", variable has type "FrozenList") + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + + return result + + +def stack_reshape( + frame: DataFrame, level: list[int], set_levels: set[int], stack_cols: Index +) -> Series | DataFrame: + """Reshape the data of a frame for stack. + + This function takes care of most of the work that stack needs to do. Caller + will sort the result once the appropriate index is set. + + Parameters + ---------- + frame: DataFrame + DataFrame that is to be stacked. + level: list of ints. + Levels of the columns to stack. + set_levels: set of ints. + Same as level, but as a set. + stack_cols: Index. + Columns of the result when the DataFrame is stacked. + + Returns + ------- + The data of behind the stacked DataFrame. + """ + # non-MultIndex takes a fast path. + assert isinstance(frame.columns, MultiIndex) + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level, reverse=True) # Grab data for each unique index to be stacked buf = [] - for idx in stack_cols_unique: + for idx in stack_cols.unique(): if len(frame.columns) == 1: - data = frame.copy() + data = frame.copy(deep=False) else: # Take the data from frame corresponding to this idx value if len(level) == 1: idx = (idx,) gen = iter(idx) column_indexer = tuple( - next(gen) if k in level else slice(None) + next(gen) if k in set_levels else slice(None) for k in range(frame.columns.nlevels) ) data = frame.loc[:, column_indexer] @@ -923,13 +1058,11 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if data.ndim == 1: data.name = 0 else: - data.columns = RangeIndex(len(data.columns)) + data.columns = default_index(len(data.columns)) buf.append(data) - result: Series | DataFrame if len(buf) > 0 and not frame.empty: - result = concat(buf) - ratio = len(result) // len(frame) + result = concat(buf, ignore_index=True) else: # input is empty if len(level) < frame.columns.nlevels: @@ -938,7 +1071,6 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: else: new_columns = [0] result = DataFrame(columns=new_columns, dtype=frame._values.dtype) - ratio = 0 if len(level) < frame.columns.nlevels: # concat column order may be different from dropping the levels @@ -946,44 +1078,4 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if not result.columns.equals(desired_columns): result = result[desired_columns] - # Construct the correct MultiIndex by combining the frame's index and - # stacked columns. - index_levels: list | FrozenList - if isinstance(frame.index, MultiIndex): - index_levels = frame.index.levels - index_codes = list(np.tile(frame.index.codes, (1, ratio))) - else: - codes, uniques = factorize(frame.index, use_na_sentinel=False) - index_levels = [uniques] - index_codes = list(np.tile(codes, (1, ratio))) - if isinstance(stack_cols, MultiIndex): - column_levels = ordered_stack_cols.levels - column_codes = ordered_stack_cols.drop_duplicates().codes - else: - column_levels = [ordered_stack_cols.unique()] - column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] - column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] - result.index = MultiIndex( - levels=index_levels + column_levels, - codes=index_codes + column_codes, - names=frame.index.names + list(ordered_stack_cols.names), - verify_integrity=False, - ) - - # sort result, but faster than calling sort_index since we know the order we need - len_df = len(frame) - n_uniques = len(ordered_stack_cols_unique) - indexer = np.arange(n_uniques) - idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) - result = result.take(idxs) - - # Reshape/rename if needed and dropna - if result.ndim == 2 and frame.columns.nlevels == len(level): - if len(result.columns) == 0: - result = Series(index=result.index) - else: - result = result.iloc[:, 0] - if result.ndim == 1: - result.name = None - return result diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 2b0c6fbb8e3bf..034b861a83f43 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -1,12 +1,12 @@ """ Quantilization functions and related stuff """ + from __future__ import annotations from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -43,6 +43,8 @@ from pandas.core.arrays.datetimelike import dtype_to_unit if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( DtypeObj, IntervalLeftRight, @@ -71,7 +73,7 @@ def cut( Parameters ---------- - x : array-like + x : 1d ndarray or Series The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. @@ -124,7 +126,7 @@ def cut( Categorical for all other inputs. The values stored within are whatever the type in the sequence is. - * False : returns an ndarray of integers. + * False : returns a 1d ndarray or Series of integers. bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when `retbins=True`. @@ -140,12 +142,17 @@ def cut( fixed set of values. Series : One-dimensional array with axis labels (including time series). IntervalIndex : Immutable Index implementing an ordered, sliceable set. + numpy.histogram_bin_edges: Function to calculate only the edges of the bins + used by the histogram function. Notes ----- Any NA values will be NA in the result. Out of bounds values will be NA in the resulting Series or Categorical object. + ``numpy.histogram_bin_edges`` can be used along with cut to calculate bins according + to some predefined methods. + Reference :ref:`the user guide ` for more examples. Examples @@ -166,16 +173,14 @@ def cut( Discovers the same bins, but assign them specific labels. Notice that the returned Categorical's categories are `labels` and is ordered. - >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), - ... 3, labels=["bad", "medium", "good"]) + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, labels=["bad", "medium", "good"]) ['bad', 'good', 'medium', 'medium', 'good', 'bad'] Categories (3, object): ['bad' < 'medium' < 'good'] ``ordered=False`` will result in unordered categories when labels are passed. This parameter can be used to allow non-unique labels: - >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, - ... labels=["B", "A", "B"], ordered=False) + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, labels=["B", "A", "B"], ordered=False) ['B', 'B', 'A', 'A', 'B', 'B'] Categories (2, object): ['A', 'B'] @@ -186,8 +191,7 @@ def cut( Passing a Series as an input returns a Series with categorical dtype: - >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), - ... index=['a', 'b', 'c', 'd', 'e']) + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), index=["a", "b", "c", "d", "e"]) >>> pd.cut(s, 3) ... # doctest: +ELLIPSIS a (1.992, 4.667] @@ -201,8 +205,7 @@ def cut( Passing a Series as an input returns a Series with mapping value. It is used to map numerically to intervals based on bins. - >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), - ... index=['a', 'b', 'c', 'd', 'e']) + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), index=["a", "b", "c", "d", "e"]) >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) ... # doctest: +ELLIPSIS (a 1.0 @@ -215,8 +218,14 @@ def cut( Use `drop` optional when bins is not unique - >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, - ... right=False, duplicates='drop') + >>> pd.cut( + ... s, + ... [0, 2, 4, 6, 10, 10], + ... labels=False, + ... retbins=True, + ... right=False, + ... duplicates="drop", + ... ) ... # doctest: +ELLIPSIS (a 1.0 b 2.0 @@ -235,6 +244,16 @@ def cut( >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]] + + Using np.histogram_bin_edges with cut + + >>> pd.cut( + ... np.array([1, 7, 5, 4]), + ... bins=np.histogram_bin_edges(np.array([1, 7, 5, 4]), bins="auto"), + ... ) + ... # doctest: +ELLIPSIS + [NaN, (5.0, 7.0], (3.0, 5.0], (3.0, 5.0]] + Categories (3, interval[float64, right]): [(1.0, 3.0] < (3.0, 5.0] < (5.0, 7.0]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -286,6 +305,7 @@ def qcut( Parameters ---------- x : 1d ndarray or Series + Input Numpy array or pandas Series object to be discretized. q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. @@ -310,6 +330,11 @@ def qcut( bins : ndarray of floats Returned only if `retbins` is True. + See Also + -------- + cut : Bin values into discrete intervals. + Series.quantile : Return value at the given quantile. + Notes ----- Out of bounds values will be NA in the resulting Categorical object @@ -333,7 +358,16 @@ def qcut( x_idx = _preprocess_for_cut(x) x_idx, _ = _coerce_to_type(x_idx) - quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q + if is_integer(q): + quantiles = np.linspace(0, 1, q + 1) + # Round up rather than to nearest if not representable in base 2 + np.putmask( + quantiles, + q * quantiles != np.arange(q + 1), + np.nextafter(quantiles, 1), + ) + else: + quantiles = q bins = x_idx.to_series().dropna().quantile(quantiles) @@ -441,7 +475,7 @@ def _bins_to_cuts( if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == "raise": raise ValueError( - f"Bin edges must be unique: {repr(bins)}.\n" + f"Bin edges must be unique: {bins!r}.\n" f"You can drop duplicate edges by setting the 'duplicates' kwarg" ) bins = unique_bins @@ -548,7 +582,7 @@ def _format_labels( precision: int, right: bool = True, include_lowest: bool = False, -): +) -> IntervalIndex: """based on the dtype, return our labels""" closed: IntervalLeftRight = "right" if right else "left" diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py deleted file mode 100644 index 476e3922b6989..0000000000000 --- a/pandas/core/reshape/util.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import numpy as np - -from pandas.core.dtypes.common import is_list_like - -if TYPE_CHECKING: - from pandas._typing import NumpyIndexT - - -def cartesian_product(X) -> list[np.ndarray]: - """ - Numpy version of itertools.product. - Sometimes faster (for large inputs)... - - Parameters - ---------- - X : list-like of list-likes - - Returns - ------- - product : list of ndarrays - - Examples - -------- - >>> cartesian_product([list('ABC'), [1, 2]]) - [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' NumpyIndexT: - """ - Index compat for np.tile. - - Notes - ----- - Does not support multi-dimensional `num`. - """ - if isinstance(arr, np.ndarray): - return np.tile(arr, num) - - # Otherwise we have an Index - taker = np.tile(np.arange(len(arr)), num) - return arr.take(taker) diff --git a/pandas/core/roperator.py b/pandas/core/roperator.py index 2f320f4e9c6b9..9ea4bea41cdea 100644 --- a/pandas/core/roperator.py +++ b/pandas/core/roperator.py @@ -2,6 +2,7 @@ Reversed Operations not available in the stdlib operator module. Defining these instead of using lambdas allows us to reference them by name. """ + from __future__ import annotations import operator diff --git a/pandas/core/sample.py b/pandas/core/sample.py index eebbed3512c4e..4f12563e3c5e2 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -1,6 +1,7 @@ """ Module containing utilities for NDFrame.sample() and .GroupBy.sample() """ + from __future__ import annotations from typing import TYPE_CHECKING @@ -122,7 +123,7 @@ def sample( random_state: np.random.RandomState | np.random.Generator, ) -> np.ndarray: """ - Randomly sample `size` indices in `np.arange(obj_len)` + Randomly sample `size` indices in `np.arange(obj_len)`. Parameters ---------- diff --git a/pandas/core/series.py b/pandas/core/series.py index e3b401cd3c88b..5ed094349caaa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1,14 +1,17 @@ """ Data structure for 1-dimensional cross-sectional and time series data """ + from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, Sequence, ) +import functools import operator import sys from textwrap import dedent @@ -16,22 +19,14 @@ IO, TYPE_CHECKING, Any, - Callable, Literal, cast, overload, ) import warnings -import weakref import numpy as np -from pandas._config import ( - using_copy_on_write, - warn_copy_on_write, -) -from pandas._config.config import _get_option - from pandas._libs import ( lib, properties, @@ -45,19 +40,21 @@ from pandas.errors import ( ChainedAssignmentError, InvalidIndexError, +) +from pandas.errors.cow import ( _chained_assignment_method_msg, _chained_assignment_msg, - _chained_assignment_warning_method_msg, - _chained_assignment_warning_msg, - _check_cacher, ) from pandas.util._decorators import ( Appender, Substitution, deprecate_nonkeyword_arguments, doc, + set_module, +) +from pandas.util._exceptions import ( + find_stack_level, ) -from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -67,11 +64,15 @@ from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from, maybe_box_native, maybe_cast_pointwise_result, ) from pandas.core.dtypes.common import ( is_dict_like, + is_float, is_integer, is_iterator, is_list_like, @@ -83,8 +84,12 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, ExtensionDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, ) -from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( isna, @@ -97,12 +102,11 @@ algorithms, base, common as com, - missing, nanops, ops, roperator, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray from pandas.core.arrays.arrow import ( @@ -113,6 +117,7 @@ from pandas.core.arrays.sparse import SparseAccessor from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( + array as pd_array, extract_array, sanitize_array, ) @@ -132,6 +137,7 @@ PeriodIndex, default_index, ensure_index, + maybe_sequence_to_range, ) import pandas.core.indexes.base as ibase from pandas.core.indexes.multi import maybe_droplevels @@ -139,10 +145,7 @@ check_bool_indexer, check_dict_or_set_indexers, ) -from pandas.core.internals import ( - SingleArrayManager, - SingleBlockManager, -) +from pandas.core.internals import SingleBlockManager from pandas.core.methods import selectn from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( @@ -179,6 +182,7 @@ IndexKeyFunc, IndexLabel, Level, + ListLike, MutableMappingT, NaPosition, NumpySorter, @@ -188,7 +192,6 @@ Renamer, Scalar, Self, - SingleManager, SortKind, StorageOptions, Suffixes, @@ -221,28 +224,6 @@ Unused.""", } - -def _coerce_method(converter): - """ - Install the scalar coercion methods. - """ - - def wrapper(self): - if len(self) == 1: - warnings.warn( - f"Calling {converter.__name__} on a single element Series is " - "deprecated and will raise a TypeError in the future. " - f"Use {converter.__name__}(ser.iloc[0]) instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - return converter(self.iloc[0]) - raise TypeError(f"cannot convert the series to {converter}") - - wrapper.__name__ = f"__{converter.__name__}__" - return wrapper - - # ---------------------------------------------------------------------- # Series class @@ -252,6 +233,7 @@ def wrapper(self): # error: Cannot override final attribute "size" (previously declared in base # class "NDFrame") # definition in base class "NDFrame" +@set_module("pandas") class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] """ One-dimensional ndarray with axis labels (including time series). @@ -270,7 +252,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] ---------- data : array-like, Iterable, dict, or scalar value Contains data stored in Series. If data is a dict, argument order is - maintained. + maintained. Unordered sets are not supported. index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to @@ -281,11 +263,17 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] Data type for the output Series. If not specified, this will be inferred from `data`. See the :ref:`user guide ` for more usages. + If ``data`` is Series then is ignored. name : Hashable, default None The name to give to the Series. copy : bool, default False Copy input data. Only affects Series or 1d ndarray input. See examples. + See Also + -------- + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Index : Immutable sequence used for indexing and alignment. + Notes ----- Please reference the :ref:`User Guide ` for more information. @@ -294,8 +282,8 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] -------- Constructing Series from a dictionary with an Index specified - >>> d = {'a': 1, 'b': 2, 'c': 3} - >>> ser = pd.Series(data=d, index=['a', 'b', 'c']) + >>> d = {"a": 1, "b": 2, "c": 3} + >>> ser = pd.Series(data=d, index=["a", "b", "c"]) >>> ser a 1 b 2 @@ -305,15 +293,15 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] The keys of the dictionary match with the Index values, hence the Index values have no effect. - >>> d = {'a': 1, 'b': 2, 'c': 3} - >>> ser = pd.Series(data=d, index=['x', 'y', 'z']) + >>> d = {"a": 1, "b": 2, "c": 3} + >>> ser = pd.Series(data=d, index=["x", "y", "z"]) >>> ser x NaN y NaN z NaN dtype: float64 - Note that the Index is first build with the keys from the dictionary. + Note that the Index is first built with the keys from the dictionary. After this the Series is reindexed with the given Index values, hence we get all NaN as a result. @@ -373,7 +361,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] base.IndexOpsMixin.hasnans.fget, # type: ignore[attr-defined] doc=base.IndexOpsMixin.hasnans.__doc__, ) - _mgr: SingleManager + _mgr: SingleBlockManager # ---------------------------------------------------------------------- # Constructors @@ -385,21 +373,10 @@ def __init__( dtype: Dtype | None = None, name=None, copy: bool | None = None, - fastpath: bool | lib.NoDefault = lib.no_default, ) -> None: - if fastpath is not lib.no_default: - warnings.warn( - "The 'fastpath' keyword in pd.Series is deprecated and will " - "be removed in a future version.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - else: - fastpath = False - allow_mgr = False if ( - isinstance(data, (SingleBlockManager, SingleArrayManager)) + isinstance(data, SingleBlockManager) and index is None and dtype is None and (copy is False or copy is None) @@ -413,58 +390,20 @@ def __init__( DeprecationWarning, stacklevel=2, ) - if using_copy_on_write(): - data = data.copy(deep=False) + data = data.copy(deep=False) # GH#33357 called with just the SingleBlockManager NDFrame.__init__(self, data) - if fastpath: - # e.g. from _box_col_values, skip validation of name - object.__setattr__(self, "_name", name) - else: - self.name = name + self.name = name return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - if isinstance(data, (ExtensionArray, np.ndarray)): - if copy is not False and using_copy_on_write(): + if copy is not False: if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): data = data.copy() if copy is None: copy = False - # we are called internally, so short-circuit - if fastpath: - # data is a ndarray, index is defined - if not isinstance(data, (SingleBlockManager, SingleArrayManager)): - manager = _get_option("mode.data_manager", silent=True) - if manager == "block": - data = SingleBlockManager.from_array(data, index) - elif manager == "array": - data = SingleArrayManager.from_array(data, index) - allow_mgr = True - elif using_copy_on_write() and not copy: - data = data.copy(deep=False) - - if not allow_mgr: - warnings.warn( - f"Passing a {type(data).__name__} to {type(self).__name__} " - "is deprecated and will raise in a future version. " - "Use public APIs instead.", - DeprecationWarning, - stacklevel=2, - ) - - if copy: - data = data.copy() - # skips validation of the name - object.__setattr__(self, "_name", name) - NDFrame.__init__(self, data) - return - - if isinstance(data, SingleBlockManager) and using_copy_on_write() and not copy: + if isinstance(data, SingleBlockManager) and not copy: data = data.copy(deep=False) if not allow_mgr: @@ -499,14 +438,9 @@ def __init__( refs = None if isinstance(data, Index): if dtype is not None: - data = data.astype(dtype, copy=False) + data = data.astype(dtype) - if using_copy_on_write(): - refs = data._references - data = data._values - else: - # GH#24096 we need to ensure the index remains immutable - data = data._values.copy() + refs = data._references copy = False elif isinstance(data, np.ndarray): @@ -522,14 +456,14 @@ def __init__( index = data.index data = data._mgr.copy(deep=False) else: - data = data.reindex(index, copy=copy) + data = data.reindex(index) copy = False data = data._mgr - elif is_dict_like(data): + elif isinstance(data, Mapping): data, index = self._init_dict(data, index, dtype) dtype = None copy = False - elif isinstance(data, (SingleBlockManager, SingleArrayManager)): + elif isinstance(data, SingleBlockManager): if index is None: index = data.index elif not data.index.equals(index) or copy: @@ -567,37 +501,21 @@ def __init__( com.require_length_match(data, index) # create/copy the manager - if isinstance(data, (SingleBlockManager, SingleArrayManager)): + if isinstance(data, SingleBlockManager): if dtype is not None: - data = data.astype(dtype=dtype, errors="ignore", copy=copy) + data = data.astype(dtype=dtype) elif copy: data = data.copy() else: data = sanitize_array(data, index, dtype, copy) - - manager = _get_option("mode.data_manager", silent=True) - if manager == "block": - data = SingleBlockManager.from_array(data, index, refs=refs) - elif manager == "array": - data = SingleArrayManager.from_array(data, index) + data = SingleBlockManager.from_array(data, index, refs=refs) NDFrame.__init__(self, data) self.name = name self._set_axis(0, index) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Series " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old behavior.", - FutureWarning, - stacklevel=find_stack_level(), - ) - def _init_dict( - self, data, index: Index | None = None, dtype: DtypeObj | None = None + self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): """ Derive the "_mgr" and "index" attributes of a new Series from a @@ -617,8 +535,6 @@ def _init_dict( _data : BlockManager for the new Series index : index for the new Series """ - keys: Index | tuple - # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: @@ -626,7 +542,7 @@ def _init_dict( # using generators in effects the performance. # Below is the new way of extracting the keys and values - keys = tuple(data.keys()) + keys = maybe_sequence_to_range(tuple(data.keys())) values = list(data.values()) # Generating list of values- faster way elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar @@ -644,24 +560,60 @@ def _init_dict( # Now we just make sure the order is respected, if any if data and index is not None: - s = s.reindex(index, copy=False) + s = s.reindex(index) return s._mgr, s.index # ---------------------------------------------------------------------- + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas Series as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas Series to the Arrow + format (and follows the default behavior of ``pyarrow.Array.from_pandas`` + in its handling of the index, i.e. to ignore it). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="16.0.0") + type = ( + pa.DataType._import_from_c_capsule(requested_schema) + if requested_schema is not None + else None + ) + ca = pa.array(self, type=type) + if not isinstance(ca, pa.ChunkedArray): + ca = pa.chunked_array([ca]) + return ca.__arrow_c_stream__() + + # ---------------------------------------------------------------------- + @property - def _constructor(self) -> Callable[..., Series]: + def _constructor(self) -> type[Series]: return Series def _constructor_from_mgr(self, mgr, axes): - if self._constructor is Series: - # we are pandas.Series (or a subclass that doesn't override _constructor) - ser = Series._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name + ser = Series._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name + + if type(self) is Series: + # This would also work `if self._constructor is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - else: - assert axes is mgr.axes - return self._constructor(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor(ser) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -673,18 +625,19 @@ def _constructor_expanddim(self) -> Callable[..., DataFrame]: return DataFrame - def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: + def _constructor_expanddim_from_mgr(self, mgr, axes): from pandas.core.frame import DataFrame - return DataFrame._from_mgr(mgr, axes=mgr.axes) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) - def _constructor_expanddim_from_mgr(self, mgr, axes): - from pandas.core.frame import DataFrame + if type(self) is Series: + # This would also work `if self._constructor_expanddim is DataFrame`, + # but this check is slightly faster, benefiting the most-common case. + return df - if self._constructor_expanddim is DataFrame: - return self._expanddim_from_mgr(mgr, axes) - assert axes is mgr.axes - return self._constructor_expanddim(mgr) + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor_expanddim(df) # types @property @@ -697,6 +650,13 @@ def dtype(self) -> DtypeObj: """ Return the dtype object of the underlying data. + See Also + -------- + Series.dtypes : Return the dtype object of the underlying data. + Series.astype : Cast a pandas object to a specified dtype dtype. + Series.convert_dtypes : Convert columns to the best possible dtypes using dtypes + supporting pd.NA. + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -710,6 +670,10 @@ def dtypes(self) -> DtypeObj: """ Return the dtype object of the underlying data. + See Also + -------- + DataFrame.dtypes : Return the dtypes in the DataFrame. + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -742,7 +706,7 @@ def name(self) -> Hashable: -------- The Series name can be set initially when calling the constructor. - >>> s = pd.Series([1, 2, 3], dtype=np.int64, name='Numbers') + >>> s = pd.Series([1, 2, 3], dtype=np.int64, name="Numbers") >>> s 0 1 1 2 @@ -757,8 +721,9 @@ def name(self) -> Hashable: The name of a Series within a DataFrame is its column name. - >>> df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], - ... columns=["Odd Numbers", "Even Numbers"]) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4], [5, 6]], columns=["Odd Numbers", "Even Numbers"] + ... ) >>> df Odd Numbers Even Numbers 0 1 2 @@ -799,17 +764,16 @@ def values(self): >>> pd.Series([1, 2, 3]).values array([1, 2, 3]) - >>> pd.Series(list('aabc')).values + >>> pd.Series(list("aabc")).values array(['a', 'a', 'b', 'c'], dtype=object) - >>> pd.Series(list('aabc')).astype('category').values + >>> pd.Series(list("aabc")).astype("category").values ['a', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] Timezone aware datetime data is converted to UTC: - >>> pd.Series(pd.date_range('20130101', periods=3, - ... tz='US/Eastern')).values + >>> pd.Series(pd.date_range("20130101", periods=3, tz="US/Eastern")).values array(['2013-01-01T05:00:00.000000000', '2013-01-02T05:00:00.000000000', '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]') @@ -851,119 +815,25 @@ def _values(self): return self._mgr.internal_values() @property - def _references(self) -> BlockValuesRefs | None: - if isinstance(self._mgr, SingleArrayManager): - return None + def _references(self) -> BlockValuesRefs: return self._mgr._block.refs - # error: Decorated property not supported - @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] + @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[prop-decorator] @property def array(self) -> ExtensionArray: return self._mgr.array_values() - # ops - def ravel(self, order: str = "C") -> ArrayLike: - """ - Return the flattened underlying data as an ndarray or ExtensionArray. - - .. deprecated:: 2.2.0 - Series.ravel is deprecated. The underlying array is already 1D, so - ravel is not necessary. Use :meth:`to_numpy` for conversion to a numpy - array instead. - - Returns - ------- - numpy.ndarray or ExtensionArray - Flattened data of the Series. - - See Also - -------- - numpy.ndarray.ravel : Return a flattened array. - - Examples - -------- - >>> s = pd.Series([1, 2, 3]) - >>> s.ravel() # doctest: +SKIP - array([1, 2, 3]) - """ - warnings.warn( - "Series.ravel is deprecated. The underlying array is already 1D, so " - "ravel is not necessary. Use `to_numpy()` for conversion to a numpy " - "array instead.", - FutureWarning, - stacklevel=2, - ) - arr = self._values.ravel(order=order) - if isinstance(arr, np.ndarray) and using_copy_on_write(): - arr.flags.writeable = False - return arr - def __len__(self) -> int: """ Return the length of the Series. """ return len(self._mgr) - def view(self, dtype: Dtype | None = None) -> Series: - """ - Create a new view of the Series. - - .. deprecated:: 2.2.0 - ``Series.view`` is deprecated and will be removed in a future version. - Use :meth:`Series.astype` as an alternative to change the dtype. - - This function will return a new Series with a view of the same - underlying values in memory, optionally reinterpreted with a new data - type. The new data type must preserve the same size in bytes as to not - cause index misalignment. - - Parameters - ---------- - dtype : data type - Data type object or one of their string representations. - - Returns - ------- - Series - A new Series object as a view of the same data in memory. - - See Also - -------- - numpy.ndarray.view : Equivalent numpy function to create a new view of - the same data in memory. - - Notes - ----- - Series are instantiated with ``dtype=float64`` by default. While - ``numpy.ndarray.view()`` will return a view with the same data type as - the original array, ``Series.view()`` (without specified dtype) - will try using ``float64`` and may fail if the original data type size - in bytes is not the same. - - Examples - -------- - Use ``astype`` to change the dtype instead. - """ - warnings.warn( - "Series.view is deprecated and will be removed in a future version. " - "Use ``astype`` as an alternative to change the dtype.", - FutureWarning, - stacklevel=2, - ) - # self.array instead of self._values so we piggyback on NumpyExtensionArray - # implementation - res_values = self.array.view(dtype) - res_ser = self._constructor(res_values, index=self.index, copy=False) - if isinstance(res_ser._mgr, SingleBlockManager): - blk = res_ser._mgr._block - blk.refs = cast("BlockValuesRefs", self._references) - blk.refs.add_reference(blk) - return res_ser.__finalize__(self, method="view") - # ---------------------------------------------------------------------- # NDArray Compat - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the values as a NumPy array. @@ -976,6 +846,9 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy : bool or None, optional + See :func:`numpy.asarray`. + Returns ------- numpy.ndarray @@ -997,7 +870,7 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: For timezone-aware data, the timezones may be retained with ``dtype='object'`` - >>> tzser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> tzser = pd.Series(pd.date_range("2000", periods=2, tz="CET")) >>> np.asarray(tzser, dtype="object") array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), Timestamp('2000-01-02 00:00:00+0100', tz='CET')], @@ -1011,37 +884,21 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: dtype='datetime64[ns]') """ values = self._values - arr = np.asarray(values, dtype=dtype) - if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype): + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + + if copy is True: + return arr + if copy is False or astype_is_view(values.dtype, arr.dtype): arr = arr.view() arr.flags.writeable = False return arr # ---------------------------------------------------------------------- - def __column_consortium_standard__(self, *, api_version: str | None = None) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of pandas. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - dataframe_api_compat = import_optional_dependency("dataframe_api_compat") - return ( - dataframe_api_compat.pandas_standard.convert_to_standard_compliant_column( - self, api_version=api_version - ) - ) - - # ---------------------------------------------------------------------- - # Unary Methods - - # coercion - __float__ = _coerce_method(float) - __int__ = _coerce_method(int) - - # ---------------------------------------------------------------------- - # indexers @property def axes(self) -> list[Index]: @@ -1080,27 +937,15 @@ def __getitem__(self, key): key = com.apply_if_callable(key, self) if key is Ellipsis: - if using_copy_on_write() or warn_copy_on_write(): - return self.copy(deep=False) - return self + return self.copy(deep=False) key_is_scalar = is_scalar(key) if isinstance(key, (list, tuple)): key = unpack_1tuple(key) - if is_integer(key) and self.index._should_fallback_to_positional: - warnings.warn( - # GH#50617 - "Series.__getitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To access " - "a value by position, use `ser.iloc[pos]`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._values[key] - elif key_is_scalar: + # Note: GH#50617 in 3.0 we changed int key to always be treated as + # a label, matching DataFrame behavior. return self._get_value(key) # Convert generator to list before going through hashable part @@ -1145,35 +990,6 @@ def _get_with(self, key): elif isinstance(key, tuple): return self._get_values_tuple(key) - elif not is_list_like(key): - # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684 - return self.loc[key] - - if not isinstance(key, (list, np.ndarray, ExtensionArray, Series, Index)): - key = list(key) - - key_type = lib.infer_dtype(key, skipna=False) - - # Note: The key_type == "boolean" case should be caught by the - # com.is_bool_indexer check in __getitem__ - if key_type == "integer": - # We need to decide whether to treat this as a positional indexer - # (i.e. self.iloc) or label-based (i.e. self.loc) - if not self.index._should_fallback_to_positional: - return self.loc[key] - else: - warnings.warn( - # GH#50617 - "Series.__getitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To access " - "a value by position, use `ser.iloc[pos]`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.iloc[key] - - # handle the dup indexing case GH#4246 return self.loc[key] def _get_values_tuple(self, key: tuple): @@ -1193,7 +1009,7 @@ def _get_values_tuple(self, key: tuple): indexer, new_index = self.index.get_loc_level(key) new_ser = self._constructor(self._values[indexer], index=new_index, copy=False) if isinstance(indexer, slice): - new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] + new_ser._mgr.add_references(self._mgr) return new_ser.__finalize__(self) def _get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Series: @@ -1235,74 +1051,36 @@ def _get_value(self, label, takeable: bool = False): new_values, index=new_index, name=self.name, copy=False ) if isinstance(loc, slice): - new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] + new_ser._mgr.add_references(self._mgr) return new_ser.__finalize__(self) else: return self.iloc[loc] def __setitem__(self, key, value) -> None: - warn = True - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= 3: warnings.warn( _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 ) - elif not PYPY and not using_copy_on_write(): - ctr = sys.getrefcount(self) - ref_count = 3 - if not warn_copy_on_write() and _check_cacher(self): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count and ( - warn_copy_on_write() - or ( - not warn_copy_on_write() - and self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] - ) - ): - warn = False - warnings.warn( - _chained_assignment_warning_msg, FutureWarning, stacklevel=2 - ) check_dict_or_set_indexers(key) key = com.apply_if_callable(key, self) - cacher_needs_updating = self._check_is_chained_assignment_possible() if key is Ellipsis: key = slice(None) if isinstance(key, slice): indexer = self.index._convert_slice_indexer(key, kind="getitem") - return self._set_values(indexer, value, warn=warn) + return self._set_values(indexer, value) try: - self._set_with_engine(key, value, warn=warn) + self._set_with_engine(key, value) except KeyError: # We have a scalar (or for MultiIndex or object-dtype, scalar-like) # key that is not present in self.index. - if is_integer(key): - if not self.index._should_fallback_to_positional: - # GH#33469 - self.loc[key] = value - else: - # positional setter - # can't use _mgr.setitem_inplace yet bc could have *both* - # KeyError and then ValueError, xref GH#45070 - warnings.warn( - # GH#50617 - "Series.__setitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To set " - "a value by position, use `ser.iloc[pos] = value`", - FutureWarning, - stacklevel=find_stack_level(), - ) - self._set_values(key, value) - else: - # GH#12862 adding a new key to the Series - self.loc[key] = value + # GH#12862 adding a new key to the Series + self.loc[key] = value except (TypeError, ValueError, LossySetitemError): # The key was OK, but we cannot set the value losslessly @@ -1337,25 +1115,22 @@ def __setitem__(self, key, value) -> None: # otherwise with listlike other we interpret series[mask] = other # as series[mask] = other[mask] try: - self._where(~key, value, inplace=True, warn=warn) + self._where(~key, value, inplace=True) except InvalidIndexError: # test_where_dups self.iloc[key] = value return else: - self._set_with(key, value, warn=warn) - - if cacher_needs_updating: - self._maybe_update_cacher(inplace=True) + self._set_with(key, value) - def _set_with_engine(self, key, value, warn: bool = True) -> None: + def _set_with_engine(self, key, value) -> None: loc = self.index.get_loc(key) # this is equivalent to self._values[key] = value - self._mgr.setitem_inplace(loc, value, warn=warn) + self._mgr.setitem_inplace(loc, value) - def _set_with(self, key, value, warn: bool = True) -> None: + def _set_with(self, key, value) -> None: # We got here via exception-handling off of InvalidIndexError, so # key should always be listlike at this point. assert not isinstance(key, tuple) @@ -1364,43 +1139,21 @@ def _set_with(self, key, value, warn: bool = True) -> None: # Without this, the call to infer_dtype will consume the generator key = list(key) - if not self.index._should_fallback_to_positional: - # Regardless of the key type, we're treating it as labels - self._set_labels(key, value, warn=warn) - - else: - # Note: key_type == "boolean" should not occur because that - # should be caught by the is_bool_indexer check in __setitem__ - key_type = lib.infer_dtype(key, skipna=False) - - if key_type == "integer": - warnings.warn( - # GH#50617 - "Series.__setitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To set " - "a value by position, use `ser.iloc[pos] = value`", - FutureWarning, - stacklevel=find_stack_level(), - ) - self._set_values(key, value, warn=warn) - else: - self._set_labels(key, value, warn=warn) + self._set_labels(key, value) - def _set_labels(self, key, value, warn: bool = True) -> None: + def _set_labels(self, key, value) -> None: key = com.asarray_tuplesafe(key) indexer: np.ndarray = self.index.get_indexer(key) mask = indexer == -1 if mask.any(): raise KeyError(f"{key[mask]} not in index") - self._set_values(indexer, value, warn=warn) + self._set_values(indexer, value) - def _set_values(self, key, value, warn: bool = True) -> None: + def _set_values(self, key, value) -> None: if isinstance(key, (Index, Series)): key = key._values - self._mgr = self._mgr.setitem(indexer=key, value=value, warn=warn) - self._maybe_update_cacher() + self._mgr = self._mgr.setitem(indexer=key, value=value) def _set_value(self, label, value, takeable: bool = False) -> None: """ @@ -1429,84 +1182,6 @@ def _set_value(self, label, value, takeable: bool = False) -> None: self._set_values(loc, value) - # ---------------------------------------------------------------------- - # Lookup Caching - - @property - def _is_cached(self) -> bool: - """Return boolean indicating if self is cached or not.""" - return getattr(self, "_cacher", None) is not None - - def _get_cacher(self): - """return my cacher or None""" - cacher = getattr(self, "_cacher", None) - if cacher is not None: - cacher = cacher[1]() - return cacher - - def _reset_cacher(self) -> None: - """ - Reset the cacher. - """ - if hasattr(self, "_cacher"): - del self._cacher - - def _set_as_cached(self, item, cacher) -> None: - """ - Set the _cacher attribute on the calling object with a weakref to - cacher. - """ - if using_copy_on_write(): - return - self._cacher = (item, weakref.ref(cacher)) - - def _clear_item_cache(self) -> None: - # no-op for Series - pass - - def _check_is_chained_assignment_possible(self) -> bool: - """ - See NDFrame._check_is_chained_assignment_possible.__doc__ - """ - if self._is_view and self._is_cached: - ref = self._get_cacher() - if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(t="referent", force=True) - return True - return super()._check_is_chained_assignment_possible() - - def _maybe_update_cacher( - self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False - ) -> None: - """ - See NDFrame._maybe_update_cacher.__doc__ - """ - # for CoW, we never want to update the parent DataFrame cache - # if the Series changed, but don't keep track of any cacher - if using_copy_on_write(): - return - cacher = getattr(self, "_cacher", None) - if cacher is not None: - ref: DataFrame = cacher[1]() - - # we are trying to reference a dead referent, hence - # a copy - if ref is None: - del self._cacher - elif len(self) == len(ref) and self.name in ref.columns: - # GH#42530 self.name must be in ref.columns - # to ensure column still in dataframe - # otherwise, either self or ref has swapped in new arrays - ref._maybe_cache_changed(cacher[0], self, inplace=inplace) - else: - # GH#33675 we have swapped in a new array, so parent - # reference to self is now invalid - ref._item_cache.pop(cacher[0], None) - - super()._maybe_update_cacher( - clear=clear, verify_is_copy=verify_is_copy, inplace=inplace - ) - # ---------------------------------------------------------------------- # Unsorted @@ -1538,7 +1213,7 @@ def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: Examples -------- - >>> s = pd.Series(['a', 'b', 'c']) + >>> s = pd.Series(["a", "b", "c"]) >>> s 0 a 1 b @@ -1577,8 +1252,7 @@ def reset_index( name: Level = ..., inplace: Literal[False] = ..., allow_duplicates: bool = ..., - ) -> DataFrame: - ... + ) -> DataFrame: ... @overload def reset_index( @@ -1589,8 +1263,7 @@ def reset_index( name: Level = ..., inplace: Literal[False] = ..., allow_duplicates: bool = ..., - ) -> Series: - ... + ) -> Series: ... @overload def reset_index( @@ -1601,8 +1274,7 @@ def reset_index( name: Level = ..., inplace: Literal[True], allow_duplicates: bool = ..., - ) -> None: - ... + ) -> None: ... def reset_index( self, @@ -1654,8 +1326,11 @@ def reset_index( Examples -------- - >>> s = pd.Series([1, 2, 3, 4], name='foo', - ... index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) + >>> s = pd.Series( + ... [1, 2, 3, 4], + ... name="foo", + ... index=pd.Index(["a", "b", "c", "d"], name="idx"), + ... ) Generate a DataFrame with default index. @@ -1668,7 +1343,7 @@ def reset_index( To specify the name of the new column use `name`. - >>> s.reset_index(name='values') + >>> s.reset_index(name="values") idx values 0 a 1 1 b 2 @@ -1687,16 +1362,19 @@ def reset_index( The `level` parameter is interesting for Series with a multi-level index. - >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']), - ... np.array(['one', 'two', 'one', 'two'])] + >>> arrays = [ + ... np.array(["bar", "bar", "baz", "baz"]), + ... np.array(["one", "two", "one", "two"]), + ... ] >>> s2 = pd.Series( - ... range(4), name='foo', - ... index=pd.MultiIndex.from_arrays(arrays, - ... names=['a', 'b'])) + ... range(4), + ... name="foo", + ... index=pd.MultiIndex.from_arrays(arrays, names=["a", "b"]), + ... ) To remove a specific level from the Index, use `level`. - >>> s2.reset_index(level='a') + >>> s2.reset_index(level="a") a foo b one bar 0 @@ -1728,14 +1406,10 @@ def reset_index( if inplace: self.index = new_index - elif using_copy_on_write(): + else: new_ser = self.copy(deep=False) new_ser.index = new_index return new_ser.__finalize__(self, method="reset_index") - else: - return self._constructor( - self._values.copy(), index=new_index, copy=False, dtype=self.dtype - ).__finalize__(self, method="reset_index") elif inplace: raise TypeError( "Cannot reset_index inplace on a Series to create a DataFrame" @@ -1762,7 +1436,6 @@ def __repr__(self) -> str: """ Return a string representation for a particular Series. """ - # pylint: disable=invalid-repr-returned repr_params = fmt.get_series_repr_params() return self.to_string(**repr_params) @@ -1770,6 +1443,7 @@ def __repr__(self) -> str: def to_string( self, buf: None = ..., + *, na_rep: str = ..., float_format: str | None = ..., header: bool = ..., @@ -1779,13 +1453,13 @@ def to_string( name=..., max_rows: int | None = ..., min_rows: int | None = ..., - ) -> str: - ... + ) -> str: ... @overload def to_string( self, buf: FilePath | WriteBuffer[str], + *, na_rep: str = ..., float_format: str | None = ..., header: bool = ..., @@ -1795,9 +1469,11 @@ def to_string( name=..., max_rows: int | None = ..., min_rows: int | None = ..., - ) -> None: - ... + ) -> None: ... + @deprecate_nonkeyword_arguments( + version="4.0", allowed_args=["self", "buf"], name="to_string" + ) def to_string( self, buf: FilePath | WriteBuffer[str] | None = None, @@ -1845,6 +1521,13 @@ def to_string( str or None String representation of Series if ``buf=None``, otherwise None. + See Also + -------- + Series.to_dict : Convert Series to dict object. + Series.to_frame : Convert Series to DataFrame object. + Series.to_markdown : Print Series in Markdown-friendly format. + Series.to_timestamp : Cast to DatetimeIndex of Timestamps. + Examples -------- >>> ser = pd.Series([1, 2, 3]).to_string() @@ -1869,7 +1552,7 @@ def to_string( if not isinstance(result, str): raise AssertionError( "result must be of type str, type " - f"of result is {repr(type(result).__name__)}" + f"of result is {type(result).__name__!r}" ) if buf is None: @@ -1882,6 +1565,39 @@ def to_string( f.write(result) return None + @overload + def to_markdown( + self, + buf: None = ..., + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> str: ... + + @overload + def to_markdown( + self, + buf: IO[str], + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> None: ... + + @overload + def to_markdown( + self, + buf: IO[str] | None, + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> str | None: ... + @doc( klass=_shared_doc_kwargs["klass"], storage_options=_shared_docs["storage_options"], @@ -1913,6 +1629,9 @@ def to_string( +----+----------+""" ), ) + @deprecate_nonkeyword_arguments( + version="4.0", allowed_args=["self", "buf"], name="to_markdown" + ) def to_markdown( self, buf: IO[str] | None = None, @@ -1944,6 +1663,11 @@ def to_markdown( str {klass} in Markdown-friendly format. + See Also + -------- + Series.to_frame : Rrite a text representation of object to the system clipboard. + Series.to_latex : Render Series to LaTeX-formatted table. + Notes ----- Requires the `tabulate `_ package. @@ -1976,7 +1700,7 @@ def items(self) -> Iterable[tuple[Hashable, Any]]: Examples -------- - >>> s = pd.Series(['A', 'B', 'C']) + >>> s = pd.Series(["A", "B", "C"]) >>> for index, value in s.items(): ... print(f"Index : {index}, Value : {value}") Index : 0, Value : A @@ -1997,6 +1721,10 @@ def keys(self) -> Index: Index Index of the Series. + See Also + -------- + Series.index : The index (axis labels) of the Series. + Examples -------- >>> s = pd.Series([1, 2, 3], index=[0, 1, 2]) @@ -2008,22 +1736,17 @@ def keys(self) -> Index: @overload def to_dict( self, *, into: type[MutableMappingT] | MutableMappingT - ) -> MutableMappingT: - ... + ) -> MutableMappingT: ... @overload - def to_dict(self, *, into: type[dict] = ...) -> dict: - ... + def to_dict(self, *, into: type[dict] = ...) -> dict: ... # error: Incompatible default for argument "into" (default has type "type[ # dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") - @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self"], name="to_dict" - ) def to_dict( self, - into: type[MutableMappingT] - | MutableMappingT = dict, # type: ignore[assignment] + *, + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] ) -> MutableMappingT: """ Convert Series to {label -> value} dict or dict-like object. @@ -2041,6 +1764,12 @@ def to_dict( collections.abc.MutableMapping Key-value representation of Series. + See Also + -------- + Series.to_list: Converts Series to a list of the values. + Series.to_numpy: Converts Series to NumPy ndarray. + Series.array: ExtensionArray of the data backing this Series. + Examples -------- >>> s = pd.Series([1, 2, 3, 4]) @@ -2078,10 +1807,13 @@ def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: DataFrame DataFrame representation of Series. + See Also + -------- + Series.to_dict : Convert Series to dict object. + Examples -------- - >>> s = pd.Series(["a", "b", "c"], - ... name="vals") + >>> s = pd.Series(["a", "b", "c"], name="vals") >>> s.to_frame() vals 0 a @@ -2114,11 +1846,9 @@ def _set_name( name : str inplace : bool Whether to modify `self` directly or return a copy. - deep : bool|None, default None - Whether to do a deep copy, a shallow copy, or Copy on Write(None) """ inplace = validate_bool_kwarg(inplace, "inplace") - ser = self if inplace else self.copy(deep and not using_copy_on_write()) + ser = self if inplace else self.copy(deep=False) ser.name = name return ser @@ -2136,14 +1866,30 @@ def _set_name( Parrot 30.0 Parrot 20.0 Name: Max Speed, dtype: float64 + + We can pass a list of values to group the Series data by custom labels: + >>> ser.groupby(["a", "b", "a", "b"]).mean() a 210.0 b 185.0 Name: Max Speed, dtype: float64 + + Grouping by numeric labels yields similar results: + + >>> ser.groupby([0, 1, 0, 1]).mean() + 0 210.0 + 1 185.0 + Name: Max Speed, dtype: float64 + + We can group by a level of the index: + >>> ser.groupby(level=0).mean() Falcon 370.0 Parrot 25.0 Name: Max Speed, dtype: float64 + + We can group by a condition applied to the Series values: + >>> ser.groupby(ser > 100).mean() Max Speed False 25.0 @@ -2166,11 +1912,16 @@ def _set_name( Parrot Captive 30.0 Wild 20.0 Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() Animal Falcon 370.0 Parrot 25.0 Name: Max Speed, dtype: float64 + + We can also group by the 'Type' level of the hierarchical index + to get the mean speed for each type: + >>> ser.groupby(level="Type").mean() Type Captive 210.0 @@ -2186,12 +1937,17 @@ def _set_name( b 3 dtype: int64 + To include `NA` values in the group keys, set `dropna=False`: + >>> ser.groupby(level=0, dropna=False).sum() a 3 b 3 NaN 3 dtype: int64 + We can also group by a custom list with NaN values to handle + missing group labels: + >>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] >>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") >>> ser.groupby(["a", "b", "a", np.nan]).mean() @@ -2211,12 +1967,11 @@ def _set_name( def groupby( self, by=None, - axis: Axis = 0, level: IndexLabel | None = None, as_index: bool = True, sort: bool = True, group_keys: bool = True, - observed: bool | lib.NoDefault = lib.no_default, + observed: bool = False, dropna: bool = True, ) -> SeriesGroupBy: from pandas.core.groupby.generic import SeriesGroupBy @@ -2225,12 +1980,10 @@ def groupby( raise TypeError("You have to supply one of 'by' and 'level'") if not as_index: raise TypeError("as_index=False only valid with DataFrame") - axis = self._get_axis_number(axis) return SeriesGroupBy( obj=self, keys=by, - axis=axis, level=level, as_index=as_index, sort=sort, @@ -2282,7 +2035,17 @@ def mode(self, dropna: bool = True) -> Series: Series Modes of the Series in sorted order. - Examples + See Also + -------- + numpy.mode : Equivalent numpy function for computing median. + Series.sum : Sum of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + + Examples -------- >>> s = pd.Series([2, 4, 2, 2, 4, None]) >>> s.mode() @@ -2311,7 +2074,7 @@ def mode(self, dropna: bool = True) -> Series: # TODO: Add option for bins like value_counts() values = self._values if isinstance(values, np.ndarray): - res_values = algorithms.mode(values, dropna=dropna) + res_values, _ = algorithms.mode(values, dropna=dropna) else: res_values = values._mode(dropna=dropna) @@ -2324,7 +2087,7 @@ def mode(self, dropna: bool = True) -> Series: dtype=self.dtype, ).__finalize__(self, method="mode") - def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation + def unique(self) -> ArrayLike: """ Return unique values of Series object. @@ -2362,28 +2125,30 @@ def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation Examples -------- - >>> pd.Series([2, 1, 3, 3], name='A').unique() + >>> pd.Series([2, 1, 3, 3], name="A").unique() array([2, 1, 3]) - >>> pd.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique() + >>> pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique() ['2016-01-01 00:00:00'] - Length: 1, dtype: datetime64[ns] + Length: 1, dtype: datetime64[s] - >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern') - ... for _ in range(3)]).unique() + >>> pd.Series( + ... [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] + ... ).unique() ['2016-01-01 00:00:00-05:00'] - Length: 1, dtype: datetime64[ns, US/Eastern] + Length: 1, dtype: datetime64[s, US/Eastern] An Categorical will return categories in the order of appearance and with the same dtype. - >>> pd.Series(pd.Categorical(list('baabc'))).unique() + >>> pd.Series(pd.Categorical(list("baabc"))).unique() ['b', 'a', 'c'] Categories (3, object): ['a', 'b', 'c'] - >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'), - ... ordered=True)).unique() + >>> pd.Series( + ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) + ... ).unique() ['b', 'a', 'c'] Categories (3, object): ['a' < 'b' < 'c'] """ @@ -2396,20 +2161,17 @@ def drop_duplicates( keep: DropKeep = ..., inplace: Literal[False] = ..., ignore_index: bool = ..., - ) -> Series: - ... + ) -> Series: ... @overload def drop_duplicates( self, *, keep: DropKeep = ..., inplace: Literal[True], ignore_index: bool = ... - ) -> None: - ... + ) -> None: ... @overload def drop_duplicates( self, *, keep: DropKeep = ..., inplace: bool = ..., ignore_index: bool = ... - ) -> Series | None: - ... + ) -> Series | None: ... def drop_duplicates( self, @@ -2455,8 +2217,9 @@ def drop_duplicates( -------- Generate a Series with duplicated entries. - >>> s = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', 'hippo'], - ... name='animal') + >>> s = pd.Series( + ... ["llama", "cow", "llama", "beetle", "llama", "hippo"], name="animal" + ... ) >>> s 0 llama 1 cow @@ -2466,7 +2229,7 @@ def drop_duplicates( 5 hippo Name: animal, dtype: object - With the 'keep' parameter, the selection behaviour of duplicated values + With the 'keep' parameter, the selection behavior of duplicated values can be changed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. @@ -2480,7 +2243,7 @@ def drop_duplicates( The value 'last' for parameter 'keep' keeps the last occurrence for each set of duplicated entries. - >>> s.drop_duplicates(keep='last') + >>> s.drop_duplicates(keep="last") 1 cow 3 beetle 4 llama @@ -2544,7 +2307,7 @@ def duplicated(self, keep: DropKeep = "first") -> Series: By default, for each set of duplicated values, the first occurrence is set on False and all others on True: - >>> animals = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama']) + >>> animals = pd.Series(["llama", "cow", "llama", "beetle", "llama"]) >>> animals.duplicated() 0 False 1 False @@ -2555,7 +2318,7 @@ def duplicated(self, keep: DropKeep = "first") -> Series: which is equivalent to - >>> animals.duplicated(keep='first') + >>> animals.duplicated(keep="first") 0 False 1 False 2 True @@ -2566,7 +2329,7 @@ def duplicated(self, keep: DropKeep = "first") -> Series: By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True: - >>> animals.duplicated(keep='last') + >>> animals.duplicated(keep="last") 0 True 1 False 2 True @@ -2600,8 +2363,8 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashab axis : {0 or 'index'} Unused. Parameter needed for compatibility with DataFrame. skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, the result - will be NA. + Exclude NA/null values. If the entire Series is NA, or if ``skipna=False`` + and there is an NA value, this method will raise a ``ValueError``. *args, **kwargs Additional arguments and keywords have no effect but might be accepted for compatibility with NumPy. @@ -2633,8 +2396,7 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashab Examples -------- - >>> s = pd.Series(data=[1, None, 4, 1], - ... index=['A', 'B', 'C', 'D']) + >>> s = pd.Series(data=[1, None, 4, 1], index=["A", "B", "C", "D"]) >>> s A 1.0 B NaN @@ -2644,32 +2406,10 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashab >>> s.idxmin() 'A' - - If `skipna` is False and there is an NA value in the data, - the function returns ``nan``. - - >>> s.idxmin(skipna=False) - nan """ axis = self._get_axis_number(axis) - with warnings.catch_warnings(): - # TODO(3.0): this catching/filtering can be removed - # ignore warning produced by argmin since we will issue a different - # warning for idxmin - warnings.simplefilter("ignore") - i = self.argmin(axis, skipna, *args, **kwargs) - - if i == -1: - # GH#43587 give correct NA value for Index. - warnings.warn( - f"The behavior of {type(self).__name__}.idxmin with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.index._na_value - return self.index[i] + iloc = self.argmin(axis, skipna, *args, **kwargs) + return self.index[iloc] def idxmax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable: """ @@ -2683,8 +2423,8 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashab axis : {0 or 'index'} Unused. Parameter needed for compatibility with DataFrame. skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, the result - will be NA. + Exclude NA/null values. If the entire Series is NA, or if ``skipna=False`` + and there is an NA value, this method will raise a ``ValueError``. *args, **kwargs Additional arguments and keywords have no effect but might be accepted for compatibility with NumPy. @@ -2716,8 +2456,7 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashab Examples -------- - >>> s = pd.Series(data=[1, None, 4, 3, 4], - ... index=['A', 'B', 'C', 'D', 'E']) + >>> s = pd.Series(data=[1, None, 4, 3, 4], index=["A", "B", "C", "D", "E"]) >>> s A 1.0 B NaN @@ -2728,32 +2467,10 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashab >>> s.idxmax() 'C' - - If `skipna` is False and there is an NA value in the data, - the function returns ``nan``. - - >>> s.idxmax(skipna=False) - nan """ axis = self._get_axis_number(axis) - with warnings.catch_warnings(): - # TODO(3.0): this catching/filtering can be removed - # ignore warning produced by argmax since we will issue a different - # warning for argmax - warnings.simplefilter("ignore") - i = self.argmax(axis, skipna, *args, **kwargs) - - if i == -1: - # GH#43587 give correct NA value for Index. - warnings.warn( - f"The behavior of {type(self).__name__}.idxmax with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.index._na_value - return self.index[i] + iloc = self.argmax(axis, skipna, *args, **kwargs) + return self.index[iloc] def round(self, decimals: int = 0, *args, **kwargs) -> Series: """ @@ -2777,45 +2494,49 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: -------- numpy.around : Round values of an np.array. DataFrame.round : Round values of a DataFrame. + Series.dt.round : Round values of data to the specified freq. + + Notes + ----- + For values exactly halfway between rounded decimal values, pandas rounds + to the nearest even value (e.g. -0.5 and 0.5 round to 0.0, 1.5 and 2.5 + round to 2.0, etc.). Examples -------- - >>> s = pd.Series([0.1, 1.3, 2.7]) + >>> s = pd.Series([-0.5, 0.1, 2.5, 1.3, 2.7]) >>> s.round() - 0 0.0 - 1 1.0 - 2 3.0 + 0 -0.0 + 1 0.0 + 2 2.0 + 3 1.0 + 4 3.0 dtype: float64 """ nv.validate_round(args, kwargs) - result = self._values.round(decimals) - result = self._constructor(result, index=self.index, copy=False).__finalize__( + new_mgr = self._mgr.round(decimals=decimals) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" ) - return result - @overload def quantile( self, q: float = ..., interpolation: QuantileInterpolation = ... - ) -> float: - ... + ) -> float: ... @overload def quantile( self, q: Sequence[float] | AnyArrayLike, interpolation: QuantileInterpolation = ..., - ) -> Series: - ... + ) -> Series: ... @overload def quantile( self, q: float | Sequence[float] | AnyArrayLike = ..., interpolation: QuantileInterpolation = ..., - ) -> float | Series: - ... + ) -> float | Series: ... def quantile( self, @@ -2855,9 +2576,9 @@ def quantile( Examples -------- >>> s = pd.Series([1, 2, 3, 4]) - >>> s.quantile(.5) + >>> s.quantile(0.5) 2.5 - >>> s.quantile([.25, .5, .75]) + >>> s.quantile([0.25, 0.5, 0.75]) 0.25 1.75 0.50 2.50 0.75 3.25 @@ -2939,8 +2660,8 @@ def corr( >>> def histogram_intersection(a, b): ... v = np.minimum(a, b).sum().round(decimals=1) ... return v - >>> s1 = pd.Series([.2, .0, .6, .2]) - >>> s2 = pd.Series([.3, .6, .0, .1]) + >>> s1 = pd.Series([0.2, 0.0, 0.6, 0.2]) + >>> s2 = pd.Series([0.3, 0.6, 0.0, 0.1]) >>> s1.corr(s2, method=histogram_intersection) 0.3 @@ -2950,8 +2671,15 @@ def corr( >>> s2 = pd.Series([1, 2, 3], index=[2, 1, 0]) >>> s1.corr(s2) -1.0 + + If the input is a constant array, the correlation is not defined in this case, + and ``np.nan`` is returned. + + >>> s1 = pd.Series([0.45, 0.45]) + >>> s1.corr(s1) + nan """ # noqa: E501 - this, other = self.align(other, join="inner", copy=False) + this, other = self.align(other, join="inner") if len(this) == 0: return np.nan @@ -3008,7 +2736,7 @@ def cov( >>> s1.cov(s2) -0.01685762652715874 """ - this, other = self.align(other, join="inner", copy=False) + this, other = self.align(other, join="inner") if len(this) == 0: return np.nan this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False) @@ -3102,6 +2830,9 @@ def diff(self, periods: int = 1) -> Series: -------- {examples} """ + if not lib.is_integer(periods): + if not (is_float(periods) and periods.is_integer()): + raise ValueError("periods must be an integer") result = algorithms.diff(self._values, periods) return self._constructor(result, index=self.index, copy=False).__finalize__( self, method="diff" @@ -3152,7 +2883,7 @@ def autocorr(self, lag: int = 1) -> float: """ return self.corr(cast(Series, self.shift(lag))) - def dot(self, other: AnyArrayLike) -> Series | np.ndarray: + def dot(self, other: AnyArrayLike | DataFrame) -> Series | np.ndarray: """ Compute the dot product between the Series and the columns of other. @@ -3207,8 +2938,8 @@ def dot(self, other: AnyArrayLike) -> Series | np.ndarray: if len(common) > len(self.index) or len(common) > len(other.index): raise ValueError("matrices are not aligned") - left = self.reindex(index=common, copy=False) - right = other.reindex(index=common, copy=False) + left = self.reindex(index=common) + right = other.reindex(index=common) lvals = left.values rvals = right.values else: @@ -3220,8 +2951,9 @@ def dot(self, other: AnyArrayLike) -> Series | np.ndarray: ) if isinstance(other, ABCDataFrame): + common_type = find_common_type([self.dtypes] + list(other.dtypes)) return self._constructor( - np.dot(lvals, rvals), index=other.columns, copy=False + np.dot(lvals, rvals), index=other.columns, copy=False, dtype=common_type ).__finalize__(self, method="dot") elif isinstance(other, Series): return np.dot(lvals, rvals) @@ -3394,12 +3126,12 @@ def combine( Consider 2 Datasets ``s1`` and ``s2`` containing highest clocked speeds of different birds. - >>> s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0}) + >>> s1 = pd.Series({"falcon": 330.0, "eagle": 160.0}) >>> s1 falcon 330.0 eagle 160.0 dtype: float64 - >>> s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + >>> s2 = pd.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0}) >>> s2 falcon 345.0 eagle 200.0 @@ -3495,8 +3227,8 @@ def combine_first(self, other) -> Series: Null values still persist if the location of that null value does not exist in `other` - >>> s1 = pd.Series({'falcon': np.nan, 'eagle': 160.0}) - >>> s2 = pd.Series({'eagle': 200.0, 'duck': 30.0}) + >>> s1 = pd.Series({"falcon": np.nan, "eagle": 160.0}) + >>> s2 = pd.Series({"eagle": 200.0, "duck": 30.0}) >>> s1.combine_first(s2) duck 30.0 eagle 160.0 @@ -3505,6 +3237,13 @@ def combine_first(self, other) -> Series: """ from pandas.core.reshape.concat import concat + if self.dtype == other.dtype: + if self.index.equals(other.index): + return self.mask(self.isna(), other) + elif self._can_hold_na and not isinstance(self.dtype, SparseDtype): + this, other = self.align(other, join="outer") + return this.mask(this.isna(), other) + new_index = self.index.union(other.index) this = self @@ -3512,13 +3251,14 @@ def combine_first(self, other) -> Series: keep_other = other.index.difference(this.index[notna(this)]) keep_this = this.index.difference(keep_other) - this = this.reindex(keep_this, copy=False) - other = other.reindex(keep_other, copy=False) + this = this.reindex(keep_this) + other = other.reindex(keep_other) if this.dtype.kind == "M" and other.dtype.kind != "M": + # TODO: try to match resos? other = to_datetime(other) combined = concat([this, other]) - combined = combined.reindex(new_index, copy=False) + combined = combined.reindex(new_index) return combined.__finalize__(self, method="combine_first") def update(self, other: Series | Sequence | Mapping) -> None: @@ -3531,6 +3271,13 @@ def update(self, other: Series | Sequence | Mapping) -> None: Parameters ---------- other : Series, or object coercible into Series + Other Series that provides values to update the current Series. + + See Also + -------- + Series.combine : Perform element-wise operation on two Series + using a given function. + Series.transform: Modify a Series using a function. Examples -------- @@ -3542,8 +3289,8 @@ def update(self, other: Series | Sequence | Mapping) -> None: 2 6 dtype: int64 - >>> s = pd.Series(['a', 'b', 'c']) - >>> s.update(pd.Series(['d', 'e'], index=[0, 2])) + >>> s = pd.Series(["a", "b", "c"]) + >>> s.update(pd.Series(["d", "e"], index=[0, 2])) >>> s 0 d 1 b @@ -3588,25 +3335,13 @@ def update(self, other: Series | Sequence | Mapping) -> None: 2 3 dtype: int64 """ - if not PYPY and using_copy_on_write(): + if not PYPY: if sys.getrefcount(self) <= REF_COUNT: warnings.warn( _chained_assignment_method_msg, ChainedAssignmentError, stacklevel=2, ) - elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules(): - ctr = sys.getrefcount(self) - ref_count = REF_COUNT - if _check_cacher(self): - # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 - ref_count += 1 - if ctr <= ref_count: - warnings.warn( - _chained_assignment_warning_method_msg, - FutureWarning, - stacklevel=2, - ) if not isinstance(other, Series): other = Series(other) @@ -3615,7 +3350,6 @@ def update(self, other: Series | Sequence | Mapping) -> None: mask = notna(other) self._mgr = self._mgr.putmask(mask=mask, new=other) - self._maybe_update_cacher() # ---------------------------------------------------------------------- # Reindexing, sorting @@ -3631,8 +3365,7 @@ def sort_values( na_position: NaPosition = ..., ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> Series: - ... + ) -> Series: ... @overload def sort_values( @@ -3645,8 +3378,7 @@ def sort_values( na_position: NaPosition = ..., ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> None: - ... + ) -> None: ... @overload def sort_values( @@ -3659,8 +3391,7 @@ def sort_values( na_position: NaPosition = ..., ignore_index: bool = ..., key: ValueKeyFunc = ..., - ) -> Series | None: - ... + ) -> Series | None: ... def sort_values( self, @@ -3724,7 +3455,7 @@ def sort_values( 4 5.0 dtype: float64 - Sort values ascending order (default behaviour) + Sort values ascending order (default behavior) >>> s.sort_values(ascending=True) 1 1.0 @@ -3746,7 +3477,7 @@ def sort_values( Sort values putting NAs first - >>> s.sort_values(na_position='first') + >>> s.sort_values(na_position="first") 0 NaN 1 1.0 2 3.0 @@ -3756,7 +3487,7 @@ def sort_values( Sort a series of strings - >>> s = pd.Series(['z', 'b', 'd', 'a', 'c']) + >>> s = pd.Series(["z", "b", "d", "a", "c"]) >>> s 0 z 1 b @@ -3776,7 +3507,7 @@ def sort_values( Sort using a key function. Your `key` function will be given the ``Series`` of values and should return an array-like. - >>> s = pd.Series(['a', 'B', 'c', 'D', 'e']) + >>> s = pd.Series(["a", "B", "c", "D", "e"]) >>> s.sort_values() 1 B 3 D @@ -3819,13 +3550,6 @@ def sort_values( # Validate the axis parameter self._get_axis_number(axis) - # GH 5856/5853 - if inplace and self._is_cached: - raise ValueError( - "This Series is a view of some other array, to " - "sort in-place you must create a copy" - ) - if is_list_like(ascending): ascending = cast(Sequence[bool], ascending) if len(ascending) != 1: @@ -3849,7 +3573,7 @@ def sort_values( if is_range_indexer(sorted_index, len(sorted_index)): if inplace: return self._update_inplace(self) - return self.copy(deep=None) + return self.copy(deep=False) result = self._constructor( self._values[sorted_index], index=self.index[sorted_index], copy=False @@ -3876,8 +3600,7 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> None: - ... + ) -> None: ... @overload def sort_index( @@ -3892,8 +3615,7 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> Series: - ... + ) -> Series: ... @overload def sort_index( @@ -3908,8 +3630,7 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> Series | None: - ... + ) -> Series | None: ... def sort_index( self, @@ -3974,7 +3695,7 @@ def sort_index( Examples -------- - >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) + >>> s = pd.Series(["a", "b", "c", "d"], index=[3, 2, 1, 4]) >>> s.sort_index() 1 c 2 b @@ -3994,8 +3715,8 @@ def sort_index( By default NaNs are put at the end, but use `na_position` to place them at the beginning - >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, np.nan]) - >>> s.sort_index(na_position='first') + >>> s = pd.Series(["a", "b", "c", "d"], index=[3, 2, 1, np.nan]) + >>> s.sort_index(na_position="first") NaN d 1.0 c 2.0 b @@ -4004,10 +3725,10 @@ def sort_index( Specify index level to sort - >>> arrays = [np.array(['qux', 'qux', 'foo', 'foo', - ... 'baz', 'baz', 'bar', 'bar']), - ... np.array(['two', 'one', 'two', 'one', - ... 'two', 'one', 'two', 'one'])] + >>> arrays = [ + ... np.array(["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"]), + ... np.array(["two", "one", "two", "one", "two", "one", "two", "one"]), + ... ] >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=arrays) >>> s.sort_index(level=1) bar one 8 @@ -4035,8 +3756,8 @@ def sort_index( Apply a key function before sorting - >>> s = pd.Series([1, 2, 3, 4], index=['A', 'b', 'C', 'd']) - >>> s.sort_index(key=lambda x : x.str.lower()) + >>> s = pd.Series([1, 2, 3, 4], index=["A", "b", "C", "d"]) + >>> s.sort_index(key=lambda x: x.str.lower()) A 1 b 2 C 3 @@ -4061,6 +3782,7 @@ def argsort( axis: Axis = 0, kind: SortKind = "quicksort", order: None = None, + stable: None = None, ) -> Series: """ Return the integer indices that would sort the Series values. @@ -4077,6 +3799,8 @@ def argsort( information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. + stable : None + Has no effect but is accepted for compatibility with numpy. Returns ------- @@ -4101,25 +3825,7 @@ def argsort( # GH#54257 We allow -1 here so that np.argsort(series) works self._get_axis_number(axis) - values = self._values - mask = isna(values) - - if mask.any(): - # TODO(3.0): once this deprecation is enforced we can call - # self.array.argsort directly, which will close GH#43840 and - # GH#12694 - warnings.warn( - "The behavior of Series.argsort in the presence of NA values is " - "deprecated. In a future version, NA values will be ordered " - "last instead of set to -1.", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = np.full(len(self), -1, dtype=np.intp) - notmask = ~mask - result[notmask] = np.argsort(values[notmask], kind=kind) - else: - result = np.argsort(values, kind=kind) + result = self.array.argsort(kind=kind) res = self._constructor( result, index=self.index, name=self.name, dtype=np.intp, copy=False @@ -4165,11 +3871,18 @@ def nlargest( Examples -------- - >>> countries_population = {"Italy": 59000000, "France": 65000000, - ... "Malta": 434000, "Maldives": 434000, - ... "Brunei": 434000, "Iceland": 337000, - ... "Nauru": 11300, "Tuvalu": 11300, - ... "Anguilla": 11300, "Montserrat": 5200} + >>> countries_population = { + ... "Italy": 59000000, + ... "France": 65000000, + ... "Malta": 434000, + ... "Maldives": 434000, + ... "Brunei": 434000, + ... "Iceland": 337000, + ... "Nauru": 11300, + ... "Tuvalu": 11300, + ... "Anguilla": 11300, + ... "Montserrat": 5200, + ... } >>> s = pd.Series(countries_population) >>> s Italy 59000000 @@ -4207,7 +3920,7 @@ def nlargest( Brunei will be kept since it is the last with value 434000 based on the index order. - >>> s.nlargest(3, keep='last') + >>> s.nlargest(3, keep="last") France 65000000 Italy 59000000 Brunei 434000 @@ -4216,7 +3929,7 @@ def nlargest( The `n` largest elements where ``n=3`` with all duplicates kept. Note that the returned Series has five elements due to the three duplicates. - >>> s.nlargest(3, keep='all') + >>> s.nlargest(3, keep="all") France 65000000 Italy 59000000 Malta 434000 @@ -4265,11 +3978,18 @@ def nsmallest( Examples -------- - >>> countries_population = {"Italy": 59000000, "France": 65000000, - ... "Brunei": 434000, "Malta": 434000, - ... "Maldives": 434000, "Iceland": 337000, - ... "Nauru": 11300, "Tuvalu": 11300, - ... "Anguilla": 11300, "Montserrat": 5200} + >>> countries_population = { + ... "Italy": 59000000, + ... "France": 65000000, + ... "Brunei": 434000, + ... "Malta": 434000, + ... "Maldives": 434000, + ... "Iceland": 337000, + ... "Nauru": 11300, + ... "Tuvalu": 11300, + ... "Anguilla": 11300, + ... "Montserrat": 5200, + ... } >>> s = pd.Series(countries_population) >>> s Italy 59000000 @@ -4307,7 +4027,7 @@ def nsmallest( duplicates. Anguilla and Tuvalu will be kept since they are the last with value 11300 based on the index order. - >>> s.nsmallest(3, keep='last') + >>> s.nsmallest(3, keep="last") Montserrat 5200 Anguilla 11300 Tuvalu 11300 @@ -4316,7 +4036,7 @@ def nsmallest( The `n` smallest elements where ``n=3`` with all duplicates kept. Note that the returned Series has four elements due to the three duplicates. - >>> s.nsmallest(3, keep='all') + >>> s.nsmallest(3, keep="all") Montserrat 5200 Nauru 11300 Tuvalu 11300 @@ -4325,26 +4045,44 @@ def nsmallest( """ return selectn.SelectNSeries(self, n=n, keep=keep).nsmallest() - @doc( - klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """copy : bool, default True - Whether to copy underlying data. + def swaplevel( + self, i: Level = -2, j: Level = -1, copy: bool | lib.NoDefault = lib.no_default + ) -> Series: + """ + Swap levels i and j in a :class:`MultiIndex`. - .. note:: - The `copy` keyword will change behavior in pandas 3.0. - `Copy-on-Write - `__ - will be enabled by default, which means that all methods with a - `copy` keyword will use a lazy copy mechanism to defer the copy and - ignore the `copy` keyword. The `copy` keyword will be removed in a - future version of pandas. + Default is to swap the two innermost levels of the index. + + Parameters + ---------- + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + copy : bool, default True + Whether to copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy + and ignore the `copy` keyword. The `copy` keyword will be + removed in a future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + Series + Series with levels swapped in MultiIndex. + + See Also + -------- + DataFrame.swaplevel : Swap levels i and j in a :class:`DataFrame`. + Series.reorder_levels : Rearrange index levels using input order. + MultiIndex.swaplevel : Swap levels i and j in a :class:`MultiIndex`. - You can already get the future behavior and improvements through - enabling copy on write ``pd.options.mode.copy_on_write = True``""" - ), - examples=dedent( - """\ Examples -------- >>> s = pd.Series( @@ -4364,7 +4102,7 @@ def nsmallest( In the following example, we will swap the levels of the indices. Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. + in a similar manner. Note that column-wise is the default behavior. By not supplying any arguments for i and j, we swap the last and second to last indices. @@ -4394,32 +4132,11 @@ def nsmallest( Geography Final exam February B History Coursework March A Geography Coursework April C - dtype: object""" - ), - ) - def swaplevel( - self, i: Level = -2, j: Level = -1, copy: bool | None = None - ) -> Series: - """ - Swap levels i and j in a :class:`MultiIndex`. - - Default is to swap the two innermost levels of the index. - - Parameters - ---------- - i, j : int or str - Levels of the indices to be swapped. Can pass level name as string. - {extra_params} - - Returns - ------- - {klass} - {klass} with levels swapped in MultiIndex. - - {examples} + dtype: object """ + self._check_copy_deprecation(copy) assert isinstance(self.index, MultiIndex) - result = self.copy(deep=copy and not using_copy_on_write()) + result = self.copy(deep=False) result.index = self.index.swaplevel(i, j) return result @@ -4436,12 +4153,20 @@ def reorder_levels(self, order: Sequence[Level]) -> Series: Returns ------- - type of caller (new object) + Series + Type of caller with index as MultiIndex (new object). + + See Also + -------- + DataFrame.reorder_levels : Rearrange index or column levels using + input ``order``. Examples -------- - >>> arrays = [np.array(["dog", "dog", "cat", "cat", "bird", "bird"]), - ... np.array(["white", "black", "white", "black", "white", "black"])] + >>> arrays = [ + ... np.array(["dog", "dog", "cat", "cat", "bird", "bird"]), + ... np.array(["white", "black", "white", "black", "white", "black"]), + ... ] >>> s = pd.Series([1, 2, 3, 3, 5, 2], index=arrays) >>> s dog white 1 @@ -4463,7 +4188,7 @@ def reorder_levels(self, order: Sequence[Level]) -> Series: if not isinstance(self.index, MultiIndex): # pragma: no cover raise Exception("Can only reorder levels on a hierarchical axis.") - result = self.copy(deep=None) + result = self.copy(deep=False) assert isinstance(result.index, MultiIndex) result.index = result.index.reorder_levels(order) return result @@ -4503,7 +4228,7 @@ def explode(self, ignore_index: bool = False) -> Series: Examples -------- - >>> s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) + >>> s = pd.Series([[1, 2, 3], "foo", [], [3, 4]]) >>> s 0 [1, 2, 3] 1 foo @@ -4559,15 +4284,20 @@ def unstack( DataFrame Unstacked Series. + See Also + -------- + DataFrame.unstack : Pivot the MultiIndex of a DataFrame. + Notes ----- Reference :ref:`the user guide ` for more examples. Examples -------- - >>> s = pd.Series([1, 2, 3, 4], - ... index=pd.MultiIndex.from_product([['one', 'two'], - ... ['a', 'b']])) + >>> s = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.MultiIndex.from_product([["one", "two"], ["a", "b"]]), + ... ) >>> s one a 1 b 2 @@ -4594,8 +4324,9 @@ def unstack( def map( self, - arg: Callable | Mapping | Series, + func: Callable | Mapping | Series | None = None, na_action: Literal["ignore"] | None = None, + **kwargs, ) -> Series: """ Map values of Series according to an input mapping or function. @@ -4606,11 +4337,16 @@ def map( Parameters ---------- - arg : function, collections.abc.Mapping subclass or Series - Mapping correspondence. + func : function, collections.abc.Mapping subclass or Series + Function or mapping correspondence. na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the mapping correspondence. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `arg`. + + .. versionadded:: 3.0.0 Returns ------- @@ -4634,7 +4370,7 @@ def map( Examples -------- - >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit']) + >>> s = pd.Series(["cat", "dog", np.nan, "rabbit"]) >>> s 0 cat 1 dog @@ -4646,7 +4382,7 @@ def map( in the ``dict`` are converted to ``NaN``, unless the dict has a default value (e.g. ``defaultdict``): - >>> s.map({'cat': 'kitten', 'dog': 'puppy'}) + >>> s.map({"cat": "kitten", "dog": "puppy"}) 0 kitten 1 puppy 2 NaN @@ -4655,7 +4391,7 @@ def map( It also accepts a function: - >>> s.map('I am a {}'.format) + >>> s.map("I am a {}".format) 0 I am a cat 1 I am a dog 2 I am a nan @@ -4665,14 +4401,29 @@ def map( To avoid applying the function to missing values (and keep them as ``NaN``) ``na_action='ignore'`` can be used: - >>> s.map('I am a {}'.format, na_action='ignore') + >>> s.map("I am a {}".format, na_action="ignore") 0 I am a cat 1 I am a dog 2 NaN 3 I am a rabbit dtype: object """ - new_values = self._map_values(arg, na_action=na_action) + if func is None: + if "arg" in kwargs: + # `.map(arg=my_func)` + func = kwargs.pop("arg") + warnings.warn( + "The parameter `arg` has been renamed to `func`, and it " + "will stop being supported in a future version of pandas.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + raise ValueError("The `func` parameter is required") + + if callable(func): + func = functools.partial(func, **kwargs) + new_values = self._map_values(func, na_action=na_action) return self._constructor(new_values, index=self.index, copy=False).__finalize__( self, method="map" ) @@ -4753,18 +4504,13 @@ def transform( ) -> DataFrame | Series: # Validate axis argument self._get_axis_number(axis) - ser = ( - self.copy(deep=False) - if using_copy_on_write() or warn_copy_on_write() - else self - ) + ser = self.copy(deep=False) result = SeriesApply(ser, func=func, args=args, kwargs=kwargs).transform() return result def apply( self, func: AggFuncType, - convert_dtype: bool | lib.NoDefault = lib.no_default, args: tuple[Any, ...] = (), *, by_row: Literal[False, "compat"] = "compat", @@ -4780,14 +4526,6 @@ def apply( ---------- func : function Python function or NumPy ufunc to apply. - convert_dtype : bool, default True - Try to find better dtype for elementwise function results. If - False, leave as dtype=object. Note that the dtype is always - preserved for some extension array dtypes, such as Categorical. - - .. deprecated:: 2.1.0 - ``convert_dtype`` has been deprecated. Do ``ser.astype(object).apply()`` - instead if you want ``convert_dtype=False``. args : tuple Positional arguments passed to func after the series value. by_row : False or "compat", default "compat" @@ -4826,8 +4564,7 @@ def apply( -------- Create a series with typical summer temperatures for each city. - >>> s = pd.Series([20, 21, 12], - ... index=['London', 'New York', 'Helsinki']) + >>> s = pd.Series([20, 21, 12], index=["London", "New York", "Helsinki"]) >>> s London 20 New York 21 @@ -4838,7 +4575,7 @@ def apply( argument to ``apply()``. >>> def square(x): - ... return x ** 2 + ... return x**2 >>> s.apply(square) London 400 New York 441 @@ -4848,7 +4585,7 @@ def apply( Square the values by passing an anonymous function as an argument to ``apply()``. - >>> s.apply(lambda x: x ** 2) + >>> s.apply(lambda x: x**2) London 400 New York 441 Helsinki 144 @@ -4892,7 +4629,6 @@ def apply( return SeriesApply( self, func, - convert_dtype=convert_dtype, by_row=by_row, args=args, kwargs=kwargs, @@ -4902,18 +4638,13 @@ def _reindex_indexer( self, new_index: Index | None, indexer: npt.NDArray[np.intp] | None, - copy: bool | None, ) -> Series: # Note: new_index is None iff indexer is None # if not None, indexer is np.intp if indexer is None and ( new_index is None or new_index.names == self.index.names ): - if using_copy_on_write(): - return self.copy(deep=copy) - if copy or copy is None: - return self.copy(deep=copy) - return self + return self.copy(deep=False) new_values = algorithms.take_nd( self._values, indexer, allow_fill=True, fill_value=None @@ -4933,12 +4664,11 @@ def rename( index: Renamer | Hashable | None = ..., *, axis: Axis | None = ..., - copy: bool = ..., + copy: bool | lib.NoDefault = ..., inplace: Literal[True], level: Level | None = ..., errors: IgnoreRaise = ..., - ) -> None: - ... + ) -> Series | None: ... @overload def rename( @@ -4946,32 +4676,18 @@ def rename( index: Renamer | Hashable | None = ..., *, axis: Axis | None = ..., - copy: bool = ..., + copy: bool | lib.NoDefault = ..., inplace: Literal[False] = ..., level: Level | None = ..., errors: IgnoreRaise = ..., - ) -> Series: - ... - - @overload - def rename( - self, - index: Renamer | Hashable | None = ..., - *, - axis: Axis | None = ..., - copy: bool = ..., - inplace: bool = ..., - level: Level | None = ..., - errors: IgnoreRaise = ..., - ) -> Series | None: - ... + ) -> Series: ... def rename( self, index: Renamer | Hashable | None = None, *, axis: Axis | None = None, - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, inplace: bool = False, level: Level | None = None, errors: IgnoreRaise = "ignore", @@ -4996,7 +4712,7 @@ def rename( attribute. axis : {0 or 'index'} Unused. Parameter needed for compatibility with DataFrame. - copy : bool, default True + copy : bool, default False Also copy underlying data. .. note:: @@ -5010,6 +4726,8 @@ def rename( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 inplace : bool, default False Whether to return a new Series. If True the value of copy is ignored. level : int or level name, default None @@ -5021,8 +4739,9 @@ def rename( Returns ------- - Series or None - Series with index labels or name altered or None if ``inplace=True``. + Series + A shallow copy with index labels or name altered, or the same object + if ``inplace=True`` and index is not a dict or callable else None. See Also -------- @@ -5042,7 +4761,7 @@ def rename( 1 2 2 3 Name: my_name, dtype: int64 - >>> s.rename(lambda x: x ** 2) # function, changes labels + >>> s.rename(lambda x: x**2) # function, changes labels 0 1 1 2 4 3 @@ -5053,6 +4772,7 @@ def rename( 5 3 dtype: int64 """ + self._check_copy_deprecation(copy) if axis is not None: # Make sure we raise if an invalid 'axis' is passed. axis = self._get_axis_number(axis) @@ -5064,13 +4784,12 @@ def rename( # Hashable], Callable[[Any], Hashable], None]" return super()._rename( index, # type: ignore[arg-type] - copy=copy, inplace=inplace, level=level, errors=errors, ) else: - return self._set_name(index, inplace=inplace, deep=copy) + return self._set_name(index, inplace=inplace) @Appender( """ @@ -5103,7 +4822,7 @@ def set_axis( labels, *, axis: Axis = 0, - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, ) -> Series: return super().set_axis(labels, axis=axis, copy=copy) @@ -5119,7 +4838,7 @@ def reindex( # type: ignore[override] *, axis: Axis | None = None, method: ReindexMethod | None = None, - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, level: Level | None = None, fill_value: Scalar | None = None, limit: int | None = None, @@ -5128,11 +4847,11 @@ def reindex( # type: ignore[override] return super().reindex( index=index, method=method, - copy=copy, level=level, fill_value=fill_value, limit=limit, tolerance=tolerance, + copy=copy, ) @overload # type: ignore[override] @@ -5142,10 +4861,9 @@ def rename_axis( *, index=..., axis: Axis = ..., - copy: bool = ..., + copy: bool | lib.NoDefault = ..., inplace: Literal[True], - ) -> None: - ... + ) -> None: ... @overload def rename_axis( @@ -5154,10 +4872,9 @@ def rename_axis( *, index=..., axis: Axis = ..., - copy: bool = ..., + copy: bool | lib.NoDefault = ..., inplace: Literal[False] = ..., - ) -> Self: - ... + ) -> Self: ... @overload def rename_axis( @@ -5166,78 +4883,134 @@ def rename_axis( *, index=..., axis: Axis = ..., - copy: bool = ..., + copy: bool | lib.NoDefault = ..., inplace: bool = ..., - ) -> Self | None: - ... + ) -> Self | None: ... - @doc(NDFrame.rename_axis) def rename_axis( self, mapper: IndexLabel | lib.NoDefault = lib.no_default, *, index=lib.no_default, axis: Axis = 0, - copy: bool = True, + copy: bool | lib.NoDefault = lib.no_default, inplace: bool = False, ) -> Self | None: + """ + Set the name of the axis for the index. + + Parameters + ---------- + mapper : scalar, list-like, optional + Value to set the axis name attribute. + + Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index``. + + index : scalar, list-like, dict-like or function, optional + A scalar, list-like, dict-like or functions transformations to + apply to that axis' values. + axis : {0 or 'index'}, default 0 + The axis to rename. For `Series` this parameter is unused and defaults to 0. + copy : bool, default False + Also copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + inplace : bool, default False + Modifies the object directly, instead of creating a new Series + or DataFrame. + + Returns + ------- + Series, or None + The same type as the caller or None if ``inplace=True``. + + See Also + -------- + Series.rename : Alter Series index labels or name. + DataFrame.rename : Alter DataFrame index labels or name. + Index.rename : Set new names on index. + + Examples + -------- + + >>> s = pd.Series(["dog", "cat", "monkey"]) + >>> s + 0 dog + 1 cat + 2 monkey + dtype: object + >>> s.rename_axis("animal") + animal + 0 dog + 1 cat + 2 monkey + dtype: object + """ return super().rename_axis( mapper=mapper, index=index, axis=axis, - copy=copy, inplace=inplace, + copy=copy, ) @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level | None = ..., inplace: Literal[True], errors: IgnoreRaise = ..., - ) -> None: - ... + ) -> None: ... @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level | None = ..., inplace: Literal[False] = ..., errors: IgnoreRaise = ..., - ) -> Series: - ... + ) -> Series: ... @overload def drop( self, - labels: IndexLabel = ..., + labels: IndexLabel | ListLike = ..., *, axis: Axis = ..., - index: IndexLabel = ..., - columns: IndexLabel = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., level: Level | None = ..., inplace: bool = ..., errors: IgnoreRaise = ..., - ) -> Series | None: - ... + ) -> Series | None: ... def drop( self, - labels: IndexLabel | None = None, + labels: IndexLabel | ListLike = None, *, axis: Axis = 0, - index: IndexLabel | None = None, - columns: IndexLabel | None = None, + index: IndexLabel | ListLike = None, + columns: IndexLabel | ListLike = None, level: Level | None = None, inplace: bool = False, errors: IgnoreRaise = "raise", @@ -5286,27 +5059,26 @@ def drop( Examples -------- - >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C']) + >>> s = pd.Series(data=np.arange(3), index=["A", "B", "C"]) >>> s A 0 B 1 C 2 dtype: int64 - Drop labels B en C + Drop labels B and C - >>> s.drop(labels=['B', 'C']) + >>> s.drop(labels=["B", "C"]) A 0 dtype: int64 Drop 2nd level label in MultiIndex Series - >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], - ... ['speed', 'weight', 'length']], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], - ... index=midx) + >>> midx = pd.MultiIndex( + ... levels=[["llama", "cow", "falcon"], ["speed", "weight", "length"]], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ... ) + >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) >>> s llama speed 45.0 weight 200.0 @@ -5319,7 +5091,7 @@ def drop( length 0.3 dtype: float64 - >>> s.drop(labels='weight', level=1) + >>> s.drop(labels="weight", level=1) llama speed 45.0 length 1.2 cow speed 30.0 @@ -5349,7 +5121,13 @@ def pop(self, item: Hashable) -> Any: Returns ------- - Value that is popped from series. + scalar + Value that is popped from series. + + See Also + -------- + Series.drop: Drop specified values from Series. + Series.drop_duplicates: Return Series with duplicate values removed. Examples -------- @@ -5381,29 +5159,6 @@ def info( show_counts=show_counts, ) - # TODO(3.0): this can be removed once GH#33302 deprecation is enforced - def _replace_single(self, to_replace, method: str, inplace: bool, limit): - """ - Replaces values in a Series using the fill method specified when no - replacement value is given in the replace method - """ - - result = self if inplace else self.copy() - - values = result._values - mask = missing.mask_missing(values, to_replace) - - if isinstance(values, ExtensionArray): - # dispatch to the EA's _pad_mask_inplace method - values._fill_mask_inplace(method, limit, mask) - else: - fill_f = missing.get_fill_func(method) - fill_f(values, limit=limit, mask=mask) - - if inplace: - return - return result - def memory_usage(self, index: bool = True, deep: bool = False) -> int: """ Return the memory usage of the Series. @@ -5488,9 +5243,10 @@ def isin(self, values) -> Series: Examples -------- - >>> s = pd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', - ... 'hippo'], name='animal') - >>> s.isin(['cow', 'llama']) + >>> s = pd.Series( + ... ["llama", "cow", "llama", "beetle", "llama", "hippo"], name="animal" + ... ) + >>> s.isin(["cow", "llama"]) 0 True 1 True 2 True @@ -5501,7 +5257,7 @@ def isin(self, values) -> Series: To invert the boolean values, use the ``~`` operator: - >>> ~s.isin(['cow', 'llama']) + >>> ~s.isin(["cow", "llama"]) 0 False 1 False 2 False @@ -5513,7 +5269,7 @@ def isin(self, values) -> Series: Passing a single string as ``s.isin('llama')`` will raise an error. Use a list of one element instead: - >>> s.isin(['llama']) + >>> s.isin(["llama"]) 0 True 1 False 2 True @@ -5524,10 +5280,10 @@ def isin(self, values) -> Series: Strings and integers are distinct and are therefore not comparable: - >>> pd.Series([1]).isin(['1']) + >>> pd.Series([1]).isin(["1"]) 0 False dtype: bool - >>> pd.Series([1.1]).isin(['1.1']) + >>> pd.Series([1.1]).isin(["1.1"]) 0 False dtype: bool """ @@ -5601,8 +5357,8 @@ def between( `left` and `right` can be any scalar value: - >>> s = pd.Series(['Alice', 'Bob', 'Carol', 'Eve']) - >>> s.between('Anna', 'Daniel') + >>> s = pd.Series(["Alice", "Bob", "Carol", "Eve"]) + >>> s.between("Anna", "Daniel") 0 False 1 True 2 True @@ -5629,6 +5385,126 @@ def between( return lmask & rmask + def case_when( + self, + caselist: list[ + tuple[ + ArrayLike | Callable[[Series], Series | np.ndarray | Sequence[bool]], + ArrayLike | Scalar | Callable[[Series], Series | np.ndarray], + ], + ], + ) -> Series: + """ + Replace values where the conditions are True. + + .. versionadded:: 2.2.0 + + Parameters + ---------- + caselist : A list of tuples of conditions and expected replacements + Takes the form: ``(condition0, replacement0)``, + ``(condition1, replacement1)``, ... . + ``condition`` should be a 1-D boolean array-like object + or a callable. If ``condition`` is a callable, + it is computed on the Series + and should return a boolean Series or array. + The callable must not change the input Series + (though pandas doesn`t check it). ``replacement`` should be a + 1-D array-like object, a scalar or a callable. + If ``replacement`` is a callable, it is computed on the Series + and should return a scalar or Series. The callable + must not change the input Series + (though pandas doesn`t check it). + + Returns + ------- + Series + A new Series with values replaced based on the provided conditions. + + See Also + -------- + Series.mask : Replace values where the condition is True. + + Examples + -------- + >>> c = pd.Series([6, 7, 8, 9], name="c") + >>> a = pd.Series([0, 0, 1, 2]) + >>> b = pd.Series([0, 3, 4, 5]) + + >>> c.case_when( + ... caselist=[ + ... (a.gt(0), a), # condition, replacement + ... (b.gt(0), b), + ... ] + ... ) + 0 6 + 1 3 + 2 1 + 3 2 + Name: c, dtype: int64 + """ + if not isinstance(caselist, list): + raise TypeError( + f"The caselist argument should be a list; instead got {type(caselist)}" + ) + + if not caselist: + raise ValueError( + "provide at least one boolean condition, " + "with a corresponding replacement." + ) + + for num, entry in enumerate(caselist): + if not isinstance(entry, tuple): + raise TypeError( + f"Argument {num} must be a tuple; instead got {type(entry)}." + ) + if len(entry) != 2: + raise ValueError( + f"Argument {num} must have length 2; " + "a condition and replacement; " + f"instead got length {len(entry)}." + ) + caselist = [ + ( + com.apply_if_callable(condition, self), + com.apply_if_callable(replacement, self), + ) + for condition, replacement in caselist + ] + default = self.copy(deep=False) + conditions, replacements = zip(*caselist) + common_dtypes = [infer_dtype_from(arg)[0] for arg in [*replacements, default]] + if len(set(common_dtypes)) > 1: + common_dtype = find_common_type(common_dtypes) + updated_replacements = [] + for condition, replacement in zip(conditions, replacements): + if is_scalar(replacement): + replacement = construct_1d_arraylike_from_scalar( + value=replacement, length=len(condition), dtype=common_dtype + ) + elif isinstance(replacement, ABCSeries): + replacement = replacement.astype(common_dtype) + else: + replacement = pd_array(replacement, dtype=common_dtype) + updated_replacements.append(replacement) + replacements = updated_replacements + default = default.astype(common_dtype) + + counter = range(len(conditions) - 1, -1, -1) + for position, condition, replacement in zip( + counter, reversed(conditions), reversed(replacements) + ): + try: + default = default.mask( + condition, other=replacement, axis=0, inplace=False, level=None + ) + except Exception as error: + raise ValueError( + f"Failed to apply condition{position} and replacement{position}." + ) from error + return default + # error: Cannot determine type of 'isna' @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] def isna(self) -> Series: @@ -5663,8 +5539,7 @@ def dropna( inplace: Literal[False] = ..., how: AnyAll | None = ..., ignore_index: bool = ..., - ) -> Series: - ... + ) -> Series: ... @overload def dropna( @@ -5674,8 +5549,7 @@ def dropna( inplace: Literal[True], how: AnyAll | None = ..., ignore_index: bool = ..., - ) -> None: - ... + ) -> None: ... def dropna( self, @@ -5719,7 +5593,7 @@ def dropna( Examples -------- - >>> ser = pd.Series([1., 2., np.nan]) + >>> ser = pd.Series([1.0, 2.0, np.nan]) >>> ser 0 1.0 1 2.0 @@ -5736,7 +5610,7 @@ def dropna( Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = pd.Series([np.nan, 2, pd.NaT, '', None, 'I stay']) + >>> ser = pd.Series([np.nan, 2, pd.NaT, "", None, "I stay"]) >>> ser 0 NaN 1 2 @@ -5760,7 +5634,7 @@ def dropna( result = remove_na_arraylike(self) else: if not inplace: - result = self.copy(deep=None) + result = self.copy(deep=False) else: result = self @@ -5779,11 +5653,13 @@ def to_timestamp( self, freq: Frequency | None = None, how: Literal["s", "e", "start", "end"] = "start", - copy: bool | None = None, + copy: bool | lib.NoDefault = lib.no_default, ) -> Series: """ Cast to DatetimeIndex of Timestamps, at *beginning* of period. + This can be changed to the *end* of the period, by specifying `how="e"`. + Parameters ---------- freq : str, default frequency of PeriodIndex @@ -5791,7 +5667,7 @@ def to_timestamp( how : {'s', 'e', 'start', 'end'} Convention for converting period to timestamp; start of period vs. end. - copy : bool, default True + copy : bool, default False Whether or not to return a copy. .. note:: @@ -5806,13 +5682,21 @@ def to_timestamp( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- Series with DatetimeIndex + Series with the PeriodIndex cast to DatetimeIndex. - Examples + See Also + -------- + Series.to_period: Inverse method to cast DatetimeIndex to PeriodIndex. + DataFrame.to_timestamp: Equivalent method for DataFrame. + + Examples -------- - >>> idx = pd.PeriodIndex(['2023', '2024', '2025'], freq='Y') + >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") >>> s1 = pd.Series([1, 2, 3], index=idx) >>> s1 2023 1 @@ -5832,22 +5716,27 @@ def to_timestamp( Using `freq` which is the offset that the Timestamps will have >>> s2 = pd.Series([1, 2, 3], index=idx) - >>> s2 = s2.to_timestamp(freq='M') + >>> s2 = s2.to_timestamp(freq="M") >>> s2 2023-01-31 1 2024-01-31 2 2025-01-31 3 Freq: YE-JAN, dtype: int64 """ + self._check_copy_deprecation(copy) if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_obj = self.copy(deep=copy and not using_copy_on_write()) + new_obj = self.copy(deep=False) new_index = self.index.to_timestamp(freq=freq, how=how) setattr(new_obj, "index", new_index) return new_obj - def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series: + def to_period( + self, + freq: str | None = None, + copy: bool | lib.NoDefault = lib.no_default, + ) -> Series: """ Convert Series from DatetimeIndex to PeriodIndex. @@ -5855,7 +5744,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series ---------- freq : str, default None Frequency associated with the PeriodIndex. - copy : bool, default True + copy : bool, default False Whether or not to return a copy. .. note:: @@ -5870,14 +5759,21 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + .. deprecated:: 3.0.0 + Returns ------- Series Series with index converted to PeriodIndex. + See Also + -------- + DataFrame.to_period: Equivalent method for DataFrame. + Series.dt.to_period: Convert DateTime column values. + Examples -------- - >>> idx = pd.DatetimeIndex(['2023', '2024', '2025']) + >>> idx = pd.DatetimeIndex(["2023", "2024", "2025"]) >>> s = pd.Series([1, 2, 3], index=idx) >>> s = s.to_period() >>> s @@ -5891,10 +5787,11 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series >>> s.index PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') """ + self._check_copy_deprecation(copy) if not isinstance(self.index, DatetimeIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_obj = self.copy(deep=copy and not using_copy_on_write()) + new_obj = self.copy(deep=False) new_index = self.index.to_period(freq=freq) setattr(new_obj, "index", new_index) return new_obj @@ -5952,13 +5849,13 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series # ---------------------------------------------------------------------- # Accessor Methods # ---------------------------------------------------------------------- - str = CachedAccessor("str", StringMethods) - dt = CachedAccessor("dt", CombinedDatetimelikeProperties) - cat = CachedAccessor("cat", CategoricalAccessor) - plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) - sparse = CachedAccessor("sparse", SparseAccessor) - struct = CachedAccessor("struct", StructAccessor) - list = CachedAccessor("list", ListAccessor) + str = Accessor("str", StringMethods) + dt = Accessor("dt", CombinedDatetimelikeProperties) + cat = Accessor("cat", CategoricalAccessor) + plot = Accessor("plot", pandas.plotting.PlotAccessor) + sparse = Accessor("sparse", SparseAccessor) + struct = Accessor("struct", StructAccessor) + list = Accessor("list", ListAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series @@ -5978,7 +5875,7 @@ def _cmp_method(self, other, op): res_values = ops.comparison_op(lvalues, rvalues, op) - return self._construct_result(res_values, name=res_name) + return self._construct_result(res_values, name=res_name, other=other) def _logical_method(self, other, op): res_name = ops.get_op_result_name(self, other) @@ -5988,7 +5885,7 @@ def _logical_method(self, other, op): rvalues = extract_array(other, extract_numpy=True, extract_range=True) res_values = ops.logical_op(lvalues, rvalues, op) - return self._construct_result(res_values, name=res_name) + return self._construct_result(res_values, name=res_name, other=other) def _arith_method(self, other, op): self, other = self._align_for_op(other) @@ -6009,19 +5906,14 @@ def _align_for_op(self, right, align_asobject: bool = False): object, np.bool_, ): - warnings.warn( - "Operation between non boolean Series with different " - "indexes will no longer return a boolean result in " - "a future version. Cast both Series to object type " - "to maintain the prior behavior.", - FutureWarning, - stacklevel=find_stack_level(), - ) - # to keep original value's dtype for bool ops - left = left.astype(object) - right = right.astype(object) - - left, right = left.align(right, copy=False) + pass + # GH#52538 no longer cast in these cases + else: + # to keep original value's dtype for bool ops + left = left.astype(object) + right = right.astype(object) + + left, right = left.align(right) return left, right @@ -6047,7 +5939,7 @@ def _binop(self, other: Series, func, level=None, fill_value=None) -> Series: this = self if not self.index.equals(other.index): - this, other = self.align(other, level=level, join="outer", copy=False) + this, other = self.align(other, level=level, join="outer") this_vals, other_vals = ops.fill_binop(this._values, other._values, fill_value) @@ -6055,11 +5947,15 @@ def _binop(self, other: Series, func, level=None, fill_value=None) -> Series: result = func(this_vals, other_vals) name = ops.get_op_result_name(self, other) - out = this._construct_result(result, name) + + out = this._construct_result(result, name, other) return cast(Series, out) def _construct_result( - self, result: ArrayLike | tuple[ArrayLike, ArrayLike], name: Hashable + self, + result: ArrayLike | tuple[ArrayLike, ArrayLike], + name: Hashable, + other: AnyArrayLike | DataFrame, ) -> Series | tuple[Series, Series]: """ Construct an appropriately-labelled Series from the result of an op. @@ -6068,6 +5964,7 @@ def _construct_result( ---------- result : ndarray or ExtensionArray name : Label + other : Series, DataFrame or array-like Returns ------- @@ -6077,8 +5974,8 @@ def _construct_result( if isinstance(result, tuple): # produced by divmod or rdivmod - res1 = self._construct_result(result[0], name=name) - res2 = self._construct_result(result[1], name=name) + res1 = self._construct_result(result[0], name=name, other=other) + res2 = self._construct_result(result[1], name=name, other=other) # GH#33427 assertions to keep mypy happy assert isinstance(res1, Series) @@ -6090,6 +5987,7 @@ def _construct_result( dtype = getattr(result, "dtype", None) out = self._constructor(result, index=self.index, dtype=dtype, copy=False) out = out.__finalize__(self) + out = out.__finalize__(other) # Set the result's name after __finalize__ is called because __finalize__ # would set it back to self.name @@ -6119,7 +6017,6 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0 return op(self, other) - @Appender(ops.make_flex_doc("eq", "series")) def eq( self, other, @@ -6127,6 +6024,63 @@ def eq( fill_value: float | None = None, axis: Axis = 0, ) -> Series: + """ + Return Equal to of series and other, element-wise (binary operator `eq`). + + Equivalent to ``series == other``, but with support to substitute a fill_value + for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.ge : Return elementwise Greater than or equal to of series and other. + Series.le : Return elementwise Less than or equal to of series and other. + Series.gt : Return elementwise Greater than of series and other. + Series.lt : Return elementwise Less than of series and other. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.eq(b, fill_value=0) + a True + b False + c False + d False + e False + dtype: bool + """ return self._flex_method( other, operator.eq, level=level, fill_value=fill_value, axis=axis ) @@ -6137,8 +6091,68 @@ def ne(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.ne, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("le", "series")) def le(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Less than or equal to of series and other, \ + element-wise (binary operator `le`). + + Equivalent to ``series <= other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.ge : Return elementwise Greater than or equal to of series and other. + Series.lt : Return elementwise Less than of series and other. + Series.gt : Return elementwise Greater than of series and other. + Series.eq : Return elementwise equal to of series and other. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan, 1], index=['a', 'b', 'c', 'd', 'e']) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + e 1.0 + dtype: float64 + >>> b = pd.Series([0, 1, 2, np.nan, 1], index=['a', 'b', 'c', 'd', 'f']) + >>> b + a 0.0 + b 1.0 + c 2.0 + d NaN + f 1.0 + dtype: float64 + >>> a.le(b, fill_value=0) + a False + b True + c True + d False + e False + f True + dtype: bool + """ return self._flex_method( other, operator.le, level=level, fill_value=fill_value, axis=axis ) @@ -6149,8 +6163,69 @@ def lt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.lt, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("ge", "series")) def ge(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Greater than or equal to of series and other, \ + element-wise (binary operator `ge`). + + Equivalent to ``series >= other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.gt : Greater than comparison, element-wise. + Series.le : Less than or equal to comparison, element-wise. + Series.lt : Less than comparison, element-wise. + Series.eq : Equal to comparison, element-wise. + Series.ne : Not equal to comparison, element-wise. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan, 1], index=["a", "b", "c", "d", "e"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + e 1.0 + dtype: float64 + >>> b = pd.Series([0, 1, 2, np.nan, 1], index=["a", "b", "c", "d", "f"]) + >>> b + a 0.0 + b 1.0 + c 2.0 + d NaN + f 1.0 + dtype: float64 + >>> a.ge(b, fill_value=0) + a True + b True + c False + d False + e True + f False + dtype: bool + """ return self._flex_method( other, operator.ge, level=level, fill_value=fill_value, axis=axis ) @@ -6161,8 +6236,64 @@ def gt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.gt, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("add", "series")) def add(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Addition of series and other, element-wise (binary operator `add`). + + Equivalent to ``series + other``, but with support to substitute a fill_value + for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + With which to compute the addition. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.radd : Reverse of the Addition operator, see + `Python documentation + `_ + for more details. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.add(b, fill_value=0) + a 2.0 + b 1.0 + c 1.0 + d 1.0 + e NaN + dtype: float64 + """ return self._flex_method( other, operator.add, level=level, fill_value=fill_value, axis=axis ) @@ -6187,7 +6318,6 @@ def rsub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, roperator.rsub, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("mul", "series")) def mul( self, other, @@ -6195,6 +6325,69 @@ def mul( fill_value: float | None = None, axis: Axis = 0, ) -> Series: + """ + Return Multiplication of series and other, element-wise (binary operator `mul`). + + Equivalent to ``series * other``, but with support to substitute + a fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + With which to compute the multiplication. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.rmul : Reverse of the Multiplication operator, see + `Python documentation + `_ + for more details. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.multiply(b, fill_value=0) + a 1.0 + b 0.0 + c 0.0 + d 0.0 + e NaN + dtype: float64 + >>> a.mul(5, fill_value=0) + a 5.0 + b 5.0 + c 5.0 + d 0.0 + dtype: float64 + """ return self._flex_method( other, operator.mul, level=level, fill_value=fill_value, axis=axis ) @@ -6207,8 +6400,65 @@ def rmul(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, roperator.rmul, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("truediv", "series")) def truediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Floating division of series and other, \ + element-wise (binary operator `truediv`). + + Equivalent to ``series / other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + Series with which to compute division. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.rtruediv : Reverse of the Floating division operator, see + `Python documentation + `_ + for more details. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.divide(b, fill_value=0) + a 1.0 + b inf + c inf + d 0.0 + e NaN + dtype: float64 + """ return self._flex_method( other, operator.truediv, level=level, fill_value=fill_value, axis=axis ) @@ -6236,8 +6486,64 @@ def rfloordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Serie other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("mod", "series")) def mod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Modulo of series and other, element-wise (binary operator `mod`). + + Equivalent to ``series % other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + Series with which to compute modulo. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.rmod : Reverse of the Modulo operator, see + `Python documentation + `_ + for more details. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.mod(b, fill_value=0) + a 0.0 + b NaN + c NaN + d 0.0 + e NaN + dtype: float64 + """ return self._flex_method( other, operator.mod, level=level, fill_value=fill_value, axis=axis ) @@ -6337,6 +6643,7 @@ def any( # type: ignore[override] filter_type="bool", ) + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") @Appender(make_doc("all", ndim=1)) def all( self, @@ -6356,7 +6663,7 @@ def all( filter_type="bool", ) - @doc(make_doc("min", ndim=1)) + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") def min( self, axis: Axis | None = 0, @@ -6364,9 +6671,70 @@ def min( numeric_only: bool = False, **kwargs, ): - return NDFrame.min(self, axis, skipna, numeric_only, **kwargs) + """ + Return the minimum of the values over the requested axis. + + If you want the *index* of the minimum, use ``idxmin``. + This is the equivalent of the ``numpy.ndarray`` method ``argmin``. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The minimum of the values in the Series. + + See Also + -------- + numpy.min : Equivalent numpy function for arrays. + Series.min : Return the minimum. + Series.max : Return the maximum. + Series.idxmin : Return the index of the minimum. + Series.idxmax : Return the index of the maximum. + DataFrame.min : Return the minimum over the requested axis. + DataFrame.max : Return the maximum over the requested axis. + DataFrame.idxmin : Return the index of the minimum over the requested axis. + DataFrame.idxmax : Return the index of the maximum over the requested axis. - @doc(make_doc("max", ndim=1)) + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.min() + 0 + """ + return NDFrame.min( + self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") def max( self, axis: Axis | None = 0, @@ -6374,9 +6742,70 @@ def max( numeric_only: bool = False, **kwargs, ): - return NDFrame.max(self, axis, skipna, numeric_only, **kwargs) + """ + Return the maximum of the values over the requested axis. + + If you want the *index* of the maximum, use ``idxmax``. + This is the equivalent of the ``numpy.ndarray`` method ``argmax``. - @doc(make_doc("sum", ndim=1)) + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The maximum of the values in the Series. + + See Also + -------- + numpy.max : Equivalent numpy function for arrays. + Series.min : Return the minimum. + Series.max : Return the maximum. + Series.idxmin : Return the index of the minimum. + Series.idxmax : Return the index of the maximum. + DataFrame.min : Return the minimum over the requested axis. + DataFrame.max : Return the maximum over the requested axis. + DataFrame.idxmin : Return the index of the minimum over the requested axis. + DataFrame.idxmax : Return the index of the maximum over the requested axis. + + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.max() + 8 + """ + return NDFrame.max( + self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") def sum( self, axis: Axis | None = None, @@ -6385,8 +6814,99 @@ def sum( min_count: int = 0, **kwargs, ): - return NDFrame.sum(self, axis, skipna, numeric_only, min_count, **kwargs) + """ + Return the sum of the values over the requested axis. + + This is equivalent to the method ``numpy.sum``. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sum with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Sum of the values for the requested axis. + + See Also + -------- + numpy.sum : Equivalent numpy function for computing sum. + Series.mean : Mean of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.sum() + 14 + + By default, the sum of an empty or all-NA Series is ``0``. + + >>> pd.Series([], dtype="float64").sum() # min_count=0 is the default + 0.0 + + This can be controlled with the ``min_count`` parameter. For example, if + you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + + >>> pd.Series([], dtype="float64").sum(min_count=1) + nan + + Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and + empty series identically. + + >>> pd.Series([np.nan]).sum() + 0.0 + + >>> pd.Series([np.nan]).sum(min_count=1) + nan + """ + return NDFrame.sum( + self, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") @doc(make_doc("prod", ndim=1)) def prod( self, @@ -6396,28 +6916,151 @@ def prod( min_count: int = 0, **kwargs, ): - return NDFrame.prod(self, axis, skipna, numeric_only, min_count, **kwargs) + return NDFrame.prod( + self, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) - @doc(make_doc("mean", ndim=1)) + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") def mean( self, axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, **kwargs, - ): - return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs) + ) -> Any: + """ + Return the mean of the values over the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Mean of the values for the requested axis. + + See Also + -------- + numpy.median : Equivalent numpy function for computing median. + Series.sum : Sum of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.mean() + 2.0 + """ + return NDFrame.mean( + self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) - @doc(make_doc("median", ndim=1)) + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") def median( self, axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, **kwargs, - ): - return NDFrame.median(self, axis, skipna, numeric_only, **kwargs) + ) -> Any: + """ + Return the median of the values over the requested axis. + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Median of the values for the requested axis. + + See Also + -------- + numpy.median : Equivalent numpy function for computing median. + Series.sum : Sum of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.median() + 2.0 + + With a DataFrame + + >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"]) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.median() + a 1.5 + b 2.5 + dtype: float64 + + Using axis=1 + + >>> df.median(axis=1) + tiger 1.5 + zebra 2.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` + to avoid getting an error. + + >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"]) + >>> df.median(numeric_only=True) + a 1.5 + dtype: float64 + """ + return NDFrame.median( + self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") @doc(make_doc("sem", ndim=1)) def sem( self, @@ -6427,9 +7070,16 @@ def sem( numeric_only: bool = False, **kwargs, ): - return NDFrame.sem(self, axis, skipna, ddof, numeric_only, **kwargs) + return NDFrame.sem( + self, + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) - @doc(make_doc("var", ndim=1)) + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") def var( self, axis: Axis | None = None, @@ -6438,8 +7088,85 @@ def var( numeric_only: bool = False, **kwargs, ): - return NDFrame.var(self, axis, skipna, ddof, numeric_only, **kwargs) + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + Parameters + ---------- + axis : {index (0)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.var with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + scalar or Series (if level specified) + Unbiased variance over requested axis. + + See Also + -------- + numpy.var : Equivalent function in NumPy. + Series.std : Returns the standard deviation of the Series. + DataFrame.var : Returns the variance of the DataFrame. + DataFrame.std : Return standard deviation of the values over + the requested axis. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "person_id": [0, 1, 2, 3], + ... "age": [21, 25, 62, 43], + ... "height": [1.61, 1.87, 1.49, 2.01], + ... } + ... ).set_index("person_id") + >>> df + age height + person_id + 0 21 1.61 + 1 25 1.87 + 2 62 1.49 + 3 43 2.01 + + >>> df.var() + age 352.916667 + height 0.056367 + dtype: float64 + + Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1: + + >>> df.var(ddof=0) + age 264.687500 + height 0.042275 + dtype: float64 + """ + return NDFrame.var( + self, + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") @doc(make_doc("std", ndim=1)) def std( self, @@ -6449,8 +7176,16 @@ def std( numeric_only: bool = False, **kwargs, ): - return NDFrame.std(self, axis, skipna, ddof, numeric_only, **kwargs) + return NDFrame.std( + self, + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") @doc(make_doc("skew", ndim=1)) def skew( self, @@ -6459,9 +7194,11 @@ def skew( numeric_only: bool = False, **kwargs, ): - return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs) + return NDFrame.skew( + self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) - @doc(make_doc("kurt", ndim=1)) + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") def kurt( self, axis: Axis | None = 0, @@ -6469,23 +7206,73 @@ def kurt( numeric_only: bool = False, **kwargs, ): - return NDFrame.kurt(self, axis, skipna, numeric_only, **kwargs) + """ + Return unbiased kurtosis over requested axis. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar + Unbiased kurtosis. + + See Also + -------- + Series.skew : Return unbiased skew over requested axis. + Series.var : Return unbiased variance over requested axis. + Series.std : Return unbiased standard deviation over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"]) + >>> s + cat 1 + dog 2 + dog 2 + mouse 3 + dtype: int64 + >>> s.kurt() + 1.5 + """ + return NDFrame.kurt( + self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) kurtosis = kurt product = prod @doc(make_doc("cummin", ndim=1)) - def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + def cummin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: return NDFrame.cummin(self, axis, skipna, *args, **kwargs) @doc(make_doc("cummax", ndim=1)) - def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + def cummax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: return NDFrame.cummax(self, axis, skipna, *args, **kwargs) @doc(make_doc("cumsum", ndim=1)) - def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + def cumsum(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) @doc(make_doc("cumprod", 1)) - def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): + def cumprod(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 25f7e7e9f832b..81fa508ae6d23 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -2,9 +2,7 @@ _shared_docs: dict[str, str] = {} -_shared_docs[ - "aggregate" -] = """ +_shared_docs["aggregate"] = """ Aggregate using one or more operations over the specified axis. Parameters @@ -51,11 +49,11 @@ for more details. A passed user-defined-function will be passed a Series for evaluation. + +If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. {examples}""" -_shared_docs[ - "compare" -] = """ +_shared_docs["compare"] = """ Compare to another {klass} and show the differences. Parameters @@ -67,9 +65,9 @@ Determine which axis to align the comparison on. * 0, or 'index' : Resulting differences are stacked vertically - with rows drawn alternately from self and other. + with rows drawn alternately from self and other. * 1, or 'columns' : Resulting differences are aligned horizontally - with columns drawn alternately from self and other. + with columns drawn alternately from self and other. keep_shape : bool, default False If true, all rows and columns are kept. @@ -85,9 +83,7 @@ .. versionadded:: 1.5.0 """ -_shared_docs[ - "groupby" -] = """ +_shared_docs["groupby"] = """ Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the @@ -108,15 +104,6 @@ the values are used as-is to determine the groups. A label or list of labels may be passed to group by the columns in ``self``. Notice that a tuple is interpreted as a (single) key. -axis : {0 or 'index', 1 or 'columns'}, default 0 - Split along rows (0) or columns (1). For `Series` this parameter - is unused and defaults to 0. - - .. deprecated:: 2.1.0 - - Will be removed and behave like axis=0 in a future version. - For ``axis=1``, do ``frame.T.groupby(...)`` instead. - level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular level or levels. Do not specify both ``by`` and ``level``. @@ -163,14 +150,14 @@ ``group_keys`` now defaults to ``True``. -observed : bool, default False +observed : bool, default True This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. - .. deprecated:: 2.1.0 + .. versionchanged:: 3.0.0 - The default value will change to True in a future version of pandas. + The default value is now ``True``. dropna : bool, default True If True, and if group keys contain NA values, NA values together @@ -193,127 +180,15 @@ `__ for more detailed usage and examples, including splitting an object into groups, iterating through groups, selecting a group, aggregation, and more. -""" - -_shared_docs[ - "melt" -] = """ -Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - -This function is useful to massage a DataFrame into a format where one -or more columns are identifier variables (`id_vars`), while all other -columns, considered measured variables (`value_vars`), are "unpivoted" to -the row axis, leaving just two non-identifier columns, 'variable' and -'value'. - -Parameters ----------- -id_vars : scalar, tuple, list, or ndarray, optional - Column(s) to use as identifier variables. -value_vars : scalar, tuple, list, or ndarray, optional - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. -var_name : scalar, default None - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. -value_name : scalar, default 'value' - Name to use for the 'value' column, can't be an existing column label. -col_level : scalar, optional - If columns are a MultiIndex then use this level to melt. -ignore_index : bool, default True - If True, original index is ignored. If False, the original index is retained. - Index labels will be repeated as necessary. - -Returns -------- -DataFrame - Unpivoted DataFrame. - -See Also --------- -%(other)s : Identical method. -pivot_table : Create a spreadsheet-style pivot table as a DataFrame. -DataFrame.pivot : Return reshaped DataFrame organized - by given index / column values. -DataFrame.explode : Explode a DataFrame from list-like - columns to long format. - -Notes ------ -Reference :ref:`the user guide ` for more examples. -Examples --------- ->>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, -... 'B': {0: 1, 1: 3, 2: 5}, -... 'C': {0: 2, 1: 4, 2: 6}}) ->>> df - A B C -0 a 1 2 -1 b 3 4 -2 c 5 6 - ->>> %(caller)sid_vars=['A'], value_vars=['B']) - A variable value -0 a B 1 -1 b B 3 -2 c B 5 - ->>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) - A variable value -0 a B 1 -1 b B 3 -2 c B 5 -3 a C 2 -4 b C 4 -5 c C 6 - -The names of 'variable' and 'value' columns can be customized: - ->>> %(caller)sid_vars=['A'], value_vars=['B'], -... var_name='myVarname', value_name='myValname') - A myVarname myValname -0 a B 1 -1 b B 3 -2 c B 5 - -Original index values can be kept around: - ->>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) - A variable value -0 a B 1 -1 b B 3 -2 c B 5 -0 a C 2 -1 b C 4 -2 c C 6 - -If you have multi-index columns: - ->>> df.columns = [list('ABC'), list('DEF')] ->>> df - A B C - D E F -0 a 1 2 -1 b 3 4 -2 c 5 6 - ->>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) - A variable value -0 a B 1 -1 b B 3 -2 c B 5 - ->>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) - (A, D) variable_0 variable_1 value -0 a B E 1 -1 b B E 3 -2 c B E 5 +The implementation of groupby is hash-based, meaning in particular that +objects that compare as equal will be considered to be in the same group. +An exception to this is that pandas has special handling of NA values: +any NA values will be collapsed to a single group, regardless of how +they compare. See the user guide linked above for more details. """ -_shared_docs[ - "transform" -] = """ +_shared_docs["transform"] = """ Call ``func`` on self producing a {klass} with the same axis shape as self. Parameters @@ -438,9 +313,7 @@ 6 2 n 4 """ -_shared_docs[ - "storage_options" -] = """storage_options : dict, optional +_shared_docs["storage_options"] = """storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value pairs are forwarded to ``urllib.request.Request`` as header options. For other @@ -450,9 +323,7 @@ `_.""" -_shared_docs[ - "compression_options" -] = """compression : str or dict, default 'infer' +_shared_docs["compression_options"] = """compression : str or dict, default 'infer' For on-the-fly compression of the output data. If 'infer' and '%s' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' @@ -471,9 +342,7 @@ .. versionadded:: 1.5.0 Added support for `.tar` files.""" -_shared_docs[ - "decompression_options" -] = """compression : str or dict, default 'infer' +_shared_docs["decompression_options"] = """compression : str or dict, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and '%s' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' @@ -493,9 +362,7 @@ .. versionadded:: 1.5.0 Added support for `.tar` files.""" -_shared_docs[ - "replace" -] = """ +_shared_docs["replace"] = """ Replace values given in `to_replace` with `value`. Values of the {klass} are replaced with other values dynamically. @@ -564,20 +431,11 @@ filled). Regular expressions, strings and lists or dicts of such objects are also allowed. {inplace} - limit : int, default None - Maximum size gap to forward or backward fill. - - .. deprecated:: 2.1.0 regex : bool or same types as `to_replace`, default False Whether to interpret `to_replace` and/or `value` as regular expressions. Alternatively, this could be a regular expression or a list, dict, or array of regular expressions in which case `to_replace` must be ``None``. - method : {{'pad', 'ffill', 'bfill'}} - The method to use when for replacement, when `to_replace` is a - scalar, list or tuple and `value` is ``None``. - - .. deprecated:: 2.1.0 Returns ------- @@ -673,14 +531,6 @@ 3 1 8 d 4 4 9 e - >>> s.replace([1, 2], method='bfill') - 0 3 - 1 3 - 2 3 - 3 4 - 4 5 - dtype: int64 - **dict-like `to_replace`** >>> df.replace({{0: 10, 1: 100}}) @@ -750,7 +600,7 @@ When one uses a dict as the `to_replace` value, it is like the value(s) in the dict are equal to the `value` parameter. ``s.replace({{'a': None}})`` is equivalent to - ``s.replace(to_replace={{'a': None}}, value=None, method=None)``: + ``s.replace(to_replace={{'a': None}}, value=None)``: >>> s.replace({{'a': None}}) 0 10 @@ -760,24 +610,7 @@ 4 None dtype: object - When ``value`` is not explicitly passed and `to_replace` is a scalar, list - or tuple, `replace` uses the method parameter (default 'pad') to do the - replacement. So this is why the 'a' values are being replaced by 10 - in rows 1 and 2 and 'b' in row 4 in this case. - - >>> s.replace('a') - 0 10 - 1 10 - 2 10 - 3 b - 4 b - dtype: object - - .. deprecated:: 2.1.0 - The 'method' parameter and padding behavior are deprecated. - - On the other hand, if ``None`` is explicitly passed for ``value``, it will - be respected: + If ``None`` is explicitly passed for ``value``, it will be respected: >>> s.replace('a', None) 0 10 @@ -816,137 +649,3 @@ 3 3 d e 4 4 e e """ - -_shared_docs[ - "idxmin" -] = """ - Return index of first occurrence of minimum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - numeric_only : bool, default {numeric_only_default} - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of minima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmin : Return index of the minimum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmin``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the minimum value in each column. - - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object - - To return the index for the minimum value in each row, use ``axis="columns"``. - - >>> df.idxmin(axis="columns") - Pork consumption - Wheat Products co2_emissions - Beef consumption - dtype: object -""" - -_shared_docs[ - "idxmax" -] = """ - Return index of first occurrence of maximum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - numeric_only : bool, default {numeric_only_default} - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of maxima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmax : Return index of the maximum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmax``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the maximum value in each column. - - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object - - To return the index for the maximum value in each row, use ``axis="columns"``. - - >>> df.idxmax(axis="columns") - Pork co2_emissions - Wheat Products consumption - Beef co2_emissions - dtype: object -""" diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index a431842218b3b..18983af12976c 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,11 +1,10 @@ -""" miscellaneous sorting / groupby utilities """ +"""miscellaneous sorting / groupby utilities""" + from __future__ import annotations -from collections import defaultdict +import itertools from typing import ( TYPE_CHECKING, - Callable, - DefaultDict, cast, ) @@ -32,8 +31,8 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, - Iterable, Sequence, ) @@ -173,8 +172,6 @@ def maybe_lift(lab, size: int) -> tuple[np.ndarray, int]: for i, (lab, size) in enumerate(zip(labels, shape)): labels[i], lshape[i] = maybe_lift(lab, size) - labels = list(labels) - # Iteratively process all the labels in chunks sized so less # than lib.i8max unique int ids will be required for each chunk while True: @@ -336,13 +333,15 @@ def lexsort_indexer( raise ValueError(f"invalid na_position: {na_position}") if isinstance(orders, bool): - orders = [orders] * len(keys) + orders = itertools.repeat(orders, len(keys)) elif orders is None: - orders = [True] * len(keys) + orders = itertools.repeat(True, len(keys)) + else: + orders = reversed(orders) labels = [] - for k, order in zip(keys, orders): + for k, order in zip(reversed(keys), orders): k = ensure_key_mapped(k, key) if codes_given: codes = cast(np.ndarray, k) @@ -363,7 +362,7 @@ def lexsort_indexer( labels.append(codes) - return np.lexsort(labels[::-1]) + return np.lexsort(labels) def nargsort( @@ -477,7 +476,7 @@ def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0): zipped = zip(arr_values, mask) else: zipped = zip(arr_values.T, mask.T) - return np.array([_nanargminmax(v, m, func) for v, m in zipped]) + return np.array([_nanargminmax(v, m, func) for v, m in zipped]) # type: ignore[arg-type] return func(arr_values, axis=axis) return _nanargminmax(arr_values, mask, func) @@ -525,13 +524,13 @@ def _ensure_key_mapped_multiindex( if level is not None: if isinstance(level, (str, int)): - sort_levels = [level] + level_iter = [level] else: - sort_levels = level + level_iter = level - sort_levels = [index._get_level_number(lev) for lev in sort_levels] + sort_levels: range | set = {index._get_level_number(lev) for lev in level_iter} else: - sort_levels = list(range(index.nlevels)) # satisfies mypy + sort_levels = range(index.nlevels) mapped = [ ensure_key_mapped(index._get_level_values(level), key) @@ -576,38 +575,21 @@ def ensure_key_mapped( if isinstance( values, Index ): # convert to a new Index subclass, not necessarily the same - result = Index(result) + result = Index(result, tupleize_cols=False) else: # try to revert to original type otherwise type_of_values = type(values) # error: Too many arguments for "ExtensionArray" result = type_of_values(result) # type: ignore[call-arg] - except TypeError: + except TypeError as err: raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." - ) + ) from err return result -def get_flattened_list( - comp_ids: npt.NDArray[np.intp], - ngroups: int, - levels: Iterable[Index], - labels: Iterable[np.ndarray], -) -> list[tuple]: - """Map compressed group id -> key tuple.""" - comp_ids = comp_ids.astype(np.int64, copy=False) - arrays: DefaultDict[int, list[int]] = defaultdict(list) - for labs, level in zip(labels, levels): - table = hashtable.Int64HashTable(ngroups) - table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False)) - for i in range(ngroups): - arrays[i].append(level[table.get_item(i)]) - return [tuple(array) for array in arrays.values()] - - def get_indexer_dict( label_list: list[np.ndarray], keys: list[Index] ) -> dict[Hashable, npt.NDArray[np.intp]]: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1b7d632c0fa80..81f7441846589 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -5,7 +5,6 @@ import re from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -13,6 +12,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._typing import ( AlignJoin, @@ -27,10 +28,13 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, + is_extension_array_dtype, is_integer, is_list_like, + is_numeric_dtype, is_object_dtype, is_re, + is_string_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -50,10 +54,13 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, ) + from pandas._typing import NpDtype + from pandas import ( DataFrame, Index, @@ -162,6 +169,16 @@ class StringMethods(NoNewAttributesMixin): Patterned after Python's string methods, with some inspiration from R's stringr package. + Parameters + ---------- + data : Series or Index + The content of the Series or Index. + + See Also + -------- + Series.str : Vectorized string functions for Series. + Index.str : Vectorized string functions for Index. + Examples -------- >>> s = pd.Series(["A_Str_Series"]) @@ -242,7 +259,9 @@ def _validate(data): inferred_dtype = lib.infer_dtype(values, skipna=True) if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") + raise AttributeError( + f"Can only use .str accessor with string values, not {inferred_dtype}" + ) return inferred_dtype def __getitem__(self, key): @@ -259,7 +278,6 @@ def _wrap_result( expand: bool | None = None, fill_value=np.nan, returns_string: bool = True, - returns_bool: bool = False, dtype=None, ): from pandas import ( @@ -321,10 +339,8 @@ def _wrap_result( new_values.append(row) pa_type = result._pa_array.type result = ArrowExtensionArray(pa.array(new_values, type=pa_type)) - if name is not None: - labels = name - else: - labels = range(max_len) + if name is None: + name = range(max_len) result = ( pa.compute.list_flatten(result._pa_array) .to_numpy() @@ -332,7 +348,7 @@ def _wrap_result( ) result = { label: ArrowExtensionArray(pa.array(res)) - for label, res in zip(labels, result.T) + for label, res in zip(name, result.T) } elif is_object_dtype(result): @@ -387,7 +403,9 @@ def cons_row(x): # This is a mess. _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) - if self._is_string: + if _dtype is not None: + pass + elif self._is_string: if is_bool_dtype(vdtype): _dtype = result.dtype elif returns_string: @@ -538,20 +556,20 @@ def cat( When not passing `others`, all values are concatenated into a single string: - >>> s = pd.Series(['a', 'b', np.nan, 'd']) - >>> s.str.cat(sep=' ') + >>> s = pd.Series(["a", "b", np.nan, "d"]) + >>> s.str.cat(sep=" ") 'a b d' By default, NA values in the Series are ignored. Using `na_rep`, they can be given a representation: - >>> s.str.cat(sep=' ', na_rep='?') + >>> s.str.cat(sep=" ", na_rep="?") 'a b ? d' If `others` is specified, corresponding values are concatenated with the separator. Result will be a Series of strings. - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + >>> s.str.cat(["A", "B", "C", "D"], sep=",") 0 a,A 1 b,B 2 NaN @@ -561,7 +579,7 @@ def cat( Missing values will remain missing in the result, but can again be represented using `na_rep` - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + >>> s.str.cat(["A", "B", "C", "D"], sep=",", na_rep="-") 0 a,A 1 b,B 2 -,C @@ -571,7 +589,7 @@ def cat( If `sep` is not specified, the values are concatenated without separation. - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + >>> s.str.cat(["A", "B", "C", "D"], na_rep="-") 0 aA 1 bB 2 -C @@ -581,15 +599,15 @@ def cat( Series with different indexes can be aligned before concatenation. The `join`-keyword works as in other methods. - >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join='left', na_rep='-') + >>> t = pd.Series(["d", "a", "e", "c"], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join="left", na_rep="-") 0 aa 1 b- 2 -c 3 dd dtype: object >>> - >>> s.str.cat(t, join='outer', na_rep='-') + >>> s.str.cat(t, join="outer", na_rep="-") 0 aa 1 b- 2 -c @@ -597,13 +615,13 @@ def cat( 4 -e dtype: object >>> - >>> s.str.cat(t, join='inner', na_rep='-') + >>> s.str.cat(t, join="inner", na_rep="-") 0 aa 2 -c 3 dd dtype: object >>> - >>> s.str.cat(t, join='right', na_rep='-') + >>> s.str.cat(t, join="right", na_rep="-") 3 dd 0 aa 4 -e @@ -661,7 +679,6 @@ def cat( join=(join if join == "inner" else "outer"), keys=range(len(others)), sort=False, - copy=False, ) data, others = data.align(others, join=join) others = [others[x] for x in others] # again list of Series @@ -707,9 +724,7 @@ def cat( out = res_ser.__finalize__(self._orig, method="str_cat") return out - _shared_docs[ - "str_split" - ] = r""" + _shared_docs["str_split"] = r""" Split strings around given separator/delimiter. Splits the string in the Series/Index from the %(side)s, @@ -946,9 +961,7 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): result, expand=expand, returns_string=expand, dtype=dtype ) - _shared_docs[ - "str_partition" - ] = """ + _shared_docs["str_partition"] = """ Split the string at the %(side)s occurrence of `sep`. This method splits the string at the %(side)s occurrence of `sep`, @@ -967,6 +980,8 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): Returns ------- DataFrame/MultiIndex or Series/Index of objects + Returns appropriate type based on `expand` parameter with strings + split based on the `sep` parameter. See Also -------- @@ -1083,15 +1098,26 @@ def get(self, i): Returns ------- Series or Index + Series or Index where each value is the extracted element from + the corresponding input component. + + See Also + -------- + Series.str.extract : Extract capture groups in the regex as columns + in a DataFrame. Examples -------- - >>> s = pd.Series(["String", - ... (1, 2, 3), - ... ["a", "b", "c"], - ... 123, - ... -456, - ... {1: "Hello", "2": "World"}]) + >>> s = pd.Series( + ... [ + ... "String", + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}, + ... ] + ... ) >>> s 0 String 1 (1, 2, 3) @@ -1121,9 +1147,13 @@ def get(self, i): Return element with given key - >>> s = pd.Series([{"name": "Hello", "value": "World"}, - ... {"name": "Goodbye", "value": "Planet"}]) - >>> s.str.get('name') + >>> s = pd.Series( + ... [ + ... {"name": "Hello", "value": "World"}, + ... {"name": "Goodbye", "value": "Planet"}, + ... ] + ... ) + >>> s.str.get("name") 0 Hello 1 Goodbye dtype: object @@ -1170,11 +1200,15 @@ def join(self, sep: str): -------- Example with a list that contains non-string elements. - >>> s = pd.Series([['lion', 'elephant', 'zebra'], - ... [1.1, 2.2, 3.3], - ... ['cat', np.nan, 'dog'], - ... ['cow', 4.5, 'goat'], - ... ['duck', ['swan', 'fish'], 'guppy']]) + >>> s = pd.Series( + ... [ + ... ["lion", "elephant", "zebra"], + ... [1.1, 2.2, 3.3], + ... ["cat", np.nan, "dog"], + ... ["cow", 4.5, "goat"], + ... ["duck", ["swan", "fish"], "guppy"], + ... ] + ... ) >>> s 0 [lion, elephant, zebra] 1 [1.1, 2.2, 3.3] @@ -1186,7 +1220,7 @@ def join(self, sep: str): Join all lists using a '-'. The lists containing object(s) of types other than str will produce a NaN. - >>> s.str.join('-') + >>> s.str.join("-") 0 lion-elephant-zebra 1 NaN 2 NaN @@ -1199,7 +1233,12 @@ def join(self, sep: str): @forbid_nonstring_types(["bytes"]) def contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): r""" Test if pattern or regex is contained within a string of a Series or Index. @@ -1217,8 +1256,9 @@ def contains( Flags to pass through to the re module, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. regex : bool, default True If True, assumes the pat is a regular expression. @@ -1242,8 +1282,8 @@ def contains( -------- Returning a Series of booleans using only a literal pattern. - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan]) - >>> s1.str.contains('og', regex=False) + >>> s1 = pd.Series(["Mouse", "dog", "house and parrot", "23", np.nan]) + >>> s1.str.contains("og", regex=False) 0 False 1 True 2 False @@ -1253,13 +1293,13 @@ def contains( Returning an Index of booleans using only a literal pattern. - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan]) - >>> ind.str.contains('23', regex=False) + >>> ind = pd.Index(["Mouse", "dog", "house and parrot", "23.0", np.nan]) + >>> ind.str.contains("23", regex=False) Index([False, False, False, True, nan], dtype='object') Specifying case sensitivity using `case`. - >>> s1.str.contains('oG', case=True, regex=True) + >>> s1.str.contains("oG", case=True, regex=True) 0 False 1 False 2 False @@ -1271,7 +1311,7 @@ def contains( with `False`. If Series or Index does not contain NaN values the resultant dtype will be `bool`, otherwise, an `object` dtype. - >>> s1.str.contains('og', na=False, regex=True) + >>> s1.str.contains("og", na=False, regex=True) 0 False 1 True 2 False @@ -1281,7 +1321,7 @@ def contains( Returning 'house' or 'dog' when either expression occurs in a string. - >>> s1.str.contains('house|dog', regex=True) + >>> s1.str.contains("house|dog", regex=True) 0 False 1 True 2 True @@ -1292,7 +1332,7 @@ def contains( Ignoring case sensitivity using `flags` with regex. >>> import re - >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + >>> s1.str.contains("PARROT", flags=re.IGNORECASE, regex=True) 0 False 1 False 2 True @@ -1302,7 +1342,7 @@ def contains( Returning any digit using regular expression. - >>> s1.str.contains('\\d', regex=True) + >>> s1.str.contains("\\d", regex=True) 0 False 1 False 2 False @@ -1315,8 +1355,8 @@ def contains( return `True`. However, '.0' as a regex matches any character followed by a 0. - >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) + >>> s2 = pd.Series(["40", "40.0", "41", "41.0", "35"]) + >>> s2.str.contains(".0", regex=True) 0 True 1 True 2 False @@ -1336,26 +1376,35 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. + Determines whether each string in the Series or Index starts with a + match to a specified regular expression. This function is especially + useful for validating prefixes, such as ensuring that codes, tags, or + identifiers begin with a specific pattern. + Parameters ---------- pat : str - Character sequence or regular expression. + Character sequence. case : bool, default True If True, case sensitive. flags : int, default 0 (no flags) Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- Series/Index/array of boolean values + A Series, Index, or array of boolean values indicating whether the start + of each string matches the pattern. The result will be of the same type + as the input. See Also -------- @@ -1377,10 +1426,15 @@ def match(self, pat, case: bool = True, flags: int = 0, na=None): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): + def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. + Checks if each string in the Series or Index fully matches the + specified regular expression pattern. This function is useful when the + requirement is for an entire string to conform to a pattern, such as + validating formats like phone numbers or email addresses. + Parameters ---------- pat : str @@ -1391,12 +1445,16 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- Series/Index/array of boolean values + The function returns a Series, Index, or array of boolean values, + where True indicates that the entire string matches the regular + expression pattern and False indicates that it does not. See Also -------- @@ -1407,7 +1465,7 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): Examples -------- >>> ser = pd.Series(["cat", "duck", "dove"]) - >>> ser.str.fullmatch(r'd.+') + >>> ser.str.fullmatch(r"d.+") 0 False 1 True 2 True @@ -1419,8 +1477,8 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): @forbid_nonstring_types(["bytes"]) def replace( self, - pat: str | re.Pattern, - repl: str | Callable, + pat: str | re.Pattern | dict, + repl: str | Callable | None = None, n: int = -1, case: bool | None = None, flags: int = 0, @@ -1434,11 +1492,14 @@ def replace( Parameters ---------- - pat : str or compiled regex + pat : str, compiled regex, or a dict String can be a character sequence or regular expression. + Dictionary contains pairs of strings to be replaced + along with the updated value. repl : str or callable Replacement string or a callable. The callable is passed the regex match object and must return a replacement string to be used. + Must have a value of None if `pat` is a dict See :func:`re.sub`. n : int, default -1 (all) Number of replacements to make from start. @@ -1472,6 +1533,15 @@ def replace( * if `regex` is False and `repl` is a callable or `pat` is a compiled regex * if `pat` is a compiled regex and `case` or `flags` is set + * if `pat` is a dictionary and `repl` is not None. + + See Also + -------- + Series.str.replace : Method to replace occurrences of a substring with another + substring. + Series.str.extract : Extract substrings using a regular expression. + Series.str.findall : Find all occurrences of a pattern or regex in each string. + Series.str.split : Split each string by a specified delimiter or pattern. Notes ----- @@ -1481,12 +1551,21 @@ def replace( Examples -------- + When `pat` is a dictionary, every key in `pat` is replaced + with its corresponding value: + + >>> pd.Series(["A", "B", np.nan]).str.replace(pat={"A": "a", "B": "b"}) + 0 a + 1 b + 2 NaN + dtype: object + When `pat` is a string and `regex` is True, the given `pat` is compiled as a regex. When `repl` is a string, it replaces matching regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are left as is: - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + >>> pd.Series(["foo", "fuz", np.nan]).str.replace("f.", "ba", regex=True) 0 bao 1 baz 2 NaN @@ -1495,7 +1574,7 @@ def replace( When `pat` is a string and `regex` is False, every `pat` is replaced with `repl` as with :meth:`str.replace`: - >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + >>> pd.Series(["f.o", "fuz", np.nan]).str.replace("f.", "ba", regex=False) 0 bao 1 fuz 2 NaN @@ -1507,7 +1586,7 @@ def replace( To get the idea: - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True) + >>> pd.Series(["foo", "fuz", np.nan]).str.replace("f", repr, regex=True) 0 oo 1 uz 2 NaN @@ -1516,8 +1595,8 @@ def replace( Reverse every lowercase alphabetic word: >>> repl = lambda m: m.group(0)[::-1] - >>> ser = pd.Series(['foo 123', 'bar baz', np.nan]) - >>> ser.str.replace(r'[a-z]+', repl, regex=True) + >>> ser = pd.Series(["foo 123", "bar baz", np.nan]) + >>> ser.str.replace(r"[a-z]+", repl, regex=True) 0 oof 123 1 rab zab 2 NaN @@ -1526,8 +1605,8 @@ def replace( Using regex groups (extract second group and swap case): >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" - >>> repl = lambda m: m.group('two').swapcase() - >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz']) + >>> repl = lambda m: m.group("two").swapcase() + >>> ser = pd.Series(["One Two Three", "Foo Bar Baz"]) >>> ser.str.replace(pat, repl, regex=True) 0 tWO 1 bAR @@ -1536,15 +1615,18 @@ def replace( Using a compiled regex with flags >>> import re - >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True) + >>> regex_pat = re.compile(r"FUZ", flags=re.IGNORECASE) + >>> pd.Series(["foo", "fuz", np.nan]).str.replace(regex_pat, "bar", regex=True) 0 foo 1 bar 2 NaN dtype: object """ + if isinstance(pat, dict) and repl is not None: + raise ValueError("repl cannot be used when pat is a dictionary") + # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): + if not isinstance(pat, dict) and not (isinstance(repl, str) or callable(repl)): raise TypeError("repl must be a string or callable") is_compiled_re = is_re(pat) @@ -1564,16 +1646,27 @@ def replace( if case is None: case = True - result = self._data.array._str_replace( - pat, repl, n=n, case=case, flags=flags, regex=regex - ) - return self._wrap_result(result) + res_output = self._data + if not isinstance(pat, dict): + pat = {pat: repl} + + for key, value in pat.items(): + result = res_output.array._str_replace( + key, value, n=n, case=case, flags=flags, regex=regex + ) + res_output = self._wrap_result(result) + + return res_output @forbid_nonstring_types(["bytes"]) def repeat(self, repeats): """ Duplicate each string in the Series or Index. + Duplicates each string in the Series or Index, either by applying the + same repeat count to all elements or by using different repeat values + for each element. + Parameters ---------- repeats : int or sequence of int @@ -1585,9 +1678,23 @@ def repeat(self, repeats): Series or Index of repeated string objects specified by input parameter repeats. + See Also + -------- + Series.str.lower : Convert all characters in each string to lowercase. + Series.str.upper : Convert all characters in each string to uppercase. + Series.str.title : Convert each string to title case (capitalizing the first + letter of each word). + Series.str.strip : Remove leading and trailing whitespace from each string. + Series.str.replace : Replace occurrences of a substring with another substring + in each string. + Series.str.ljust : Left-justify each string in the Series/Index by padding with + a specified character. + Series.str.rjust : Right-justify each string in the Series/Index by padding with + a specified character. + Examples -------- - >>> s = pd.Series(['a', 'b', 'c']) + >>> s = pd.Series(["a", "b", "c"]) >>> s 0 a 1 b @@ -1623,6 +1730,12 @@ def pad( """ Pad strings in the Series/Index up to width. + This function pads strings in a Series or Index to a specified width, + filling the extra space with a character of your choice. It provides + flexibility in positioning the padding, allowing it to be added to the + left, right, or both sides. This is useful for formatting strings to + align text or ensure consistent string lengths in data processing. + Parameters ---------- width : int @@ -1662,12 +1775,12 @@ def pad( 1 tiger dtype: object - >>> s.str.pad(width=10, side='right', fillchar='-') + >>> s.str.pad(width=10, side="right", fillchar="-") 0 caribou--- 1 tiger----- dtype: object - >>> s.str.pad(width=10, side='both', fillchar='-') + >>> s.str.pad(width=10, side="both", fillchar="-") 0 -caribou-- 1 --tiger--- dtype: object @@ -1686,9 +1799,7 @@ def pad( result = self._data.array._str_pad(width, side=side, fillchar=fillchar) return self._wrap_result(result) - _shared_docs[ - "str_pad" - ] = """ + _shared_docs["str_pad"] = """ Pad %(side)s side of strings in the Series/Index. Equivalent to :meth:`str.%(method)s`. @@ -1704,6 +1815,18 @@ def pad( Returns ------- Series/Index of objects. + A Series or Index where the strings are modified by :meth:`str.%(method)s`. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. + Series.str.center : Fills both sides of strings with an arbitrary + character. + Series.str.zfill : Pad strings in the Series/Index by prepending '0' + character. Examples -------- @@ -1769,6 +1892,7 @@ def zfill(self, width: int): Returns ------- Series/Index of objects. + A Series or Index where the strings are prepended with '0' characters. See Also -------- @@ -1788,7 +1912,7 @@ def zfill(self, width: int): Examples -------- - >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) + >>> s = pd.Series(["-1", "1", "1000", 10, np.nan]) >>> s 0 -1 1 1 @@ -1822,6 +1946,11 @@ def slice(self, start=None, stop=None, step=None): """ Slice substrings from each element in the Series or Index. + Slicing substrings from strings in a Series or Index helps extract + specific portions of data, making it easier to analyze or manipulate + text. This is useful for tasks like parsing structured text fields or + isolating parts of strings with a consistent format. + Parameters ---------- start : int, optional @@ -1898,6 +2027,11 @@ def slice_replace(self, start=None, stop=None, repl=None): """ Replace a positional slice of a string with another value. + This function allows replacing specific parts of a string in a Series + or Index by specifying start and stop positions. It is useful for + modifying substrings in a controlled way, such as updating sections of + text based on their positions or patterns. + Parameters ---------- start : int, optional @@ -1923,7 +2057,7 @@ def slice_replace(self, start=None, stop=None, repl=None): Examples -------- - >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) + >>> s = pd.Series(["a", "ab", "abc", "abdc", "abcde"]) >>> s 0 a 1 ab @@ -1935,7 +2069,7 @@ def slice_replace(self, start=None, stop=None, repl=None): Specify just `start`, meaning replace `start` until the end of the string with `repl`. - >>> s.str.slice_replace(1, repl='X') + >>> s.str.slice_replace(1, repl="X") 0 aX 1 aX 2 aX @@ -1946,7 +2080,7 @@ def slice_replace(self, start=None, stop=None, repl=None): Specify just `stop`, meaning the start of the string to `stop` is replaced with `repl`, and the rest of the string is included. - >>> s.str.slice_replace(stop=2, repl='X') + >>> s.str.slice_replace(stop=2, repl="X") 0 X 1 X 2 Xc @@ -1958,7 +2092,7 @@ def slice_replace(self, start=None, stop=None, repl=None): replaced with `repl`. Everything before or after `start` and `stop` is included as is. - >>> s.str.slice_replace(start=1, stop=3, repl='X') + >>> s.str.slice_replace(start=1, stop=3, repl="X") 0 aX 1 aX 2 aX @@ -1969,7 +2103,9 @@ def slice_replace(self, start=None, stop=None, repl=None): result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) - def decode(self, encoding, errors: str = "strict"): + def decode( + self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None + ): """ Decode character string in the Series/Index using indicated encoding. @@ -1979,23 +2115,41 @@ def decode(self, encoding, errors: str = "strict"): Parameters ---------- encoding : str + Specifies the encoding to be used. errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`bytes.decode`. + dtype : str or dtype, optional + The dtype of the result. When not ``None``, must be either a string or + object dtype. When ``None``, the dtype of the result is determined by + ``pd.options.future.infer_string``. + + .. versionadded:: 2.3.0 Returns ------- Series or Index + A Series or Index with decoded strings. + + See Also + -------- + Series.str.encode : Encodes strings into bytes in a Series/Index. Examples -------- For Series: - >>> ser = pd.Series([b'cow', b'123', b'()']) - >>> ser.str.decode('ascii') + >>> ser = pd.Series([b"cow", b"123", b"()"]) + >>> ser.str.decode("ascii") 0 cow 1 123 2 () dtype: object """ + if dtype is not None and not is_string_dtype(dtype): + raise ValueError(f"dtype must be string or object, got {dtype=}") + if dtype is None and get_option("future.infer_string"): + dtype = "str" # TODO: Add a similar _bytes interface. if encoding in _cpython_optimized_decoders: # CPython optimized implementation @@ -2004,9 +2158,8 @@ def decode(self, encoding, errors: str = "strict"): decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] arr = self._data.array - # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) - return self._wrap_result(result) + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors: str = "strict"): @@ -2018,16 +2171,24 @@ def encode(self, encoding, errors: str = "strict"): Parameters ---------- encoding : str + Specifies the encoding to be used. errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`str.encode`. Returns ------- Series/Index of objects + A Series or Index with strings encoded into bytes. + + See Also + -------- + Series.str.decode : Decodes bytes into strings in a Series/Index. Examples -------- - >>> ser = pd.Series(['cow', '123', '()']) - >>> ser.str.encode(encoding='ascii') + >>> ser = pd.Series(["cow", "123", "()"]) + >>> ser.str.encode(encoding="ascii") 0 b'cow' 1 b'123' 2 b'()' @@ -2036,9 +2197,7 @@ def encode(self, encoding, errors: str = "strict"): result = self._data.array._str_encode(encoding, errors) return self._wrap_result(result, returns_string=False) - _shared_docs[ - "str_strip" - ] = r""" + _shared_docs["str_strip"] = r""" Remove %(position)s characters. Strip whitespaces (including newlines) or a set of specified characters @@ -2056,6 +2215,7 @@ def encode(self, encoding, errors: str = "strict"): Returns ------- Series or Index of object + Series or Index with the strings being stripped from the %(side)s. See Also -------- @@ -2143,9 +2303,7 @@ def rstrip(self, to_strip=None): result = self._data.array._str_rstrip(to_strip) return self._wrap_result(result) - _shared_docs[ - "str_removefix" - ] = r""" + _shared_docs["str_removefix"] = r""" Remove a %(side)s from an object series. If the %(side)s is not present, the original string will be returned. @@ -2208,19 +2366,37 @@ def removesuffix(self, suffix: str): return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) - def wrap(self, width: int, **kwargs): + def wrap( + self, + width: int, + expand_tabs: bool = True, + tabsize: int = 8, + replace_whitespace: bool = True, + drop_whitespace: bool = True, + initial_indent: str = "", + subsequent_indent: str = "", + fix_sentence_endings: bool = False, + break_long_words: bool = True, + break_on_hyphens: bool = True, + max_lines: int | None = None, + placeholder: str = " [...]", + ): r""" Wrap strings in Series/Index at specified line width. This method has the same keyword parameters and defaults as - :class:`textwrap.TextWrapper`. + :class:`textwrap.TextWrapper`. Parameters ---------- - width : int + width : int, optional Maximum line width. expand_tabs : bool, optional If True, tab characters will be expanded to spaces (default: True). + tabsize : int, optional + If expand_tabs is true, then all tab characters in text will be + expanded to zero or more spaces, depending on the current column + and the given tab size (default: 8). replace_whitespace : bool, optional If True, each whitespace character (as defined by string.whitespace) remaining after tab expansion will be replaced by a single space @@ -2228,6 +2404,28 @@ def wrap(self, width: int, **kwargs): drop_whitespace : bool, optional If True, whitespace that, after wrapping, happens to end up at the beginning or end of a line is dropped (default: True). + initial_indent : str, optional + String that will be prepended to the first line of wrapped output. + Counts towards the length of the first line. The empty string is + not indented (default: ''). + subsequent_indent : str, optional + String that will be prepended to all lines of wrapped output except + the first. Counts towards the length of each line except the first + (default: ''). + fix_sentence_endings : bool, optional + If true, TextWrapper attempts to detect sentence endings and ensure + that sentences are always separated by exactly two spaces. This is + generally desired for text in a monospaced font. However, the sentence + detection algorithm is imperfect: it assumes that a sentence ending + consists of a lowercase letter followed by one of '.', '!', or '?', + possibly followed by one of '"' or "'", followed by a space. One + problem with this algorithm is that it is unable to detect the + difference between “Dr.” in `[...] Dr. Frankenstein's monster [...]` + and “Spot.” in `[...] See Spot. See Spot run [...]` + Since the sentence detection algorithm relies on string.lowercase + for the definition of “lowercase letter”, and a convention of using + two spaces after a period to separate sentences on the same line, + it is specific to English-language texts (default: False). break_long_words : bool, optional If True, then words longer than width will be broken in order to ensure that no lines are longer than width. If it is false, long words will @@ -2238,10 +2436,23 @@ def wrap(self, width: int, **kwargs): only whitespaces will be considered as potentially good places for line breaks, but you need to set break_long_words to false if you want truly insecable words (default: True). + max_lines : int, optional + If not None, then the output will contain at most max_lines lines, with + placeholder appearing at the end of the output (default: None). + placeholder : str, optional + String that will appear at the end of the output text if it has been + truncated (default: ' [...]'). Returns ------- Series or Index + A Series or Index where the strings are wrapped at the specified line width. + + See Also + -------- + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. Notes ----- @@ -2257,17 +2468,34 @@ def wrap(self, width: int, **kwargs): Examples -------- - >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) + >>> s = pd.Series(["line to be wrapped", "another line to be wrapped"]) >>> s.str.wrap(12) 0 line to be\nwrapped 1 another line\nto be\nwrapped dtype: object """ - result = self._data.array._str_wrap(width, **kwargs) + result = self._data.array._str_wrap( + width=width, + expand_tabs=expand_tabs, + tabsize=tabsize, + replace_whitespace=replace_whitespace, + drop_whitespace=drop_whitespace, + initial_indent=initial_indent, + subsequent_indent=subsequent_indent, + fix_sentence_endings=fix_sentence_endings, + break_long_words=break_long_words, + break_on_hyphens=break_on_hyphens, + max_lines=max_lines, + placeholder=placeholder, + ) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep: str = "|"): + def get_dummies( + self, + sep: str = "|", + dtype: NpDtype | None = None, + ): """ Return DataFrame of dummy/indicator variables for Series. @@ -2278,6 +2506,8 @@ def get_dummies(self, sep: str = "|"): ---------- sep : str, default "|" String to split on. + dtype : dtype, default np.int64 + Data type for new columns. Only a single dtype is allowed. Returns ------- @@ -2291,21 +2521,37 @@ def get_dummies(self, sep: str = "|"): Examples -------- - >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() + >>> pd.Series(["a|b", "a", "a|c"]).str.get_dummies() a b c 0 1 1 0 1 1 0 0 2 1 0 1 - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies() a b c 0 1 1 0 1 0 0 0 2 1 0 1 + + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool) + a b c + 0 True True False + 1 False False False + 2 True False True """ + from pandas.core.frame import DataFrame + + if dtype is not None and not (is_numeric_dtype(dtype) or is_bool_dtype(dtype)): + raise ValueError("Only numeric or boolean dtypes are supported for 'dtype'") # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._data.array._str_get_dummies(sep) + result, name = self._data.array._str_get_dummies(sep, dtype) + if is_extension_array_dtype(dtype): + return self._wrap_result( + DataFrame(result, columns=name, dtype=dtype), + name=name, + returns_string=False, + ) return self._wrap_result( result, name=name, @@ -2318,7 +2564,11 @@ def translate(self, table): """ Map all characters in the string through the given mapping table. - Equivalent to standard :meth:`str.translate`. + This method is equivalent to the standard :meth:`str.translate` + method for strings. It maps each character in the string to a new + character according to the translation table provided. Unmapped + characters are left unchanged, while characters mapped to None + are removed. Parameters ---------- @@ -2331,11 +2581,19 @@ def translate(self, table): Returns ------- Series or Index + A new Series or Index with translated strings. + + See Also + -------- + Series.str.replace : Replace occurrences of pattern/regex in the + Series with some other string. + Index.str.replace : Replace occurrences of pattern/regex in the + Index with some other string. Examples -------- >>> ser = pd.Series(["El niño", "Françoise"]) - >>> mytable = str.maketrans({'ñ': 'n', 'ç': 'c'}) + >>> mytable = str.maketrans({"ñ": "n", "ç": "c"}) >>> ser.str.translate(mytable) 0 El nino 1 Francoise @@ -2361,8 +2619,6 @@ def count(self, pat, flags: int = 0): flags : int, default 0, meaning no flags Flags for the `re` module. For a complete list, `see here `_. - **kwargs - For compatibility with other string methods. Not used. Returns ------- @@ -2382,8 +2638,8 @@ def count(self, pat, flags: int = 0): Examples -------- - >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) - >>> s.str.count('a') + >>> s = pd.Series(["A", "B", "Aaba", "Baca", np.nan, "CABA", "cat"]) + >>> s.str.count("a") 0 0.0 1 0.0 2 2.0 @@ -2395,8 +2651,8 @@ def count(self, pat, flags: int = 0): Escape ``'$'`` to find the literal dollar sign. - >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') + >>> s = pd.Series(["$", "B", "Aab$", "$$ca", "C$B$", "cat"]) + >>> s.str.count("\\$") 0 1 1 0 2 1 @@ -2407,7 +2663,7 @@ def count(self, pat, flags: int = 0): This is also available on Index - >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') + >>> pd.Index(["A", "A", "Aaba", "cat"]).str.count("a") Index([0, 0, 2, 1], dtype='int64') """ result = self._data.array._str_count(pat, flags) @@ -2415,7 +2671,7 @@ def count(self, pat, flags: int = 0): @forbid_nonstring_types(["bytes"]) def startswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the start of each string element matches a pattern. @@ -2427,10 +2683,11 @@ def startswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- @@ -2446,7 +2703,7 @@ def startswith( Examples -------- - >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) + >>> s = pd.Series(["bat", "Bear", "cat", np.nan]) >>> s 0 bat 1 Bear @@ -2454,14 +2711,14 @@ def startswith( 3 NaN dtype: object - >>> s.str.startswith('b') + >>> s.str.startswith("b") 0 True 1 False 2 False 3 NaN dtype: object - >>> s.str.startswith(('b', 'B')) + >>> s.str.startswith(("b", "B")) 0 True 1 True 2 False @@ -2470,7 +2727,7 @@ def startswith( Specifying `na` to be `False` instead of `NaN`. - >>> s.str.startswith('b', na=False) + >>> s.str.startswith("b", na=False) 0 True 1 False 2 False @@ -2485,7 +2742,7 @@ def startswith( @forbid_nonstring_types(["bytes"]) def endswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the end of each string element matches a pattern. @@ -2497,10 +2754,11 @@ def endswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- @@ -2516,7 +2774,7 @@ def endswith( Examples -------- - >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) + >>> s = pd.Series(["bat", "bear", "caT", np.nan]) >>> s 0 bat 1 bear @@ -2524,14 +2782,14 @@ def endswith( 3 NaN dtype: object - >>> s.str.endswith('t') + >>> s.str.endswith("t") 0 True 1 False 2 False 3 NaN dtype: object - >>> s.str.endswith(('t', 'T')) + >>> s.str.endswith(("t", "T")) 0 True 1 False 2 True @@ -2540,7 +2798,7 @@ def endswith( Specifying `na` to be `False` instead of `NaN`. - >>> s.str.endswith('t', na=False) + >>> s.str.endswith("t", na=False) 0 True 1 False 2 False @@ -2587,11 +2845,11 @@ def findall(self, pat, flags: int = 0): Examples -------- - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + >>> s = pd.Series(["Lion", "Monkey", "Rabbit"]) The search for the pattern 'Monkey' returns one match: - >>> s.str.findall('Monkey') + >>> s.str.findall("Monkey") 0 [] 1 [Monkey] 2 [] @@ -2600,7 +2858,7 @@ def findall(self, pat, flags: int = 0): On the other hand, the search for the pattern 'MONKEY' doesn't return any match: - >>> s.str.findall('MONKEY') + >>> s.str.findall("MONKEY") 0 [] 1 [] 2 [] @@ -2610,7 +2868,7 @@ def findall(self, pat, flags: int = 0): to find the pattern 'MONKEY' ignoring the case: >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + >>> s.str.findall("MONKEY", flags=re.IGNORECASE) 0 [] 1 [Monkey] 2 [] @@ -2619,7 +2877,7 @@ def findall(self, pat, flags: int = 0): When the pattern matches more than one string in the Series, all matches are returned: - >>> s.str.findall('on') + >>> s.str.findall("on") 0 [on] 1 [on] 2 [] @@ -2628,7 +2886,7 @@ def findall(self, pat, flags: int = 0): Regular expressions are supported too. For instance, the search for all the strings ending with the word 'on' is shown next: - >>> s.str.findall('on$') + >>> s.str.findall("on$") 0 [on] 1 [] 2 [] @@ -2637,7 +2895,7 @@ def findall(self, pat, flags: int = 0): If the pattern is found more than once in the same string, then a list of multiple strings is returned: - >>> s.str.findall('b') + >>> s.str.findall("b") 0 [] 1 [] 2 [b, b] @@ -2690,8 +2948,8 @@ def extract( A pattern with two groups will return a DataFrame with two columns. Non-matches will be NaN. - >>> s = pd.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') + >>> s = pd.Series(["a1", "b2", "c3"]) + >>> s.str.extract(r"([ab])(\d)") 0 1 0 a 1 1 b 2 @@ -2699,7 +2957,7 @@ def extract( A pattern may contain optional groups. - >>> s.str.extract(r'([ab])?(\d)') + >>> s.str.extract(r"([ab])?(\d)") 0 1 0 a 1 1 b 2 @@ -2707,7 +2965,7 @@ def extract( Named groups will become column names in the result. - >>> s.str.extract(r'(?P[ab])(?P\d)') + >>> s.str.extract(r"(?P[ab])(?P\d)") letter digit 0 a 1 1 b 2 @@ -2716,7 +2974,7 @@ def extract( A pattern with one group will return a DataFrame with one column if expand=True. - >>> s.str.extract(r'[ab](\d)', expand=True) + >>> s.str.extract(r"[ab](\d)", expand=True) 0 0 1 1 2 @@ -2724,7 +2982,7 @@ def extract( A pattern with one group will return a Series if expand=False. - >>> s.str.extract(r'[ab](\d)', expand=False) + >>> s.str.extract(r"[ab](\d)", expand=False) 0 1 1 2 2 NaN @@ -2852,9 +3110,7 @@ def extractall(self, pat, flags: int = 0) -> DataFrame: # TODO: dispatch return str_extractall(self._orig, pat, flags) - _shared_docs[ - "find" - ] = """ + _shared_docs["find"] = """ Return %(side)s indexes in each strings in the Series/Index. Each of returned indexes corresponds to the position where the @@ -2873,6 +3129,9 @@ def extractall(self, pat, flags: int = 0) -> DataFrame: Returns ------- Series or Index of int. + A Series (if the input is a Series) or an Index (if the input is an + Index) of the %(side)s indexes corresponding to the positions where the + substring is found in each string of the input. See Also -------- @@ -2882,9 +3141,9 @@ def extractall(self, pat, flags: int = 0) -> DataFrame: -------- For Series.str.find: - >>> ser = pd.Series(["cow_", "duck_", "do_ve"]) + >>> ser = pd.Series(["_cow_", "duck_", "do_v_e"]) >>> ser.str.find("_") - 0 3 + 0 0 1 4 2 2 dtype: int64 @@ -2949,20 +3208,31 @@ def normalize(self, form): Returns ------- Series/Index of objects + A Series or Index of strings in the same Unicode form specified by `form`. + The returned object retains the same type as the input (Series or Index), + and contains the normalized strings. + + See Also + -------- + Series.str.upper : Convert all characters in each string to uppercase. + Series.str.lower : Convert all characters in each string to lowercase. + Series.str.title : Convert each string to title case (capitalizing the + first letter of each word). + Series.str.strip : Remove leading and trailing whitespace from each string. + Series.str.replace : Replace occurrences of a substring with another substring + in each string. Examples -------- - >>> ser = pd.Series(['ñ']) - >>> ser.str.normalize('NFC') == ser.str.normalize('NFD') + >>> ser = pd.Series(["ñ"]) + >>> ser.str.normalize("NFC") == ser.str.normalize("NFD") 0 False dtype: bool """ result = self._data.array._str_normalize(form) return self._wrap_result(result) - _shared_docs[ - "index" - ] = """ + _shared_docs["index"] = """ Return %(side)s indexes in each string in Series/Index. Each of the returned indexes corresponds to the position where the @@ -2983,6 +3253,8 @@ def normalize(self, form): Returns ------- Series or Index of object + Returns a Series or an Index of the %(side)s indexes + in each string of the input. See Also -------- @@ -3068,12 +3340,9 @@ def len(self): Returns the length (number of characters) in a string. Returns the number of entries for dictionaries, lists or tuples. - >>> s = pd.Series(['dog', - ... '', - ... 5, - ... {'foo' : 'bar'}, - ... [2, 3, 5, 7], - ... ('one', 'two', 'three')]) + >>> s = pd.Series( + ... ["dog", "", 5, {"foo": "bar"}, [2, 3, 5, 7], ("one", "two", "three")] + ... ) >>> s 0 dog 1 @@ -3094,16 +3363,15 @@ def len(self): result = self._data.array._str_len() return self._wrap_result(result, returns_string=False) - _shared_docs[ - "casemethods" - ] = """ + _shared_docs["casemethods"] = """ Convert strings in the Series/Index to %(type)s. %(version)s Equivalent to :meth:`str.%(method)s`. Returns ------- - Series or Index of object + Series or Index of objects + A Series or Index where the strings are modified by :meth:`str.%(method)s`. See Also -------- @@ -3166,7 +3434,8 @@ def len(self): # cases: # upper, lower, title, capitalize, swapcase, casefold # boolean: - # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower + # isupper istitle isascii # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args: dict[str, dict[str, str]] = {} _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} @@ -3224,9 +3493,7 @@ def casefold(self): result = self._data.array._str_casefold() return self._wrap_result(result) - _shared_docs[ - "ismethods" - ] = """ + _shared_docs["ismethods"] = """ Check whether all characters in each string are %(type)s. This is equivalent to running the Python string method @@ -3238,39 +3505,91 @@ def casefold(self): Series or Index of bool Series or Index of boolean values with the same length as the original Series/Index. - + """ + _shared_docs["isalpha"] = """ See Also -------- - Series.str.isalpha : Check whether all characters are alphabetic. Series.str.isnumeric : Check whether all characters are numeric. Series.str.isalnum : Check whether all characters are alphanumeric. Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. Examples -------- - **Checks for Alphabetic and Numeric Characters** >>> s1 = pd.Series(['one', 'one1', '1', '']) - >>> s1.str.isalpha() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isnumeric"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + Examples + -------- + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but + also includes other characters that can represent quantities such as + unicode fractions. + + >>> s1 = pd.Series(['one', 'one1', '1', '', '³', '⅕']) >>> s1.str.isnumeric() 0 False 1 False 2 True 3 False + 4 True + 5 True dtype: bool + For a string to be considered numeric, all its characters must have a Unicode + numeric property matching :py:meth:`str.is_numeric`. As a consequence, + the following cases are **not** recognized as numeric: + + - **Decimal numbers** (e.g., "1.1"): due to period ``"."`` + - **Negative numbers** (e.g., "-5"): due to minus sign ``"-"`` + - **Scientific notation** (e.g., "1e3"): due to characters like ``"e"`` + + >>> s2 = pd.Series(["1.1", "-5", "1e3"]) + >>> s2.str.isnumeric() + 0 False + 1 False + 2 False + dtype: bool + """ + _shared_docs["isalnum"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + >>> s1 = pd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() 0 True 1 True @@ -3287,47 +3606,75 @@ def casefold(self): 1 False 2 False dtype: bool + """ + _shared_docs["isdecimal"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. + Examples + -------- + The ``s3.str.isdecimal`` method checks for characters used to form + numbers in base 10. >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - >>> s3.str.isdecimal() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isdigit"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. + Examples + -------- + Similar to ``str.isdecimal`` but also includes special digits, like + superscripted and subscripted digits in unicode. + >>> s3 = pd.Series(['23', '³', '⅕', '']) >>> s3.str.isdigit() 0 True 1 True 2 False 3 False dtype: bool + """ - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool + _shared_docs["isspace"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **Checks for Whitespace** + Examples + -------- >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) >>> s4.str.isspace() @@ -3335,30 +3682,77 @@ def casefold(self): 1 True 2 False dtype: bool + """ + _shared_docs["islower"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.isascii : Check whether all characters are ascii. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **Checks for Character Case** + Examples + -------- >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - >>> s5.str.islower() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isupper"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s5.str.isupper() 0 False 1 False 2 True 3 False dtype: bool + """ + _shared_docs["istitle"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. + Series.str.isupper : Check whether all characters are uppercase. + Examples + -------- The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are assumed to be as any sequence of non-numeric characters separated by whitespace characters. + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s5.str.istitle() 0 False 1 True @@ -3366,11 +3760,40 @@ def casefold(self): 3 False dtype: bool """ + _shared_docs["isascii"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.istitle : Check whether all characters are titlecase. + Series.str.isupper : Check whether all characters are uppercase. + + Examples + ------------ + The ``s5.str.isascii`` method checks for whether all characters are ascii + characters, which includes digits 0-9, capital and lowercase letters A-Z, + and some other special characters. + + >>> s5 = pd.Series(['ö', 'see123', 'hello world', '']) + >>> s5.str.isascii() + 0 False + 1 True + 2 True + 3 True + dtype: bool + """ + _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} _doc_args["islower"] = {"type": "lowercase", "method": "islower"} + _doc_args["isascii"] = {"type": "ascii", "method": "isascii"} _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} @@ -3378,31 +3801,54 @@ def casefold(self): # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) isalnum = _map_and_wrap( - "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + "isalnum", + docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + + _shared_docs["isalnum"], ) isalpha = _map_and_wrap( - "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + "isalpha", + docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + + _shared_docs["isalpha"], ) isdigit = _map_and_wrap( - "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + "isdigit", + docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + + _shared_docs["isdigit"], ) isspace = _map_and_wrap( - "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + "isspace", + docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + + _shared_docs["isspace"], ) islower = _map_and_wrap( - "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] + "islower", + docstring=_shared_docs["ismethods"] % _doc_args["islower"] + + _shared_docs["islower"], + ) + isascii = _map_and_wrap( + "isascii", + docstring=_shared_docs["ismethods"] % _doc_args["isascii"] + + _shared_docs["isascii"], ) isupper = _map_and_wrap( - "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + "isupper", + docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + + _shared_docs["isupper"], ) istitle = _map_and_wrap( - "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + "istitle", + docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + + _shared_docs["istitle"], ) isnumeric = _map_and_wrap( - "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + "isnumeric", + docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + + _shared_docs["isnumeric"], ) isdecimal = _map_and_wrap( - "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + "isdecimal", + docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + + _shared_docs["isdecimal"], ) @@ -3487,7 +3933,7 @@ def _get_single_group_name(regex: re.Pattern) -> Hashable: return None -def _get_group_names(regex: re.Pattern) -> list[Hashable]: +def _get_group_names(regex: re.Pattern) -> list[Hashable] | range: """ Get named groups from compiled regex. @@ -3501,8 +3947,15 @@ def _get_group_names(regex: re.Pattern) -> list[Hashable]: ------- list of column labels """ + rng = range(regex.groups) names = {v: k for k, v in regex.groupindex.items()} - return [names.get(1 + i, i) for i in range(regex.groups)] + if not names: + return rng + result: list[Hashable] = [names.get(1 + i, i) for i in rng] + arr = np.array(result) + if arr.dtype.kind == "i" and lib.is_range_indexer(arr, len(arr)): + return rng + return result def str_extractall(arr, pat, flags: int = 0) -> DataFrame: diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 96b0352666b41..78c4f3acbe1aa 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -3,19 +3,23 @@ import abc from typing import ( TYPE_CHECKING, - Callable, Literal, ) -import numpy as np +from pandas._libs import lib if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) import re - from pandas._typing import Scalar - - from pandas import Series + from pandas._typing import ( + NpDtype, + Scalar, + Self, + ) class BaseStringArrayMethods(abc.ABC): @@ -85,7 +89,11 @@ def _str_repeat(self, repeats: int | Sequence[int]): @abc.abstractmethod def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): pass @@ -95,7 +103,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar = np.nan, + na: Scalar | lib.NoDefault = lib.no_default, ): pass @@ -160,7 +168,7 @@ def _str_wrap(self, width: int, **kwargs): pass @abc.abstractmethod - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): pass @abc.abstractmethod @@ -171,6 +179,10 @@ def _str_isalnum(self): def _str_isalpha(self): pass + @abc.abstractmethod + def _str_isascii(self): + pass + @abc.abstractmethod def _str_isdecimal(self): pass @@ -240,11 +252,11 @@ def _str_rstrip(self, to_strip=None): pass @abc.abstractmethod - def _str_removeprefix(self, prefix: str) -> Series: + def _str_removeprefix(self, prefix: str) -> Self: pass @abc.abstractmethod - def _str_removesuffix(self, suffix: str) -> Series: + def _str_removesuffix(self, suffix: str) -> Self: pass @abc.abstractmethod diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0029beccc40a8..0adb7b51cf2b7 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -5,46 +5,51 @@ import textwrap from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) import unicodedata +import warnings import numpy as np from pandas._libs import lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops +from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.missing import isna from pandas.core.strings.base import BaseStringArrayMethods if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( NpDtype, Scalar, ) - from pandas import Series - class ObjectStringArrayMixin(BaseStringArrayMethods): """ String Methods operating on object-dtype ndarrays. """ - _str_na_value = np.nan - def __len__(self) -> int: # For typing, _str_map relies on the object being sized. raise NotImplementedError def _str_map( - self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: NpDtype | None = None, + convert: bool = True, ): """ Map a callable over valid elements of the array. @@ -56,7 +61,7 @@ def _str_map( na_value : Scalar, optional The value to set for NA values. Might also be used for the fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` + This defaults to ``self.dtype.na_value`` which is ``np.nan`` for object-dtype and Categorical and ``pd.NA`` for StringArray. dtype : Dtype, optional The dtype of the result array. @@ -65,8 +70,8 @@ def _str_map( """ if dtype is None: dtype = np.dtype("object") - if na_value is None: - na_value = self._str_na_value + if na_value is lib.no_default: + na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): return np.array([], dtype=dtype) @@ -75,7 +80,9 @@ def _str_map( mask = isna(arr) map_convert = convert and not np.all(mask) try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) + result = lib.map_infer_mask( + arr, f, mask.view(np.uint8), convert=map_convert + ) except (TypeError, AttributeError) as err: # Reraise the exception if callable `f` got wrong number of args. # The user may want to be warned by this, instead of getting NaN @@ -127,7 +134,12 @@ def _str_pad( return self._str_map(f) def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if regex: if not case: @@ -142,14 +154,38 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na, dtype=np.dtype("bool")) - def _str_startswith(self, pat, na=None): + def _str_startswith(self, pat, na=lib.no_default): f = lambda x: x.startswith(pat) + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.startswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_endswith(self, pat, na=None): + def _str_endswith(self, pat, na=lib.no_default): f = lambda x: x.endswith(pat) + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.endswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_replace( @@ -205,13 +241,17 @@ def rep(x, r): np.asarray(repeats, dtype=object), rep, ) - if isinstance(self, BaseStringArray): - # Not going through map, so we have to do this here. - result = type(self)._from_sequence(result, dtype=self.dtype) - return result + if not isinstance(self, BaseStringArray): + return result + # Not going through map, so we have to do this here. + return type(self)._from_sequence(result, dtype=self.dtype) def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE @@ -226,7 +266,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar | None = None, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE @@ -270,7 +310,7 @@ def f(x): return x.get(i) elif len(x) > i >= -len(x): return x[i] - return self._str_na_value + return self.dtype.na_value # type: ignore[attr-defined] return self._str_map(f) @@ -372,9 +412,11 @@ def _str_wrap(self, width: int, **kwargs): tw = textwrap.TextWrapper(**kwargs) return self._str_map(lambda s: "\n".join(tw.wrap(s))) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): from pandas import Series + if dtype is None: + dtype = np.int64 arr = Series(self).fillna("") try: arr = sep + arr + sep @@ -386,7 +428,13 @@ def _str_get_dummies(self, sep: str = "|"): tags.update(ts) tags2 = sorted(tags - {""}) - dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype, order="F") def _isin(test_elements: str, element: str) -> bool: return element in test_elements @@ -407,6 +455,9 @@ def _str_isalnum(self): def _str_isalpha(self): return self._str_map(str.isalpha, dtype="bool") + def _str_isascii(self): + return self._str_map(str.isascii, dtype="bool") + def _str_isdecimal(self): return self._str_map(str.isdecimal, dtype="bool") @@ -456,24 +507,15 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) - def _str_removeprefix(self, prefix: str) -> Series: - # outstanding question on whether to use native methods for users on Python 3.9+ - # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770, - # in which case we could do return self._str_map(str.removeprefix) - - def removeprefix(text: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text - - return self._str_map(removeprefix) + def _str_removeprefix(self, prefix: str): + return self._str_map(lambda x: x.removeprefix(prefix)) - def _str_removesuffix(self, suffix: str) -> Series: + def _str_removesuffix(self, suffix: str): return self._str_map(lambda x: x.removesuffix(suffix)) def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): regex = re.compile(pat, flags=flags) - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not expand: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 05262c235568d..0a10001a3113f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -6,7 +6,6 @@ from itertools import islice from typing import ( TYPE_CHECKING, - Callable, TypedDict, Union, cast, @@ -29,6 +28,7 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -44,6 +44,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_float, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -76,7 +77,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.timedeltas import UnitChoices @@ -126,7 +130,7 @@ class FulldatetimeDict(YearMonthDayDict, total=False): def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721 + if type(first_non_nan_element := arr[first_non_null]) is str: # GH#32264 np.str_ object guessed_format = guess_datetime_format( first_non_nan_element, dayfirst=dayfirst @@ -188,9 +192,9 @@ def should_cache( else: check_count = 500 else: - assert ( - 0 <= check_count <= len(arg) - ), "check_count must be in next bounds: [0; len(arg)]" + assert 0 <= check_count <= len(arg), ( + "check_count must be in next bounds: [0; len(arg)]" + ) if check_count == 0: return False @@ -337,7 +341,7 @@ def _convert_listlike_datetimes( unit : str None or string of the frequency of the passed data errors : str - error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' + error handing behaviors from to_datetime, 'raise', 'coerce' dayfirst : bool dayfirst parsing behavior from to_datetime yearfirst : bool @@ -387,7 +391,6 @@ def _convert_listlike_datetimes( if not is_supported_dtype(arg_dtype): # We go to closest supported reso, i.e. "s" arg = astype_overflowsafe( - # TODO: looks like we incorrectly raise with errors=="ignore" np.asarray(arg), np.dtype("M8[s]"), is_coerce=errors == "coerce", @@ -416,11 +419,8 @@ def _convert_listlike_datetimes( arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) except TypeError: if errors == "coerce": - npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) + npvalues = np.full(len(arg), np.datetime64("NaT", "ns")) return DatetimeIndex(npvalues, name=name) - elif errors == "ignore": - idx = Index(arg, name=name) - return idx raise arg = ensure_object(arg) @@ -445,7 +445,7 @@ def _convert_listlike_datetimes( # We can take a shortcut since the datetime64 numpy array # is in UTC out_unit = np.datetime_data(result.dtype)[0] - dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit)) + dtype = tz_to_dtype(tz_parsed, out_unit) dt64_values = result.view(f"M8[{dtype.unit}]") dta = DatetimeArray._simple_new(dt64_values, dtype=dtype) return DatetimeIndex._simple_new(dta, name=name) @@ -485,7 +485,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: """ arg = extract_array(arg, extract_numpy=True) - # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime + # GH#30050 pass an ndarray to tslib.array_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): arr = arg.astype(f"datetime64[{unit}]") @@ -510,27 +510,28 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: with np.errstate(over="raise"): try: arr = cast_from_unit_vectorized(arg, unit=unit) - except OutOfBoundsDatetime: + except OutOfBoundsDatetime as err: if errors != "raise": return _to_datetime_with_unit( arg.astype(object), unit, name, utc, errors ) raise OutOfBoundsDatetime( f"cannot convert input with unit '{unit}'" - ) + ) from err arr = arr.view("M8[ns]") tz_parsed = None else: arg = arg.astype(object, copy=False) - arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) - - if errors == "ignore": - # Index constructor _may_ infer to DatetimeIndex - result = Index._with_infer(arr, name=name) - else: - result = DatetimeIndex(arr, name=name) + arr, tz_parsed = tslib.array_to_datetime( + arg, + utc=utc, + errors=errors, + unit_for_numerics=unit, + creso=NpyDatetimeUnit.NPY_FR_ns.value, + ) + result = DatetimeIndex(arr, name=name) if not isinstance(result, DatetimeIndex): return result @@ -629,11 +630,9 @@ def to_datetime( format: str | None = ..., exact: bool = ..., unit: str | None = ..., - infer_datetime_format: bool = ..., origin=..., cache: bool = ..., -) -> Timestamp: - ... +) -> Timestamp: ... @overload @@ -646,11 +645,9 @@ def to_datetime( format: str | None = ..., exact: bool = ..., unit: str | None = ..., - infer_datetime_format: bool = ..., origin=..., cache: bool = ..., -) -> Series: - ... +) -> Series: ... @overload @@ -663,11 +660,9 @@ def to_datetime( format: str | None = ..., exact: bool = ..., unit: str | None = ..., - infer_datetime_format: bool = ..., origin=..., cache: bool = ..., -) -> DatetimeIndex: - ... +) -> DatetimeIndex: ... def to_datetime( @@ -679,7 +674,6 @@ def to_datetime( format: str | None = None, exact: bool | lib.NoDefault = lib.no_default, unit: str | None = None, - infer_datetime_format: lib.NoDefault | bool = lib.no_default, origin: str = "unix", cache: bool = True, ) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: @@ -696,10 +690,9 @@ def to_datetime( method expects minimally the following columns: :const:`"year"`, :const:`"month"`, :const:`"day"`. The column "year" must be specified in 4-digit format. - errors : {'ignore', 'raise', 'coerce'}, default 'raise' + errors : {'raise', 'coerce'}, default 'raise' - If :const:`'raise'`, then invalid parsing will raise an exception. - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. - - If :const:`'ignore'`, then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or is list-like. If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` @@ -737,14 +730,6 @@ def to_datetime( offsets (typically, daylight savings), see :ref:`Examples ` section for details. - .. warning:: - - In a future version of pandas, parsing datetimes with mixed time - zones will raise an error unless `utc=True`. - Please specify `utc=True` to opt in to the new behaviour - and silence this warning. To create a `Series` with mixed offsets and - `object` dtype, please use `apply` and `datetime.datetime.strptime`. - See also: pandas general documentation about `timezone conversion and localization >> df = pd.DataFrame({'year': [2015, 2016], - ... 'month': [2, 3], - ... 'day': [4, 5]}) + >>> df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 - dtype: datetime64[ns] + dtype: datetime64[s] Using a unix epoch time - >>> pd.to_datetime(1490195805, unit='s') + >>> pd.to_datetime(1490195805, unit="s") Timestamp('2017-03-22 15:16:45') - >>> pd.to_datetime(1490195805433502912, unit='ns') + >>> pd.to_datetime(1490195805433502912, unit="ns") Timestamp('2017-03-22 15:16:45.433502912') .. warning:: For float arg, precision rounding might happen. To prevent @@ -917,8 +892,7 @@ def to_datetime( Using a non-unix epoch origin - >>> pd.to_datetime([1, 2, 3], unit='D', - ... origin=pd.Timestamp('1960-01-01')) + >>> pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")) DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None) @@ -926,8 +900,7 @@ def to_datetime( :const:`"%f"` will parse all the way up to nanoseconds. - >>> pd.to_datetime('2018-10-26 12:00:00.0000000011', - ... format='%Y-%m-%d %H:%M:%S.%f') + >>> pd.to_datetime("2018-10-26 12:00:00.0000000011", format="%Y-%m-%d %H:%M:%S.%f") Timestamp('2018-10-26 12:00:00.000000001') **Non-convertible date/times** @@ -935,7 +908,7 @@ def to_datetime( Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. - >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') + >>> pd.to_datetime("invalid for Ymd", format="%Y%m%d", errors="coerce") NaT .. _to_datetime_tz_examples: @@ -946,47 +919,48 @@ def to_datetime( - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: - >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) + >>> pd.to_datetime(["2018-10-26 12:00:00", "2018-10-26 13:00:15"]) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) - Timezone-aware inputs *with constant time offset* are converted to timezone-aware :class:`DatetimeIndex`: - >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) + >>> pd.to_datetime(["2018-10-26 12:00 -0500", "2018-10-26 13:00 -0500"]) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], - dtype='datetime64[ns, UTC-05:00]', freq=None) + dtype='datetime64[s, UTC-05:00]', freq=None) - However, timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris) are **not successfully converted** to a :class:`DatetimeIndex`. - Parsing datetimes with mixed time zones will show a warning unless - `utc=True`. If you specify `utc=False` the warning below will be shown - and a simple :class:`Index` containing :class:`datetime.datetime` - objects will be returned: - - >>> pd.to_datetime(['2020-10-25 02:00 +0200', - ... '2020-10-25 04:00 +0100']) # doctest: +SKIP - FutureWarning: In a future version of pandas, parsing datetimes with mixed - time zones will raise an error unless `utc=True`. Please specify `utc=True` - to opt in to the new behaviour and silence this warning. To create a `Series` - with mixed offsets and `object` dtype, please use `apply` and - `datetime.datetime.strptime`. - Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], - dtype='object') - - - A mix of timezone-aware and timezone-naive inputs is also converted to - a simple :class:`Index` containing :class:`datetime.datetime` objects: + Parsing datetimes with mixed time zones will raise a ValueError unless + ``utc=True``: + + >>> pd.to_datetime( + ... ["2020-10-25 02:00 +0200", "2020-10-25 04:00 +0100"] + ... ) # doctest: +SKIP + ValueError: Mixed timezones detected. Pass utc=True in to_datetime + or tz='UTC' in DatetimeIndex to convert to a common timezone. + + - To create a :class:`Series` with mixed offsets and ``object`` dtype, please use + :meth:`Series.apply` and :func:`datetime.datetime.strptime`: + + >>> import datetime as dt + >>> ser = pd.Series(["2020-10-25 02:00 +0200", "2020-10-25 04:00 +0100"]) + >>> ser.apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d %H:%M %z")) + 0 2020-10-25 02:00:00+02:00 + 1 2020-10-25 04:00:00+01:00 + dtype: object + + - A mix of timezone-aware and timezone-naive inputs will also raise a ValueError + unless ``utc=True``: >>> from datetime import datetime - >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", - ... datetime(2020, 1, 1, 3, 0)]) # doctest: +SKIP - FutureWarning: In a future version of pandas, parsing datetimes with mixed - time zones will raise an error unless `utc=True`. Please specify `utc=True` - to opt in to the new behaviour and silence this warning. To create a `Series` - with mixed offsets and `object` dtype, please use `apply` and - `datetime.datetime.strptime`. - Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') + >>> pd.to_datetime( + ... ["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)] + ... ) # doctest: +SKIP + ValueError: Mixed timezones detected. Pass utc=True in to_datetime + or tz='UTC' in DatetimeIndex to convert to a common timezone. | @@ -994,46 +968,26 @@ def to_datetime( - Timezone-naive inputs are *localized* as UTC - >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) + >>> pd.to_datetime(["2018-10-26 12:00", "2018-10-26 13:00"], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Timezone-aware inputs are *converted* to UTC (the output represents the exact same datetime, but viewed from the UTC time offset `+00:00`). - >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], - ... utc=True) + >>> pd.to_datetime(["2018-10-26 12:00 -0530", "2018-10-26 12:00 -0500"], utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Inputs can contain both string or datetime, the above rules still apply - >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True) + >>> pd.to_datetime(["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[us, UTC]', freq=None) """ if exact is not lib.no_default and format in {"mixed", "ISO8601"}: raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - stacklevel=find_stack_level(), - ) - if errors == "ignore": - # GH#54467 - warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_datetime without passing `errors` and catch exceptions " - "explicitly instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - if arg is None: return None @@ -1047,9 +1001,8 @@ def to_datetime( dayfirst=dayfirst, yearfirst=yearfirst, errors=errors, - exact=exact, + exact=exact, # type: ignore[arg-type] ) - # pylint: disable-next=used-before-assignment result: Timestamp | NaTType | Series | Index if isinstance(arg, Timestamp): @@ -1134,7 +1087,9 @@ def to_datetime( } -def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool): +def _assemble_from_unit_mappings( + arg, errors: DateTimeErrorChoices, utc: bool +) -> Series: """ assemble the unit specified fields from the arg (DataFrame) Return a Series for actual parsing @@ -1142,11 +1097,10 @@ def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool): Parameters ---------- arg : DataFrame - errors : {'ignore', 'raise', 'coerce'}, default 'raise' + errors : {'raise', 'coerce'}, default 'raise' - If :const:`'raise'`, then invalid parsing will raise an exception - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT` - - If :const:`'ignore'`, then invalid parsing will return the input utc : bool Whether to convert/localize timestamps to UTC. @@ -1180,18 +1134,18 @@ def f(value): # we require at least Ymd required = ["year", "month", "day"] - req = sorted(set(required) - set(unit_rev.keys())) + req = set(required) - set(unit_rev.keys()) if len(req): - _required = ",".join(req) + _required = ",".join(sorted(req)) raise ValueError( "to assemble mappings requires at least that " f"[year, month, day] be specified: [{_required}] is missing" ) # keys we don't recognize - excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) + excess = set(unit_rev.keys()) - set(_unit_map.values()) if len(excess): - _excess = ",".join(excess) + _excess = ",".join(sorted(excess)) raise ValueError( f"extra keys have been passed to the datetime assemblage: [{_excess}]" ) @@ -1200,9 +1154,13 @@ def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) + # prevent prevision issues in case of float32 # GH#60506 + if is_float_dtype(values.dtype): + values = values.astype("float64") + # prevent overflow in case of int8 or int16 if is_integer_dtype(values.dtype): - values = values.astype("int64", copy=False) + values = values.astype("int64") return values values = ( diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 09652a7d8bc92..bc45343d6e2d3 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -4,12 +4,17 @@ TYPE_CHECKING, Literal, ) -import warnings import numpy as np -from pandas._libs import lib -from pandas.util._exceptions import find_stack_level +from pandas._libs import ( + lib, + missing as libmissing, +) +from pandas._libs.tslibs import ( + Timedelta, + Timestamp, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -66,14 +71,10 @@ def to_numeric( ---------- arg : scalar, list, tuple, 1-d array, or Series Argument to be converted. - errors : {'ignore', 'raise', 'coerce'}, default 'raise' + + errors : {'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - - If 'ignore', then invalid parsing will return the input. - - .. versionchanged:: 2.2 - - "ignore" is deprecated. Catch exceptions explicitly instead. downcast : str, default None Can be 'integer', 'signed', 'unsigned', or 'float'. @@ -95,14 +96,15 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"numpy_nullable"``: returns nullable-dtype-backed object + * ``"pyarrow"``: returns with pyarrow-backed nullable object .. versionadded:: 2.0 @@ -124,24 +126,24 @@ def to_numeric( -------- Take separate series and convert to numeric, coercing when told to - >>> s = pd.Series(['1.0', '2', -3]) + >>> s = pd.Series(["1.0", "2", -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 - >>> pd.to_numeric(s, downcast='float') + >>> pd.to_numeric(s, downcast="float") 0 1.0 1 2.0 2 -3.0 dtype: float32 - >>> pd.to_numeric(s, downcast='signed') + >>> pd.to_numeric(s, downcast="signed") 0 1 1 2 2 -3 dtype: int8 - >>> s = pd.Series(['apple', '1.0', '2', -3]) - >>> pd.to_numeric(s, errors='coerce') + >>> s = pd.Series(["apple", "1.0", "2", -3]) + >>> pd.to_numeric(s, errors="coerce") 0 NaN 1 1.0 2 2.0 @@ -166,17 +168,8 @@ def to_numeric( if downcast not in (None, "integer", "signed", "unsigned", "float"): raise ValueError("invalid downcasting method provided") - if errors not in ("ignore", "raise", "coerce"): + if errors not in ("raise", "coerce"): raise ValueError("invalid error value specified") - if errors == "ignore": - # GH#54467 - warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_numeric without passing `errors` and catch exceptions " - "explicitly instead", - FutureWarning, - stacklevel=find_stack_level(), - ) check_dtype_backend(dtype_backend) @@ -200,6 +193,8 @@ def to_numeric( return float(arg) if is_number(arg): return arg + if isinstance(arg, (Timedelta, Timestamp)): + return arg._value is_scalars = True values = np.array([arg], dtype="O") elif getattr(arg, "ndim", 1) > 1: @@ -207,8 +202,6 @@ def to_numeric( else: values = arg - orig_values = values - # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting mask: npt.NDArray[np.bool_] | None = None @@ -227,30 +220,24 @@ def to_numeric( values = values.view(np.int64) else: values = ensure_object(values) - coerce_numeric = errors not in ("ignore", "raise") - try: - values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] - values, - set(), - coerce_numeric=coerce_numeric, - convert_to_masked_nullable=dtype_backend is not lib.no_default - or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy", - ) - except (ValueError, TypeError): - if errors == "raise": - raise - values = orig_values + coerce_numeric = errors != "raise" + values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] + values, + set(), + coerce_numeric=coerce_numeric, + convert_to_masked_nullable=dtype_backend is not lib.no_default + or ( + isinstance(values_dtype, StringDtype) + and values_dtype.na_value is libmissing.NA + ), + ) if new_mask is not None: # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] - elif ( - dtype_backend is not lib.no_default - and new_mask is None - or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy" + elif (dtype_backend is not lib.no_default and new_mask is None) or ( + isinstance(values_dtype, StringDtype) and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index d772c908c4731..8d82a5c213910 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -1,13 +1,14 @@ """ timedelta support tools """ + from __future__ import annotations from typing import ( TYPE_CHECKING, + Any, overload, ) -import warnings import numpy as np @@ -21,7 +22,6 @@ disallow_ambiguous_unit, parse_timedelta_unit, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.dtypes import ArrowDtype @@ -54,8 +54,7 @@ def to_timedelta( arg: str | float | timedelta, unit: UnitChoices | None = ..., errors: DateTimeErrorChoices = ..., -) -> Timedelta: - ... +) -> Timedelta: ... @overload @@ -63,8 +62,7 @@ def to_timedelta( arg: Series, unit: UnitChoices | None = ..., errors: DateTimeErrorChoices = ..., -) -> Series: - ... +) -> Series: ... @overload @@ -72,8 +70,7 @@ def to_timedelta( arg: list | tuple | range | ArrayLike | Index, unit: UnitChoices | None = ..., errors: DateTimeErrorChoices = ..., -) -> TimedeltaIndex: - ... +) -> TimedeltaIndex: ... def to_timedelta( @@ -89,7 +86,7 @@ def to_timedelta( | Series, unit: UnitChoices | None = None, errors: DateTimeErrorChoices = "raise", -) -> Timedelta | TimedeltaIndex | Series: +) -> Timedelta | TimedeltaIndex | Series | NaTType | Any: """ Convert argument to timedelta. @@ -115,23 +112,21 @@ def to_timedelta( * 'W' * 'D' / 'days' / 'day' * 'hours' / 'hour' / 'hr' / 'h' / 'H' - * 'm' / 'minute' / 'min' / 'minutes' / 'T' + * 'm' / 'minute' / 'min' / 'minutes' * 's' / 'seconds' / 'sec' / 'second' / 'S' - * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' - * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' - * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' + * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' + * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' + * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' Must not be specified when `arg` contains strings and ``errors="raise"``. .. deprecated:: 2.2.0 - Units 'H', 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed - in a future version. Please use 'h', 'min', 's', 'ms', 'us', and 'ns' - instead of 'H', 'T', 'S', 'L', 'U' and 'N'. + Units 'H'and 'S' are deprecated and will be removed + in a future version. Please use 'h' and 's'. - errors : {'ignore', 'raise', 'coerce'}, default 'raise' + errors : {'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaT. - - If 'ignore', then invalid parsing will return the input. Returns ------- @@ -158,24 +153,24 @@ def to_timedelta( -------- Parsing a single string to a Timedelta: - >>> pd.to_timedelta('1 days 06:05:01.00003') + >>> pd.to_timedelta("1 days 06:05:01.00003") Timedelta('1 days 06:05:01.000030') - >>> pd.to_timedelta('15.5us') + >>> pd.to_timedelta("15.5us") Timedelta('0 days 00:00:00.000015500') Parsing a list or array of strings: - >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + >>> pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"]) TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], dtype='timedelta64[ns]', freq=None) Converting numbers by specifying the `unit` keyword argument: - >>> pd.to_timedelta(np.arange(5), unit='s') + >>> pd.to_timedelta(np.arange(5), unit="s") TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', '0 days 00:00:04'], dtype='timedelta64[ns]', freq=None) - >>> pd.to_timedelta(np.arange(5), unit='d') + >>> pd.to_timedelta(np.arange(5), unit="D") TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ @@ -183,17 +178,8 @@ def to_timedelta( unit = parse_timedelta_unit(unit) disallow_ambiguous_unit(unit) - if errors not in ("ignore", "raise", "coerce"): - raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.") - if errors == "ignore": - # GH#54467 - warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_timedelta without passing `errors` and catch exceptions " - "explicitly instead", - FutureWarning, - stacklevel=find_stack_level(), - ) + if errors not in ("raise", "coerce"): + raise ValueError("errors must be one of 'raise', or 'coerce'.") if arg is None: return arg @@ -225,7 +211,7 @@ def to_timedelta( def _coerce_scalar_to_timedelta_type( r, unit: UnitChoices | None = "ns", errors: DateTimeErrorChoices = "raise" -): +) -> Timedelta | NaTType: """Convert string 'r' to a timedelta object.""" result: Timedelta | NaTType @@ -234,9 +220,6 @@ def _coerce_scalar_to_timedelta_type( except ValueError: if errors == "raise": raise - if errors == "ignore": - return r - # coerce result = NaT @@ -252,30 +235,11 @@ def _convert_listlike( """Convert a list of objects to a timedelta index object.""" arg_dtype = getattr(arg, "dtype", None) if isinstance(arg, (list, tuple)) or arg_dtype is None: - # This is needed only to ensure that in the case where we end up - # returning arg (errors == "ignore"), and where the input is a - # generator, we return a useful list-like instead of a - # used-up generator - if not hasattr(arg, "__array__"): - arg = list(arg) arg = np.array(arg, dtype=object) elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.kind == "m": return arg - try: - td64arr = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0] - except ValueError: - if errors == "ignore": - return arg - else: - # This else-block accounts for the cases when errors='raise' - # and errors='coerce'. If errors == 'raise', these errors - # should be raised. If errors == 'coerce', we shouldn't - # expect any errors to be raised, since all parsing errors - # cause coercion to pd.NaT. However, if an error / bug is - # introduced that causes an Exception to be raised, we would - # like to surface it. - raise + td64arr = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0] from pandas import TimedeltaIndex diff --git a/pandas/core/tools/times.py b/pandas/core/tools/times.py index d77bcc91df709..f6a610ebc36ab 100644 --- a/pandas/core/tools/times.py +++ b/pandas/core/tools/times.py @@ -5,12 +5,10 @@ time, ) from typing import TYPE_CHECKING -import warnings import numpy as np from pandas._libs.lib import is_list_like -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.generic import ( ABCIndex, @@ -45,24 +43,16 @@ def to_time( infer_time_format: bool, default False Infer the time format based on the first non-NaN element. If all strings are in the same format, this will speed up conversion. - errors : {'ignore', 'raise', 'coerce'}, default 'raise' + errors : {'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as None - - If 'ignore', then invalid parsing will return the input Returns ------- datetime.time """ - if errors == "ignore": - # GH#54467 - warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_time without passing `errors` and catch exceptions " - "explicitly instead", - FutureWarning, - stacklevel=find_stack_level(), - ) + if errors not in ("raise", "coerce"): + raise ValueError("errors must be one of 'raise', or 'coerce'.") def _convert_listlike(arg, format): if isinstance(arg, (list, tuple)): @@ -90,10 +80,7 @@ def _convert_listlike(arg, format): f"format {format}" ) raise ValueError(msg) from err - if errors == "ignore": - return arg - else: - times.append(None) + times.append(None) else: formats = _time_formats[:] format_found = False @@ -118,8 +105,6 @@ def _convert_listlike(arg, format): times.append(time_object) elif errors == "raise": raise ValueError(f"Cannot convert arg {arg} to a time") - elif errors == "ignore": - return arg else: times.append(None) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 4933de3212581..e120e69dc27cf 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -1,6 +1,7 @@ """ data hash pandas / numpy objects """ + from __future__ import annotations import itertools @@ -105,7 +106,8 @@ def hash_pandas_object( Returns ------- - Series of uint64, same length as the object + Series of uint64 + Same length as the object. Examples -------- @@ -242,6 +244,7 @@ def hash_array( Parameters ---------- vals : ndarray or ExtensionArray + The input array to hash. encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key @@ -255,6 +258,11 @@ def hash_array( ndarray[np.uint64, ndim=1] Hashed values, same length as the vals. + See Also + -------- + util.hash_pandas_object : Return a data hash of the Index/Series/DataFrame. + util.hash_tuples : Hash an MultiIndex / listlike-of-tuples efficiently. + Examples -------- >>> pd.util.hash_array(np.array([1, 2, 3])) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index b8d489179338b..d3f00c08e0e2c 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -1,14 +1,19 @@ """Common utilities for Numba operations""" + from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +import inspect +import types +from typing import TYPE_CHECKING + +import numpy as np from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError +if TYPE_CHECKING: + from collections.abc import Callable + GLOBAL_USE_NUMBA: bool = False @@ -24,9 +29,7 @@ def set_use_numba(enable: bool = False) -> None: GLOBAL_USE_NUMBA = enable -def get_jit_arguments( - engine_kwargs: dict[str, bool] | None = None, kwargs: dict | None = None -) -> dict[str, bool]: +def get_jit_arguments(engine_kwargs: dict[str, bool] | None = None) -> dict[str, bool]: """ Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. @@ -34,8 +37,6 @@ def get_jit_arguments( ---------- engine_kwargs : dict, default None user passed keyword arguments for numba.JIT - kwargs : dict, default None - user passed keyword arguments to pass into the JITed function Returns ------- @@ -50,11 +51,6 @@ def get_jit_arguments( engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) - if kwargs and nopython: - raise NumbaUtilError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" - ) nogil = engine_kwargs.get("nogil", False) parallel = engine_kwargs.get("parallel", False) return {"nopython": nopython, "nogil": nogil, "parallel": parallel} @@ -83,7 +79,72 @@ def jit_user_function(func: Callable) -> Callable: if numba.extending.is_jitted(func): # Don't jit a user passed jitted function numba_func = func + elif getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + # Not necessary to jit builtins or np functions + # This will mess up register_jitable + numba_func = func else: numba_func = numba.extending.register_jitable(func) return numba_func + + +_sentinel = object() + + +def prepare_function_arguments( + func: Callable, args: tuple, kwargs: dict, *, num_required_args: int +) -> tuple[tuple, dict]: + """ + Prepare arguments for jitted function. As numba functions do not support kwargs, + we try to move kwargs into args if possible. + + Parameters + ---------- + func : function + User defined function + args : tuple + User input positional arguments + kwargs : dict + User input keyword arguments + num_required_args : int + The number of leading positional arguments we will pass to udf. + These are not supplied by the user. + e.g. for groupby we require "values", "index" as the first two arguments: + `numba_func(group, group_index, *args)`, in this case num_required_args=2. + See :func:`pandas.core.groupby.numba_.generate_numba_agg_func` + + Returns + ------- + tuple[tuple, dict] + args, kwargs + + """ + if not kwargs: + return args, kwargs + + # the udf should have this pattern: def udf(arg1, arg2, ..., *args, **kwargs):... + signature = inspect.signature(func) + arguments = signature.bind(*[_sentinel] * num_required_args, *args, **kwargs) + arguments.apply_defaults() + # Ref: https://peps.python.org/pep-0362/ + # Arguments which could be passed as part of either *args or **kwargs + # will be included only in the BoundArguments.args attribute. + args = arguments.args + kwargs = arguments.kwargs + + if kwargs: + # Note: in case numba supports keyword-only arguments in + # a future version, we should remove this check. But this + # seems unlikely to happen soon. + + raise NumbaUtilError( + "numba does not support keyword-only arguments" + "https://github.com/numba/numba/issues/2916, " + "https://github.com/numba/numba/issues/6846" + ) + + args = args[num_required_args:] + return args, kwargs diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index fc8eddca09c84..004a3555f0212 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,4 +1,5 @@ """Common utility functions for rolling operations""" + from __future__ import annotations from collections import defaultdict diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index 2a5cbc04921fa..6dbc52a99e70c 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -1,4 +1,5 @@ """Any shareable docstring components for rolling/expanding/ewm""" + from __future__ import annotations from textwrap import dedent @@ -24,10 +25,10 @@ def create_section_header(header: str) -> str: template_see_also = dedent( """ - pandas.Series.{window_method} : Calling {window_method} with Series data. - pandas.DataFrame.{window_method} : Calling {window_method} with DataFrames. - pandas.Series.{agg_method} : Aggregating {agg_method} for Series. - pandas.DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n + Series.{window_method} : Calling {window_method} with Series data. + DataFrame.{window_method} : Calling {window_method} with DataFrames. + Series.{agg_method} : Aggregating {agg_method} for Series. + DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n """ ).replace("\n", "", 1) @@ -84,6 +85,63 @@ def create_section_header(header: str) -> str: """ ).replace("\n", "", 1) +template_pipe = """ +Apply a ``func`` with arguments to this %(klass)s object and return its result. + +Use `.pipe` when you want to improve readability by chaining together +functions that expect Series, DataFrames, GroupBy, Rolling, Expanding or Resampler +objects. +Instead of writing + +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, index=pd.date_range('2012-08-02', periods=4)) +>>> h(g(f(df.rolling('2D')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + +You can write + +>>> (df.rolling('2D') +... .pipe(f) +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP + +which is much more readable. + +Parameters +---------- +func : callable or tuple of (callable, str) + Function to apply to this %(klass)s object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + %(klass)s object. +*args : iterable, optional + Positional arguments passed into `func`. +**kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + +Returns +------- +%(klass)s + The original object with the function `func` applied. + +See Also +-------- +Series.pipe : Apply a function with arguments to a series. +DataFrame.pipe: Apply a function with arguments to a dataframe. +apply : Apply function to each group instead of to the + full %(klass)s object. + +Notes +----- +See more `here +`_ + +Examples +-------- +%(examples)s +""" + numba_notes = ( "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for " "extended documentation and performance considerations for the Numba engine.\n\n" diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 9ebf32d3e536e..73e4de6ea6208 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -56,7 +56,6 @@ if TYPE_CHECKING: from pandas._typing import ( - Axis, TimedeltaConvertibleTypes, npt, ) @@ -135,8 +134,10 @@ class ExponentialMovingWindow(BaseWindow): Provide exponentially weighted (EW) calculations. Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be - provided if ``times`` is not provided. If ``times`` is provided, + provided if ``times`` is not provided. If ``times`` is provided and ``adjust=True``, ``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided. + If ``times`` is provided and ``adjust=False``, ``halflife`` must be the only + provided decay-specification parameter. Parameters ---------- @@ -204,13 +205,6 @@ class ExponentialMovingWindow(BaseWindow): [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``. - axis : {0, 1}, default 0 - If ``0`` or ``'index'``, calculate across the rows. - - If ``1`` or ``'columns'``, calculate across the columns. - - For `Series` this parameter is unused and defaults to 0. - times : np.ndarray, Series, default None Only applicable to ``mean()``. @@ -234,6 +228,8 @@ class ExponentialMovingWindow(BaseWindow): Returns ------- pandas.api.typing.ExponentialMovingWindow + An instance of ExponentialMovingWindow for further exponentially weighted (EW) + calculations, e.g. using the ``mean`` method. See Also -------- @@ -328,7 +324,6 @@ class ExponentialMovingWindow(BaseWindow): "min_periods", "adjust", "ignore_na", - "axis", "times", "method", ] @@ -343,7 +338,6 @@ def __init__( min_periods: int | None = 0, adjust: bool = True, ignore_na: bool = False, - axis: Axis = 0, times: np.ndarray | NDFrame | None = None, method: str = "single", *, @@ -356,7 +350,6 @@ def __init__( center=False, closed=None, method=method, - axis=axis, selection=selection, ) self.com = com @@ -367,8 +360,6 @@ def __init__( self.ignore_na = ignore_na self.times = times if self.times is not None: - if not self.adjust: - raise NotImplementedError("times is not supported with adjust=False.") times_dtype = getattr(self.times, "dtype", None) if not ( is_datetime64_dtype(times_dtype) @@ -385,6 +376,11 @@ def __init__( # Halflife is no longer applicable when calculating COM # But allow COM to still be calculated if the user passes other decay args if common.count_not_none(self.com, self.span, self.alpha) > 0: + if not self.adjust: + raise NotImplementedError( + "None of com, span, or alpha can be specified if " + "times is provided and adjust=False" + ) self._com = get_center_of_mass(self.com, self.span, None, self.alpha) else: self._com = 1.0 @@ -397,9 +393,7 @@ def __init__( "times is not None." ) # Without times, points are equally spaced - self._deltas = np.ones( - max(self.obj.shape[self.axis] - 1, 0), dtype=np.float64 - ) + self._deltas = np.ones(max(self.obj.shape[0] - 1, 0), dtype=np.float64) self._com = get_center_of_mass( # error: Argument 3 to "get_center_of_mass" has incompatible type # "Union[float, Any, None, timedelta64, signedinteger[_64Bit]]"; @@ -460,7 +454,6 @@ def online( min_periods=self.min_periods, adjust=self.adjust, ignore_na=self.ignore_na, - axis=self.axis, times=self.times, engine=engine, engine_kwargs=engine_kwargs, @@ -497,7 +490,7 @@ def online( klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -601,6 +594,8 @@ def sum( ): if not self.adjust: raise NotImplementedError("sum is not implemented with adjust=False") + if self.times is not None: + raise NotImplementedError("sum is not implemented with times") if maybe_use_numba(engine): if self.method == "single": func = generate_numba_ewm_func @@ -672,6 +667,8 @@ def std(self, bias: bool = False, numeric_only: bool = False): raise NotImplementedError( f"{type(self).__name__}.std does not implement numeric_only" ) + if self.times is not None: + raise NotImplementedError("std is not implemented with times") return zsqrt(self.var(bias=bias, numeric_only=numeric_only)) @doc( @@ -705,6 +702,8 @@ def std(self, bias: bool = False, numeric_only: bool = False): agg_method="var", ) def var(self, bias: bool = False, numeric_only: bool = False): + if self.times is not None: + raise NotImplementedError("var is not implemented with times") window_func = window_aggregations.ewmcov wfunc = partial( window_func, @@ -767,6 +766,9 @@ def cov( bias: bool = False, numeric_only: bool = False, ): + if self.times is not None: + raise NotImplementedError("cov is not implemented with times") + from pandas import Series self._validate_numeric_only("cov", numeric_only) @@ -851,6 +853,9 @@ def corr( pairwise: bool | None = None, numeric_only: bool = False, ): + if self.times is not None: + raise NotImplementedError("corr is not implemented with times") + from pandas import Series self._validate_numeric_only("corr", numeric_only) @@ -941,7 +946,6 @@ def __init__( min_periods: int | None = 0, adjust: bool = True, ignore_na: bool = False, - axis: Axis = 0, times: np.ndarray | NDFrame | None = None, engine: str = "numba", engine_kwargs: dict[str, bool] | None = None, @@ -961,13 +965,10 @@ def __init__( min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, - axis=axis, times=times, selection=selection, ) - self._mean = EWMMeanState( - self._com, self.adjust, self.ignore_na, self.axis, obj.shape - ) + self._mean = EWMMeanState(self._com, self.adjust, self.ignore_na, obj.shape) if maybe_use_numba(engine): self.engine = engine self.engine_kwargs = engine_kwargs @@ -980,7 +981,7 @@ def reset(self) -> None: """ self._mean.reset() - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): raise NotImplementedError("aggregate is not implemented.") def std(self, bias: bool = False, *args, **kwargs): @@ -1055,7 +1056,7 @@ def mean(self, *args, update=None, update_times=None, **kwargs): if update_times is not None: raise NotImplementedError("update_times is not implemented.") update_deltas = np.ones( - max(self._selected_obj.shape[self.axis - 1] - 1, 0), dtype=np.float64 + max(self._selected_obj.shape[-1] - 1, 0), dtype=np.float64 ) if update is not None: if self._mean.last_ewm is None: @@ -1078,7 +1079,7 @@ def mean(self, *args, update=None, update_times=None, **kwargs): result_kwargs["columns"] = self._selected_obj.columns else: result_kwargs["name"] = self._selected_obj.name - np_array = self._selected_obj.astype(np.float64, copy=False).to_numpy() + np_array = self._selected_obj.astype(np.float64).to_numpy() ewma_func = generate_online_numba_ewma_func( **get_jit_arguments(self.engine_kwargs) ) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index aac10596ffc69..bff3485c9cb86 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -4,12 +4,14 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, + final, + overload, ) from pandas.util._decorators import ( - deprecate_kwarg, + Appender, + Substitution, doc, ) @@ -24,6 +26,7 @@ kwargs_numeric_only, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -35,9 +38,14 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( - Axis, + Concatenate, + P, QuantileInterpolation, + Self, + T, WindowingRankType, ) @@ -52,19 +60,15 @@ class Expanding(RollingAndExpandingMixin): """ Provide expanding window calculations. + An expanding window yields the value of an aggregation statistic with all the data + available up to that point in time. + Parameters ---------- min_periods : int, default 1 Minimum number of observations in window required to have a value; otherwise, result is ``np.nan``. - axis : int or str, default 0 - If ``0`` or ``'index'``, roll across the rows. - - If ``1`` or ``'columns'``, roll across the columns. - - For `Series` this parameter is unused and defaults to 0. - method : str {'single', 'table'}, default 'single' Execute the rolling operation per single column or row (``'single'``) or over the entire object (``'table'``). @@ -77,6 +81,8 @@ class Expanding(RollingAndExpandingMixin): Returns ------- pandas.api.typing.Expanding + An instance of Expanding for further expanding window calculations, + e.g. using the ``sum`` method. See Also -------- @@ -119,20 +125,18 @@ class Expanding(RollingAndExpandingMixin): 4 7.0 """ - _attributes: list[str] = ["min_periods", "axis", "method"] + _attributes: list[str] = ["min_periods", "method"] def __init__( self, obj: NDFrame, min_periods: int = 1, - axis: Axis = 0, method: str = "single", selection=None, ) -> None: super().__init__( obj=obj, min_periods=min_periods, - axis=axis, method=method, selection=selection, ) @@ -149,8 +153,8 @@ def _get_window_indexer(self) -> BaseIndexer: """ See Also -------- - pandas.DataFrame.aggregate : Similar DataFrame method. - pandas.Series.aggregate : Similar Series method. + DataFrame.aggregate : Similar DataFrame method. + Series.aggregate : Similar Series method. """ ), examples=dedent( @@ -174,13 +178,15 @@ def _get_window_indexer(self) -> BaseIndexer: klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate @doc( template_header, + create_section_header("Parameters"), + kwargs_numeric_only, create_section_header("Returns"), template_returns, create_section_header("See Also"), @@ -246,6 +252,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Expanding", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each expanding window's maximum and minimum + value in one pass, you can do + + >>> df.expanding().pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 2.0 + 2012-08-05 3.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), @@ -672,13 +726,85 @@ def kurt(self, numeric_only: bool = False): @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.first : Similar method for GroupBy objects. + Expanding.last : Method to get the last element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), dedent( """ - quantile : float + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 0.0 + 4 0.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.last : Similar method for GroupBy objects. + Expanding.first : Method to get the first element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + q : float Quantile to compute. 0 <= quantile <= 1. .. deprecated:: 2.1.0 - This will be renamed to 'q' in a future version. + This was renamed from 'quantile' to 'q' in version 2.1.0. interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}} This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -714,7 +840,6 @@ def kurt(self, numeric_only: bool = False): aggregation_description="quantile", agg_method="quantile", ) - @deprecate_kwarg(old_arg_name="quantile", new_arg_name="q") def quantile( self, q: float, @@ -802,6 +927,41 @@ def rank( numeric_only=numeric_only, ) + @doc( + template_header, + ".. versionadded:: 3.0.0 \n\n", + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 4, 2, 3, 5, 3]) + >>> s.expanding().nunique() + 0 1.0 + 1 2.0 + 2 3.0 + 3 4.0 + 4 5.0 + 5 5.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="nunique", + agg_method="nunique", + ) + def nunique( + self, + numeric_only: bool = False, + ): + return super().nunique( + numeric_only=numeric_only, + ) + @doc( template_header, create_section_header("Parameters"), @@ -873,6 +1033,9 @@ def cov( output will be a MultiIndexed DataFrame in the case of DataFrame inputs. In the case of missing elements, only complete pairwise observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), kwargs_numeric_only, diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 9357945e78c63..171d3bc1d1c35 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -14,6 +13,8 @@ from pandas.core.util.numba_ import jit_user_function if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import Scalar @@ -148,6 +149,9 @@ def ewm( # note that len(deltas) = len(vals) - 1 and deltas[i] # is to be used in conjunction with vals[i+1] old_wt *= old_wt_factor ** deltas[start + j - 1] + if not adjust and com == 1: + # update in case of irregular-interval time series + new_wt = 1.0 - old_wt else: weighted = old_wt_factor * weighted if is_observation: @@ -186,8 +190,7 @@ def generate_numba_table_func( Generate a numba jitted function to apply window calculations table-wise. Func will be passed a M window size x N number of columns array, and - must return a 1 x N number of columns array. Func is intended to operate - row-wise, but the result will be transposed for axis=1. + must return a 1 x N number of columns array. 1. jit the user's function 2. Return a rolling apply function with the jitted function inline @@ -228,10 +231,10 @@ def roll_table( stop = end[i] window = values[start:stop] count_nan = np.sum(np.isnan(window), axis=0) - sub_result = numba_func(window, *args) nan_mask = len(window) - count_nan >= minimum_periods + if nan_mask.any(): + result[i, :] = numba_func(window, *args) min_periods_mask[i, :] = nan_mask - result[i, :] = sub_result result = np.where(min_periods_mask, result, np.nan) return result @@ -324,6 +327,9 @@ def ewm_table( # note that len(deltas) = len(vals) - 1 and deltas[i] # is to be used in conjunction with vals[i+1] old_wt[j] *= old_wt_factor ** deltas[i - 1] + if not adjust and com == 1: + # update in case of irregular-interval time series + new_wt = 1.0 - old_wt[j] else: weighted[j] = old_wt_factor * weighted[j] if is_observations[j]: diff --git a/pandas/core/window/online.py b/pandas/core/window/online.py index 29d1f740e021f..72236bf5ccea2 100644 --- a/pandas/core/window/online.py +++ b/pandas/core/window/online.py @@ -87,15 +87,14 @@ def online_ewma( class EWMMeanState: - def __init__(self, com, adjust, ignore_na, axis, shape) -> None: + def __init__(self, com, adjust, ignore_na, shape) -> None: alpha = 1.0 / (1.0 + com) - self.axis = axis self.shape = shape self.adjust = adjust self.ignore_na = ignore_na self.new_wt = 1.0 if adjust else alpha self.old_wt_factor = 1.0 - alpha - self.old_wt = np.ones(self.shape[self.axis - 1]) + self.old_wt = np.ones(self.shape[-1]) self.last_ewm = None def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func): @@ -114,5 +113,5 @@ def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func): return result def reset(self) -> None: - self.old_wt = np.ones(self.shape[self.axis - 1]) + self.old_wt = np.ones(self.shape[-1]) self.last_ewm = None diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e78bd258c11ff..03534bbee4c58 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2,6 +2,7 @@ Provide a generic structure to support window functions, similar to how we have a Groupby object. """ + from __future__ import annotations import copy @@ -12,9 +13,9 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, - cast, + final, + overload, ) import numpy as np @@ -28,7 +29,8 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError from pandas.util._decorators import ( - deprecate_kwarg, + Appender, + Substitution, doc, ) @@ -39,6 +41,7 @@ is_numeric_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -47,7 +50,10 @@ from pandas.core._numba import executor from pandas.core.algorithms import factorize -from pandas.core.apply import ResamplerWindowApply +from pandas.core.apply import ( + ResamplerWindowApply, + reconstruct_func, +) from pandas.core.arrays import ExtensionArray from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -68,6 +74,7 @@ from pandas.core.util.numba_ import ( get_jit_arguments, maybe_use_numba, + prepare_function_arguments, ) from pandas.core.window.common import ( flex_binary_moment, @@ -80,6 +87,7 @@ kwargs_scipy, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -92,6 +100,7 @@ ) if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Hashable, Iterator, @@ -100,10 +109,14 @@ from pandas._typing import ( ArrayLike, - Axis, + Concatenate, NDFrameT, QuantileInterpolation, + P, + Self, + T, WindowingRankType, + npt, ) from pandas import ( @@ -130,7 +143,6 @@ def __init__( min_periods: int | None = None, center: bool | None = False, win_type: str | None = None, - axis: Axis = 0, on: str | Index | None = None, closed: str | None = None, step: int | None = None, @@ -146,15 +158,10 @@ def __init__( self.min_periods = min_periods self.center = center self.win_type = win_type - self.axis = obj._get_axis_number(axis) if axis is not None else None self.method = method self._win_freq_i8: int | None = None if self.on is None: - if self.axis == 0: - self._on = self.obj.index - else: - # i.e. self.axis == 1 - self._on = self.obj.columns + self._on = self.obj.index elif isinstance(self.on, Index): self._on = self.on elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: @@ -276,15 +283,9 @@ def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: """ # filter out the on from the object if self.on is not None and not isinstance(self.on, Index) and obj.ndim == 2: - obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - if obj.ndim > 1 and (numeric_only or self.axis == 1): - # GH: 20649 in case of mixed dtype and axis=1 we have to convert everything - # to float to calculate the complete row at once. We exclude all non-numeric - # dtypes. + obj = obj.reindex(columns=obj.columns.difference([self.on], sort=False)) + if obj.ndim > 1 and numeric_only: obj = self._make_numeric_only(obj) - if self.axis == 1: - obj = obj.astype("float64", copy=False) - obj._mgr = obj._mgr.consolidate() return obj def _gotitem(self, key, ndim, subset=None): @@ -404,11 +405,12 @@ def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None: result[name] = extra_col @property - def _index_array(self): + def _index_array(self) -> npt.NDArray[np.int64] | None: # TODO: why do we get here with e.g. MultiIndex? - if needs_i8_conversion(self._on.dtype): - idx = cast("PeriodIndex | DatetimeIndex | TimedeltaIndex", self._on) - return idx.asi8 + if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): + return self._on.asi8 + elif isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM": + return self._on.to_numpy(dtype=np.int64) return None def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame: @@ -439,7 +441,7 @@ def _apply_series( self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> Series: """ - Series version of _apply_blockwise + Series version of _apply_columnwise """ obj = self._create_data(self._selected_obj) @@ -455,7 +457,7 @@ def _apply_series( index = self._slice_axis_for_step(obj.index, result) return obj._constructor(result, index=index, name=obj.name) - def _apply_blockwise( + def _apply_columnwise( self, homogeneous_func: Callable[..., ArrayLike], name: str, @@ -475,9 +477,6 @@ def _apply_blockwise( obj = notna(obj).astype(int) obj._mgr = obj._mgr.consolidate() - if self.axis == 1: - obj = obj.T - taker = [] res_values = [] for i, arr in enumerate(obj._iter_column_arrays()): @@ -503,9 +502,6 @@ def _apply_blockwise( verify_integrity=False, ) - if self.axis == 1: - df = df.T - return self._resolve_output(df, obj) def _apply_tablewise( @@ -521,9 +517,7 @@ def _apply_tablewise( raise ValueError("method='table' not applicable for Series objects.") obj = self._create_data(self._selected_obj, numeric_only) values = self._prep_values(obj.to_numpy()) - values = values.T if self.axis == 1 else values result = homogeneous_func(values) - result = result.T if self.axis == 1 else result index = self._slice_axis_for_step(obj.index, result) columns = ( obj.columns @@ -614,7 +608,7 @@ def calc(x): return result if self.method == "single": - return self._apply_blockwise(homogeneous_func, name, numeric_only) + return self._apply_columnwise(homogeneous_func, name, numeric_only) else: return self._apply_tablewise(homogeneous_func, name, numeric_only) @@ -631,8 +625,6 @@ def _numba_apply( else window_indexer.window_size ) obj = self._create_data(self._selected_obj) - if self.axis == 1: - obj = obj.T values = self._prep_values(obj.to_numpy()) if values.ndim == 1: values = values.reshape(-1, 1) @@ -658,7 +650,6 @@ def _numba_apply( result = aggregator( values.T, start=start, end=end, min_periods=min_periods, **func_kwargs ).T - result = result.T if self.axis == 1 else result index = self._slice_axis_for_step(obj.index, result) if obj.ndim == 1: result = result.squeeze() @@ -669,8 +660,12 @@ def _numba_apply( out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): + relabeling, func, columns, order = reconstruct_func(func, **kwargs) result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + if isinstance(result, ABCDataFrame) and relabeling: + result = result.iloc[:, order] + result.columns = columns # type: ignore[union-attr] if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -844,7 +839,7 @@ def _apply_pairwise( result_levels = [idx_levels] result_names = [result.index.name] - # 3) Create the resulting index by combining 1) + 2) + # 3) Create the resulting index by combining 1) + 2) result_codes = groupby_codes + result_codes result_levels = groupby_levels + result_levels result_names = self._grouper.names + result_names @@ -886,16 +881,16 @@ class Window(BaseWindow): Parameters ---------- window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. + Interval of the moving window. - If an integer, the fixed number of observations used for - each window. + If an integer, the delta between the start and end of each window. + The number of points in the window depends on the ``closed`` argument. If a timedelta, str, or offset, the time period of each window. Each window will be a variable sized based on the observations included in the time-period. This is only valid for datetimelike indexes. - To learn more about the offsets & frequency strings, please see `this link - `__. + To learn more about the offsets & frequency strings, please see + :ref:`this link`. If a BaseIndexer subclass, the window boundaries based on the defined ``get_window_bounds`` method. Additional rolling @@ -933,38 +928,33 @@ class Window(BaseWindow): Provided integer column is ignored and excluded from result since an integer index is not used to calculate the rolling window. - axis : int or str, default 0 - If ``0`` or ``'index'``, roll across the rows. - - If ``1`` or ``'columns'``, roll across the columns. - - For `Series` this parameter is unused and defaults to 0. + closed : str, default None + Determines the inclusivity of points in the window - .. deprecated:: 2.1.0 + If ``'right'``, uses the window (first, last] meaning the last point + is included in the calculations. - The axis keyword is deprecated. For ``axis=1``, - transpose the DataFrame first instead. + If ``'left'``, uses the window [first, last) meaning the first point + is included in the calculations. - closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + If ``'both'``, uses the window [first, last] meaning all points in + the window are included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'neither'``, uses the window (first, last) meaning the first + and last points in the window are excluded from calculations. - If ``'both'``, the no points in the window are excluded from calculations. - - If ``'neither'``, the first and last points in the window are excluded - from calculations. + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). step : int, default None - - .. versionadded:: 1.5.0 - Evaluate the window at every ``step`` result, equivalent to slicing as ``[::step]``. ``window`` must be an integer. Using a step argument other than None or 1 will produce a result with a different shape than the input. + .. versionadded:: 1.5.0 + method : str {'single', 'table'}, default 'single' .. versionadded:: 1.3.0 @@ -993,7 +983,7 @@ class Window(BaseWindow): Examples -------- - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) >>> df B 0 0.0 @@ -1016,12 +1006,16 @@ class Window(BaseWindow): Rolling sum with a window span of 2 seconds. - >>> df_time = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - ... index=[pd.Timestamp('20130101 09:00:00'), - ... pd.Timestamp('20130101 09:00:02'), - ... pd.Timestamp('20130101 09:00:03'), - ... pd.Timestamp('20130101 09:00:05'), - ... pd.Timestamp('20130101 09:00:06')]) + >>> df_time = pd.DataFrame( + ... {"B": [0, 1, 2, np.nan, 4]}, + ... index=[ + ... pd.Timestamp("20130101 09:00:00"), + ... pd.Timestamp("20130101 09:00:02"), + ... pd.Timestamp("20130101 09:00:03"), + ... pd.Timestamp("20130101 09:00:05"), + ... pd.Timestamp("20130101 09:00:06"), + ... ], + ... ) >>> df_time B @@ -1031,7 +1025,7 @@ class Window(BaseWindow): 2013-01-01 09:00:05 NaN 2013-01-01 09:00:06 4.0 - >>> df_time.rolling('2s').sum() + >>> df_time.rolling("2s").sum() B 2013-01-01 09:00:00 0.0 2013-01-01 09:00:02 1.0 @@ -1099,7 +1093,7 @@ class Window(BaseWindow): Rolling sum with a window length of 2, using the Scipy ``'gaussian'`` window type. ``std`` is required in the aggregation function. - >>> df.rolling(2, win_type='gaussian').sum(std=3) + >>> df.rolling(2, win_type="gaussian").sum(std=3) B 0 NaN 1 0.986207 @@ -1111,12 +1105,17 @@ class Window(BaseWindow): Rolling sum with a window length of 2 days. - >>> df = pd.DataFrame({ - ... 'A': [pd.to_datetime('2020-01-01'), - ... pd.to_datetime('2020-01-01'), - ... pd.to_datetime('2020-01-02'),], - ... 'B': [1, 2, 3], }, - ... index=pd.date_range('2020', periods=3)) + >>> df = pd.DataFrame( + ... { + ... "A": [ + ... pd.to_datetime("2020-01-01"), + ... pd.to_datetime("2020-01-01"), + ... pd.to_datetime("2020-01-02"), + ... ], + ... "B": [1, 2, 3], + ... }, + ... index=pd.date_range("2020", periods=3), + ... ) >>> df A B @@ -1124,7 +1123,7 @@ class Window(BaseWindow): 2020-01-02 2020-01-01 2 2020-01-03 2020-01-02 3 - >>> df.rolling('2D', on='A').sum() + >>> df.rolling("2D", on="A").sum() A B 2020-01-01 2020-01-01 1.0 2020-01-02 2020-01-01 3.0 @@ -1136,14 +1135,13 @@ class Window(BaseWindow): "min_periods", "center", "win_type", - "axis", "on", "closed", "step", "method", ] - def _validate(self): + def _validate(self) -> None: super()._validate() if not isinstance(self.win_type, str): @@ -1215,7 +1213,7 @@ def homogeneous_func(values: np.ndarray): return values.copy() def calc(x): - additional_nans = np.array([np.nan] * offset) + additional_nans = np.full(offset, np.nan) x = np.concatenate((x, additional_nans)) return func( x, @@ -1232,7 +1230,9 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step] + return self._apply_columnwise(homogeneous_func, name, numeric_only)[ + :: self.step + ] @doc( _shared_docs["aggregate"], @@ -1240,8 +1240,8 @@ def calc(x): """ See Also -------- - pandas.DataFrame.aggregate : Similar DataFrame method. - pandas.Series.aggregate : Similar Series method. + DataFrame.aggregate : Similar DataFrame method. + Series.aggregate : Similar Series method. """ ), examples=dedent( @@ -1265,7 +1265,7 @@ def calc(x): klass="Series/DataFrame", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: # these must apply directly @@ -1377,6 +1377,13 @@ def mean(self, numeric_only: bool = False, **kwargs): @doc( template_header, create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), kwargs_numeric_only, kwargs_scipy, create_section_header("Returns"), @@ -1419,6 +1426,13 @@ def var(self, ddof: int = 1, numeric_only: bool = False, **kwargs): @doc( template_header, create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), kwargs_numeric_only, kwargs_scipy, create_section_header("Returns"), @@ -1485,14 +1499,16 @@ def apply( if maybe_use_numba(engine): if raw is False: raise ValueError("raw must be `True` when using the numba engine") - numba_args = args + numba_args, kwargs = prepare_function_arguments( + func, args, kwargs, num_required_args=1 + ) if self.method == "single": apply_func = generate_numba_apply_func( - func, **get_jit_arguments(engine_kwargs, kwargs) + func, **get_jit_arguments(engine_kwargs) ) else: apply_func = generate_numba_table_func( - func, **get_jit_arguments(engine_kwargs, kwargs) + func, **get_jit_arguments(engine_kwargs) ) elif engine in ("cython", None): if engine_kwargs is not None: @@ -1520,7 +1536,7 @@ def _generate_cython_apply_func( window_aggregations.roll_apply, args=args, kwargs=kwargs, - raw=raw, + raw=bool(raw), function=function, ) @@ -1532,6 +1548,30 @@ def apply_func(values, begin, end, min_periods, raw=raw): return apply_func + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return com.pipe(self, func, *args, **kwargs) + def sum( self, numeric_only: bool = False, @@ -1708,6 +1748,22 @@ def kurt(self, numeric_only: bool = False): numeric_only=numeric_only, ) + def first(self, numeric_only: bool = False): + window_func = window_aggregations.roll_first + return self._apply( + window_func, + name="first", + numeric_only=numeric_only, + ) + + def last(self, numeric_only: bool = False): + window_func = window_aggregations.roll_last + return self._apply( + window_func, + name="last", + numeric_only=numeric_only, + ) + def quantile( self, q: float, @@ -1743,6 +1799,16 @@ def rank( return self._apply(window_func, name="rank", numeric_only=numeric_only) + def nunique( + self, + numeric_only: bool = False, + ): + window_func = partial( + window_aggregations.roll_nunique, + ) + + return self._apply(window_func, name="nunique", numeric_only=numeric_only) + def cov( self, other: DataFrame | Series | None = None, @@ -1854,20 +1920,20 @@ class Rolling(RollingAndExpandingMixin): "min_periods", "center", "win_type", - "axis", "on", "closed", "step", "method", ] - def _validate(self): + def _validate(self) -> None: super()._validate() # we allow rolling on a datetimelike index if ( self.obj.empty or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) + or (isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM") ) and isinstance(self.window, (str, BaseOffset, timedelta)): self._validate_datetimelike_monotonic() @@ -1921,10 +1987,7 @@ def _validate_datetimelike_monotonic(self) -> None: def _raise_monotonic_error(self, msg: str): on = self.on if on is None: - if self.axis == 0: - on = "index" - else: - on = "column" + on = "index" raise ValueError(f"{on} {msg}") @doc( @@ -1933,8 +1996,8 @@ def _raise_monotonic_error(self, msg: str): """ See Also -------- - pandas.Series.rolling : Calling object with Series data. - pandas.DataFrame.rolling : Calling object with DataFrame data. + Series.rolling : Calling object with Series data. + DataFrame.rolling : Calling object with DataFrame data. """ ), examples=dedent( @@ -1964,7 +2027,7 @@ def _raise_monotonic_error(self, msg: str): klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -2050,6 +2113,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Rolling", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each rolling 2-day window's maximum and minimum + value in one pass, you can do + + >>> df.rolling('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 1.0 + 2012-08-05 1.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), @@ -2129,7 +2240,19 @@ def sum( template_header, create_section_header("Parameters"), kwargs_numeric_only, + dedent( + """ + *args : iterable, optional + Positional arguments passed into ``func``.\n + """ + ).replace("\n", "", 1), window_agg_numba_parameters(), + dedent( + """ + **kwargs : mapping, optional + A dictionary of keyword arguments passed into ``func``.\n + """ + ).replace("\n", "", 1), create_section_header("Returns"), template_returns, create_section_header("See Also"), @@ -2533,16 +2656,88 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.first : Similar method for GroupBy objects. + Rolling.last : Method to get the last element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 1.0 + 4 2.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.last : Similar method for GroupBy objects. + Rolling.first : Method to get the first element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), dedent( """ - quantile : float + q : float Quantile to compute. 0 <= quantile <= 1. .. deprecated:: 2.1.0 - This will be renamed to 'q' in a future version. + This was renamed from 'quantile' to 'q' in version 2.1.0. interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}} This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -2583,7 +2778,6 @@ def kurt(self, numeric_only: bool = False): aggregation_description="quantile", agg_method="quantile", ) - @deprecate_kwarg(old_arg_name="quantile", new_arg_name="q") def quantile( self, q: float, @@ -2671,6 +2865,43 @@ def rank( numeric_only=numeric_only, ) + @doc( + template_header, + ".. versionadded:: 3.0.0 \n\n", + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5]) + >>> s.rolling(3).nunique() + 0 NaN + 1 NaN + 2 3.0 + 3 NaN + 4 NaN + 5 NaN + 6 2.0 + 7 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="nunique", + agg_method="nunique", + ) + def nunique( + self, + numeric_only: bool = False, + ): + return super().nunique( + numeric_only=numeric_only, + ) + @doc( template_header, create_section_header("Parameters"), @@ -2906,7 +3137,7 @@ def _get_window_indexer(self) -> GroupbyIndexer: ) return window_indexer - def _validate_datetimelike_monotonic(self): + def _validate_datetimelike_monotonic(self) -> None: """ Validate that each group in self._on is monotonic """ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 01094ba36b9dd..2b5bc450e41d6 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -1,6 +1,7 @@ """ Expose public exceptions & warnings """ + from __future__ import annotations import ctypes @@ -19,6 +20,16 @@ class IntCastingNaNError(ValueError): """ Exception raised when converting (``astype``) an array with NaN to an integer type. + This error occurs when attempting to cast a data structure containing non-finite + values (such as NaN or infinity) to an integer data type. Integer types do not + support non-finite values, so such conversions are explicitly disallowed to + prevent silent data corruption or unexpected behavior. + + See Also + -------- + DataFrame.astype : Method to cast a pandas DataFrame object to a specified dtype. + Series.astype : Method to cast a pandas Series object to a specified dtype. + Examples -------- >>> pd.DataFrame(np.array([[1, np.nan], [2, 3]]), dtype="i8") @@ -34,6 +45,11 @@ class NullFrequencyError(ValueError): Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``, ``PeriodIndex.shift``. + See Also + -------- + Index.shift : Shift values of Index. + Series.shift : Shift values of Series. + Examples -------- >>> df = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) @@ -47,11 +63,17 @@ class PerformanceWarning(Warning): """ Warning raised when there is a possible performance impact. + See Also + -------- + DataFrame.set_index : Set the DataFrame index using existing columns. + DataFrame.loc : Access a group of rows and columns by label(s) \ + or a boolean array. + Examples -------- - >>> df = pd.DataFrame({"jim": [0, 0, 1, 1], - ... "joe": ["x", "x", "z", "y"], - ... "jolie": [1, 2, 3, 4]}) + >>> df = pd.DataFrame( + ... {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": [1, 2, 3, 4]} + ... ) >>> df = df.set_index(["jim", "joe"]) >>> df jolie @@ -60,7 +82,7 @@ class PerformanceWarning(Warning): x 2 1 z 3 y 4 - >>> df.loc[(1, 'z')] # doctest: +SKIP + >>> df.loc[(1, "z")] # doctest: +SKIP # PerformanceWarning: indexing past lexsort depth may impact performance. df.loc[(1, 'z')] jolie @@ -75,12 +97,17 @@ class UnsupportedFunctionCall(ValueError): For example, ``np.cumsum(groupby_object)``. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Series.groupby : Group Series using a mapper or by a Series of columns. + core.groupby.GroupBy.cumsum : Compute cumulative sum for each group. + Examples -------- - >>> df = pd.DataFrame({"A": [0, 0, 1, 1], - ... "B": ["x", "x", "z", "y"], - ... "C": [1, 2, 3, 4]} - ... ) + >>> df = pd.DataFrame( + ... {"A": [0, 0, 1, 1], "B": ["x", "x", "z", "y"], "C": [1, 2, 3, 4]} + ... ) >>> np.cumsum(df.groupby(["A"])) Traceback (most recent call last): UnsupportedFunctionCall: numpy operations are not valid with groupby. @@ -94,12 +121,20 @@ class UnsortedIndexError(KeyError): Subclass of `KeyError`. + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by its index. + DataFrame.set_index : Set the DataFrame index using existing columns. + Examples -------- - >>> df = pd.DataFrame({"cat": [0, 0, 1, 1], - ... "color": ["white", "white", "brown", "black"], - ... "lives": [4, 4, 3, 7]}, - ... ) + >>> df = pd.DataFrame( + ... { + ... "cat": [0, 0, 1, 1], + ... "color": ["white", "white", "brown", "black"], + ... "lives": [4, 4, 3, 7], + ... }, + ... ) >>> df = df.set_index(["cat", "color"]) >>> df lives @@ -108,7 +143,7 @@ class UnsortedIndexError(KeyError): white 4 1 brown 3 black 7 - >>> df.loc[(0, "black"):(1, "white")] + >>> df.loc[(0, "black") : (1, "white")] Traceback (most recent call last): UnsortedIndexError: 'Key length (2) was greater than MultiIndex lexsort depth (1)' @@ -133,7 +168,7 @@ class ParserError(ValueError): ... cat,foo,bar ... dog,foo,"baz''' >>> from io import StringIO - >>> pd.read_csv(StringIO(data), skipfooter=1, engine='python') + >>> pd.read_csv(StringIO(data), skipfooter=1, engine="python") Traceback (most recent call last): ParserError: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows @@ -167,12 +202,15 @@ class DtypeWarning(Warning): This example creates and reads a large CSV file with a column that contains `int` and `str`. - >>> df = pd.DataFrame({'a': (['1'] * 100000 + ['X'] * 100000 + - ... ['1'] * 100000), - ... 'b': ['b'] * 300000}) # doctest: +SKIP - >>> df.to_csv('test.csv', index=False) # doctest: +SKIP - >>> df2 = pd.read_csv('test.csv') # doctest: +SKIP - ... # DtypeWarning: Columns (0) have mixed types + >>> df = pd.DataFrame( + ... { + ... "a": (["1"] * 100000 + ["X"] * 100000 + ["1"] * 100000), + ... "b": ["b"] * 300000, + ... } + ... ) # doctest: +SKIP + >>> df.to_csv("test.csv", index=False) # doctest: +SKIP + >>> df2 = pd.read_csv("test.csv") # doctest: +SKIP + ... # DtypeWarning: Columns (0: a) have mixed types Important to notice that ``df2`` will contain both `str` and `int` for the same input, '1'. @@ -189,7 +227,7 @@ class DtypeWarning(Warning): One way to solve this issue is using the `dtype` parameter in the `read_csv` and `read_table` functions to explicit the conversion: - >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str}) # doctest: +SKIP + >>> df2 = pd.read_csv("test.csv", sep=",", dtype={"a": str}) # doctest: +SKIP No warning was issued. """ @@ -199,6 +237,17 @@ class EmptyDataError(ValueError): """ Exception raised in ``pd.read_csv`` when empty data or header is encountered. + This error is typically encountered when attempting to read an empty file or + an invalid file where no data or headers are present. + + See Also + -------- + read_csv : Read a comma-separated values (CSV) file into DataFrame. + errors.ParserError : Exception that is raised by an error encountered in parsing + file contents. + errors.DtypeWarning : Warning raised when reading different dtypes in a column + from a file. + Examples -------- >>> from io import StringIO @@ -223,7 +272,6 @@ class ParserWarning(Warning): 1. `sep` other than a single character (e.g. regex separators) 2. `skipfooter` higher than 0 - 3. `sep=None` with `delim_whitespace=False` The warning can be avoided by adding `engine='python'` as a parameter in `pd.read_csv` and `pd.read_table` methods. @@ -241,12 +289,12 @@ class ParserWarning(Warning): >>> csv = '''a;b;c ... 1;1,8 ... 1;2,1''' - >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]') # doctest: +SKIP + >>> df = pd.read_csv(io.StringIO(csv), sep="[;,]") # doctest: +SKIP ... # ParserWarning: Falling back to the 'python' engine... Adding `engine='python'` to `pd.read_csv` removes the Warning: - >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]', engine='python') + >>> df = pd.read_csv(io.StringIO(csv), sep="[;,]", engine="python") """ @@ -256,15 +304,26 @@ class MergeError(ValueError): Subclass of ``ValueError``. + See Also + -------- + DataFrame.join : For joining DataFrames on their indexes. + merge : For merging two DataFrames on a common set of keys. + Examples -------- - >>> left = pd.DataFrame({"a": ["a", "b", "b", "d"], - ... "b": ["cat", "dog", "weasel", "horse"]}, - ... index=range(4)) - >>> right = pd.DataFrame({"a": ["a", "b", "c", "d"], - ... "c": ["meow", "bark", "chirp", "nay"]}, - ... index=range(4)).set_index("a") - >>> left.join(right, on="a", validate="one_to_one",) + >>> left = pd.DataFrame( + ... {"a": ["a", "b", "b", "d"], "b": ["cat", "dog", "weasel", "horse"]}, + ... index=range(4), + ... ) + >>> right = pd.DataFrame( + ... {"a": ["a", "b", "c", "d"], "c": ["meow", "bark", "chirp", "nay"]}, + ... index=range(4), + ... ).set_index("a") + >>> left.join( + ... right, + ... on="a", + ... validate="one_to_one", + ... ) Traceback (most recent call last): MergeError: Merge keys are not unique in left dataset; not a one-to-one merge """ @@ -274,12 +333,37 @@ class AbstractMethodError(NotImplementedError): """ Raise this error instead of NotImplementedError for abstract methods. + The `AbstractMethodError` is designed for use in classes that follow an abstract + base class pattern. By raising this error in the method, it ensures that a subclass + must implement the method to provide specific functionality. This is useful in a + framework or library where certain methods must be implemented by the user to + ensure correct behavior. + + Parameters + ---------- + class_instance : object + The instance of the class where the abstract method is being called. + methodtype : str, default "method" + A string indicating the type of method that is abstract. + Must be one of {"method", "classmethod", "staticmethod", "property"}. + + See Also + -------- + api.extensions.ExtensionArray + An example of a pandas extension mechanism that requires implementing + specific abstract methods. + NotImplementedError + A built-in exception that can also be used for abstract methods but lacks + the specificity of `AbstractMethodError` in indicating the need for subclass + implementation. + Examples -------- >>> class Foo: ... @classmethod ... def classmethod(cls): ... raise pd.errors.AbstractMethodError(cls, methodtype="classmethod") + ... ... def method(self): ... raise pd.errors.AbstractMethodError(self) >>> test = Foo.classmethod() @@ -312,10 +396,18 @@ class NumbaUtilError(Exception): """ Error raised for unsupported Numba engine routines. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Series.groupby : Group Series using a mapper or by a Series of columns. + DataFrame.agg : Aggregate using one or more operations over the specified axis. + Series.agg : Aggregate using one or more operations over the specified axis. + Examples -------- - >>> df = pd.DataFrame({"key": ["a", "a", "b", "b"], "data": [1, 2, 3, 4]}, - ... columns=["key", "data"]) + >>> df = pd.DataFrame( + ... {"key": ["a", "a", "b", "b"], "data": [1, 2, 3, 4]}, columns=["key", "data"] + ... ) >>> def incorrect_function(x): ... return sum(x) * 2.7 >>> df.groupby("key").agg(incorrect_function, engine="numba") @@ -329,12 +421,25 @@ class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. + This error is typically encountered when performing operations on objects + with `allows_duplicate_labels=False` and the operation would result in + duplicate labels in the index. Duplicate labels can lead to ambiguities + in indexing and reduce data integrity. + + See Also + -------- + Series.set_flags : Return a new ``Series`` object with updated flags. + DataFrame.set_flags : Return a new ``DataFrame`` object with updated flags. + Series.reindex : Conform ``Series`` object to new index with optional filling logic. + DataFrame.reindex : Conform ``DataFrame`` object to new index with optional filling + logic. + Examples -------- - >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags( + >>> s = pd.Series([0, 1, 2], index=["a", "b", "c"]).set_flags( ... allows_duplicate_labels=False ... ) - >>> s.reindex(['a', 'a', 'b']) + >>> s.reindex(["a", "a", "b"]) Traceback (most recent call last): ... DuplicateLabelError: Index has duplicates. @@ -348,11 +453,20 @@ class InvalidIndexError(Exception): """ Exception raised when attempting to use an invalid index key. + This exception is triggered when a user attempts to access or manipulate + data in a pandas DataFrame or Series using an index key that is not valid + for the given object. This may occur in cases such as using a malformed + slice, a mismatched key for a ``MultiIndex``, or attempting to access an index + element that does not exist. + + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Examples -------- >>> idx = pd.MultiIndex.from_product([["x", "y"], [0, 1]]) - >>> df = pd.DataFrame([[1, 1, 2, 2], - ... [3, 3, 4, 4]], columns=idx) + >>> df = pd.DataFrame([[1, 1, 2, 2], [3, 3, 4, 4]], columns=idx) >>> df x y 0 1 0 1 @@ -366,14 +480,19 @@ class InvalidIndexError(Exception): class DataError(Exception): """ - Exceptionn raised when performing an operation on non-numerical data. + Exception raised when performing an operation on non-numerical data. For example, calling ``ohlc`` on a non-numerical column or a function on a rolling window. + See Also + -------- + Series.rolling : Provide rolling window calculations on Series object. + DataFrame.rolling : Provide rolling window calculations on DataFrame object. + Examples -------- - >>> ser = pd.Series(['a', 'b', 'c']) + >>> ser = pd.Series(["a", "b", "c"]) >>> ser.rolling(2).sum() Traceback (most recent call last): DataError: No numeric types to aggregate @@ -392,66 +511,25 @@ class SpecificationError(Exception): The second way is calling ``agg`` on a Dataframe with duplicated functions names without assigning column name. + See Also + -------- + DataFrame.agg : Aggregate using one or more operations over the specified axis. + Series.agg : Aggregate using one or more operations over the specified axis. + Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2], - ... 'B': range(5), - ... 'C': range(5)}) - >>> df.groupby('A').B.agg({'foo': 'count'}) # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) + >>> df.groupby("A").B.agg({"foo": "count"}) # doctest: +SKIP ... # SpecificationError: nested renamer is not supported - >>> df.groupby('A').agg({'B': {'foo': ['sum', 'max']}}) # doctest: +SKIP + >>> df.groupby("A").agg({"B": {"foo": ["sum", "max"]}}) # doctest: +SKIP ... # SpecificationError: nested renamer is not supported - >>> df.groupby('A').agg(['min', 'min']) # doctest: +SKIP + >>> df.groupby("A").agg(["min", "min"]) # doctest: +SKIP ... # SpecificationError: nested renamer is not supported """ -class SettingWithCopyError(ValueError): - """ - Exception raised when trying to set on a copied slice from a ``DataFrame``. - - The ``mode.chained_assignment`` needs to be set to set to 'raise.' This can - happen unintentionally when chained indexing. - - For more information on evaluation order, - see :ref:`the user guide`. - - For more information on view vs. copy, - see :ref:`the user guide`. - - Examples - -------- - >>> pd.options.mode.chained_assignment = 'raise' - >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A']) - >>> df.loc[0:3]['A'] = 'a' # doctest: +SKIP - ... # SettingWithCopyError: A value is trying to be set on a copy of a... - """ - - -class SettingWithCopyWarning(Warning): - """ - Warning raised when trying to set on a copied slice from a ``DataFrame``. - - The ``mode.chained_assignment`` needs to be set to set to 'warn.' - 'Warn' is the default option. This can happen unintentionally when - chained indexing. - - For more information on evaluation order, - see :ref:`the user guide`. - - For more information on view vs. copy, - see :ref:`the user guide`. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A']) - >>> df.loc[0:3]['A'] = 'a' # doctest: +SKIP - ... # SettingWithCopyWarning: A value is trying to be set on a copy of a... - """ - - class ChainedAssignmentError(Warning): """ Warning raised when trying to set using chained assignment. @@ -462,94 +540,24 @@ class ChainedAssignmentError(Warning): Copy-on-Write always behaves as a copy. Thus, assigning through a chain can never update the original Series or DataFrame. - For more information on view vs. copy, - see :ref:`the user guide`. + For more information on Copy-on-Write, + see :ref:`the user guide`. + + See Also + -------- + options.mode.copy_on_write : Global setting for enabling or disabling + Copy-on-Write behavior. Examples -------- >>> pd.options.mode.copy_on_write = True - >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A']) - >>> df["A"][0:3] = 10 # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2]}, columns=["A"]) + >>> df["A"][0:3] = 10 # doctest: +SKIP ... # ChainedAssignmentError: ... >>> pd.options.mode.copy_on_write = False """ -_chained_assignment_msg = ( - "A value is trying to be set on a copy of a DataFrame or Series " - "through chained assignment.\n" - "When using the Copy-on-Write mode, such chained assignment never works " - "to update the original DataFrame or Series, because the intermediate " - "object on which we are setting values always behaves as a copy.\n\n" - "Try using '.loc[row_indexer, col_indexer] = value' instead, to perform " - "the assignment in a single step.\n\n" - "See the caveats in the documentation: " - "https://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" -) - - -_chained_assignment_method_msg = ( - "A value is trying to be set on a copy of a DataFrame or Series " - "through chained assignment using an inplace method.\n" - "When using the Copy-on-Write mode, such inplace method never works " - "to update the original DataFrame or Series, because the intermediate " - "object on which we are setting values always behaves as a copy.\n\n" - "For example, when doing 'df[col].method(value, inplace=True)', try " - "using 'df.method({col: value}, inplace=True)' instead, to perform " - "the operation inplace on the original object.\n\n" -) - - -_chained_assignment_warning_msg = ( - "ChainedAssignmentError: behaviour will change in pandas 3.0!\n" - "You are setting values through chained assignment. Currently this works " - "in certain cases, but when using Copy-on-Write (which will become the " - "default behaviour in pandas 3.0) this will never work to update the " - "original DataFrame or Series, because the intermediate object on which " - "we are setting values will behave as a copy.\n" - "A typical example is when you are setting values in a column of a " - "DataFrame, like:\n\n" - 'df["col"][row_indexer] = value\n\n' - 'Use `df.loc[row_indexer, "col"] = values` instead, to perform the ' - "assignment in a single step and ensure this keeps updating the original `df`.\n\n" - "See the caveats in the documentation: " - "https://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy\n" -) - - -_chained_assignment_warning_method_msg = ( - "A value is trying to be set on a copy of a DataFrame or Series " - "through chained assignment using an inplace method.\n" - "The behavior will change in pandas 3.0. This inplace method will " - "never work because the intermediate object on which we are setting " - "values always behaves as a copy.\n\n" - "For example, when doing 'df[col].method(value, inplace=True)', try " - "using 'df.method({col: value}, inplace=True)' or " - "df[col] = df[col].method(value) instead, to perform " - "the operation inplace on the original object.\n\n" -) - - -def _check_cacher(obj): - # This is a mess, selection paths that return a view set the _cacher attribute - # on the Series; most of them also set _item_cache which adds 1 to our relevant - # reference count, but iloc does not, so we have to check if we are actually - # in the item cache - if hasattr(obj, "_cacher"): - parent = obj._cacher[1]() - # parent could be dead - if parent is None: - return False - if hasattr(parent, "_item_cache"): - if obj._cacher[0] in parent._item_cache: - # Check if we are actually the item from item_cache, iloc creates a - # new object - return obj is parent._item_cache[obj._cacher[0]] - return False - - class NumExprClobberingError(NameError): """ Exception raised when trying to use a built-in numexpr name as a variable name. @@ -558,13 +566,18 @@ class NumExprClobberingError(NameError): to 'numexpr'. 'numexpr' is the default engine value for these methods if the numexpr package is installed. + See Also + -------- + eval : Evaluate a Python expression as a string using various backends. + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + Examples -------- - >>> df = pd.DataFrame({'abs': [1, 1, 1]}) - >>> df.query("abs > 2") # doctest: +SKIP + >>> df = pd.DataFrame({"abs": [1, 1, 1]}) + >>> df.query("abs > 2") # doctest: +SKIP ... # NumExprClobberingError: Variables in expression "(abs) > (2)" overlap... >>> sin, a = 1, 2 - >>> pd.eval("sin + a", engine='numexpr') # doctest: +SKIP + >>> pd.eval("sin + a", engine="numexpr") # doctest: +SKIP ... # NumExprClobberingError: Variables in expression "(sin) + (a)" overlap... """ @@ -575,19 +588,33 @@ class UndefinedVariableError(NameError): It will also specify whether the undefined variable is local or not. + Parameters + ---------- + name : str + The name of the undefined variable. + is_local : bool or None, optional + Indicates whether the undefined variable is considered a local variable. + If ``True``, the error message specifies it as a local variable. + If ``False`` or ``None``, the variable is treated as a non-local name. + + See Also + -------- + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + DataFrame.eval : Evaluate a string describing operations on DataFrame columns. + Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.query("A > x") # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1]}) + >>> df.query("A > x") # doctest: +SKIP ... # UndefinedVariableError: name 'x' is not defined - >>> df.query("A > @y") # doctest: +SKIP + >>> df.query("A > @y") # doctest: +SKIP ... # UndefinedVariableError: local variable 'y' is not defined - >>> pd.eval('x + 1') # doctest: +SKIP + >>> pd.eval("x + 1") # doctest: +SKIP ... # UndefinedVariableError: name 'x' is not defined """ def __init__(self, name: str, is_local: bool | None = None) -> None: - base_msg = f"{repr(name)} is not defined" + base_msg = f"{name!r} is not defined" if is_local: msg = f"local variable {base_msg}" else: @@ -599,19 +626,29 @@ class IndexingError(Exception): """ Exception is raised when trying to index and there is a mismatch in dimensions. + Raised by properties like :attr:`.pandas.DataFrame.iloc` when + an indexer is out of bounds or :attr:`.pandas.DataFrame.loc` when its index is + unalignable to the frame index. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for \ + selection by position. + DataFrame.loc : Access a group of rows and columns by label(s) \ + or a boolean array. + Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.loc[..., ..., 'A'] # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1]}) + >>> df.loc[..., ..., "A"] # doctest: +SKIP ... # IndexingError: indexer may only contain one '...' entry - >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.loc[1, ..., ...] # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1]}) + >>> df.loc[1, ..., ...] # doctest: +SKIP ... # IndexingError: Too many indexers - >>> df[pd.Series([True], dtype=bool)] # doctest: +SKIP + >>> df[pd.Series([True], dtype=bool)] # doctest: +SKIP ... # IndexingError: Unalignable boolean Series provided as indexer... - >>> s = pd.Series(range(2), - ... index = pd.MultiIndex.from_product([["a", "b"], ["c"]])) - >>> s.loc["a", "c", "d"] # doctest: +SKIP + >>> s = pd.Series(range(2), index=pd.MultiIndex.from_product([["a", "b"], ["c"]])) + >>> s.loc["a", "c", "d"] # doctest: +SKIP ... # IndexingError: Too many indexers """ @@ -645,16 +682,24 @@ class CSSWarning(UserWarning): This can be due to the styling not having an equivalent value or because the styling isn't properly formatted. + See Also + -------- + DataFrame.style : Returns a Styler object for applying CSS-like styles. + io.formats.style.Styler : Helps style a DataFrame or Series according to the + data with HTML and CSS. + io.formats.style.Styler.to_excel : Export styled DataFrame to Excel. + io.formats.style.Styler.to_html : Export styled DataFrame to HTML. + Examples -------- - >>> df = pd.DataFrame({'A': [1, 1, 1]}) - >>> df.style.applymap( - ... lambda x: 'background-color: blueGreenRed;' - ... ).to_excel('styled.xlsx') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 1, 1]}) + >>> df.style.map(lambda x: "background-color: blueGreenRed;").to_excel( + ... "styled.xlsx" + ... ) # doctest: +SKIP CSSWarning: Unhandled color format: 'blueGreenRed' - >>> df.style.applymap( - ... lambda x: 'border: 1px solid red red;' - ... ).to_excel('styled.xlsx') # doctest: +SKIP + >>> df.style.map(lambda x: "border: 1px solid red red;").to_excel( + ... "styled.xlsx" + ... ) # doctest: +SKIP CSSWarning: Unhandled color format: 'blueGreenRed' """ @@ -663,11 +708,19 @@ class PossibleDataLossError(Exception): """ Exception raised when trying to open a HDFStore file when already opened. + This error is triggered when there is a potential risk of data loss due to + conflicting operations on an HDFStore file. It serves to prevent unintended + overwrites or data corruption by enforcing exclusive access to the file. + + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + HDFStore.open : Open an HDFStore file in the specified mode. + Examples -------- - >>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP - >>> store.open("w") # doctest: +SKIP - ... # PossibleDataLossError: Re-opening the file [my-store] with mode [a]... + >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP + >>> store.open("w") # doctest: +SKIP """ @@ -675,11 +728,21 @@ class ClosedFileError(Exception): """ Exception is raised when trying to perform an operation on a closed HDFStore file. + ``ClosedFileError`` is specific to operations on ``HDFStore`` objects. Once an + HDFStore is closed, its resources are no longer available, and any further attempt + to access data or perform file operations will raise this exception. + + See Also + -------- + HDFStore.close : Closes the PyTables file handle. + HDFStore.open : Opens the file in the specified mode. + HDFStore.is_open : Returns a boolean indicating whether the file is open. + Examples -------- - >>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP - >>> store.close() # doctest: +SKIP - >>> store.keys() # doctest: +SKIP + >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP + >>> store.close() # doctest: +SKIP + >>> store.keys() # doctest: +SKIP ... # ClosedFileError: my-store file is not open! """ @@ -698,14 +761,20 @@ class AttributeConflictWarning(Warning): name than the existing index on an HDFStore or attempting to append an index with a different frequency than the existing index on an HDFStore. + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + DataFrame.to_hdf : Write the contained data to an HDF5 file using HDFStore. + read_hdf : Read from an HDF5 file into a DataFrame. + Examples -------- - >>> idx1 = pd.Index(['a', 'b'], name='name1') + >>> idx1 = pd.Index(["a", "b"], name="name1") >>> df1 = pd.DataFrame([[1, 2], [3, 4]], index=idx1) - >>> df1.to_hdf('file', 'data', 'w', append=True) # doctest: +SKIP - >>> idx2 = pd.Index(['c', 'd'], name='name2') + >>> df1.to_hdf("file", "data", "w", append=True) # doctest: +SKIP + >>> idx2 = pd.Index(["c", "d"], name="name2") >>> df2 = pd.DataFrame([[5, 6], [7, 8]], index=idx2) - >>> df2.to_hdf('file', 'data', 'a', append=True) # doctest: +SKIP + >>> df2.to_hdf("file", "data", "a", append=True) # doctest: +SKIP AttributeConflictWarning: the [index_name] attribute of the existing index is [name1] which conflicts with the new [name2]... """ @@ -713,14 +782,19 @@ class AttributeConflictWarning(Warning): class DatabaseError(OSError): """ - Error is raised when executing sql with bad syntax or sql that throws an error. + Error is raised when executing SQL with bad syntax or SQL that throws an error. + + Raised by :func:`.pandas.read_sql` when a bad SQL statement is passed in. + + See Also + -------- + read_sql : Read SQL query or database table into a DataFrame. Examples -------- >>> from sqlite3 import connect - >>> conn = connect(':memory:') - >>> pd.read_sql('select * test', conn) # doctest: +SKIP - ... # DatabaseError: Execution failed on sql 'test': near "test": syntax error + >>> conn = connect(":memory:") + >>> pd.read_sql("select * test", conn) # doctest: +SKIP """ @@ -731,11 +805,14 @@ class PossiblePrecisionLoss(Warning): When the column value is outside or equal to the int64 value the column is converted to a float64 dtype. + See Also + -------- + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)}) - >>> df.to_stata('test') # doctest: +SKIP - ... # PossiblePrecisionLoss: Column converted from int64 to float64... + >>> df.to_stata("test") # doctest: +SKIP """ @@ -743,11 +820,20 @@ class ValueLabelTypeMismatch(Warning): """ Warning raised by to_stata on a category column that contains non-string values. + When exporting data to Stata format using the `to_stata` method, category columns + must have string values as labels. If a category column contains non-string values + (e.g., integers, floats, or other types), this warning is raised to indicate that + the Stata file may not correctly represent the data. + + See Also + -------- + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Series.cat : Accessor for categorical properties of the Series values. + Examples -------- >>> df = pd.DataFrame({"categories": pd.Series(["a", 2], dtype="category")}) - >>> df.to_stata('test') # doctest: +SKIP - ... # ValueLabelTypeMismatch: Stata value labels (pandas categories) must be str... + >>> df.to_stata("test") # doctest: +SKIP """ @@ -758,11 +844,14 @@ class InvalidColumnName(Warning): Because the column name is an invalid Stata variable, the name needs to be converted. + See Also + -------- + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame({"0categories": pd.Series([2, 2])}) - >>> df.to_stata('test') # doctest: +SKIP - ... # InvalidColumnName: Not all pandas column names were valid Stata variable... + >>> df.to_stata("test") # doctest: +SKIP """ @@ -770,12 +859,21 @@ class CategoricalConversionWarning(Warning): """ Warning is raised when reading a partial labeled Stata file using a iterator. + This warning helps ensure data integrity and alerts users to potential issues + during the incremental reading of Stata files with labeled data, allowing for + additional checks and adjustments as necessary. + + See Also + -------- + read_stata : Read a Stata file into a DataFrame. + Categorical : Represents a categorical variable in pandas. + Examples -------- >>> from pandas.io.stata import StataReader - >>> with StataReader('dta_file', chunksize=2) as reader: # doctest: +SKIP - ... for i, block in enumerate(reader): - ... print(i, block) + >>> with StataReader("dta_file", chunksize=2) as reader: # doctest: +SKIP + ... for i, block in enumerate(reader): + ... print(i, block) ... # CategoricalConversionWarning: One or more series with value labels... """ @@ -809,27 +907,28 @@ class InvalidComparison(Exception): __all__ = [ "AbstractMethodError", "AttributeConflictWarning", + "CSSWarning", "CategoricalConversionWarning", + "ChainedAssignmentError", "ClosedFileError", - "CSSWarning", - "DatabaseError", "DataError", + "DatabaseError", "DtypeWarning", "DuplicateLabelError", "EmptyDataError", "IncompatibilityWarning", + "IndexingError", "IntCastingNaNError", "InvalidColumnName", "InvalidComparison", "InvalidIndexError", "InvalidVersion", - "IndexingError", "LossySetitemError", "MergeError", "NoBufferPresent", "NullFrequencyError", - "NumbaUtilError", "NumExprClobberingError", + "NumbaUtilError", "OptionError", "OutOfBoundsDatetime", "OutOfBoundsTimedelta", @@ -840,8 +939,6 @@ class InvalidComparison(Exception): "PossiblePrecisionLoss", "PyperclipException", "PyperclipWindowsException", - "SettingWithCopyError", - "SettingWithCopyWarning", "SpecificationError", "UndefinedVariableError", "UnsortedIndexError", diff --git a/pandas/errors/cow.py b/pandas/errors/cow.py new file mode 100644 index 0000000000000..4815a0f5328d7 --- /dev/null +++ b/pandas/errors/cow.py @@ -0,0 +1,24 @@ +_chained_assignment_msg = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment.\n" + "When using the Copy-on-Write mode, such chained assignment never works " + "to update the original DataFrame or Series, because the intermediate " + "object on which we are setting values always behaves as a copy.\n\n" + "Try using '.loc[row_indexer, col_indexer] = value' instead, to perform " + "the assignment in a single step.\n\n" + "See the caveats in the documentation: " + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" + "copy_on_write.html" +) + + +_chained_assignment_method_msg = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment using an inplace method.\n" + "When using the Copy-on-Write mode, such inplace method never works " + "to update the original DataFrame or Series, because the intermediate " + "object on which we are setting values always behaves as a copy.\n\n" + "For example, when doing 'df[col].method(value, inplace=True)', try " + "using 'df.method({col: value}, inplace=True)' instead, to perform " + "the operation inplace on the original object.\n\n" +) diff --git a/pandas/io/__init__.py b/pandas/io/__init__.py index c804b81c49e7c..1c7e531debb14 100644 --- a/pandas/io/__init__.py +++ b/pandas/io/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 3b2ae5daffdba..6827fbe9c998e 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,11 +1,30 @@ from __future__ import annotations -from typing import Callable +from typing import ( + TYPE_CHECKING, + Literal, +) +import numpy as np + +from pandas._config import using_string_dtype + +from pandas._libs import lib +from pandas.compat import ( + pa_version_under18p0, + pa_version_under19p0, +) from pandas.compat._optional import import_optional_dependency import pandas as pd +if TYPE_CHECKING: + from collections.abc import Callable + + import pyarrow + + from pandas._typing import DtypeBackend + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -22,13 +41,53 @@ def _arrow_dtype_mapping() -> dict: pa.string(): pd.StringDtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), + pa.string(): pd.StringDtype(), + pa.large_string(): pd.StringDtype(), } -def arrow_string_types_mapper() -> Callable: +def _arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return { - pa.string(): pd.StringDtype(storage="pyarrow_numpy"), - pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), - }.get + mapping = { + pa.string(): pd.StringDtype(na_value=np.nan), + pa.large_string(): pd.StringDtype(na_value=np.nan), + } + if not pa_version_under18p0: + mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) + + return mapping.get + + +def arrow_table_to_pandas( + table: pyarrow.Table, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, + null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, +) -> pd.DataFrame: + pa = import_optional_dependency("pyarrow") + + to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs + + types_mapper: type[pd.ArrowDtype] | None | Callable + if dtype_backend == "numpy_nullable": + mapping = _arrow_dtype_mapping() + if null_to_int64: + # Modify the default mapping to also map null to Int64 + # (to match other engines - only for CSV parser) + mapping[pa.null()] = pd.Int64Dtype() + types_mapper = mapping.get + elif dtype_backend == "pyarrow": + types_mapper = pd.ArrowDtype + elif using_string_dtype(): + if pa_version_under19p0: + types_mapper = _arrow_string_types_mapper() + else: + types_mapper = None + elif dtype_backend is lib.no_default or dtype_backend == "numpy": + types_mapper = None + else: + raise NotImplementedError + + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return df diff --git a/pandas/io/api.py b/pandas/io/api.py index 4e8b34a61dfc6..5900c94384384 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -9,8 +9,8 @@ read_excel, ) from pandas.io.feather_format import read_feather -from pandas.io.gbq import read_gbq from pandas.io.html import read_html +from pandas.io.iceberg import read_iceberg from pandas.io.json import read_json from pandas.io.orc import read_orc from pandas.io.parquet import read_parquet @@ -46,9 +46,9 @@ "read_excel", "read_feather", "read_fwf", - "read_gbq", "read_hdf", "read_html", + "read_iceberg", "read_json", "read_orc", "read_parquet", diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index a15e37328e9fa..2ed241f0b9bca 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -1,4 +1,5 @@ -""" io on the clipboard """ +"""io on the clipboard""" + from __future__ import annotations from io import StringIO @@ -37,14 +38,15 @@ def read_clipboard( A string or regex delimiter. The default of ``'\\s+'`` denotes one or more whitespace characters. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -64,7 +66,7 @@ def read_clipboard( Examples -------- - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) >>> df.to_clipboard() # doctest: +SKIP >>> pd.read_clipboard() # doctest: +SKIP A B C @@ -112,9 +114,8 @@ def read_clipboard( if index_length != 0: kwargs.setdefault("index_col", list(range(index_length))) - # Edge case where sep is specified to be None, return to default - if sep is None and kwargs.get("delim_whitespace") is None: - sep = r"\s+" + elif not isinstance(sep, str): + raise ValueError(f"{sep=} must be a string") # Regex separator currently only works with python engine. # Default to python if separator is multi-character (regex) diff --git a/pandas/io/common.py b/pandas/io/common.py index 72c9deeb54fc7..1a9e6b472463d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,4 +1,5 @@ """Common IO api utilities""" + from __future__ import annotations from abc import ( @@ -54,10 +55,6 @@ BaseBuffer, ReadCsvBuffer, ) -from pandas.compat import ( - get_bz2_file, - get_lzma_file, -) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -74,7 +71,7 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") -_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") +_FSSPEC_URL_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)*://") BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) @@ -176,13 +173,11 @@ def is_url(url: object) -> bool: @overload -def _expand_user(filepath_or_buffer: str) -> str: - ... +def _expand_user(filepath_or_buffer: str) -> str: ... @overload -def _expand_user(filepath_or_buffer: BaseBufferT) -> BaseBufferT: - ... +def _expand_user(filepath_or_buffer: BaseBufferT) -> BaseBufferT: ... def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT: @@ -234,15 +229,15 @@ def validate_header_arg(header: object) -> None: @overload -def stringify_path(filepath_or_buffer: FilePath, convert_file_like: bool = ...) -> str: - ... +def stringify_path( + filepath_or_buffer: FilePath, convert_file_like: bool = ... +) -> str: ... @overload def stringify_path( filepath_or_buffer: BaseBufferT, convert_file_like: bool = ... -) -> BaseBufferT: - ... +) -> BaseBufferT: ... def stringify_path( @@ -279,14 +274,14 @@ def stringify_path( return _expand_user(filepath_or_buffer) -def urlopen(*args, **kwargs): +def urlopen(*args: Any, **kwargs: Any) -> Any: """ Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of the stdlib. """ import urllib.request - return urllib.request.urlopen(*args, **kwargs) + return urllib.request.urlopen(*args, **kwargs) # noqa: TID251 def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: @@ -296,7 +291,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: """ return ( isinstance(url, str) - and bool(_RFC_3986_PATTERN.match(url)) + and bool(_FSSPEC_URL_PATTERN.match(url)) and not url.startswith(("http://", "https://")) ) @@ -318,7 +313,7 @@ def _get_filepath_or_buffer( Parameters ---------- - filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), + filepath_or_buffer : a url, filepath (str or pathlib.Path), or buffer {compression_options} @@ -362,6 +357,16 @@ def _get_filepath_or_buffer( stacklevel=find_stack_level(), ) + if "a" in mode and compression_method in ["zip", "tar"]: + # GH56778 + warnings.warn( + "zip and tar do not support mode 'a' properly. " + "This combination will result in multiple files with same name " + "being added to the archive.", + RuntimeWarning, + stacklevel=find_stack_level(), + ) + # Use binary mode when converting path-like objects to file-like objects (fsspec) # except when text mode is explicitly requested. The original mode is returned if # fsspec is not used. @@ -579,6 +584,9 @@ def infer_compression( # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings + if isinstance(filepath_or_buffer, str) and "::" in filepath_or_buffer: + # chained URLs contain :: + filepath_or_buffer = filepath_or_buffer.split("::")[0] filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression @@ -627,8 +635,7 @@ def get_handle( is_text: Literal[False], errors: str | None = ..., storage_options: StorageOptions = ..., -) -> IOHandles[bytes]: - ... +) -> IOHandles[bytes]: ... @overload @@ -642,8 +649,7 @@ def get_handle( is_text: Literal[True] = ..., errors: str | None = ..., storage_options: StorageOptions = ..., -) -> IOHandles[str]: - ... +) -> IOHandles[str]: ... @overload @@ -657,8 +663,7 @@ def get_handle( is_text: bool = ..., errors: str | None = ..., storage_options: StorageOptions = ..., -) -> IOHandles[str] | IOHandles[bytes]: - ... +) -> IOHandles[str] | IOHandles[bytes]: ... @doc(compression_options=_shared_docs["compression_options"] % "path_or_buf") @@ -778,9 +783,11 @@ def get_handle( # BZ Compression elif compression == "bz2": + import bz2 + # Overload of "BZ2File" to handle pickle protocol 5 # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" - handle = get_bz2_file()( # type: ignore[call-overload] + handle = bz2.BZ2File( # type: ignore[call-overload] handle, mode=ioargs.mode, **compression_args, @@ -792,7 +799,9 @@ def get_handle( # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]], # ReadBuffer[bytes], WriteBuffer[bytes]]" handle = _BytesZipFile( - handle, ioargs.mode, **compression_args # type: ignore[arg-type] + handle, # type: ignore[arg-type] + ioargs.mode, + **compression_args, ) if handle.buffer.mode == "r": handles.append(handle) @@ -817,7 +826,8 @@ def get_handle( # type "BaseBuffer"; expected "Union[ReadBuffer[bytes], # WriteBuffer[bytes], None]" handle = _BytesTarFile( - fileobj=handle, **compression_args # type: ignore[arg-type] + fileobj=handle, # type: ignore[arg-type] + **compression_args, ) assert isinstance(handle, _BytesTarFile) if "r" in handle.buffer.mode: @@ -840,8 +850,12 @@ def get_handle( # error: Argument 1 to "LZMAFile" has incompatible type "Union[str, # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], # PathLike[bytes]], IO[bytes]], None]" - handle = get_lzma_file()( - handle, ioargs.mode, **compression_args # type: ignore[arg-type] + import lzma + + handle = lzma.LZMAFile( + handle, # type: ignore[arg-type] + ioargs.mode, + **compression_args, ) # Zstd Compression @@ -899,10 +913,10 @@ def get_handle( or not hasattr(handle, "seekable") ): handle = _IOWrapper(handle) - # error: Argument 1 to "TextIOWrapper" has incompatible type - # "_IOWrapper"; expected "IO[bytes]" + # error: Value of type variable "_BufferT_co" of "TextIOWrapper" cannot + # be "_IOWrapper | BaseBuffer" [type-var] handle = TextIOWrapper( - handle, # type: ignore[arg-type] + handle, # type: ignore[type-var] encoding=ioargs.encoding, errors=errors, newline="", @@ -948,8 +962,7 @@ class _BufferedWriter(BytesIO, ABC): # type: ignore[misc] buffer = BytesIO() @abstractmethod - def write_to_buffer(self) -> None: - ... + def write_to_buffer(self) -> None: ... def close(self) -> None: if self.closed: @@ -972,7 +985,7 @@ def __init__( mode: Literal["r", "a", "w", "x"] = "r", fileobj: ReadBuffer[bytes] | WriteBuffer[bytes] | None = None, archive_name: str | None = None, - **kwargs, + **kwargs: Any, ) -> None: super().__init__() self.archive_name = archive_name @@ -1025,7 +1038,7 @@ def __init__( file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], mode: str, archive_name: str | None = None, - **kwargs, + **kwargs: Any, ) -> None: super().__init__() mode = mode.replace("b", "") @@ -1066,7 +1079,7 @@ class _IOWrapper: def __init__(self, buffer: BaseBuffer) -> None: self.buffer = buffer - def __getattr__(self, name: str): + def __getattr__(self, name: str) -> Any: return getattr(self.buffer, name) def readable(self) -> bool: @@ -1097,7 +1110,7 @@ def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8") -> No # overflow to the front of the bytestring the next time reading is performed self.overflow = b"" - def __getattr__(self, attr: str): + def __getattr__(self, attr: str) -> Any: return getattr(self.buffer, attr) def read(self, n: int | None = -1) -> bytes: @@ -1137,7 +1150,9 @@ def _maybe_memory_map( # expected "BaseBuffer" wrapped = _IOWrapper( mmap.mmap( - handle.fileno(), 0, access=mmap.ACCESS_READ # type: ignore[arg-type] + handle.fileno(), + 0, + access=mmap.ACCESS_READ, # type: ignore[arg-type] ) ) finally: @@ -1221,12 +1236,14 @@ def is_potential_multi_index( bool : Whether or not columns could become a MultiIndex """ if index_col is None or isinstance(index_col, bool): - index_col = [] + index_columns = set() + else: + index_columns = set(index_col) return bool( len(columns) and not isinstance(columns, ABCMultiIndex) - and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) + and all(isinstance(c, tuple) for c in columns if c not in index_columns) ) diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index 275cbf0148f94..f13d7afa63d84 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -8,7 +8,7 @@ from pandas.io.excel._util import register_writer from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter -__all__ = ["read_excel", "ExcelWriter", "ExcelFile"] +__all__ = ["ExcelFile", "ExcelWriter", "read_excel"] register_writer(_OpenpyxlWriter) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bce890c6f73b0..1dc6c1f08b49a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,21 +1,21 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, Sequence, ) import datetime +from decimal import Decimal from functools import partial -from io import BytesIO import os from textwrap import fill from typing import ( IO, TYPE_CHECKING, Any, - Callable, Generic, Literal, TypeVar, @@ -44,6 +44,8 @@ from pandas.core.dtypes.common import ( is_bool, + is_decimal, + is_file_like, is_float, is_integer, is_list_like, @@ -77,6 +79,7 @@ DtypeBackend, ExcelWriterIfSheetExists, FilePath, + HashableT, IntStrT, ReadBuffer, Self, @@ -86,7 +89,7 @@ ) _read_excel_doc = ( """ -Read an Excel file into a ``pandas`` ``DataFrame``. +Read an Excel file into a ``DataFrame``. Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions read from a local filesystem or URL. Supports an option to read @@ -94,7 +97,7 @@ Parameters ---------- -io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object +io : str, ExcelFile, xlrd.Book, path object, or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.xlsx``. @@ -112,7 +115,7 @@ Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify ``None`` to get all worksheets. + When ``None``, will return a dictionary containing DataFrames for each sheet. Available cases: @@ -121,7 +124,7 @@ * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" as a dict of `DataFrame` - * ``None``: All worksheets. + * ``None``: Returns a dictionary containing DataFrames for each sheet.. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed @@ -160,36 +163,24 @@ If converters are specified, they will be applied INSTEAD of dtype conversion. If you use ``None``, it will infer the dtype of each column based on the data. -engine : str, default None +engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - - ``xlr`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - - ``pyxlsb`` supports Binary Excel files. - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) and OpenDocument (.ods) file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. + - ``xlrd`` supports old-style Excel files (.xls). - .. versionchanged:: 1.2.0 - The engine `xlrd `_ - now only supports old-style ``.xls`` files. - When ``engine=None``, the following logic will be - used to determine the engine: - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls format, - ``xlrd`` will be used. - - Otherwise if ``path_or_buffer`` is in xlsb format, - ``pyxlsb`` will be used. - - .. versionadded:: 1.3.0 - - Otherwise ``openpyxl`` will be used. - - .. versionchanged:: 1.3.0 + When ``engine=None``, the following logic will be used to determine the engine: + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -206,7 +197,7 @@ False otherwise. An example of a valid callable argument would be ``lambda x: x in [0, 2]``. nrows : int, default None - Number of rows to parse. + Number of rows to parse. Does not include header rows. na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted @@ -251,20 +242,6 @@ For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. -date_parser : function, optional - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`to_datetime` as-needed. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, @@ -292,14 +269,15 @@ Rows at the end to skip (0-indexed). {storage_options} -dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' +dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -395,7 +373,7 @@ def read_excel( | str | Sequence[int] | Sequence[str] - | Callable[[str], bool] + | Callable[[HashableT], bool] | None = ..., dtype: DtypeArg | None = ..., engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., @@ -409,7 +387,6 @@ def read_excel( na_filter: bool = ..., verbose: bool = ..., parse_dates: list | dict | bool = ..., - date_parser: Callable | lib.NoDefault = ..., date_format: dict[Hashable, str] | str | None = ..., thousands: str | None = ..., decimal: str = ..., @@ -417,8 +394,7 @@ def read_excel( skipfooter: int = ..., storage_options: StorageOptions = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> DataFrame: - ... +) -> DataFrame: ... @overload @@ -434,7 +410,7 @@ def read_excel( | str | Sequence[int] | Sequence[str] - | Callable[[str], bool] + | Callable[[HashableT], bool] | None = ..., dtype: DtypeArg | None = ..., engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., @@ -448,7 +424,6 @@ def read_excel( na_filter: bool = ..., verbose: bool = ..., parse_dates: list | dict | bool = ..., - date_parser: Callable | lib.NoDefault = ..., date_format: dict[Hashable, str] | str | None = ..., thousands: str | None = ..., decimal: str = ..., @@ -456,8 +431,7 @@ def read_excel( skipfooter: int = ..., storage_options: StorageOptions = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> dict[IntStrT, DataFrame]: - ... +) -> dict[IntStrT, DataFrame]: ... @doc(storage_options=_shared_docs["storage_options"]) @@ -473,7 +447,7 @@ def read_excel( | str | Sequence[int] | Sequence[str] - | Callable[[str], bool] + | Callable[[HashableT], bool] | None = None, dtype: DtypeArg | None = None, engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, @@ -487,7 +461,6 @@ def read_excel( na_filter: bool = True, verbose: bool = False, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", @@ -534,7 +507,6 @@ def read_excel( na_filter=na_filter, verbose=verbose, parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, decimal=decimal, @@ -564,10 +536,6 @@ def __init__( if engine_kwargs is None: engine_kwargs = {} - # First argument can also be bytes, so create a buffer - if isinstance(filepath_or_buffer, bytes): - filepath_or_buffer = BytesIO(filepath_or_buffer) - self.handles = IOHandles( handle=filepath_or_buffer, compression={"method": None} ) @@ -743,7 +711,6 @@ def parse( na_values=None, verbose: bool = False, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", @@ -797,143 +764,191 @@ def parse( output[asheetname] = DataFrame() continue - is_list_header = False - is_len_one_list_header = False - if is_list_like(header): - assert isinstance(header, Sequence) - is_list_header = True - if len(header) == 1: - is_len_one_list_header = True - - if is_len_one_list_header: - header = cast(Sequence[int], header)[0] - - # forward fill and pull out names for MultiIndex column - header_names = None - if header is not None and is_list_like(header): - assert isinstance(header, Sequence) - - header_names = [] - control_row = [True] * len(data[0]) - - for row in header: - if is_integer(skiprows): - assert isinstance(skiprows, int) - row += skiprows - - if row > len(data) - 1: - raise ValueError( - f"header index {row} exceeds maximum index " - f"{len(data) - 1} of data.", - ) - - data[row], control_row = fill_mi_header(data[row], control_row) - - if index_col is not None: - header_name, _ = pop_header_name(data[row], index_col) - header_names.append(header_name) - - # If there is a MultiIndex header and an index then there is also - # a row containing just the index name(s) - has_index_names = False - if is_list_header and not is_len_one_list_header and index_col is not None: - index_col_list: Sequence[int] - if isinstance(index_col, int): - index_col_list = [index_col] - else: - assert isinstance(index_col, Sequence) - index_col_list = index_col - - # We have to handle mi without names. If any of the entries in the data - # columns are not empty, this is a regular row - assert isinstance(header, Sequence) - if len(header) < len(data): - potential_index_names = data[len(header)] - potential_data = [ - x - for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_list - ] - has_index_names = all(x == "" or x is None for x in potential_data) - - if is_list_like(index_col): - # Forward fill values for MultiIndex index. - if header is None: - offset = 0 - elif isinstance(header, int): - offset = 1 + header - else: - offset = 1 + max(header) + output = self._parse_sheet( + data=data, + output=output, + asheetname=asheetname, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + dtype=dtype, + skiprows=skiprows, + nrows=nrows, + true_values=true_values, + false_values=false_values, + na_values=na_values, + parse_dates=parse_dates, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + dtype_backend=dtype_backend, + **kwds, + ) - # GH34673: if MultiIndex names present and not defined in the header, - # offset needs to be incremented so that forward filling starts - # from the first MI value instead of the name - if has_index_names: - offset += 1 + if last_sheetname is None: + raise ValueError("Sheet name is an empty list") - # Check if we have an empty dataset - # before trying to collect data. - if offset < len(data): - assert isinstance(index_col, Sequence) + if ret_dict: + return output + else: + return output[last_sheetname] + + def _parse_sheet( + self, + data: list, + output: dict, + asheetname: str | int | None = None, + header: int | Sequence[int] | None = 0, + names: SequenceNotStr[Hashable] | range | None = None, + index_col: int | Sequence[int] | None = None, + usecols=None, + dtype: DtypeArg | None = None, + skiprows: Sequence[int] | int | Callable[[int], object] | None = None, + nrows: int | None = None, + true_values: Iterable[Hashable] | None = None, + false_values: Iterable[Hashable] | None = None, + na_values=None, + parse_dates: list | dict | bool = False, + date_format: dict[Hashable, str] | str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + skipfooter: int = 0, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + **kwds, + ): + is_list_header = False + is_len_one_list_header = False + if is_list_like(header): + assert isinstance(header, Sequence) + is_list_header = True + if len(header) == 1: + is_len_one_list_header = True + + if is_len_one_list_header: + header = cast(Sequence[int], header)[0] + + # forward fill and pull out names for MultiIndex column + header_names = None + if header is not None and is_list_like(header): + assert isinstance(header, Sequence) + + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + assert isinstance(skiprows, int) + row += skiprows + + if row > len(data) - 1: + raise ValueError( + f"header index {row} exceeds maximum index " + f"{len(data) - 1} of data.", + ) - for col in index_col: - last = data[offset][col] + data[row], control_row = fill_mi_header(data[row], control_row) - for row in range(offset + 1, len(data)): - if data[row][col] == "" or data[row][col] is None: - data[row][col] = last - else: - last = data[row][col] + if index_col is not None: + header_name, _ = pop_header_name(data[row], index_col) + header_names.append(header_name) - # GH 12292 : error when read one empty column from excel file - try: - parser = TextParser( - data, - names=names, - header=header, - index_col=index_col, - has_index_names=has_index_names, - dtype=dtype, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - skip_blank_lines=False, # GH 39808 - parse_dates=parse_dates, - date_parser=date_parser, - date_format=date_format, - thousands=thousands, - decimal=decimal, - comment=comment, - skipfooter=skipfooter, - usecols=usecols, - dtype_backend=dtype_backend, - **kwds, + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + has_index_names = False + if is_list_header and not is_len_one_list_header and index_col is not None: + index_col_set: set[int] + if isinstance(index_col, int): + index_col_set = {index_col} + else: + assert isinstance(index_col, Sequence) + index_col_set = set(index_col) + + # We have to handle mi without names. If any of the entries in the data + # columns are not empty, this is a regular row + assert isinstance(header, Sequence) + if len(header) < len(data): + potential_index_names = data[len(header)] + has_index_names = all( + x == "" or x is None + for i, x in enumerate(potential_index_names) + if not control_row[i] and i not in index_col_set ) - output[asheetname] = parser.read(nrows=nrows) + if is_list_like(index_col): + # Forward fill values for MultiIndex index. + if header is None: + offset = 0 + elif isinstance(header, int): + offset = 1 + header + else: + offset = 1 + max(header) + + # GH34673: if MultiIndex names present and not defined in the header, + # offset needs to be incremented so that forward filling starts + # from the first MI value instead of the name + if has_index_names: + offset += 1 + + # Check if we have an empty dataset + # before trying to collect data. + if offset < len(data): + assert isinstance(index_col, Sequence) + + for col in index_col: + last = data[offset][col] + + for row in range(offset + 1, len(data)): + if data[row][col] == "" or data[row][col] is None: + data[row][col] = last + else: + last = data[row][col] + + # GH 12292 : error when read one empty column from excel file + try: + parser = TextParser( + data, + names=names, + header=header, + index_col=index_col, + has_index_names=has_index_names, + dtype=dtype, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + skip_blank_lines=False, # GH 39808 + parse_dates=parse_dates, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + usecols=usecols, + dtype_backend=dtype_backend, + **kwds, + ) - if header_names: - output[asheetname].columns = output[asheetname].columns.set_names( - header_names - ) + output[asheetname] = parser.read(nrows=nrows) - except EmptyDataError: - # No Data, return an empty DataFrame - output[asheetname] = DataFrame() + if header_names: + output[asheetname].columns = output[asheetname].columns.set_names( + header_names + ) - except Exception as err: - err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) - raise err + except EmptyDataError: + # No Data, return an empty DataFrame + output[asheetname] = DataFrame() - if last_sheetname is None: - raise ValueError("Sheet name is an empty list") + except Exception as err: + err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) + raise err - if ret_dict: - return output - else: - return output[last_sheetname] + return output @doc(storage_options=_shared_docs["storage_options"]) @@ -945,9 +960,9 @@ class ExcelWriter(Generic[_WorkbookT]): * `xlsxwriter `__ for xlsx files if xlsxwriter is installed otherwise `openpyxl `__ - * `odswriter `__ for ods files + * `odf `__ for ods files - See ``DataFrame.to_excel`` for typical usage. + See :meth:`DataFrame.to_excel` for typical usage. The writer should be used as a context manager. Otherwise, call `close()` to save and close any opened file handles. @@ -992,10 +1007,16 @@ class ExcelWriter(Generic[_WorkbookT]): * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)`` * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)`` * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)`` - * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` + * odf: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` .. versionadded:: 1.3.0 + See Also + -------- + read_excel : Read an Excel sheet values (xlsx) file into DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Notes ----- For compatibility with CSV writers, ExcelWriter serializes lists @@ -1031,7 +1052,7 @@ class ExcelWriter(Generic[_WorkbookT]): >>> with pd.ExcelWriter( ... "path_to_file.xlsx", ... date_format="YYYY-MM-DD", - ... datetime_format="YYYY-MM-DD HH:MM:SS" + ... datetime_format="YYYY-MM-DD HH:MM:SS", ... ) as writer: ... df.to_excel(writer) # doctest: +SKIP @@ -1043,7 +1064,7 @@ class ExcelWriter(Generic[_WorkbookT]): Here, the `if_sheet_exists` parameter can be set to replace a sheet if it already exists: - >>> with ExcelWriter( + >>> with pd.ExcelWriter( ... "path_to_file.xlsx", ... mode="a", ... engine="openpyxl", @@ -1054,7 +1075,8 @@ class ExcelWriter(Generic[_WorkbookT]): You can also write multiple DataFrames to a single sheet. Note that the ``if_sheet_exists`` parameter needs to be set to ``overlay``: - >>> with ExcelWriter("path_to_file.xlsx", + >>> with pd.ExcelWriter( + ... "path_to_file.xlsx", ... mode="a", ... engine="openpyxl", ... if_sheet_exists="overlay", @@ -1084,7 +1106,7 @@ class ExcelWriter(Generic[_WorkbookT]): >>> with pd.ExcelWriter( ... "path_to_file.xlsx", ... engine="xlsxwriter", - ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}} + ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}}, ... ) as writer: ... df.to_excel(writer) # doctest: +SKIP @@ -1095,7 +1117,7 @@ class ExcelWriter(Generic[_WorkbookT]): ... "path_to_file.xlsx", ... engine="openpyxl", ... mode="a", - ... engine_kwargs={{"keep_vba": True}} + ... engine_kwargs={{"keep_vba": True}}, ... ) as writer: ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP """ @@ -1145,7 +1167,7 @@ def __new__( ext = "xlsx" try: - engine = config.get_option(f"io.excel.{ext}.writer", silent=True) + engine = config.get_option(f"io.excel.{ext}.writer") if engine == "auto": engine = get_default_engine(ext, mode="writer") except KeyError as err: @@ -1328,6 +1350,8 @@ def _value_with_fmt( val = float(val) elif is_bool(val): val = bool(val) + elif is_decimal(val): + val = Decimal(val) elif isinstance(val, datetime.datetime): fmt = self._datetime_format elif isinstance(val, datetime.date): @@ -1337,7 +1361,16 @@ def _value_with_fmt( fmt = "0" else: val = str(val) - + # GH#56954 + # Excel's limitation on cell contents is 32767 characters + # xref https://support.microsoft.com/en-au/office/excel-specifications-and-limits-1672b34d-7043-467e-8e27-269d656771c3 + if len(val) > 32767: + warnings.warn( + f"Cell contents too long ({len(val)}), " + "truncated to 32767 characters", + UserWarning, + stacklevel=find_stack_level(), + ) return val, fmt @classmethod @@ -1374,7 +1407,7 @@ def close(self) -> None: b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2 b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3 b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4 - b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary + b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", # Compound File Binary ) ZIP_SIGNATURE = b"PK\x03\x04" PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,))) @@ -1408,9 +1441,6 @@ def inspect_excel_format( BadZipFile If resulting stream does not have an XLS signature and is not a valid zipfile. """ - if isinstance(content_or_path, bytes): - content_or_path = BytesIO(content_or_path) - with get_handle( content_or_path, "rb", storage_options=storage_options, is_text=False ) as handle: @@ -1431,9 +1461,9 @@ def inspect_excel_format( with zipfile.ZipFile(stream) as zf: # Workaround for some third party files that use forward slashes and # lower case names. - component_names = [ + component_names = { name.replace("\\", "/").lower() for name in zf.namelist() - ] + } if "xl/workbook.xml" in component_names: return "xlsx" @@ -1444,6 +1474,7 @@ def inspect_excel_format( return "zip" +@doc(storage_options=_shared_docs["storage_options"]) class ExcelFile: """ Class for parsing tabular Excel sheets into DataFrame objects. @@ -1452,7 +1483,7 @@ class ExcelFile: Parameters ---------- - path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath), + path_or_buffer : str, bytes, pathlib.Path, A file-like object, xlrd workbook or openpyxl workbook. If a string or path object, expected to be a path to a .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. @@ -1482,22 +1513,30 @@ class ExcelFile: - Otherwise if ``path_or_buffer`` is in xlsb format, `pyxlsb `_ will be used. - .. versionadded:: 1.3.0 + .. versionadded:: 1.3.0 - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - .. warning:: + .. warning:: - Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. - This is not supported, switch to using ``openpyxl`` instead. + Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. + This is not supported, switch to using ``openpyxl`` instead. + {storage_options} engine_kwargs : dict, optional Arbitrary keyword arguments passed to excel engine. + See Also + -------- + DataFrame.to_excel : Write DataFrame to an Excel file. + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Examples -------- - >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP + >>> file = pd.ExcelFile("myfile.xlsx") # doctest: +SKIP >>> with pd.ExcelFile("myfile.xls") as xls: # doctest: +SKIP ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP """ @@ -1529,36 +1568,28 @@ def __init__( if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") - # First argument can also be bytes, so create a buffer - if isinstance(path_or_buffer, bytes): - path_or_buffer = BytesIO(path_or_buffer) - warnings.warn( - "Passing bytes to 'read_excel' is deprecated and " - "will be removed in a future version. To read from a " - "byte string, wrap it in a `BytesIO` object.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # Could be a str, ExcelFile, Book, etc. - self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - # Determine xlrd version if installed - if import_optional_dependency("xlrd", errors="ignore") is None: - xlrd_version = None - else: - import xlrd - - xlrd_version = Version(get_version(xlrd)) - if engine is None: # Only determine ext if it is needed - ext: str | None - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: + ext: str | None = None + + if not isinstance( + path_or_buffer, (str, os.PathLike, ExcelFile) + ) and not is_file_like(path_or_buffer): + # GH#56692 - avoid importing xlrd if possible + if import_optional_dependency("xlrd", errors="ignore") is None: + xlrd_version = None + else: + import xlrd + + xlrd_version = Version(get_version(xlrd)) + + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + + if ext is None: ext = inspect_excel_format( content_or_path=path_or_buffer, storage_options=storage_options ) @@ -1568,7 +1599,7 @@ def __init__( "an engine manually." ) - engine = config.get_option(f"io.excel.{ext}.reader", silent=True) + engine = config.get_option(f"io.excel.{ext}.reader") if engine == "auto": engine = get_default_engine(ext, mode="reader") @@ -1599,7 +1630,6 @@ def parse( nrows: int | None = None, na_values=None, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, thousands: str | None = None, comment: str | None = None, @@ -1613,16 +1643,127 @@ def parse( Equivalent to read_excel(ExcelFile, ...) See the read_excel docstring for more info on accepted parameters. + Parameters + ---------- + sheet_name : str, int, list, or None, default 0 + Strings are used for sheet names. Integers are used in zero-indexed + sheet positions (chart sheets do not count as a sheet position). + Lists of strings/integers are used to request multiple sheets. + When ``None``, will return a dictionary containing DataFrames for + each sheet. + header : int, list of int, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex``. Use None if there is no header. + names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None. + index_col : int, str, list of int, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex``. If a + subset of data is selected with ``usecols``, index_col + is based on the subset. + + Missing values will be forward filled to allow roundtripping with + ``to_excel`` for ``merged_cells=True``. To avoid forward filling the + missing values use ``set_index`` after reading the data instead of + ``index_col``. + usecols : str, list-like, or callable, default None + * If None, then parse all columns. + * If str, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. + * If list of int, then indicates list of column numbers to be parsed + (0-indexed). + * If list of string, then indicates list of column names to be parsed. + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + Returns a subset of the columns according to behavior above. + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. + true_values : list, default None + Values to consider as True. + false_values : list, default None + Values to consider as False. + skiprows : list-like, int, or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) at the + start of the file. If callable, the callable function will be evaluated + against the row indices, returning True if the row should be skipped and + False otherwise. An example of a valid callable argument would be ``lambda + x: x in [0, 2]``. + nrows : int, default None + Number of rows to parse. + na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. + parse_dates : bool, list-like, or dict, default False + The behavior is as follows: + + * ``bool``. If True -> try parsing the index. + * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and + parse as a single date column. + * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index contains an unparsable date, the entire column or + index will be returned unaltered as an object data type. If you + don`t want to parse some cells as date just change their type + in Excel to "Text".For non-standard datetime parsing, use + ``pd.to_datetime`` after ``pd.read_excel``. + + Note: A fast-path exists for iso8601-formatted dates. + date_format : str or dict of column -> format, default ``None`` + If used in conjunction with ``parse_dates``, will parse dates + according to this format. For anything more complex, + please read in as ``object`` and then apply :func:`to_datetime` as-needed. + thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. + comment : str, default None + Comments out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. + skipfooter : int, default 0 + Rows at the end to skip (0-indexed). + dtype_backend : {{'numpy_nullable', 'pyarrow'}} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` + + .. versionadded:: 2.0 + **kwds : dict, optional + Arbitrary keyword arguments passed to excel engine. + Returns ------- DataFrame or dict of DataFrames DataFrame from the passed in Excel file. + See Also + -------- + read_excel : Read an Excel sheet values (xlsx) file into DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Examples -------- - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) - >>> df.to_excel('myfile.xlsx') # doctest: +SKIP - >>> file = pd.ExcelFile('myfile.xlsx') # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + >>> df.to_excel("myfile.xlsx") # doctest: +SKIP + >>> file = pd.ExcelFile("myfile.xlsx") # doctest: +SKIP >>> file.parse() # doctest: +SKIP """ return self._reader.parse( @@ -1638,7 +1779,6 @@ def parse( nrows=nrows, na_values=na_values, parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, comment=comment, @@ -1649,10 +1789,59 @@ def parse( @property def book(self): + """ + Gets the Excel workbook. + + Workbook is the top-level container for all document information. + + Returns + ------- + Excel Workbook + The workbook object of the type defined by the engine being used. + + See Also + -------- + read_excel : Read an Excel file into a pandas DataFrame. + + Examples + -------- + >>> file = pd.ExcelFile("myfile.xlsx") # doctest: +SKIP + >>> file.book # doctest: +SKIP + + >>> file.book.path # doctest: +SKIP + '/xl/workbook.xml' + >>> file.book.active # doctest: +SKIP + + >>> file.book.sheetnames # doctest: +SKIP + ['Sheet1', 'Sheet2'] + """ return self._reader.book @property def sheet_names(self): + """ + Names of the sheets in the document. + + This is particularly useful for loading a specific sheet into a DataFrame when + you do not know the sheet names beforehand. + + Returns + ------- + list of str + List of sheet names in the document. + + See Also + -------- + ExcelFile.parse : Parse a sheet into a DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. If you know the sheet + names, it may be easier to specify them directly to read_excel. + + Examples + -------- + >>> file = pd.ExcelFile("myfile.xlsx") # doctest: +SKIP + >>> file.sheet_names # doctest: +SKIP + ["Sheet1", "Sheet2"] + """ return self._reader.sheet_names def close(self) -> None: diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index 4f65acf1aa40e..b8994e679d4b1 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -75,7 +75,8 @@ def load_workbook( from python_calamine import load_workbook return load_workbook( - filepath_or_buffer, **engine_kwargs # type: ignore[arg-type] + filepath_or_buffer, + **engine_kwargs, ) @property diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 69b514da32857..f79417d11080d 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -122,29 +122,25 @@ def get_sheet_data( table: list[list[Scalar | NaTType]] = [] for sheet_row in sheet_rows: - sheet_cells = [ - x - for x in sheet_row.childNodes - if hasattr(x, "qname") and x.qname in cell_names - ] empty_cells = 0 table_row: list[Scalar | NaTType] = [] - for sheet_cell in sheet_cells: - if sheet_cell.qname == table_cell_name: - value = self._get_cell_value(sheet_cell) - else: - value = self.empty_value - - column_repeat = self._get_column_repeat(sheet_cell) - - # Queue up empty values, writing only if content succeeds them - if value == self.empty_value: - empty_cells += column_repeat - else: - table_row.extend([self.empty_value] * empty_cells) - empty_cells = 0 - table_row.extend([value] * column_repeat) + for sheet_cell in sheet_row.childNodes: + if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names: + if sheet_cell.qname == table_cell_name: + value = self._get_cell_value(sheet_cell) + else: + value = self.empty_value + + column_repeat = self._get_column_repeat(sheet_cell) + + # Queue up empty values, writing only if content succeeds them + if value == self.empty_value: + empty_cells += column_repeat + else: + table_row.extend([self.empty_value] * empty_cells) + empty_cells = 0 + table_row.extend([value] * column_repeat) if max_row_len < len(table_row): max_row_len = len(table_row) diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index bc7dca2d95b6b..ba4919c9298ed 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -18,6 +18,8 @@ ) if TYPE_CHECKING: + from odf.opendocument import OpenDocumentSpreadsheet + from pandas._typing import ( ExcelWriterIfSheetExists, FilePath, @@ -32,17 +34,17 @@ class ODSWriter(ExcelWriter): _engine = "odf" _supported_extensions = (".ods",) - def __init__( + def __init__( # pyright: ignore[reportInconsistentConstructor] self, path: FilePath | WriteExcelBuffer | ExcelWriter, engine: str | None = None, date_format: str | None = None, - datetime_format=None, + datetime_format: str | None = None, mode: str = "w", storage_options: StorageOptions | None = None, if_sheet_exists: ExcelWriterIfSheetExists | None = None, engine_kwargs: dict[str, Any] | None = None, - **kwargs, + **kwargs: Any, ) -> None: from odf.opendocument import OpenDocumentSpreadsheet @@ -63,7 +65,7 @@ def __init__( self._style_dict: dict[str, str] = {} @property - def book(self): + def book(self) -> OpenDocumentSpreadsheet: """ Book instance of class odf.opendocument.OpenDocumentSpreadsheet. @@ -149,7 +151,7 @@ def _write_cells( for row_nr in range(max(rows.keys()) + 1): wks.addElement(rows[row_nr]) - def _make_table_cell_attributes(self, cell) -> dict[str, int | str]: + def _make_table_cell_attributes(self, cell: ExcelCell) -> dict[str, int | str]: """Convert cell attributes to OpenDocument attributes Parameters @@ -171,7 +173,7 @@ def _make_table_cell_attributes(self, cell) -> dict[str, int | str]: attributes["numbercolumnsspanned"] = cell.mergeend return attributes - def _make_table_cell(self, cell) -> tuple[object, Any]: + def _make_table_cell(self, cell: ExcelCell) -> tuple[object, Any]: """Convert cell data to an OpenDocument spreadsheet cell Parameters @@ -238,12 +240,10 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: ) @overload - def _process_style(self, style: dict[str, Any]) -> str: - ... + def _process_style(self, style: dict[str, Any]) -> str: ... @overload - def _process_style(self, style: None) -> None: - ... + def _process_style(self, style: None) -> None: ... def _process_style(self, style: dict[str, Any] | None) -> str | None: """Convert a style dictionary to a OpenDocument style sheet @@ -270,7 +270,7 @@ def _process_style(self, style: dict[str, Any] | None) -> str | None: style_key = json.dumps(style) if style_key in self._style_dict: return self._style_dict[style_key] - name = f"pd{len(self._style_dict)+1}" + name = f"pd{len(self._style_dict) + 1}" self._style_dict[style_key] = name odf_style = Style(name=name, family="table-cell") if "font" in style: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index c546443868a62..3055c68a93cbc 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -26,6 +26,7 @@ if TYPE_CHECKING: from openpyxl import Workbook from openpyxl.descriptors.serialisable import Serialisable + from openpyxl.styles import Fill from pandas._typing import ( ExcelWriterIfSheetExists, @@ -41,7 +42,7 @@ class OpenpyxlWriter(ExcelWriter): _engine = "openpyxl" _supported_extensions = (".xlsx", ".xlsm") - def __init__( + def __init__( # pyright: ignore[reportInconsistentConstructor] self, path: FilePath | WriteExcelBuffer | ExcelWriter, engine: str | None = None, @@ -244,7 +245,7 @@ def _convert_to_stop(cls, stop_seq): return map(cls._convert_to_color, stop_seq) @classmethod - def _convert_to_fill(cls, fill_dict: dict[str, Any]): + def _convert_to_fill(cls, fill_dict: dict[str, Any]) -> Fill: """ Convert ``fill_dict`` to an openpyxl v2 Fill object. diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index f7a1fcb8052e3..e7c5d518abaee 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, MutableMapping, @@ -9,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, TypeVar, overload, @@ -143,9 +143,9 @@ def _range2cols(areas: str) -> list[int]: Examples -------- - >>> _range2cols('A:E') + >>> _range2cols("A:E") [0, 1, 2, 3, 4] - >>> _range2cols('A,C,Z:AB') + >>> _range2cols("A,C,Z:AB") [0, 2, 25, 26, 27] """ cols: list[int] = [] @@ -161,23 +161,19 @@ def _range2cols(areas: str) -> list[int]: @overload -def maybe_convert_usecols(usecols: str | list[int]) -> list[int]: - ... +def maybe_convert_usecols(usecols: str | list[int]) -> list[int]: ... @overload -def maybe_convert_usecols(usecols: list[str]) -> list[str]: - ... +def maybe_convert_usecols(usecols: list[str]) -> list[str]: ... @overload -def maybe_convert_usecols(usecols: usecols_func) -> usecols_func: - ... +def maybe_convert_usecols(usecols: usecols_func) -> usecols_func: ... @overload -def maybe_convert_usecols(usecols: None) -> None: - ... +def maybe_convert_usecols(usecols: None) -> None: ... def maybe_convert_usecols( @@ -212,13 +208,11 @@ def maybe_convert_usecols( @overload -def validate_freeze_panes(freeze_panes: tuple[int, int]) -> Literal[True]: - ... +def validate_freeze_panes(freeze_panes: tuple[int, int]) -> Literal[True]: ... @overload -def validate_freeze_panes(freeze_panes: None) -> Literal[False]: - ... +def validate_freeze_panes(freeze_panes: None) -> Literal[False]: ... def validate_freeze_panes(freeze_panes: tuple[int, int] | None) -> bool: diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index a444970792e6e..5d39a840336eb 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data = [] - nrows = sheet.nrows if file_rows_needed is not None: nrows = min(nrows, file_rows_needed) - for i in range(nrows): - row = [ + return [ + [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) ] - data.append(row) - - return data + for i in range(nrows) + ] diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 6eacac8c064fb..4a7b8eee2bfce 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -93,7 +93,7 @@ class _XlsxStyler: } @classmethod - def convert(cls, style_dict, num_format_str=None): + def convert(cls, style_dict, num_format_str=None) -> dict[str, Any]: """ converts a style_dict to an xlsxwriter format dict @@ -181,7 +181,7 @@ class XlsxWriter(ExcelWriter): _engine = "xlsxwriter" _supported_extensions = (".xlsx",) - def __init__( + def __init__( # pyright: ignore[reportInconsistentConstructor] self, path: FilePath | WriteExcelBuffer | ExcelWriter, engine: str | None = None, diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index d0aaf83b84cb2..565c53f0f3fc5 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,23 +1,24 @@ -""" feather-format compat """ +"""feather-format compat""" + from __future__ import annotations from typing import ( TYPE_CHECKING, Any, ) +import warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import get_handle if TYPE_CHECKING: @@ -77,6 +78,14 @@ def read_feather( """ Load a feather-format object from the file path. + Feather is particularly useful for scenarios that require efficient + serialization and deserialization of tabular data. It supports + schema preservation, making it a reliable choice for use cases + such as sharing data between Python and R, or persisting intermediate + results during data processing pipelines. This method provides additional + flexibility with options for selective column reading, thread parallelism, + and choosing the backend for data types. + Parameters ---------- path : str, path object, or file-like object @@ -90,20 +99,30 @@ def read_feather( Whether to parallelize reading using multiple threads. {storage_options} - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 Returns ------- type of object stored in file + DataFrame object stored in the file. + + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_sas : Read SAS file into a pandas DataFrame. Examples -------- @@ -120,24 +139,19 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): - return feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + if dtype_backend is lib.no_default and not using_string_dtype(): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) - - elif dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - - elif using_pyarrow_string_dtype(): - return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - raise NotImplementedError + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) diff --git a/pandas/io/formats/__init__.py b/pandas/io/formats/__init__.py index 5e56b1bc7ba43..895669c342f97 100644 --- a/pandas/io/formats/__init__.py +++ b/pandas/io/formats/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index 2a6cbe0762903..d76593b41a996 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -1,6 +1,7 @@ """ Internal module for console introspection """ + from __future__ import annotations from shutil import get_terminal_size @@ -62,7 +63,7 @@ def in_interactive_session() -> bool: """ from pandas import get_option - def check_main(): + def check_main() -> bool: try: import __main__ as main except ModuleNotFoundError: diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index ccce60c00a9e0..10c970887e03b 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -1,13 +1,11 @@ """ Utilities for interpreting CSS from Stylers for formatting non-HTML outputs. """ + from __future__ import annotations import re -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings from pandas.errors import CSSWarning @@ -15,6 +13,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterable, Iterator, @@ -35,7 +34,7 @@ def _side_expander(prop_fmt: str) -> Callable: function: Return to call when a 'border(-{side}): {value}' string is encountered """ - def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]: + def expand(self: CSSResolver, prop: str, value: str) -> Generator[tuple[str, str]]: """ Expand shorthand property into side-specific property (top, right, bottom, left) @@ -80,7 +79,7 @@ def _border_expander(side: str = "") -> Callable: if side != "": side = f"-{side}" - def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]: + def expand(self: CSSResolver, prop: str, value: str) -> Generator[tuple[str, str]]: """ Expand border into color, style, and width tuples @@ -244,14 +243,17 @@ def __call__( Examples -------- >>> resolve = CSSResolver() - >>> inherited = {'font-family': 'serif', 'font-weight': 'bold'} - >>> out = resolve(''' + >>> inherited = {"font-family": "serif", "font-weight": "bold"} + >>> out = resolve( + ... ''' ... border-color: BLUE RED; ... font-size: 1em; ... font-size: 2em; ... font-weight: normal; ... font-weight: inherit; - ... ''', inherited) + ... ''', + ... inherited, + ... ) >>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE [('border-bottom-color', 'blue'), ('border-left-color', 'red'), @@ -339,10 +341,12 @@ def _update_other_units(self, props: dict[str, str]) -> dict[str, str]: ) return props - def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS) -> str: - def _error(): + def size_to_pt( + self, in_val: str, em_pt: float | None = None, conversions: dict = UNIT_RATIOS + ) -> str: + def _error() -> str: warnings.warn( - f"Unhandled size: {repr(in_val)}", + f"Unhandled size: {in_val!r}", CSSWarning, stacklevel=find_stack_level(), ) @@ -384,7 +388,7 @@ def _error(): size_fmt = f"{val:f}pt" return size_fmt - def atomize(self, declarations: Iterable) -> Generator[tuple[str, str], None, None]: + def atomize(self, declarations: Iterable) -> Generator[tuple[str, str]]: for prop, value in declarations: prop = prop.lower() value = value.lower() @@ -415,7 +419,7 @@ def parse(self, declarations_str: str) -> Iterator[tuple[str, str]]: yield prop, val else: warnings.warn( - f"Ill-formatted attribute: expected a colon in {repr(decl)}", + f"Ill-formatted attribute: expected a colon in {decl!r}", CSSWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 50503e862ef43..75bcb51ef4be2 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -320,7 +320,11 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: res = df._get_values_for_csv(**self._number_format) data = list(res._iter_column_arrays()) - ix = self.data_index[slicer]._get_values_for_csv(**self._number_format) + ix = ( + self.data_index[slicer]._get_values_for_csv(**self._number_format) + if self.nlevels != 0 + else np.empty(end_i - start_i) + ) libwriters.write_csv_rows( data, ix, diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 5fd23cd7d918a..5fde6577e9f95 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -1,9 +1,11 @@ """ Utilities for conversion to writer-agnostic Excel representation. """ + from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -15,7 +17,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -36,6 +37,7 @@ DataFrame, Index, MultiIndex, + Period, PeriodIndex, ) import pandas.core.common as com @@ -47,10 +49,10 @@ CSSWarning, ) from pandas.io.formats.format import get_level_lengths -from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: from pandas._typing import ( + ExcelWriterMergeCells, FilePath, IndexLabel, StorageOptions, @@ -284,7 +286,9 @@ def build_border( for side in ["top", "right", "bottom", "left"] } - def _border_style(self, style: str | None, width: str | None, color: str | None): + def _border_style( + self, style: str | None, width: str | None, color: str | None + ) -> str | None: # convert styles and widths to openxml, one of: # 'dashDot' # 'dashDotDot' @@ -333,7 +337,7 @@ def _border_style(self, style: str | None, width: str | None, color: str | None) return self.BORDER_STYLE_MAP[style] else: warnings.warn( - f"Unhandled border style format: {repr(style)}", + f"Unhandled border style format: {style!r}", CSSWarning, stacklevel=find_stack_level(), ) @@ -469,7 +473,7 @@ def color_to_excel(self, val: str | None) -> str | None: return self.NAMED_COLORS[val] except KeyError: warnings.warn( - f"Unhandled color format: {repr(val)}", + f"Unhandled color format: {val!r}", CSSWarning, stacklevel=find_stack_level(), ) @@ -520,8 +524,11 @@ class ExcelFormatter: Column label for index column(s) if desired. If None is given, and `header` and `index` are True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - merge_cells : bool, default False - Format MultiIndex and Hierarchical Rows as merged cells. + merge_cells : bool or 'columns', default False + Format MultiIndex column headers and Hierarchical Rows as merged cells + if True. Merge MultiIndex column headers only if 'columns'. + .. versionchanged:: 3.0.0 + Added the 'columns' option. inf_rep : str, default `'inf'` representation for np.inf values (which aren't representable in Excel) A `'-'` sign will be added in front of -inf. @@ -544,7 +551,7 @@ def __init__( header: Sequence[Hashable] | bool = True, index: bool = True, index_label: IndexLabel | None = None, - merge_cells: bool = False, + merge_cells: ExcelWriterMergeCells = False, inf_rep: str = "inf", style_converter: Callable | None = None, ) -> None: @@ -577,22 +584,12 @@ def __init__( self.index = index self.index_label = index_label self.header = header + + if not isinstance(merge_cells, bool) and merge_cells != "columns": + raise ValueError(f"Unexpected value for {merge_cells=}.") self.merge_cells = merge_cells self.inf_rep = inf_rep - @property - def header_style(self) -> dict[str, dict[str, str | bool]]: - return { - "font": {"bold": True}, - "borders": { - "top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin", - }, - "alignment": {"horizontal": "center", "vertical": "top"}, - } - def _format_value(self, val): if is_scalar(val) and missing.isna(val): val = self.na_rep @@ -623,61 +620,43 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: return columns = self.columns - level_strs = columns._format_multi( - sparsify=self.merge_cells, include_names=False - ) + merge_columns = self.merge_cells in {True, "columns"} + level_strs = columns._format_multi(sparsify=merge_columns, include_names=False) level_lengths = get_level_lengths(level_strs) coloffset = 0 lnum = 0 if self.index and isinstance(self.df.index, MultiIndex): - coloffset = len(self.df.index[0]) - 1 - - if self.merge_cells: - # Format multi-index as a merged cells. - for lnum, name in enumerate(columns.names): - yield ExcelCell( - row=lnum, - col=coloffset, - val=name, - style=self.header_style, - ) + coloffset = self.df.index.nlevels - 1 + + for lnum, name in enumerate(columns.names): + yield ExcelCell( + row=lnum, + col=coloffset, + val=name, + style=None, + ) - for lnum, (spans, levels, level_codes) in enumerate( - zip(level_lengths, columns.levels, columns.codes) - ): - values = levels.take(level_codes) - for i, span_val in spans.items(): - mergestart, mergeend = None, None - if span_val > 1: - mergestart, mergeend = lnum, coloffset + i + span_val - yield CssExcelCell( - row=lnum, - col=coloffset + i + 1, - val=values[i], - style=self.header_style, - css_styles=getattr(self.styler, "ctx_columns", None), - css_row=lnum, - css_col=i, - css_converter=self.style_converter, - mergestart=mergestart, - mergeend=mergeend, - ) - else: - # Format in legacy format with dots to indicate levels. - for i, values in enumerate(zip(*level_strs)): - v = ".".join(map(pprint_thing, values)) + for lnum, (spans, levels, level_codes) in enumerate( + zip(level_lengths, columns.levels, columns.codes) + ): + values = levels.take(level_codes) + for i, span_val in spans.items(): + mergestart, mergeend = None, None + if merge_columns and span_val > 1: + mergestart, mergeend = lnum, coloffset + i + span_val yield CssExcelCell( row=lnum, col=coloffset + i + 1, - val=v, - style=self.header_style, + val=values[i], + style=None, css_styles=getattr(self.styler, "ctx_columns", None), css_row=lnum, css_col=i, css_converter=self.style_converter, + mergestart=mergestart, + mergeend=mergeend, ) - self.rowcounter = lnum def _format_header_regular(self) -> Iterable[ExcelCell]: @@ -704,7 +683,7 @@ def _format_header_regular(self) -> Iterable[ExcelCell]: row=self.rowcounter, col=colindex + coloffset, val=colname, - style=self.header_style, + style=None, css_styles=getattr(self.styler, "ctx_columns", None), css_row=0, css_col=colindex, @@ -727,7 +706,7 @@ def _format_header(self) -> Iterable[ExcelCell]: ] * len(self.columns) if functools.reduce(lambda x, y: x and y, (x != "" for x in row)): gen2 = ( - ExcelCell(self.rowcounter, colindex, val, self.header_style) + ExcelCell(self.rowcounter, colindex, val, None) for colindex, val in enumerate(row) ) self.rowcounter += 1 @@ -761,7 +740,7 @@ def _format_regular_rows(self) -> Iterable[ExcelCell]: self.rowcounter += 1 if index_label and self.header is not False: - yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style) + yield ExcelCell(self.rowcounter - 1, 0, index_label, None) # write index_values index_values = self.df.index @@ -773,7 +752,7 @@ def _format_regular_rows(self) -> Iterable[ExcelCell]: row=self.rowcounter + idx, col=0, val=idxval, - style=self.header_style, + style=None, css_styles=getattr(self.styler, "ctx_index", None), css_row=idx, css_col=0, @@ -801,17 +780,16 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # MultiIndex columns require an extra row # with index names (blank if None) for - # unambiguous round-trip, unless not merging, - # in which case the names all go on one row Issue #11328 - if isinstance(self.columns, MultiIndex) and self.merge_cells: + # unambiguous round-trip, Issue #11328 + if isinstance(self.columns, MultiIndex): self.rowcounter += 1 # if index labels are not empty go ahead and dump if com.any_not_none(*index_labels) and self.header is not False: for cidx, name in enumerate(index_labels): - yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style) + yield ExcelCell(self.rowcounter - 1, cidx, name, None) - if self.merge_cells: + if self.merge_cells and self.merge_cells != "columns": # Format hierarchical rows as merged cells. level_strs = self.df.index._format_multi( sparsify=True, include_names=False @@ -826,6 +804,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: allow_fill=levels._can_hold_na, fill_value=levels._na_value, ) + # GH#60099 + if isinstance(values[0], Period): + values = values.to_timestamp() for i, span_val in spans.items(): mergestart, mergeend = None, None @@ -836,7 +817,7 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: row=self.rowcounter + i, col=gcolidx, val=values[i], - style=self.header_style, + style=None, css_styles=getattr(self.styler, "ctx_index", None), css_row=i, css_col=gcolidx, @@ -850,11 +831,15 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): + # GH#60099 + if isinstance(indexcolval, Period): + indexcolval = indexcolval.to_timestamp() + yield CssExcelCell( row=self.rowcounter + idx, col=gcolidx, val=indexcolval, - style=self.header_style, + style=None, css_styles=getattr(self.styler, "ctx_index", None), css_row=idx, css_col=gcolidx, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 00c7526edfa48..dbfac3b02643f 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -2,9 +2,11 @@ Internal module for formatting output data in csv, html, xml, and latex files. This module also applies to display formatting. """ + from __future__ import annotations from collections.abc import ( + Callable, Generator, Hashable, Mapping, @@ -21,7 +23,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Final, cast, ) @@ -66,7 +67,6 @@ ExtensionArray, TimedeltaArray, ) -from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.indexes.api import ( @@ -77,7 +77,6 @@ ) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.reshape.concat import concat from pandas.io.common import ( check_parent_directory, @@ -244,7 +243,11 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + _len = len(series) + _slice = np.hstack( + [np.arange(row_num), np.arange(_len - row_num, _len)] + ) + series = series.iloc[_slice] self.tr_row_num = row_num else: self.tr_row_num = None @@ -562,7 +565,7 @@ def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceTyp result = {} elif isinstance(col_space, (int, str)): result = {"": col_space} - result.update({column: col_space for column in self.frame.columns}) + result.update(dict.fromkeys(self.frame.columns, col_space)) elif isinstance(col_space, Mapping): for column in col_space.keys(): if column not in self.frame.columns and column != "": @@ -668,9 +671,9 @@ def _truncate_horizontally(self) -> None: assert self.max_cols_fitted is not None col_num = self.max_cols_fitted // 2 if col_num >= 1: - left = self.tr_frame.iloc[:, :col_num] - right = self.tr_frame.iloc[:, -col_num:] - self.tr_frame = concat((left, right), axis=1) + _len = len(self.tr_frame.columns) + _slice = np.hstack([np.arange(col_num), np.arange(_len - col_num, _len)]) + self.tr_frame = self.tr_frame.iloc[:, _slice] # truncate formatter if isinstance(self.formatters, (list, tuple)): @@ -681,7 +684,7 @@ def _truncate_horizontally(self) -> None: else: col_num = cast(int, self.max_cols) self.tr_frame = self.tr_frame.iloc[:, :col_num] - self.tr_col_num = col_num + self.tr_col_num: int = col_num def _truncate_vertically(self) -> None: """Remove rows, which are not to be displayed. @@ -780,38 +783,20 @@ def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]: if isinstance(columns, MultiIndex): fmt_columns = columns._format_multi(sparsify=False, include_names=False) - fmt_columns = list(zip(*fmt_columns)) - dtypes = self.frame.dtypes._values - - # if we have a Float level, they don't use leading space at all - restrict_formatting = any(level.is_floating for level in columns.levels) - need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - - def space_format(x, y): - if ( - y not in self.formatters - and need_leadsp[x] - and not restrict_formatting - ): - return " " + y - return y - - str_columns_tuple = list( - zip(*([space_format(x, y) for y in x] for x in fmt_columns)) - ) - if self.sparsify and len(str_columns_tuple): - str_columns_tuple = sparsify_labels(str_columns_tuple) + if self.sparsify and len(fmt_columns): + fmt_columns = sparsify_labels(fmt_columns) - str_columns = [list(x) for x in zip(*str_columns_tuple)] + str_columns = [list(x) for x in zip(*fmt_columns)] else: fmt_columns = columns._format_flat(include_name=False) - dtypes = self.frame.dtypes - need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) str_columns = [ - [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] - for i, x in enumerate(fmt_columns) + [ + " " + x + if not self._get_formatter(i) and is_numeric_dtype(dtype) + else x + ] + for i, (x, dtype) in enumerate(zip(fmt_columns, self.frame.dtypes)) ] - # self.str_columns = str_columns return str_columns def _get_formatted_index(self, frame: DataFrame) -> list[str]: @@ -872,7 +857,7 @@ class DataFrameRenderer: - to_csv - to_latex - Called in pandas.core.frame.DataFrame: + Called in pandas.DataFrame: - to_html - to_string @@ -911,9 +896,13 @@ def to_html( ``
    `` tag, in addition to the default "dataframe". notebook : {True, False}, optional, default False Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - ``
    `` tag. Default ``pd.options.display.html.border``. + border : int or bool + When an integer value is provided, it sets the border attribute in + the opening tag, specifying the thickness of the border. + If ``False`` or ``0`` is passed, the border attribute will not + be present in the ``
    `` tag. + The default value for this parameter is governed by + ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
    ` tag if specified. render_links : bool, default False @@ -1041,7 +1030,7 @@ def save_to_buffer( @contextmanager def _get_buffer( buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None -) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]: +) -> Generator[WriteBuffer[str]] | Generator[StringIO]: """ Context manager to open, yield and close buffer for filenames or Path-like objects, otherwise yield buf unchanged. @@ -1223,17 +1212,11 @@ def _format(x): return "None" elif x is NA: return str(NA) - elif lib.is_float(x) and np.isinf(x): - # TODO(3.0): this will be unreachable when use_inf_as_na - # deprecation is enforced - return str(x) elif x is NaT or isinstance(x, (np.datetime64, np.timedelta64)): return "NaT" return self.na_rep elif isinstance(x, PandasObject): return str(x) - elif isinstance(x, StringDtype): - return repr(x) else: # object dtype return str(formatter(x)) @@ -1346,7 +1329,9 @@ def get_result_as_array(self) -> np.ndarray: the parameters given at initialisation, as a numpy array """ - def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str): + def format_with_na_rep( + values: ArrayLike, formatter: Callable, na_rep: str + ) -> np.ndarray: mask = isna(values) formatted = np.array( [ @@ -1358,7 +1343,7 @@ def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str): def format_complex_with_na_rep( values: ArrayLike, formatter: Callable, na_rep: str - ): + ) -> np.ndarray: real_values = np.real(values).ravel() # type: ignore[arg-type] imag_values = np.imag(values).ravel() # type: ignore[arg-type] real_mask, imag_mask = isna(real_values), isna(imag_values) @@ -1544,7 +1529,7 @@ def _format_strings(self) -> list[str]: def format_percentiles( - percentiles: (np.ndarray | Sequence[float]), + percentiles: np.ndarray | Sequence[float], ) -> list[str]: """ Outputs rounded and formatted percentiles. @@ -1577,6 +1562,9 @@ def format_percentiles( >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] """ + if len(percentiles) == 0: + return [] + percentiles = np.asarray(percentiles) # It checks for np.nan as well @@ -1768,7 +1756,7 @@ def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[st # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""] # Therefore, the imaginary part is the 4th and 3rd last elements, # and the real part is everything before the imaginary part - trimmed = re.split(r"([j+-])", x) + trimmed = re.split(r"(? Non """ Format float representation in DataFrame with SI notation. + Sets the floating-point display format for ``DataFrame`` objects using engineering + notation (SI units), allowing easier readability of values across wide ranges. + Parameters ---------- accuracy : int, default 3 @@ -1955,6 +1946,13 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non Returns ------- None + This method does not return a value. it updates the global display format + for floats in DataFrames. + + See Also + -------- + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. Examples -------- diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 794ce77b3b45e..c4884ef4ce4a9 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -1,6 +1,7 @@ """ Module for formatting output data in HTML. """ + from __future__ import annotations from textwrap import dedent @@ -194,6 +195,8 @@ def _write_cell( esc = {} rs = pprint_thing(s, escape_chars=esc).strip() + # replace spaces betweens strings with non-breaking spaces + rs = rs.replace(" ", "  ") if self.render_links and is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() @@ -238,6 +241,7 @@ def _write_table(self, indent: int = 0) -> None: use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: _classes.append("tex2jax_ignore") + _classes.append("mathjax_ignore") if self.classes is not None: if isinstance(self.classes, str): self.classes = self.classes.split() diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 552affbd053f2..eb579f7149d44 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -72,7 +72,7 @@ Prints information of all columns: >>> df.info(verbose=True) - + RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): # Column Non-Null Count Dtype @@ -87,7 +87,7 @@ information: >>> df.info(verbose=False) - + RangeIndex: 5 entries, 0 to 4 Columns: 3 entries, int_col to float_col dtypes: float64(1), int64(1), object(1) @@ -115,7 +115,7 @@ ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) ... }) >>> df.info() - + RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): # Column Non-Null Count Dtype @@ -127,7 +127,7 @@ memory usage: 22.9+ MB >>> df.info(memory_usage='deep') - + RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): # Column Non-Null Count Dtype @@ -165,7 +165,7 @@ >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> s = pd.Series(text_values, index=int_values) >>> s.info() - + Index: 5 entries, 1 to 5 Series name: None Non-Null Count Dtype @@ -177,7 +177,7 @@ Prints a summary excluding information about its values: >>> s.info(verbose=False) - + Index: 5 entries, 1 to 5 dtypes: object(1) memory usage: 80.0+ bytes @@ -200,7 +200,7 @@ >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) >>> s.info() - + RangeIndex: 1000000 entries, 0 to 999999 Series name: None Non-Null Count Dtype @@ -210,7 +210,7 @@ memory usage: 7.6+ MB >>> s.info(memory_usage='deep') - + RangeIndex: 1000000 entries, 0 to 999999 Series name: None Non-Null Count Dtype @@ -226,12 +226,17 @@ Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""" ) +series_max_cols_sub = dedent( + """\ + max_cols : int, optional + Unused, exists only for compatibility with DataFrame.info.""" +) series_sub_kwargs = { "klass": "Series", "type_sub": "", - "max_cols_sub": "", + "max_cols_sub": series_max_cols_sub, "show_counts_sub": show_counts_sub, "examples_sub": series_examples_sub, "see_also_sub": series_see_also_sub, @@ -244,7 +249,7 @@ Print a concise summary of a {klass}. This method prints information about a {klass} including - the index dtype{type_sub}, non-null values and memory usage. + the index dtype{type_sub}, non-NA values and memory usage. {version_added_sub}\ Parameters @@ -334,10 +339,10 @@ def _sizeof_fmt(num: float, size_qualifier: str) -> str: Examples -------- - >>> _sizeof_fmt(23028, '') + >>> _sizeof_fmt(23028, "") '22.5 KB' - >>> _sizeof_fmt(23028, '+') + >>> _sizeof_fmt(23028, "+") '22.5+ KB' """ for x in ["bytes", "KB", "MB", "GB", "TB"]: @@ -392,7 +397,7 @@ def dtype_counts(self) -> Mapping[str, int]: @property @abstractmethod - def non_null_counts(self) -> Sequence[int]: + def non_null_counts(self) -> list[int] | Series: """Sequence of non-null counts for all columns or column (if series).""" @property @@ -422,7 +427,7 @@ def size_qualifier(self) -> str: # categories) if ( "object" in self.dtype_counts - or self.data.index._is_memory_usage_qualified() + or self.data.index._is_memory_usage_qualified ): size_qualifier = "+" return size_qualifier @@ -486,7 +491,7 @@ def col_count(self) -> int: return len(self.ids) @property - def non_null_counts(self) -> Sequence[int]: + def non_null_counts(self) -> Series: """Sequence of non-null counts for all columns or column (if series).""" return self.data.count() @@ -546,7 +551,7 @@ def render( printer.to_buffer(buf) @property - def non_null_counts(self) -> Sequence[int]: + def non_null_counts(self) -> list[int]: return [self.data.count()] @property @@ -622,7 +627,7 @@ def __init__( @property def max_rows(self) -> int: """Maximum info rows to be displayed.""" - return get_option("display.max_info_rows", len(self.data) + 1) + return get_option("display.max_info_rows") @property def exceeds_info_cols(self) -> bool: @@ -641,7 +646,7 @@ def col_count(self) -> int: def _initialize_max_cols(self, max_cols: int | None) -> int: if max_cols is None: - return get_option("display.max_info_columns", self.col_count + 1) + return get_option("display.max_info_columns") return max_cols def _initialize_show_counts(self, show_counts: bool | None) -> bool: @@ -750,7 +755,7 @@ def memory_usage_string(self) -> str: return self.info.memory_usage_string @property - def non_null_counts(self) -> Sequence[int]: + def non_null_counts(self) -> list[int] | Series: return self.info.non_null_counts def add_object_type_line(self) -> None: diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 2cc9368f8846a..ab27321ffe83c 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -1,17 +1,19 @@ """ Printing tools. """ + from __future__ import annotations from collections.abc import ( + Callable, Iterable, Mapping, Sequence, ) import sys from typing import ( + TYPE_CHECKING, Any, - Callable, TypeVar, Union, ) @@ -23,12 +25,14 @@ from pandas.io.formats.console import get_console_size +if TYPE_CHECKING: + from pandas._typing import ListLike EscapeChars = Union[Mapping[str, str], Iterable[str]] _KT = TypeVar("_KT") _VT = TypeVar("_VT") -def adjoin(space: int, *lists: list[str], **kwargs) -> str: +def adjoin(space: int, *lists: list[str], **kwargs: Any) -> str: """ Glues together two sets of strings using the amount of space requested. The idea is to prettify. @@ -97,7 +101,7 @@ def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> lis def _pprint_seq( - seq: Sequence, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds + seq: ListLike, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds: Any ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() @@ -107,23 +111,28 @@ def _pprint_seq( """ if isinstance(seq, set): fmt = "{{{body}}}" + elif isinstance(seq, frozenset): + fmt = "frozenset({{{body}}})" else: fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})" if max_seq_items is False: - nitems = len(seq) + max_items = None else: - nitems = max_seq_items or get_option("max_seq_items") or len(seq) + max_items = max_seq_items or get_option("max_seq_items") or len(seq) s = iter(seq) # handle sets, no slicing - r = [ - pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) - for i in range(min(nitems, len(seq))) - ] + r = [] + max_items_reached = False + for i, item in enumerate(s): + if (max_items is not None) and (i >= max_items): + max_items_reached = True + break + r.append(pprint_thing(item, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)) body = ", ".join(r) - if nitems < len(seq): + if max_items_reached: body += ", ..." elif isinstance(seq, tuple) and len(seq) == 1: body += "," @@ -132,7 +141,7 @@ def _pprint_seq( def _pprint_dict( - seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds + seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds: Any ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() @@ -163,7 +172,7 @@ def _pprint_dict( def pprint_thing( - thing: Any, + thing: object, _nest_lvl: int = 0, escape_chars: EscapeChars | None = None, default_escapes: bool = False, @@ -180,8 +189,8 @@ def pprint_thing( _nest_lvl : internal use only. pprint_thing() is mutually-recursive with pprint_sequence, this argument is used to keep track of the current nesting level, and limit it. - escape_chars : list or dict, optional - Characters to escape. If a dict is passed the values are the + escape_chars : list[str] or Mapping[str, str], optional + Characters to escape. If a Mapping is passed the values are the replacements default_escapes : bool, default False Whether the input escape characters replaces or adds to the defaults @@ -196,12 +205,12 @@ def pprint_thing( def as_escaped_string( thing: Any, escape_chars: EscapeChars | None = escape_chars ) -> str: - translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} - if isinstance(escape_chars, dict): + translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r", "'": r"\'"} + if isinstance(escape_chars, Mapping): if default_escapes: translate.update(escape_chars) else: - translate = escape_chars + translate = escape_chars # type: ignore[assignment] escape_chars = list(escape_chars.keys()) else: escape_chars = escape_chars or () @@ -213,7 +222,7 @@ def as_escaped_string( if hasattr(thing, "__next__"): return str(thing) - elif isinstance(thing, dict) and _nest_lvl < get_option( + elif isinstance(thing, Mapping) and _nest_lvl < get_option( "display.pprint_nest_depth" ): result = _pprint_dict( @@ -221,7 +230,10 @@ def as_escaped_string( ) elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"): result = _pprint_seq( - thing, + # error: Argument 1 to "_pprint_seq" has incompatible type "object"; + # expected "ExtensionArray | ndarray[Any, Any] | Index | Series | + # SequenceNotStr[Any] | range" + thing, # type: ignore[arg-type] _nest_lvl, escape_chars=escape_chars, quote_strings=quote_strings, @@ -236,7 +248,7 @@ def as_escaped_string( def pprint_thing_encoded( - object, encoding: str = "utf-8", errors: str = "replace" + object: object, encoding: str = "utf-8", errors: str = "replace" ) -> bytes: value = pprint_thing(object) # get unicode representation of object return value.encode(encoding, errors) @@ -248,7 +260,8 @@ def enable_data_resource_formatter(enable: bool) -> None: return from IPython import get_ipython - ip = get_ipython() + # error: Call to untyped function "get_ipython" in typed context + ip = get_ipython() # type: ignore[no-untyped-call] if ip is None: # still not in IPython return @@ -285,7 +298,7 @@ def default_pprint(thing: Any, max_seq_items: int | None = None) -> str: def format_object_summary( - obj, + obj: ListLike, formatter: Callable, is_justify: bool = True, name: str | None = None, @@ -325,8 +338,8 @@ def format_object_summary( if indent_for_name: name_len = len(name) - space1 = f'\n{(" " * (name_len + 1))}' - space2 = f'\n{(" " * (name_len + 2))}' + space1 = f"\n{(' ' * (name_len + 1))}" + space2 = f"\n{(' ' * (name_len + 2))}" else: space1 = "\n" space2 = "\n " # space for the opening '[' @@ -474,7 +487,7 @@ def _justify( Examples -------- - >>> _justify([['a', 'b']], [['abc', 'abcd']]) + >>> _justify([["a", "b"]], [["abc", "abcd"]]) ([(' a', ' b')], [('abc', 'abcd')]) """ combined = head + tail @@ -521,7 +534,7 @@ def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]: else: return [x.rjust(max_len) for x in texts] - def adjoin(self, space: int, *lists, **kwargs) -> str: + def adjoin(self, space: int, *lists: Any, **kwargs: Any) -> str: return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs) @@ -553,7 +566,7 @@ def justify( self, texts: Iterable[str], max_len: int, mode: str = "right" ) -> list[str]: # re-calculate padding space per str considering East Asian Width - def _get_pad(t): + def _get_pad(t: str) -> int: return max_len - self.len(t) + len(t) if mode == "left": diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index cdad388592717..ca41726de08cf 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -1,6 +1,7 @@ """ Module for formatting output data in console (to string). """ + from __future__ import annotations from shutil import get_terminal_size diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b62f7581ac220..0f734a81795c4 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1,19 +1,17 @@ """ Module for applying conditional formatting to DataFrames and Series. """ + from __future__ import annotations -from contextlib import contextmanager import copy from functools import partial import operator +import textwrap from typing import ( TYPE_CHECKING, - Any, - Callable, overload, ) -import warnings import numpy as np @@ -24,7 +22,6 @@ Substitution, doc, ) -from pandas.util._exceptions import find_stack_level import pandas as pd from pandas import ( @@ -58,7 +55,7 @@ if TYPE_CHECKING: from collections.abc import ( - Generator, + Callable, Hashable, Sequence, ) @@ -66,37 +63,27 @@ from matplotlib.colors import Colormap from pandas._typing import ( + Any, Axis, AxisInt, + Concatenate, + ExcelWriterMergeCells, FilePath, IndexLabel, IntervalClosedType, Level, + P, QuantileInterpolation, Scalar, + Self, StorageOptions, + T, WriteBuffer, WriteExcelBuffer, ) from pandas import ExcelWriter -try: - import matplotlib as mpl - import matplotlib.pyplot as plt - - has_mpl = True -except ImportError: - has_mpl = False - - -@contextmanager -def _mpl(func: Callable) -> Generator[tuple[Any, Any], None, None]: - if has_mpl: - yield plt, mpl - else: - raise ImportError(f"{func.__name__} requires matplotlib.") - #### # Shared Doc Strings @@ -130,6 +117,12 @@ class Styler(StylerRenderer): r""" Helps style a DataFrame or Series according to the data with HTML and CSS. + This class provides methods for styling and formatting a Pandas DataFrame or Series. + The styled output can be rendered as HTML or LaTeX, and it supports CSS-based + styling, allowing users to control colors, font styles, and other visual aspects of + tabular data. It is particularly useful for presenting DataFrame objects in a + Jupyter Notebook environment or when exporting styled tables for reports and + Parameters ---------- data : Series or DataFrame @@ -176,6 +169,7 @@ class Styler(StylerRenderer): escape : str, optional Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` in cell display string with HTML-safe sequences. + Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with LaTeX-safe sequences. Use 'latex-math' to replace the characters @@ -193,6 +187,8 @@ class Styler(StylerRenderer): Attributes ---------- + index : data.index Index + columns : data.columns Index env : Jinja2 jinja2.Environment template_html : Jinja2 Template template_html_table : Jinja2 Template @@ -207,6 +203,13 @@ class Styler(StylerRenderer): Notes ----- + .. warning:: + + ``Styler`` is primarily intended for use on safe input that you control. + When using ``Styler`` on untrusted, user-provided input to serve HTML, + you should set ``escape="html"`` to prevent security vulnerabilities. + See the Jinja2 documentation on escaping HTML for more. + Most styling will be done by passing style functions into ``Styler.apply`` or ``Styler.map``. Style functions should return values with strings containing CSS ``'attr: value'`` that will @@ -227,6 +230,7 @@ class Styler(StylerRenderer): * ``level`` where `k` is the level in a MultiIndex * Column label cells include + * ``col_heading`` * ``col`` where `n` is the numeric position of the column * ``level`` where `k` is the level in a MultiIndex @@ -236,15 +240,17 @@ class Styler(StylerRenderer): * Trimmed cells include ``col_trim`` or ``row_trim``. Any, or all, or these classes can be renamed by using the ``css_class_names`` - argument in ``Styler.set_table_classes``, giving a value such as + argument in ``Styler.set_table_styles``, giving a value such as *{"row": "MY_ROW_CLASS", "col_trim": "", "row_trim": ""}*. Examples -------- - >>> df = pd.DataFrame([[1.0, 2.0, 3.0], [4, 5, 6]], index=['a', 'b'], - ... columns=['A', 'B', 'C']) - >>> pd.io.formats.style.Styler(df, precision=2, - ... caption="My table") # doctest: +SKIP + >>> df = pd.DataFrame( + ... [[1.0, 2.0, 3.0], [4, 5, 6]], index=["a", "b"], columns=["A", "B", "C"] + ... ) + >>> pd.io.formats.style.Styler( + ... df, precision=2, caption="My table" + ... ) # doctest: +SKIP Please see: `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -310,6 +316,12 @@ def concat(self, other: Styler) -> Styler: Returns ------- Styler + Instance of class with specified Styler appended. + + See Also + -------- + Styler.clear : Reset the ``Styler``, removing any previously applied styles. + Styler.export : Export the styles applied to the current Styler. Notes ----- @@ -339,7 +351,7 @@ def concat(self, other: Styler) -> Styler: keys ``data``, ``row_heading`` and ``row`` will be prepended with ``foot0_``. If more concats are chained, their styles will be prepended with ``foot1_``, ''foot_2'', etc., and if a concatenated style have - another concatanated style, the second style will be prepended with + another concatenated style, the second style will be prepended with ``foot{parent}_foot{child}_``. A common use case is to concatenate user defined functions with @@ -351,9 +363,11 @@ def concat(self, other: Styler) -> Styler: A common use case is adding totals rows, or otherwise, via methods calculated in ``DataFrame.agg``. - >>> df = pd.DataFrame([[4, 6], [1, 9], [3, 4], [5, 5], [9, 6]], - ... columns=["Mike", "Jim"], - ... index=["Mon", "Tue", "Wed", "Thurs", "Fri"]) + >>> df = pd.DataFrame( + ... [[4, 6], [1, 9], [3, 4], [5, 5], [9, 6]], + ... columns=["Mike", "Jim"], + ... index=["Mon", "Tue", "Wed", "Thurs", "Fri"], + ... ) >>> styler = df.style.concat(df.agg(["sum"]).style) # doctest: +SKIP .. figure:: ../../_static/style/footer_simple.png @@ -363,14 +377,16 @@ def concat(self, other: Styler) -> Styler: >>> descriptors = df.agg(["sum", "mean", lambda s: s.dtype]) >>> descriptors.index = ["Total", "Average", "dtype"] - >>> other = (descriptors.style - ... .highlight_max(axis=1, subset=(["Total", "Average"], slice(None))) - ... .format(subset=("Average", slice(None)), precision=2, decimal=",") - ... .map(lambda v: "font-weight: bold;")) - >>> styler = (df.style - ... .highlight_max(color="salmon") - ... .set_table_styles([{"selector": ".foot_row0", - ... "props": "border-top: 1px solid black;"}])) + >>> other = ( + ... descriptors.style.highlight_max( + ... axis=1, subset=(["Total", "Average"], slice(None)) + ... ) + ... .format(subset=("Average", slice(None)), precision=2, decimal=",") + ... .map(lambda v: "font-weight: bold;") + ... ) + >>> styler = df.style.highlight_max(color="salmon").set_table_styles( + ... [{"selector": ".foot_row0", "props": "border-top: 1px solid black;"}] + ... ) >>> styler.concat(other) # doctest: +SKIP .. figure:: ../../_static/style/footer_extended.png @@ -378,8 +394,9 @@ def concat(self, other: Styler) -> Styler: When ``other`` has fewer index levels than the original Styler it is possible to extend the index in ``other``, with placeholder levels. - >>> df = pd.DataFrame([[1], [2]], - ... index=pd.MultiIndex.from_product([[0], [1, 2]])) + >>> df = pd.DataFrame( + ... [[1], [2]], index=pd.MultiIndex.from_product([[0], [1, 2]]) + ... ) >>> descriptors = df.agg(["sum"]) >>> descriptors.index = pd.MultiIndex.from_product([[""], descriptors.index]) >>> df.style.concat(descriptors.style) # doctest: +SKIP @@ -415,6 +432,7 @@ def set_tooltips( ttips: DataFrame, props: CSSProperties | None = None, css_class: str | None = None, + as_title_attribute: bool = False, ) -> Styler: """ Set the DataFrame of strings on ``Styler`` generating ``:hover`` tooltips. @@ -438,10 +456,22 @@ def set_tooltips( Name of the tooltip class used in CSS, should conform to HTML standards. Only useful if integrating tooltips with external CSS. If ``None`` uses the internal default value 'pd-t'. + as_title_attribute : bool, default False + Add the tooltip text as title attribute to resultant
    element. If True + then props and css_class arguments are ignored. Returns ------- Styler + Instance of class with DataFrame set for strings on ``Styler`` + generating ``:hover`` tooltips. + + See Also + -------- + Styler.set_table_attributes : Set the table attributes added to the + ```` HTML element. + Styler.set_table_styles : Set the table styles included within the + `` @@ -1368,8 +1536,7 @@ def to_string( max_rows: int | None = ..., max_columns: int | None = ..., delimiter: str = ..., - ) -> None: - ... + ) -> None: ... @overload def to_string( @@ -1382,8 +1549,7 @@ def to_string( max_rows: int | None = ..., max_columns: int | None = ..., delimiter: str = ..., - ) -> str: - ... + ) -> str: ... @Substitution(buf=buffering_args, encoding=encoding_args) def to_string( @@ -1432,9 +1598,13 @@ def to_string( str or None If `buf` is None, returns the result as a string. Otherwise returns `None`. + See Also + -------- + DataFrame.to_string : Render a DataFrame to a console-friendly tabular output. + Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.style.to_string() ' A B\\n0 1 3\\n1 2 4\\n' """ @@ -1471,6 +1641,8 @@ def set_td_classes(self, classes: DataFrame) -> Styler: Returns ------- Styler + Instance of class with ``class`` attribute set for ``
    `` + HTML elements. See Also -------- @@ -1487,19 +1659,24 @@ def set_td_classes(self, classes: DataFrame) -> Styler: Examples -------- >>> df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) - >>> classes = pd.DataFrame([ - ... ["min-val red", "", "blue"], - ... ["red", None, "blue max-val"] - ... ], index=df.index, columns=df.columns) + >>> classes = pd.DataFrame( + ... [["min-val red", "", "blue"], ["red", None, "blue max-val"]], + ... index=df.index, + ... columns=df.columns, + ... ) >>> df.style.set_td_classes(classes) # doctest: +SKIP Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the underlying, - >>> df = pd.DataFrame([[1,2],[3,4]], index=["a", "b"], - ... columns=[["level0", "level0"], ["level1a", "level1b"]]) - >>> classes = pd.DataFrame(["min-val"], index=["a"], - ... columns=[["level0"],["level1a"]]) + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4]], + ... index=["a", "b"], + ... columns=[["level0", "level0"], ["level1a", "level1b"]], + ... ) + >>> classes = pd.DataFrame( + ... ["min-val"], index=["a"], columns=[["level0"], ["level1a"]] + ... ) >>> df.style.set_td_classes(classes) # doctest: +SKIP Form of the output with new additional css classes, @@ -1580,7 +1757,7 @@ def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None: for j in attrs.columns: ser = attrs[j] for i, c in ser.items(): - if not c: + if not c or pd.isna(c): continue css_list = maybe_convert_css_to_tuples(c) if axis == 0: @@ -1637,6 +1814,8 @@ def _copy(self, deepcopy: bool = False) -> Styler: "_display_funcs", "_display_funcs_index", "_display_funcs_columns", + "_display_funcs_index_names", + "_display_funcs_column_names", "hidden_rows", "hidden_columns", "ctx", @@ -1669,13 +1848,21 @@ def clear(self) -> None: Returns None. + See Also + -------- + Styler.apply : Apply a CSS-styling function column-wise, row-wise, + or table-wise. + Styler.export : Export the styles applied to the current Styler. + Styler.map : Apply a CSS-styling function elementwise. + Styler.use : Set the styles on the current Styler. + Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, np.nan]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, np.nan]}) After any added style: - >>> df.style.highlight_null(color='yellow') # doctest: +SKIP + >>> df.style.highlight_null(color="yellow") # doctest: +SKIP Remove it with: @@ -1710,12 +1897,12 @@ def _apply( if not isinstance(result, DataFrame): if not isinstance(result, np.ndarray): raise TypeError( - f"Function {repr(func)} must return a DataFrame or ndarray " + f"Function {func!r} must return a DataFrame or ndarray " f"when passed to `Styler.apply` with axis=None" ) if data.shape != result.shape: raise ValueError( - f"Function {repr(func)} returned ndarray with wrong shape.\n" + f"Function {func!r} returned ndarray with wrong shape.\n" f"Result has shape: {result.shape}\n" f"Expected shape: {data.shape}" ) @@ -1729,12 +1916,12 @@ def _apply( if isinstance(result, Series): raise ValueError( - f"Function {repr(func)} resulted in the apply method collapsing to a " + f"Function {func!r} resulted in the apply method collapsing to a " f"Series.\nUsually, this is the result of the function returning a " f"single value, instead of list-like." ) msg = ( - f"Function {repr(func)} created invalid {{0}} labels.\nUsually, this is " + f"Function {func!r} created invalid {{0}} labels.\nUsually, this is " f"the result of the function returning a " f"{'Series' if axis is not None else 'DataFrame'} which contains invalid " f"labels, or returning an incorrectly shaped, list-like object which " @@ -1790,6 +1977,7 @@ def apply( Returns ------- Styler + Instance of class with CSS applied to its HTML representation. See Also -------- @@ -1812,22 +2000,22 @@ def apply( >>> def highlight_max(x, color): ... return np.where(x == np.nanmax(x.to_numpy()), f"color: {color};", None) >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - >>> df.style.apply(highlight_max, color='red') # doctest: +SKIP - >>> df.style.apply(highlight_max, color='blue', axis=1) # doctest: +SKIP - >>> df.style.apply(highlight_max, color='green', axis=None) # doctest: +SKIP + >>> df.style.apply(highlight_max, color="red") # doctest: +SKIP + >>> df.style.apply(highlight_max, color="blue", axis=1) # doctest: +SKIP + >>> df.style.apply(highlight_max, color="green", axis=None) # doctest: +SKIP Using ``subset`` to restrict application to a single column or multiple columns - >>> df.style.apply(highlight_max, color='red', subset="A") + >>> df.style.apply(highlight_max, color="red", subset="A") ... # doctest: +SKIP - >>> df.style.apply(highlight_max, color='red', subset=["A", "B"]) + >>> df.style.apply(highlight_max, color="red", subset=["A", "B"]) ... # doctest: +SKIP Using a 2d input to ``subset`` to select rows in addition to columns - >>> df.style.apply(highlight_max, color='red', subset=([0, 1, 2], slice(None))) + >>> df.style.apply(highlight_max, color="red", subset=([0, 1, 2], slice(None))) ... # doctest: +SKIP - >>> df.style.apply(highlight_max, color='red', subset=(slice(0, 5, 2), "A")) + >>> df.style.apply(highlight_max, color="red", subset=(slice(0, 5, 2), "A")) ... # doctest: +SKIP Using a function which returns a Series / DataFrame of unequal length but @@ -1841,7 +2029,7 @@ def apply( more details. """ self._todo.append( - (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs) + (lambda instance: instance._apply, (func, axis, subset), kwargs) ) return self @@ -1875,9 +2063,9 @@ def _apply_index( func="take a Series and return a string array of the same length", input_note="the index as a Series, if an Index, or a level of a MultiIndex", output_note="an identically sized array of CSS styles as strings", - var="s", - ret='np.where(s == "B", "background-color: yellow;", "")', - ret2='["background-color: yellow;" if "x" in v else "" for v in s]', + var="label", + ret='np.where(label == "B", "background-color: yellow;", "")', + ret2='["background-color: yellow;" if "x" in v else "" for v in label]', ) def apply_index( self, @@ -1910,6 +2098,7 @@ def apply_index( Returns ------- Styler + Instance of class with CSS applied to its HTML representation. See Also -------- @@ -1927,8 +2116,8 @@ def apply_index( -------- Basic usage to conditionally highlight values in the index. - >>> df = pd.DataFrame([[1,2], [3,4]], index=["A", "B"]) - >>> def color_b(s): + >>> df = pd.DataFrame([[1, 2], [3, 4]], index=["A", "B"]) + >>> def color_b({var}): ... return {ret} >>> df.style.{this}_index(color_b) # doctest: +SKIP @@ -1936,18 +2125,18 @@ def apply_index( Selectively applying to specific levels of MultiIndex columns. - >>> midx = pd.MultiIndex.from_product([['ix', 'jy'], [0, 1], ['x3', 'z4']]) + >>> midx = pd.MultiIndex.from_product([["ix", "jy"], [0, 1], ["x3", "z4"]]) >>> df = pd.DataFrame([np.arange(8)], columns=midx) >>> def highlight_x({var}): ... return {ret2} - >>> df.style.{this}_index(highlight_x, axis="columns", level=[0, 2]) - ... # doctest: +SKIP + >>> df.style.{this}_index( + ... highlight_x, axis="columns", level=[0, 2]) # doctest: +SKIP .. figure:: ../../_static/style/appmaphead2.png """ self._todo.append( ( - lambda instance: getattr(instance, "_apply_index"), + lambda instance: instance._apply_index, (func, axis, level, "apply"), kwargs, ) @@ -1963,9 +2152,9 @@ def apply_index( func="take a scalar and return a string", input_note="an index value, if an Index, or a level value of a MultiIndex", output_note="CSS styles as a string", - var="v", - ret='"background-color: yellow;" if v == "B" else None', - ret2='"background-color: yellow;" if "x" in v else None', + var="label", + ret='"background-color: yellow;" if label == "B" else None', + ret2='"background-color: yellow;" if "x" in label else None', ) def map_index( self, @@ -1976,49 +2165,13 @@ def map_index( ) -> Styler: self._todo.append( ( - lambda instance: getattr(instance, "_apply_index"), + lambda instance: instance._apply_index, (func, axis, level, "map"), kwargs, ) ) return self - def applymap_index( - self, - func: Callable, - axis: AxisInt | str = 0, - level: Level | list[Level] | None = None, - **kwargs, - ) -> Styler: - """ - Apply a CSS-styling function to the index or column headers, elementwise. - - .. deprecated:: 2.1.0 - - Styler.applymap_index has been deprecated. Use Styler.map_index instead. - - Parameters - ---------- - func : function - ``func`` should take a scalar and return a string. - axis : {{0, 1, "index", "columns"}} - The headers over which to apply the function. - level : int, str, list, optional - If index is MultiIndex the level(s) over which to apply the function. - **kwargs : dict - Pass along to ``func``. - - Returns - ------- - Styler - """ - warnings.warn( - "Styler.applymap_index has been deprecated. Use Styler.map_index instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.map_index(func, axis, level, **kwargs) - def _map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: func = partial(func, **kwargs) # map doesn't take kwargs? if subset is None: @@ -2046,6 +2199,7 @@ def map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: Returns ------- Styler + Instance of class with CSS-styling function applied elementwise. See Also -------- @@ -2064,60 +2218,29 @@ def map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: >>> def color_negative(v, color): ... return f"color: {color};" if v < 0 else None >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - >>> df.style.map(color_negative, color='red') # doctest: +SKIP + >>> df.style.map(color_negative, color="red") # doctest: +SKIP Using ``subset`` to restrict application to a single column or multiple columns - >>> df.style.map(color_negative, color='red', subset="A") - ... # doctest: +SKIP - >>> df.style.map(color_negative, color='red', subset=["A", "B"]) - ... # doctest: +SKIP + >>> df.style.map(color_negative, color="red", subset="A") + ... # doctest: +SKIP + >>> df.style.map(color_negative, color="red", subset=["A", "B"]) + ... # doctest: +SKIP Using a 2d input to ``subset`` to select rows in addition to columns - >>> df.style.map(color_negative, color='red', - ... subset=([0,1,2], slice(None))) # doctest: +SKIP - >>> df.style.map(color_negative, color='red', subset=(slice(0,5,2), "A")) - ... # doctest: +SKIP + >>> df.style.map( + ... color_negative, color="red", subset=([0, 1, 2], slice(None)) + ... ) # doctest: +SKIP + >>> df.style.map(color_negative, color="red", subset=(slice(0, 5, 2), "A")) + ... # doctest: +SKIP See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. """ - self._todo.append( - (lambda instance: getattr(instance, "_map"), (func, subset), kwargs) - ) + self._todo.append((lambda instance: instance._map, (func, subset), kwargs)) return self - @Substitution(subset=subset_args) - def applymap( - self, func: Callable, subset: Subset | None = None, **kwargs - ) -> Styler: - """ - Apply a CSS-styling function elementwise. - - .. deprecated:: 2.1.0 - - Styler.applymap has been deprecated. Use Styler.map instead. - - Parameters - ---------- - func : function - ``func`` should take a scalar and return a string. - %(subset)s - **kwargs : dict - Pass along to ``func``. - - Returns - ------- - Styler - """ - warnings.warn( - "Styler.applymap has been deprecated. Use Styler.map instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.map(func, subset, **kwargs) - def set_table_attributes(self, attributes: str) -> Styler: """ Set the table attributes added to the ```` HTML element. @@ -2127,10 +2250,12 @@ def set_table_attributes(self, attributes: str) -> Styler: Parameters ---------- attributes : str + Table attributes to be added to the ``
    `` HTML element. Returns ------- Styler + Instance of class with specified table attributes set. See Also -------- @@ -2157,6 +2282,7 @@ def export(self) -> dict[str, Any]: Returns ------- dict + Contains data-independent (exportable) styles applied to current Styler. See Also -------- @@ -2233,6 +2359,7 @@ def use(self, styles: dict[str, Any]) -> Styler: Returns ------- Styler + Instance of class with defined styler attributes added. See Also -------- @@ -2280,10 +2407,19 @@ def set_uuid(self, uuid: str) -> Styler: Parameters ---------- uuid : str + The uuid to be applied to ``id`` attributes of HTML elements. Returns ------- Styler + Instance of class with specified uuid for `id` attributes set. + + See Also + -------- + Styler.set_caption : Set the text added to a `` @@ -1110,8 +1142,8 @@ def format( Using a ``formatter`` with ``escape`` in 'latex' mode. >>> df = pd.DataFrame([["123"], ["~ ^"], ["$%#"]]) - >>> df.style.format("\\textbf{{{}}}", escape="latex").to_latex() - ... # doctest: +SKIP + >>> df.style.format("\\textbf{{{}}}", + ... escape="latex").to_latex() # doctest: +SKIP \begin{tabular}{ll} & 0 \\ 0 & \textbf{123} \\ @@ -1122,10 +1154,10 @@ def format( Applying ``escape`` in 'latex-math' mode. In the example below we enter math mode using the character ``$``. - >>> df = pd.DataFrame([[r"$\sum_{i=1}^{10} a_i$ a~b $\alpha \ - ... = \frac{\beta}{\zeta^2}$"], ["%#^ $ \$x^2 $"]]) - >>> df.style.format(escape="latex-math").to_latex() - ... # doctest: +SKIP + >>> df = pd.DataFrame([ + ... [r"$\sum_{i=1}^{10} a_i$ a~b $\alpha = \frac{\beta}{\zeta^2}$"], + ... [r"%#^ $ \$x^2 $"]]) + >>> df.style.format(escape="latex-math").to_latex() # doctest: +SKIP \begin{tabular}{ll} & 0 \\ 0 & $\sum_{i=1}^{10} a_i$ a\textasciitilde b $\alpha = \frac{\beta}{\zeta^2}$ \\ @@ -1135,10 +1167,10 @@ def format( We can use the character ``\(`` to enter math mode and the character ``\)`` to close math mode. - >>> df = pd.DataFrame([[r"\(\sum_{i=1}^{10} a_i\) a~b \(\alpha \ - ... = \frac{\beta}{\zeta^2}\)"], ["%#^ \( \$x^2 \)"]]) - >>> df.style.format(escape="latex-math").to_latex() - ... # doctest: +SKIP + >>> df = pd.DataFrame([ + ... [r"\(\sum_{i=1}^{10} a_i\) a~b \(\alpha = \frac{\beta}{\zeta^2}\)"], + ... [r"%#^ \( \$x^2 \)"]]) + >>> df.style.format(escape="latex-math").to_latex() # doctest: +SKIP \begin{tabular}{ll} & 0 \\ 0 & \(\sum_{i=1}^{10} a_i\) a\textasciitilde b \(\alpha @@ -1149,10 +1181,10 @@ def format( If we have in one DataFrame cell a combination of both shorthands for math formulas, the shorthand with the sign ``$`` will be applied. - >>> df = pd.DataFrame([[r"\( x^2 \) $x^2$"], \ + >>> df = pd.DataFrame([ + ... [r"\( x^2 \) $x^2$"], ... [r"$\frac{\beta}{\zeta}$ \(\frac{\beta}{\zeta}\)"]]) - >>> df.style.format(escape="latex-math").to_latex() - ... # doctest: +SKIP + >>> df.style.format(escape="latex-math").to_latex() # doctest: +SKIP \begin{tabular}{ll} & 0 \\ 0 & \textbackslash ( x\textasciicircum 2 \textbackslash ) $x^2$ \\ @@ -1169,7 +1201,7 @@ def format( >>> df = pd.DataFrame({"A": [1, 0, -1]}) >>> pseudo_css = "number-format: 0§[Red](0)§-§@;" >>> filename = "formatted_file.xlsx" - >>> df.style.map(lambda v: pseudo_css).to_excel(filename) # doctest: +SKIP + >>> df.style.map(lambda v: pseudo_css).to_excel(filename) # doctest: +SKIP .. figure:: ../../_static/style/format_excel_css.png """ @@ -1193,7 +1225,7 @@ def format( data = self.data.loc[subset] if not isinstance(formatter, dict): - formatter = {col: formatter for col in data.columns} + formatter = dict.fromkeys(data.columns, formatter) cis = self.columns.get_indexer_for(data.columns) ris = self.index.get_indexer_for(data.index) @@ -1262,6 +1294,7 @@ def format_index( Returns ------- Styler + Returns itself for chaining. See Also -------- @@ -1295,7 +1328,7 @@ def format_index( .. warning:: `Styler.format_index` is ignored when using the output format - `Styler.to_excel`, since Excel and Python have inherrently different + `Styler.to_excel`, since Excel and Python have inherently different formatting structures. However, it is possible to use the `number-format` pseudo CSS attribute to force Excel permissible formatting. See documentation for `Styler.format`. @@ -1318,9 +1351,10 @@ def format_index( Using the default ``formatter`` for unspecified levels >>> df = pd.DataFrame([[1, 2, 3]], - ... columns=pd.MultiIndex.from_arrays([["a", "a", "b"],[2, np.nan, 4]])) + ... columns=pd.MultiIndex.from_arrays( + ... [["a", "a", "b"], [2, np.nan, 4]])) >>> df.style.format_index({0: lambda v: v.upper()}, axis=1, precision=1) - ... # doctest: +SKIP + ... # doctest: +SKIP A B 2.0 nan 4.0 0 1 2 3 @@ -1329,7 +1363,7 @@ def format_index( >>> func = lambda s: 'STRING' if isinstance(s, str) else 'FLOAT' >>> df.style.format_index(func, axis=1, na_rep='MISS') - ... # doctest: +SKIP + ... # doctest: +SKIP STRING STRING FLOAT MISS FLOAT 0 1 2 3 @@ -1338,7 +1372,7 @@ def format_index( >>> df = pd.DataFrame([[1, 2, 3]], columns=['"A"', 'A&B', None]) >>> s = df.style.format_index('$ {0}', axis=1, escape="html", na_rep="NA") - ... # doctest: +SKIP + ... # doctest: +SKIP or element. """ - if "display_value" not in kwargs: + if "display_value" not in kwargs or kwargs["display_value"] is None: kwargs["display_value"] = value return { "type": html_element, @@ -1914,18 +2085,18 @@ def maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList: ('border','1px solid red')] """ if isinstance(style, str): - s = style.split(";") - try: - return [ - (x.split(":")[0].strip(), x.split(":")[1].strip()) - for x in s - if x.strip() != "" - ] - except IndexError: + if style and ":" not in style: raise ValueError( "Styles supplied as string must follow CSS rule formats, " f"for example 'attr: val;'. '{style}' was given." ) + s = style.split(";") + return [ + (x.split(":")[0].strip(), ":".join(x.split(":")[1:]).strip()) + for x in s + if x.strip() != "" + ] + return style @@ -1977,6 +2148,11 @@ class Tooltips: tooltips: DataFrame, default empty DataFrame of strings aligned with underlying Styler data for tooltip display. + as_title_attribute: bool, default False + Flag to use title attribute based tooltips (True) or based + tooltips (False). + Add the tooltip text as title attribute to resultant s, return a list of text rows. @@ -478,12 +482,20 @@ def _expand_colspan_rowspan( rows : list of node-like List of s section : the section that the rows belong to (header, body or footer). + remainder: list[tuple[int, str | tuple, int]] | None + Any remainder from the expansion of previous section + overflow: bool + If true, return any partial rows as 'remainder'. If not, use up any + partial rows. True by default. Returns ------- list of list Each returned row is a list of str text, or tuple (text, link) if extract_links is not None. + remainder + Remaining partial rows if any. If overflow is False, an empty list + is returned. Notes ----- @@ -492,9 +504,7 @@ def _expand_colspan_rowspan( """ all_texts = [] # list of rows, each a list of str text: str | tuple - remainder: list[ - tuple[int, str | tuple, int] - ] = [] # list of (index, text, nrows) + remainder = remainder if remainder is not None else [] for tr in rows: texts = [] # the output for this row @@ -535,19 +545,20 @@ def _expand_colspan_rowspan( all_texts.append(texts) remainder = next_remainder - # Append rows that only appear because the previous row had non-1 - # rowspan - while remainder: - next_remainder = [] - texts = [] - for prev_i, prev_text, prev_rowspan in remainder: - texts.append(prev_text) - if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) - all_texts.append(texts) - remainder = next_remainder + if not overflow: + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder - return all_texts + return all_texts, remainder def _handle_hidden_tables(self, tbl_list, attr_name: str): """ @@ -591,14 +602,8 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): :class:`pandas.io.html._HtmlFrameParser`. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - from bs4 import SoupStrainer - - self._strainer = SoupStrainer("table") - def _parse_tables(self, document, match, attrs): - element_name = self._strainer.name + element_name = "table" tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") @@ -619,7 +624,7 @@ def _parse_tables(self, document, match, attrs): result.append(table) unique_tables.add(table) if not result: - raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") + raise ValueError(f"No tables found matching pattern {match.pattern!r}") return result def _href_getter(self, obj) -> str | None: @@ -629,7 +634,7 @@ def _href_getter(self, obj) -> str | None: def _text_getter(self, obj): return obj.text - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.name == tag def _parse_td(self, row): @@ -691,7 +696,7 @@ def _build_xpath_expr(attrs) -> str: if "class_" in attrs: attrs["class"] = attrs.pop("class_") - s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()]) + s = " and ".join([f"@{k}={v!r}" for k, v in attrs.items()]) return f"[{s}]" @@ -734,7 +739,7 @@ def _parse_tables(self, document, match, kwargs): # 1. check all descendants for the given pattern and only search tables # GH 49929 - xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]" + xpath_expr = f"//table[.//text()[re:test(., {pattern!r})]]" # if any table attributes were given build an xpath expression to # search for them @@ -755,10 +760,10 @@ def _parse_tables(self, document, match, kwargs): if "display:none" in elem.attrib.get("style", "").replace(" ", ""): elem.drop_tree() if not tables: - raise ValueError(f"No tables found matching regex {repr(pattern)}") + raise ValueError(f"No tables found matching regex {pattern!r}") return tables - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.tag == tag def _build_doc(self): @@ -780,36 +785,26 @@ def _build_doc(self): from lxml.etree import XMLSyntaxError from lxml.html import ( HTMLParser, - fromstring, parse, ) parser = HTMLParser(recover=True, encoding=self.encoding) - try: - if is_url(self.io): - with get_handle( - self.io, "r", storage_options=self.storage_options - ) as f: - r = parse(f.handle, parser=parser) - else: - # try to parse the input in the simplest way - r = parse(self.io, parser=parser) + if is_url(self.io): + with get_handle(self.io, "r", storage_options=self.storage_options) as f: + r = parse(f.handle, parser=parser) + else: + # try to parse the input in the simplest way try: - r = r.getroot() - except AttributeError: - pass - except (UnicodeDecodeError, OSError) as e: - # if the input is a blob of html goop - if not is_url(self.io): - r = fromstring(self.io, parser=parser) - - try: - r = r.getroot() - except AttributeError: - pass - else: - raise e + r = parse(self.io, parser=parser) + except OSError as err: + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}" + ) from err + try: + r = r.getroot() + except AttributeError: + pass else: if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) @@ -914,7 +909,7 @@ def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]: valid_parsers = list(_valid_parsers.keys()) if flavor not in valid_parsers: raise ValueError( - f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}" + f"{flavor!r} is not a valid flavor, valid flavors are {valid_parsers}" ) if flavor in ("bs4", "html5lib"): @@ -938,7 +933,7 @@ def _validate_flavor(flavor): elif isinstance(flavor, abc.Iterable): if not all(isinstance(flav, str) for flav in flavor): raise TypeError( - f"Object of type {repr(type(flavor).__name__)} " + f"Object of type {type(flavor).__name__!r} " f"is not an iterable of strings" ) else: @@ -1059,7 +1054,7 @@ def read_html( io : str, path object, or file-like object String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a string ``read()`` function. - The string can represent a URL or the HTML itself. Note that + The string can represent a URL. Note that lxml only accepts the http, ftp and file url protocols. If you have a URL that starts with ``'https'`` you might try removing the ``'s'``. @@ -1100,13 +1095,13 @@ def read_html( passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - attrs = {{'id': 'table'}} + attrs = {{"id": "table"}} is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - attrs = {{'asdf': 'table'}} + attrs = {{"asdf": "table"}} is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 @@ -1154,14 +1149,15 @@ def read_html( .. versionadded:: 1.5.0 - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1201,7 +1197,10 @@ def read_html( **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* - it will fail, e.g., it will *not* return an empty list. + it will fail, i.e., it will *not* return an empty list, save for some + rare cases. + It might return an empty list in case of inputs with single row and + ``' in result + + # test 'Max' tooltip added as title attribute and css does not exist + assert "#T_ #T__row0_col2:hover .pd-t {\n visibility: visible;\n}" not in result + assert '#T_ #T__row0_col2 .pd-t::after {\n content: "Max";\n}' not in result + assert 'class="data row0 col2" title="Max">2' in result + + # test Nan, empty string and bad column ignored + assert "#T_ #T__row1_col0:hover .pd-t {\n visibility: visible;\n}" not in result + assert "#T_ #T__row1_col1:hover .pd-t {\n visibility: visible;\n}" not in result + assert "#T_ #T__row0_col1:hover .pd-t {\n visibility: visible;\n}" not in result + assert "#T_ #T__row1_col2:hover .pd-t {\n visibility: visible;\n}" not in result + assert "Bad-Col" not in result + assert 'class="data row0 col1" >1' in result + assert 'class="data row1 col0" >3' in result + assert 'class="data row1 col1" >4' in result + assert 'class="data row1 col2" >5' in result + assert 'class="data row2 col0" >6' in result + assert 'class="data row2 col1" >7' in result + assert 'class="data row2 col2" >8' in result + + +def test_tooltip_render_as_title_with_hidden_index_level(): + df = DataFrame( + data=[[0, 1, 2], [3, 4, 5], [6, 7, 8]], + columns=["A", "B", "C"], + index=MultiIndex.from_arrays( + [["x", "y", "z"], [1, 2, 3], ["aa", "bb", "cc"]], + names=["alpha", "num", "char"], + ), + ) + ttips = DataFrame( + # Test basic reindex and ignoring blank, and hide level 2 (num) from index + data=[["Min", "Max"], [np.nan, ""]], + columns=["A", "C"], + index=MultiIndex.from_arrays( + [["x", "y"], [1, 2], ["aa", "bb"]], names=["alpha", "num", "char"] + ), + ) + styler = Styler(df, uuid_len=0) + styler = styler.hide(axis=0, level=-1, names=True) + # GH 56605 + result = styler.set_tooltips(ttips, as_title_attribute=True).to_html() + + # test css not added + assert "#T_ .pd-t {\n visibility: hidden;\n" not in result + + # test 'Min' tooltip added as title attribute and css does not exist + assert "#T_ #T__row0_col0:hover .pd-t {\n visibility: visible;\n}" not in result + assert '#T_ #T__row0_col0 .pd-t::after {\n content: "Min";\n}' not in result + assert 'class="data row0 col0" title="Min">0' in result + + # test 'Max' tooltip added as title attribute and css does not exist + assert "#T_ #T__row0_col2:hover .pd-t {\n visibility: visible;\n}" not in result + assert '#T_ #T__row0_col2 .pd-t::after {\n content: "Max";\n}' not in result + assert 'class="data row0 col2" title="Max">2' in result + + # test Nan, empty string and bad column ignored + assert "#T_ #T__row1_col0:hover .pd-t {\n visibility: visible;\n}" not in result + assert "#T_ #T__row1_col1:hover .pd-t {\n visibility: visible;\n}" not in result + assert "#T_ #T__row0_col1:hover .pd-t {\n visibility: visible;\n}" not in result + assert "#T_ #T__row1_col2:hover .pd-t {\n visibility: visible;\n}" not in result + assert "Bad-Col" not in result + assert 'class="data row0 col1" >1' in result + assert 'class="data row1 col0" >3' in result + assert 'class="data row1 col1" >4' in result + assert 'class="data row1 col2" >5' in result + assert 'class="data row2 col0" >6' in result + assert 'class="data row2 col1" >7' in result + assert 'class="data row2 col2" >8' in result diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index db436d8283b99..642a562704344 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -38,30 +38,31 @@ def test_css_parse_normalisation(name, norm, abnorm): @pytest.mark.parametrize( - "invalid_css,remainder", + "invalid_css,remainder,msg", [ # No colon - ("hello-world", ""), - ("border-style: solid; hello-world", "border-style: solid"), + ("hello-world", "", "expected a colon"), + ("border-style: solid; hello-world", "border-style: solid", "expected a colon"), ( "border-style: solid; hello-world; font-weight: bold", "border-style: solid; font-weight: bold", + "expected a colon", ), # Unclosed string fail # Invalid size - ("font-size: blah", "font-size: 1em"), - ("font-size: 1a2b", "font-size: 1em"), - ("font-size: 1e5pt", "font-size: 1em"), - ("font-size: 1+6pt", "font-size: 1em"), - ("font-size: 1unknownunit", "font-size: 1em"), - ("font-size: 10", "font-size: 1em"), - ("font-size: 10 pt", "font-size: 1em"), + ("font-size: blah", "font-size: 1em", "Unhandled size"), + ("font-size: 1a2b", "font-size: 1em", "Unhandled size"), + ("font-size: 1e5pt", "font-size: 1em", "Unhandled size"), + ("font-size: 1+6pt", "font-size: 1em", "Unhandled size"), + ("font-size: 1unknownunit", "font-size: 1em", "Unhandled size"), + ("font-size: 10", "font-size: 1em", "Unhandled size"), + ("font-size: 10 pt", "font-size: 1em", "Unhandled size"), # Too many args - ("border-top: 1pt solid red green", "border-top: 1pt solid green"), + ("border-top: 1pt solid red green", "border-top: 1pt solid green", "Too many"), ], ) -def test_css_parse_invalid(invalid_css, remainder): - with tm.assert_produces_warning(CSSWarning): +def test_css_parse_invalid(invalid_css, remainder, msg): + with tm.assert_produces_warning(CSSWarning, match=msg): assert_same_resolution(invalid_css, remainder) @@ -120,7 +121,7 @@ def test_css_side_shorthands(shorthand, expansions): {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, ) - with tm.assert_produces_warning(CSSWarning): + with tm.assert_produces_warning(CSSWarning, match="Could not expand"): assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {}) @@ -192,8 +193,7 @@ def test_css_border_shorthands(prop, expected): ( "margin: 1px; margin-top: 2px", "", - "margin-left: 1px; margin-right: 1px; " - "margin-bottom: 1px; margin-top: 2px", + "margin-left: 1px; margin-right: 1px; margin-bottom: 1px; margin-top: 2px", ), ("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"), ("margin: 1px", "margin-top: 2px", "margin: 1px"), @@ -243,7 +243,6 @@ def test_css_none_absent(style, equiv): ("02.54cm", "72pt"), ("25.4mm", "72pt"), ("101.6q", "72pt"), - ("101.6q", "72pt"), ], ) @pytest.mark.parametrize("relative_to", [None, "16pt"]) # invariant to inherited size diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0ca29c219b55b..86682e8160762 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2,16 +2,16 @@ Tests for the file pandas.io.formats.format, *not* tests for general formatting of pandas objects. """ + from datetime import datetime from io import StringIO -from pathlib import Path import re from shutil import get_terminal_size import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -32,55 +32,6 @@ import pandas.io.formats.format as fmt -@pytest.fixture(params=["string", "pathlike", "buffer"]) -def filepath_or_buffer_id(request): - """ - A fixture yielding test ids for filepath_or_buffer testing. - """ - return request.param - - -@pytest.fixture -def filepath_or_buffer(filepath_or_buffer_id, tmp_path): - """ - A fixture yielding a string representing a filepath, a path-like object - and a StringIO buffer. Also checks that buffer is not closed. - """ - if filepath_or_buffer_id == "buffer": - buf = StringIO() - yield buf - assert not buf.closed - else: - assert isinstance(tmp_path, Path) - if filepath_or_buffer_id == "pathlike": - yield tmp_path / "foo" - else: - yield str(tmp_path / "foo") - - -@pytest.fixture -def assert_filepath_or_buffer_equals( - filepath_or_buffer, filepath_or_buffer_id, encoding -): - """ - Assertion helper for checking filepath_or_buffer. - """ - if encoding is None: - encoding = "utf-8" - - def _assert_filepath_or_buffer_equals(expected): - if filepath_or_buffer_id == "string": - with open(filepath_or_buffer, encoding=encoding) as f: - result = f.read() - elif filepath_or_buffer_id == "pathlike": - result = filepath_or_buffer.read_text(encoding=encoding) - elif filepath_or_buffer_id == "buffer": - result = filepath_or_buffer.getvalue() - assert result == expected - - return _assert_filepath_or_buffer_equals - - def has_info_repr(df): r = repr(df) c1 = r.split("\n")[0].startswith("\n.. ...\n9 " + def test_repr_truncation_dataframe_attrs(self): + # GH#60455 + df = DataFrame([[0] * 10]) + df.attrs["b"] = DataFrame([]) + with option_context("display.max_columns", 2, "display.show_dimensions", False): + assert repr(df) == " 0 ... 9\n0 0 ... 0" + + def test_repr_truncation_series_with_dataframe_attrs(self): + # GH#60568 + ser = Series([0] * 10) + ser.attrs["b"] = DataFrame([]) + with option_context("display.max_rows", 2, "display.show_dimensions", False): + assert repr(ser) == "0 0\n ..\n9 0\ndtype: int64" + def test_max_colwidth_negative_int_raises(self): # Deprecation enforced from: # https://github.com/pandas-dev/pandas/issues/31532 @@ -417,6 +382,40 @@ def test_repr_min_rows(self): assert ".." not in repr(df) assert ".." not in df._repr_html_() + @pytest.mark.parametrize( + "data, format_option, expected_values", + [ + (12345.6789, "{:12.3f}", "12345.679"), + (None, "{:.3f}", "None"), + ("", "{:.2f}", ""), + (112345.6789, "{:6.3f}", "112345.679"), + ("foo foo", None, "foo      foo"), + (" foo", None, "foo"), + ( + "foo foo foo", + None, + "foo foo       foo", + ), # odd no.of spaces + ( + "foo foo foo", + None, + "foo foo    foo", + ), # even no.of spaces + ], + ) + def test_repr_float_formatting_html_output( + self, data, format_option, expected_values + ): + if format_option is not None: + with option_context("display.float_format", format_option.format): + df = DataFrame({"A": [data]}) + html_output = df._repr_html_() + assert expected_values in html_output + else: + df = DataFrame({"A": [data]}) + html_output = df._repr_html_() + assert expected_values in html_output + def test_str_max_colwidth(self): # GH 7856 df = DataFrame( @@ -1396,9 +1395,7 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") sf._get_footer() # should not raise exception - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="Fixup when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") def test_east_asian_unicode_series(self): # not aligned properly because of east asian width @@ -1773,9 +1770,7 @@ def chck_ncols(self, s): ncolsizes = len({len(line.strip()) for line in lines}) assert ncolsizes == 1 - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="change when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") def test_format_explicit(self): test_sers = gen_series_formatting() with option_context("display.max_rows", 4, "display.show_dimensions", False): @@ -2258,14 +2253,21 @@ def test_format_percentiles_integer_idx(self): "encoding, data", [(None, "abc"), ("utf-8", "abc"), ("gbk", "造成输出中文显示乱码"), ("foo", "abc")], ) +@pytest.mark.parametrize("filepath_or_buffer_id", ["string", "pathlike", "buffer"]) def test_filepath_or_buffer_arg( method, - filepath_or_buffer, - assert_filepath_or_buffer_equals, + tmp_path, encoding, data, filepath_or_buffer_id, ): + if filepath_or_buffer_id == "buffer": + filepath_or_buffer = StringIO() + elif filepath_or_buffer_id == "pathlike": + filepath_or_buffer = tmp_path / "foo" + else: + filepath_or_buffer = str(tmp_path / "foo") + df = DataFrame([data]) if method in ["to_latex"]: # uses styler implementation pytest.importorskip("jinja2") @@ -2281,7 +2283,17 @@ def test_filepath_or_buffer_arg( else: expected = getattr(df, method)() getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) - assert_filepath_or_buffer_equals(expected) + encoding = encoding or "utf-8" + if filepath_or_buffer_id == "string": + with open(filepath_or_buffer, encoding=encoding) as f: + result = f.read() + elif filepath_or_buffer_id == "pathlike": + result = filepath_or_buffer.read_text(encoding=encoding) + elif filepath_or_buffer_id == "buffer": + result = filepath_or_buffer.getvalue() + assert result == expected + if filepath_or_buffer_id == "buffer": + assert not filepath_or_buffer.closed @pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index acf2bc72c687d..7d154235d2c4a 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -1,12 +1,35 @@ # Note! This file is aimed specifically at pandas.io.formats.printing utility # functions, not the general printing of pandas objects. +from collections.abc import Mapping import string +import pytest + import pandas._config.config as cf +import pandas as pd + from pandas.io.formats import printing +@pytest.mark.parametrize( + "input_names, expected_names", + [ + (["'a b"], "['\\'a b']"), # Escape leading quote + (["test's b"], "['test\\'s b']"), # Escape apostrophe + (["'test' b"], "['\\'test\\' b']"), # Escape surrounding quotes + (["test b'"], "['test b\\'']"), # Escape single quote + (["test\n' b"], "['test\\n\\' b']"), # Escape quotes, preserve newline + ], +) +def test_formatted_index_names(input_names, expected_names): + # GH#60190 + df = pd.DataFrame({name: [1, 2, 3] for name in input_names}).set_index(input_names) + formatted_names = str(df.index.names) + + assert formatted_names == expected_names + + def test_adjoin(): data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] expected = "a dd ggg\nb ee hhh\nc ff iii" @@ -16,6 +39,17 @@ def test_adjoin(): assert adjoined == expected +class MyMapping(Mapping): + def __getitem__(self, key): + return 4 + + def __iter__(self): + return iter(["a", "b"]) + + def __len__(self): + return 2 + + class TestPPrintThing: def test_repr_binary_type(self): letters = string.ascii_letters @@ -42,6 +76,15 @@ def test_repr_obeys_max_seq_limit(self): def test_repr_set(self): assert printing.pprint_thing({1}) == "{1}" + def test_repr_dict(self): + assert printing.pprint_thing({"a": 4, "b": 4}) == "{'a': 4, 'b': 4}" + + def test_repr_mapping(self): + assert printing.pprint_thing(MyMapping()) == "{'a': 4, 'b': 4}" + + def test_repr_frozenset(self): + assert printing.pprint_thing(frozenset([1, 2])) == "frozenset({1, 2})" + class TestFormatBase: def test_adjoin(self): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 0db49a73621ea..6d762fdeb8d79 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -221,7 +221,7 @@ def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype): def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) - df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) + df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="D")}) expected_rows = [ ",A", @@ -482,10 +482,7 @@ def test_to_csv_string_with_crlf(self): # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = ( - b"int,str_crlf\r\n" - b"1,abc\r\n" - b'2,"d\r\nef"\r\n' - b'3,"g\r\nh\r\n\r\ni"\r\n' + b'int,str_crlf\r\n1,abc\r\n2,"d\r\nef"\r\n3,"g\r\nh\r\n\r\ni"\r\n' ) df.to_csv(path, lineterminator="\r\n", index=False) with open(path, "rb") as f: @@ -744,15 +741,3 @@ def test_to_csv_iterative_compression_buffer(compression): pd.read_csv(buffer, compression=compression, index_col=0), df ) assert not buffer.closed - - -def test_to_csv_pos_args_deprecation(): - # GH-54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_csv except for the " - r"argument 'path_or_buf' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - buffer = io.BytesIO() - df.to_csv(buffer, ";") diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 927a9f4961f6f..b40201b9ba1e6 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -2,6 +2,7 @@ ExcelFormatter is tested implicitly in pandas/tests/io/excel """ + import string import pytest @@ -324,7 +325,7 @@ def test_css_to_excel_bad_colors(input_color): if input_color is not None: expected["fill"] = {"patternType": "solid"} - with tm.assert_produces_warning(CSSWarning): + with tm.assert_produces_warning(CSSWarning, match="Unhandled color format"): convert = CSSToExcelConverter() assert expected == convert(css) diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 790ba92f70c40..9c75314b66fa2 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -15,7 +15,6 @@ get_option, option_context, ) -import pandas._testing as tm import pandas.io.formats.format as fmt @@ -95,8 +94,7 @@ def test_to_html_with_column_specific_col_space_raises(): ) msg = ( - "Col_space length\\(\\d+\\) should match " - "DataFrame number of columns\\(\\d+\\)" + "Col_space length\\(\\d+\\) should match DataFrame number of columns\\(\\d+\\)" ) with pytest.raises(ValueError, match=msg): df.to_html(col_space=[30, 40]) @@ -136,13 +134,14 @@ def test_to_html_with_empty_string_label(): @pytest.mark.parametrize( - "df,expected", + "df_data,expected", [ - (DataFrame({"\u03c3": np.arange(10.0)}), "unicode_1"), - (DataFrame({"A": ["\u03c3"]}), "unicode_2"), + ({"\u03c3": np.arange(10.0)}, "unicode_1"), + ({"A": ["\u03c3"]}, "unicode_2"), ], ) -def test_to_html_unicode(df, expected, datapath): +def test_to_html_unicode(df_data, expected, datapath): + df = DataFrame(df_data) expected = expected_html(datapath, expected) result = df.to_html() assert result == expected @@ -934,9 +933,11 @@ def test_repr_html(self, float_frame): def test_repr_html_mathjax(self): df = DataFrame([[1, 2], [3, 4]]) assert "tex2jax_ignore" not in df._repr_html_() + assert "mathjax_ignore" not in df._repr_html_() with option_context("display.html.use_mathjax", False): assert "tex2jax_ignore" in df._repr_html_() + assert "mathjax_ignore" in df._repr_html_() def test_repr_html_wide(self): max_cols = 20 @@ -1164,14 +1165,3 @@ def test_to_html_empty_complex_array(): "
    `` HTML element. + Styler.set_td_classes : Set the ``class`` attribute of ``
    `` HTML elements. + Styler.set_tooltips : Set the DataFrame of strings on ``Styler`` generating + ``:hover`` tooltips. Notes ----- @@ -2293,7 +2429,7 @@ def set_uuid(self, uuid: str) -> Styler: Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], index=['A', 'B'], columns=['c1', 'c2']) + >>> df = pd.DataFrame([[1, 2], [3, 4]], index=["A", "B"], columns=["c1", "c2"]) You can get the `id` attributes with the following: @@ -2301,8 +2437,8 @@ def set_uuid(self, uuid: str) -> Styler: To add a title to column `c1`, its `id` is T_20a7d_level0_col0: - >>> df.style.set_uuid("T_20a7d_level0_col0") - ... .set_caption("Test") # doctest: +SKIP + >>> df.style.set_uuid("T_20a7d_level0_col0").set_caption("Test") + ... # doctest: +SKIP Please see: `Table visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -2324,10 +2460,18 @@ def set_caption(self, caption: str | tuple | list) -> Styler: Returns ------- Styler + Instance of class with text set for ``
    `` HTML element. + + See Also + -------- + Styler.set_td_classes : Set the ``class`` attribute of ``
    `` HTML elements. + Styler.set_tooltips : Set the DataFrame of strings on ``Styler`` generating + ``:hover`` tooltips. + Styler.set_uuid : Set the uuid applied to ``id`` attributes of HTML elements. Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.style.set_caption("test") # doctest: +SKIP Please see: @@ -2370,6 +2514,13 @@ def set_sticky( Returns ------- Styler + Instance of class with CSS set for permanently displaying headers + in scrolling frame. + + See Also + -------- + Styler.set_properties : Set defined CSS-properties to each ```` + HTML element for the given subset. Notes ----- @@ -2383,7 +2534,7 @@ def set_sticky( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> df.style.set_sticky(axis="index") # doctest: +SKIP Please see: @@ -2443,7 +2594,7 @@ def set_sticky( for i, level in enumerate(levels_): styles.append( { - "selector": f"thead tr:nth-child({level+1}) th", + "selector": f"thead tr:nth-child({level + 1}) th", "props": props + ( f"top:{i * pixel_size}px; height:{pixel_size}px; " @@ -2454,7 +2605,7 @@ def set_sticky( if not all(name is None for name in self.index.names): styles.append( { - "selector": f"thead tr:nth-child({obj.nlevels+1}) th", + "selector": f"thead tr:nth-child({obj.nlevels + 1}) th", "props": props + ( f"top:{(len(levels_)) * pixel_size}px; " @@ -2474,7 +2625,7 @@ def set_sticky( styles.extend( [ { - "selector": f"thead tr th:nth-child({level+1})", + "selector": f"thead tr th:nth-child({level + 1})", "props": props_ + "z-index:3 !important;", }, { @@ -2530,6 +2681,7 @@ def set_table_styles( Returns ------- Styler + Instance of class with specified table styles set. See Also -------- @@ -2544,49 +2696,55 @@ def set_table_styles( .. code-block:: python - css_class_names = {"row_heading": "row_heading", - "col_heading": "col_heading", - "index_name": "index_name", - "col": "col", - "row": "row", - "col_trim": "col_trim", - "row_trim": "row_trim", - "level": "level", - "data": "data", - "blank": "blank", - "foot": "foot"} + css_class_names = { + "row_heading": "row_heading", + "col_heading": "col_heading", + "index_name": "index_name", + "col": "col", + "row": "row", + "col_trim": "col_trim", + "row_trim": "row_trim", + "level": "level", + "data": "data", + "blank": "blank", + "foot": "foot", + } Examples -------- - >>> df = pd.DataFrame(np.random.randn(10, 4), - ... columns=['A', 'B', 'C', 'D']) + >>> df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) >>> df.style.set_table_styles( - ... [{'selector': 'tr:hover', - ... 'props': [('background-color', 'yellow')]}] + ... [{"selector": "tr:hover", "props": [("background-color", "yellow")]}] ... ) # doctest: +SKIP Or with CSS strings >>> df.style.set_table_styles( - ... [{'selector': 'tr:hover', - ... 'props': 'background-color: yellow; font-size: 1em;'}] + ... [ + ... { + ... "selector": "tr:hover", + ... "props": "background-color: yellow; font-size: 1em;", + ... } + ... ] ... ) # doctest: +SKIP Adding column styling by name - >>> df.style.set_table_styles({ - ... 'A': [{'selector': '', - ... 'props': [('color', 'red')]}], - ... 'B': [{'selector': 'td', - ... 'props': 'color: blue;'}] - ... }, overwrite=False) # doctest: +SKIP + >>> df.style.set_table_styles( + ... { + ... "A": [{"selector": "", "props": [("color", "red")]}], + ... "B": [{"selector": "td", "props": "color: blue;"}], + ... }, + ... overwrite=False, + ... ) # doctest: +SKIP Adding row styling - >>> df.style.set_table_styles({ - ... 0: [{'selector': 'td:hover', - ... 'props': [('font-size', '25px')]}] - ... }, axis=1, overwrite=False) # doctest: +SKIP + >>> df.style.set_table_styles( + ... {0: [{"selector": "td:hover", "props": [("font-size", "25px")]}]}, + ... axis=1, + ... overwrite=False, + ... ) # doctest: +SKIP See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. @@ -2655,6 +2813,13 @@ def hide( Returns ------- Styler + Instance of class with specified headers/rows/columns hidden from display. + + See Also + -------- + Styler.apply : Apply a CSS-styling function column-wise, row-wise, + or table-wise. + Styler.map : Apply a CSS-styling function elementwise. Notes ----- @@ -2717,7 +2882,7 @@ def hide( -------- Simple application hiding specific rows: - >>> df = pd.DataFrame([[1,2], [3,4], [5,6]], index=["a", "b", "c"]) + >>> df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], index=["a", "b", "c"]) >>> df.style.hide(["a", "b"]) # doctest: +SKIP 0 1 c 5 6 @@ -2725,7 +2890,7 @@ def hide( Hide the index and retain the data values: >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) - >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) + >>> df = pd.DataFrame(np.random.randn(6, 6), index=midx, columns=midx) >>> df.style.format("{:.1f}").hide() # doctest: +SKIP x y a b c a b c @@ -2739,7 +2904,7 @@ def hide( Hide specific rows in a MultiIndex but retain the index: >>> df.style.format("{:.1f}").hide(subset=(slice(None), ["a", "c"])) - ... # doctest: +SKIP + ... # doctest: +SKIP x y a b c a b c x b 0.7 1.0 1.3 1.5 -0.0 -0.2 @@ -2748,7 +2913,7 @@ def hide( Hide specific rows and the index through chaining: >>> df.style.format("{:.1f}").hide(subset=(slice(None), ["a", "c"])).hide() - ... # doctest: +SKIP + ... # doctest: +SKIP x y a b c a b c 0.7 1.0 1.3 1.5 -0.0 -0.2 @@ -2893,6 +3058,7 @@ def background_gradient( Returns ------- Styler + Instance of class with {name} colored in gradient style. See Also -------- @@ -2915,10 +3081,14 @@ def background_gradient( Examples -------- - >>> df = pd.DataFrame(columns=["City", "Temp (c)", "Rain (mm)", "Wind (m/s)"], - ... data=[["Stockholm", 21.6, 5.0, 3.2], - ... ["Oslo", 22.4, 13.3, 3.1], - ... ["Copenhagen", 24.5, 0.0, 6.7]]) + >>> df = pd.DataFrame( + ... columns=["City", "Temp (c)", "Rain (mm)", "Wind (m/s)"], + ... data=[ + ... ["Stockholm", 21.6, 5.0, 3.2], + ... ["Oslo", 22.4, 13.3, 3.1], + ... ["Copenhagen", 24.5, 0.0, 6.7], + ... ], + ... ) Shading the values column-wise, with ``axis=0``, preselecting numeric columns @@ -2947,14 +3117,14 @@ def background_gradient( Setting a ``gmap`` and applying to all columns with another ``cmap`` >>> df.style.{name}_gradient(axis=0, gmap=df['Temp (c)'], cmap='YlOrRd') - ... # doctest: +SKIP + ... # doctest: +SKIP .. figure:: ../../_static/style/{image_prefix}_gmap.png Setting the gradient map for a dataframe (i.e. ``axis=None``), we need to explicitly state ``subset`` to match the ``gmap`` shape - >>> gmap = np.array([[1,2,3], [2,3,4], [3,4,5]]) + >>> gmap = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]]) >>> df.style.{name}_gradient(axis=None, gmap=gmap, ... cmap='YlOrRd', subset=['Temp (c)', 'Rain (mm)', 'Wind (m/s)'] ... ) # doctest: +SKIP @@ -3026,6 +3196,13 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: Returns ------- Styler + Instance of class with CSS-properties set for each ```` HTML element + in the given subset + + See Also + -------- + Styler.set_sticky : Add CSS to permanently display the index or column + headers in a scrolling frame. Notes ----- @@ -3036,7 +3213,7 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: -------- >>> df = pd.DataFrame(np.random.randn(10, 4)) >>> df.style.set_properties(color="white", align="right") # doctest: +SKIP - >>> df.style.set_properties(**{'background-color': 'yellow'}) # doctest: +SKIP + >>> df.style.set_properties(**{"background-color": "yellow"}) # doctest: +SKIP See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. @@ -3045,7 +3222,7 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: return self.map(lambda x: values, subset=subset) @Substitution(subset=subset_args) - def bar( # pylint: disable=disallowed-name + def bar( self, subset: Subset | None = None, axis: Axis | None = 0, @@ -3123,6 +3300,13 @@ def bar( # pylint: disable=disallowed-name Returns ------- Styler + Contains list-like attribute with bar chart data as formatted CSS. + + See Also + -------- + PlotAccessor.bar : Vertical bar plot. + PlotAccessor.line : Plot Series or DataFrame as lines. + PlotAccessor.pie : Generate a pie plot. Notes ----- @@ -3132,8 +3316,8 @@ def bar( # pylint: disable=disallowed-name Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) - >>> df.style.bar(subset=['A'], color='gray') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) + >>> df.style.bar(subset=["A"], color="gray") # doctest: +SKIP """ if color is None and cmap is None: color = "#d65f5f" @@ -3201,6 +3385,7 @@ def highlight_null( Returns ------- Styler + Instance of class where null values are highlighted with given style. See Also -------- @@ -3211,8 +3396,8 @@ def highlight_null( Examples -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, np.nan]}) - >>> df.style.highlight_null(color='yellow') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, np.nan]}) + >>> df.style.highlight_null(color="yellow") # doctest: +SKIP Please see: `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -3255,6 +3440,7 @@ def highlight_max( Returns ------- Styler + Instance of class where max value is highlighted in given style. See Also -------- @@ -3265,8 +3451,8 @@ def highlight_max( Examples -------- - >>> df = pd.DataFrame({'A': [2, 1], 'B': [3, 4]}) - >>> df.style.highlight_max(color='yellow') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [2, 1], "B": [3, 4]}) + >>> df.style.highlight_max(color="yellow") # doctest: +SKIP Please see: `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -3311,6 +3497,7 @@ def highlight_min( Returns ------- Styler + Instance of class where min value is highlighted in given style. See Also -------- @@ -3321,8 +3508,8 @@ def highlight_min( Examples -------- - >>> df = pd.DataFrame({'A': [2, 1], 'B': [3, 4]}) - >>> df.style.highlight_min(color='yellow') # doctest: +SKIP + >>> df = pd.DataFrame({"A": [2, 1], "B": [3, 4]}) + >>> df.style.highlight_min(color="yellow") # doctest: +SKIP Please see: `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. @@ -3375,6 +3562,7 @@ def highlight_between( Returns ------- Styler + Instance of class with range highlighted in given style. See Also -------- @@ -3401,11 +3589,13 @@ def highlight_between( -------- Basic usage - >>> df = pd.DataFrame({ - ... 'One': [1.2, 1.6, 1.5], - ... 'Two': [2.9, 2.1, 2.5], - ... 'Three': [3.1, 3.2, 3.8], - ... }) + >>> df = pd.DataFrame( + ... { + ... "One": [1.2, 1.6, 1.5], + ... "Two": [2.9, 2.1, 2.5], + ... "Three": [3.1, 3.2, 3.8], + ... } + ... ) >>> df.style.highlight_between(left=2.1, right=2.9) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_basic.png @@ -3413,23 +3603,29 @@ def highlight_between( Using a range input sequence along an ``axis``, in this case setting a ``left`` and ``right`` for each column individually - >>> df.style.highlight_between(left=[1.4, 2.4, 3.4], right=[1.6, 2.6, 3.6], - ... axis=1, color="#fffd75") # doctest: +SKIP + >>> df.style.highlight_between( + ... left=[1.4, 2.4, 3.4], right=[1.6, 2.6, 3.6], axis=1, color="#fffd75" + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_seq.png Using ``axis=None`` and providing the ``left`` argument as an array that matches the input DataFrame, with a constant ``right`` - >>> df.style.highlight_between(left=[[2,2,3],[2,2,3],[3,3,3]], right=3.5, - ... axis=None, color="#fffd75") # doctest: +SKIP + >>> df.style.highlight_between( + ... left=[[2, 2, 3], [2, 2, 3], [3, 3, 3]], + ... right=3.5, + ... axis=None, + ... color="#fffd75", + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_axNone.png Using ``props`` instead of default background coloring - >>> df.style.highlight_between(left=1.5, right=3.5, - ... props='font-weight:bold;color:#e83e8c') # doctest: +SKIP + >>> df.style.highlight_between( + ... left=1.5, right=3.5, props="font-weight:bold;color:#e83e8c" + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_props.png """ @@ -3487,6 +3683,7 @@ def highlight_quantile( Returns ------- Styler + Instance of class where values in quantile highlighted with given style. See Also -------- @@ -3503,23 +3700,27 @@ def highlight_quantile( -------- Using ``axis=None`` and apply a quantile to all collective data - >>> df = pd.DataFrame(np.arange(10).reshape(2,5) + 1) + >>> df = pd.DataFrame(np.arange(10).reshape(2, 5) + 1) >>> df.style.highlight_quantile(axis=None, q_left=0.8, color="#fffd75") - ... # doctest: +SKIP + ... # doctest: +SKIP .. figure:: ../../_static/style/hq_axNone.png Or highlight quantiles row-wise or column-wise, in this case by row-wise >>> df.style.highlight_quantile(axis=1, q_left=0.8, color="#fffd75") - ... # doctest: +SKIP + ... # doctest: +SKIP .. figure:: ../../_static/style/hq_ax1.png Use ``props`` instead of default background coloring - >>> df.style.highlight_quantile(axis=None, q_left=0.2, q_right=0.8, - ... props='font-weight:bold;color:#e83e8c') # doctest: +SKIP + >>> df.style.highlight_quantile( + ... axis=None, + ... q_left=0.2, + ... q_right=0.8, + ... props="font-weight:bold;color:#e83e8c", + ... ) # doctest: +SKIP .. figure:: ../../_static/style/hq_props.png """ @@ -3588,12 +3789,18 @@ def from_custom_template( Has the correct ``env``,``template_html``, ``template_html_table`` and ``template_html_style`` class attributes set. + See Also + -------- + Styler.export : Export the styles applied to the current Styler. + Styler.use : Set the styles on the current Styler. + Examples -------- >>> from pandas.io.formats.style import Styler - >>> EasyStyler = Styler.from_custom_template("path/to/template", - ... "template.tpl", - ... ) # doctest: +SKIP + >>> EasyStyler = Styler.from_custom_template( + ... "path/to/template", + ... "template.tpl", + ... ) # doctest: +SKIP >>> df = pd.DataFrame({"A": [1, 2]}) >>> EasyStyler(df) # doctest: +SKIP @@ -3614,7 +3821,28 @@ class MyStyler(cls): # type: ignore[valid-type,misc] return MyStyler - def pipe(self, func: Callable, *args, **kwargs): + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: """ Apply ``func(self, *args, **kwargs)``, and return the result. @@ -3654,9 +3882,7 @@ def pipe(self, func: Callable, *args, **kwargs): .. code-block:: python - (df.style.format(precision=3) - .pipe(g, arg1=a) - .pipe(f, arg2=b, arg3=c)) + (df.style.format(precision=3).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)) In particular, this allows users to define functions that take a styler object, along with other parameters, and return the styler after @@ -3672,10 +3898,10 @@ def pipe(self, func: Callable, *args, **kwargs): can be easily applied to a generic styler in a single ``pipe`` call. >>> def some_highlights(styler, min_color="red", max_color="blue"): - ... styler.highlight_min(color=min_color, axis=None) - ... styler.highlight_max(color=max_color, axis=None) - ... styler.highlight_null() - ... return styler + ... styler.highlight_min(color=min_color, axis=None) + ... styler.highlight_max(color=max_color, axis=None) + ... styler.highlight_null() + ... return styler >>> df = pd.DataFrame([[1, 2, 3, pd.NA], [pd.NA, 4, 5, 6]], dtype="Int64") >>> df.style.pipe(some_highlights, min_color="green") # doctest: +SKIP @@ -3684,9 +3910,11 @@ def pipe(self, func: Callable, *args, **kwargs): Since the method returns a ``Styler`` object it can be chained with other methods as if applying the underlying highlighters directly. - >>> (df.style.format("{:.1f}") - ... .pipe(some_highlights, min_color="green") - ... .highlight_between(left=2, right=5)) # doctest: +SKIP + >>> ( + ... df.style.format("{:.1f}") + ... .pipe(some_highlights, min_color="green") + ... .highlight_between(left=2, right=5) + ... ) # doctest: +SKIP .. figure:: ../../_static/style/df_pipe_hl2.png @@ -3705,8 +3933,9 @@ def pipe(self, func: Callable, *args, **kwargs): >>> def highlight_last_level(styler): ... return styler.apply_index( - ... lambda v: "background-color: pink; color: yellow", axis="columns", - ... level=styler.columns.nlevels-1 + ... lambda v: "background-color: pink; color: yellow", + ... axis="columns", + ... level=styler.columns.nlevels - 1, ... ) # doctest: +SKIP >>> df.columns = pd.MultiIndex.from_product([["A", "B"], ["X", "Y"]]) >>> df.style.pipe(highlight_last_level) # doctest: +SKIP @@ -3723,6 +3952,7 @@ def pipe(self, func: Callable, *args, **kwargs): ... return np.where( ... styler.data.isna().any(), "background-color: red;", "" ... ) + ... ... return styler.apply_index(dynamic_highlight, axis=1, level=level) >>> df.style.pipe(highlight_header_missing, level=1) # doctest: +SKIP @@ -3771,7 +4001,7 @@ def _validate_apply_axis_arg( f"operations is a Series with 'axis in [0,1]'" ) if isinstance(arg, (Series, DataFrame)): # align indx / cols to data - arg = arg.reindex_like(data, method=None).to_numpy(**dtype) + arg = arg.reindex_like(data).to_numpy(**dtype) else: arg = np.asarray(arg, **dtype) assert isinstance(arg, np.ndarray) # mypy requirement @@ -3794,7 +4024,7 @@ def _background_gradient( vmax: float | None = None, gmap: Sequence | np.ndarray | DataFrame | Series | None = None, text_only: bool = False, -): +) -> list[str] | DataFrame: """ Color background in a range according to the data or a gradient map """ @@ -3803,61 +4033,61 @@ def _background_gradient( else: # else validate gmap against the underlying data gmap = _validate_apply_axis_arg(gmap, "gmap", float, data) - with _mpl(Styler.background_gradient) as (_, _matplotlib): - smin = np.nanmin(gmap) if vmin is None else vmin - smax = np.nanmax(gmap) if vmax is None else vmax - rng = smax - smin - # extend lower / upper bounds, compresses color range - norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + smin = np.nanmin(gmap) if vmin is None else vmin + smax = np.nanmax(gmap) if vmax is None else vmax + rng = smax - smin + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.background_gradient requires matplotlib." + ) + # extend lower / upper bounds, compresses color range + norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + + if cmap is None: + rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]](norm(gmap)) + else: + rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) + + def relative_luminance(rgba) -> float: + """ + Calculate relative luminance of a color. + + The calculation adheres to the W3C standards + (https://www.w3.org/WAI/GL/wiki/Relative_luminance) + + Parameters + ---------- + color : rgb or rgba tuple - if cmap is None: - rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]]( - norm(gmap) + Returns + ------- + float + The relative luminance as a value from 0 to 1 + """ + r, g, b = ( + x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 + for x in rgba[:3] + ) + return 0.2126 * r + 0.7152 * g + 0.0722 * b + + def css(rgba, text_only) -> str: + if not text_only: + dark = relative_luminance(rgba) < text_color_threshold + text_color = "#f1f1f1" if dark else "#000000" + return ( + f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" + f"color: {text_color};" ) else: - rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) - - def relative_luminance(rgba) -> float: - """ - Calculate relative luminance of a color. - - The calculation adheres to the W3C standards - (https://www.w3.org/WAI/GL/wiki/Relative_luminance) - - Parameters - ---------- - color : rgb or rgba tuple - - Returns - ------- - float - The relative luminance as a value from 0 to 1 - """ - r, g, b = ( - x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 - for x in rgba[:3] - ) - return 0.2126 * r + 0.7152 * g + 0.0722 * b - - def css(rgba, text_only) -> str: - if not text_only: - dark = relative_luminance(rgba) < text_color_threshold - text_color = "#f1f1f1" if dark else "#000000" - return ( - f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" - f"color: {text_color};" - ) - else: - return f"color: {_matplotlib.colors.rgb2hex(rgba)};" + return f"color: {_matplotlib.colors.rgb2hex(rgba)};" - if data.ndim == 1: - return [css(rgba, text_only) for rgba in rgbas] - else: - return DataFrame( - [[css(rgba, text_only) for rgba in row] for row in rgbas], - index=data.index, - columns=data.columns, - ) + if data.ndim == 1: + return [css(rgba, text_only) for rgba in rgbas] + else: + return DataFrame( + [[css(rgba, text_only) for rgba in row] for row in rgbas], + index=data.index, + columns=data.columns, + ) def _highlight_between( @@ -3990,8 +4220,10 @@ def css_bar(start: float, end: float, color: str) -> str: if end > start: cell_css += "background: linear-gradient(90deg," if start > 0: - cell_css += f" transparent {start*100:.1f}%, {color} {start*100:.1f}%," - cell_css += f" {color} {end*100:.1f}%, transparent {end*100:.1f}%)" + cell_css += ( + f" transparent {start * 100:.1f}%, {color} {start * 100:.1f}%," + ) + cell_css += f" {color} {end * 100:.1f}%, transparent {end * 100:.1f}%)" return cell_css def css_calc(x, left: float, right: float, align: str, color: str | list | tuple): @@ -4095,20 +4327,22 @@ def css_calc(x, left: float, right: float, align: str, color: str | list | tuple rgbas = None if cmap is not None: # use the matplotlib colormap input - with _mpl(Styler.bar) as (_, _matplotlib): - cmap = ( - _matplotlib.colormaps[cmap] - if isinstance(cmap, str) - else cmap # assumed to be a Colormap instance as documented - ) - norm = _matplotlib.colors.Normalize(left, right) - rgbas = cmap(norm(values)) - if data.ndim == 1: - rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] - else: - rgbas = [ - [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas - ] + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.bar requires matplotlib." + ) + cmap = ( + _matplotlib.colormaps[cmap] + if isinstance(cmap, str) + else cmap # assumed to be a Colormap instance as documented + ) + norm = _matplotlib.colors.Normalize(left, right) + rgbas = cmap(norm(values)) + if data.ndim == 1: + rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] + else: + rgbas = [ + [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas + ] assert isinstance(align, str) # mypy: should now be in [left, right, mid, zero] if data.ndim == 1: diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 416b263ba8497..6752c83d5169b 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1,13 +1,15 @@ from __future__ import annotations from collections import defaultdict -from collections.abc import Sequence +from collections.abc import ( + Callable, + Sequence, +) from functools import partial import re from typing import ( TYPE_CHECKING, Any, - Callable, DefaultDict, Optional, TypedDict, @@ -75,6 +77,7 @@ class StylerRenderer: template_html_table = env.get_template("html_table.tpl") template_html_style = env.get_template("html_style.tpl") template_latex = env.get_template("latex.tpl") + template_typst = env.get_template("typst.tpl") template_string = env.get_template("string.tpl") def __init__( @@ -140,9 +143,15 @@ def __init__( self._display_funcs_index: DefaultDict[ # maps (row, level) -> format func tuple[int, int], Callable[[Any], str] ] = defaultdict(lambda: partial(_default_formatter, precision=precision)) + self._display_funcs_index_names: DefaultDict[ # maps index level -> format func + int, Callable[[Any], str] + ] = defaultdict(lambda: partial(_default_formatter, precision=precision)) self._display_funcs_columns: DefaultDict[ # maps (level, col) -> format func tuple[int, int], Callable[[Any], str] ] = defaultdict(lambda: partial(_default_formatter, precision=precision)) + self._display_funcs_column_names: DefaultDict[ # maps col level -> format func + int, Callable[[Any], str] + ] = defaultdict(lambda: partial(_default_formatter, precision=precision)) def _render( self, @@ -224,6 +233,21 @@ def _render_latex( d.update(kwargs) return self.template_latex.render(**d) + def _render_typst( + self, + sparse_index: bool, + sparse_columns: bool, + max_rows: int | None = None, + max_cols: int | None = None, + **kwargs, + ) -> str: + """ + Render a Styler in typst format + """ + d = self._render(sparse_index, sparse_columns, max_rows, max_cols) + d.update(kwargs) + return self.template_typst.render(**d) + def _render_string( self, sparse_index: bool, @@ -314,9 +338,9 @@ def _translate( max_cols, ) - self.cellstyle_map_columns: DefaultDict[ - tuple[CSSPair, ...], list[str] - ] = defaultdict(list) + self.cellstyle_map_columns: DefaultDict[tuple[CSSPair, ...], list[str]] = ( + defaultdict(list) + ) head = self._translate_header(sparse_cols, max_cols) d.update({"head": head}) @@ -329,9 +353,9 @@ def _translate( self.cellstyle_map: DefaultDict[tuple[CSSPair, ...], list[str]] = defaultdict( list ) - self.cellstyle_map_index: DefaultDict[ - tuple[CSSPair, ...], list[str] - ] = defaultdict(list) + self.cellstyle_map_index: DefaultDict[tuple[CSSPair, ...], list[str]] = ( + defaultdict(list) + ) body: list = self._translate_body(idx_lengths, max_rows, max_cols) d.update({"body": body}) @@ -358,9 +382,11 @@ def _translate( if not get_option("styler.html.mathjax"): table_attr = table_attr or "" if 'class="' in table_attr: - table_attr = table_attr.replace('class="', 'class="tex2jax_ignore ') + table_attr = table_attr.replace( + 'class="', 'class="tex2jax_ignore mathjax_ignore ' + ) else: - table_attr += ' class="tex2jax_ignore"' + table_attr += ' class="tex2jax_ignore mathjax_ignore"' d.update({"table_attributes": table_attr}) if self.tooltips: @@ -460,6 +486,12 @@ def _generate_col_header_row( ] * (self.index.nlevels - sum(self.hide_index_) - 1) name = self.data.columns.names[r] + + is_display = name is not None and not self.hide_column_names + value = name if is_display else self.css["blank_value"] + display_value = ( + self._display_funcs_column_names[r](value) if is_display else None + ) column_name = [ _element( "th", @@ -468,10 +500,9 @@ def _generate_col_header_row( if name is None else f"{self.css['index_name']} {self.css['level']}{r}" ), - name - if (name is not None and not self.hide_column_names) - else self.css["blank_value"], + value, not all(self.hide_index_), + display_value=display_value, ) ] @@ -553,6 +584,9 @@ def _generate_index_names_row( f"{self.css['index_name']} {self.css['level']}{c}", self.css["blank_value"] if name is None else name, not self.hide_index_[c], + display_value=( + None if name is None else self._display_funcs_index_names[c](name) + ), ) for c, name in enumerate(self.data.index.names) ] @@ -776,9 +810,9 @@ def _generate_body_row( ) if self.cell_ids: - header_element[ - "id" - ] = f"{self.css['level']}{c}_{self.css['row']}{r}" # id is given + header_element["id"] = ( + f"{self.css['level']}{c}_{self.css['row']}{r}" # id is given + ) if ( header_element_visible and (r, c) in self.ctx_index @@ -816,10 +850,7 @@ def _generate_body_row( data_element = _element( "td", - ( - f"{self.css['data']} {self.css['row']}{r} " - f"{self.css['col']}{c}{cls}" - ), + (f"{self.css['data']} {self.css['row']}{r} {self.css['col']}{c}{cls}"), value, data_element_visible, attributes="", @@ -850,7 +881,8 @@ def _translate_latex(self, d: dict, clines: str | None) -> None: or multirow sparsification (so that \multirow and \multicol work correctly). """ index_levels = self.index.nlevels - visible_index_level_n = index_levels - sum(self.hide_index_) + # GH 52218 + visible_index_level_n = max(1, index_levels - sum(self.hide_index_)) d["head"] = [ [ {**col, "cellstyle": self.ctx_columns[r, c - visible_index_level_n]} @@ -890,9 +922,9 @@ def concatenated_visible_rows(obj): row_body_headers = [ { **col, - "display_value": col["display_value"] - if col["is_visible"] - else "", + "display_value": ( + col["display_value"] if col["is_visible"] else "" + ), "cellstyle": self.ctx_index[r, c], } for c, col in enumerate(row[:index_levels]) @@ -938,7 +970,7 @@ def concatenated_visible_rows(obj): idx_len = d["index_lengths"].get((lvl, r), None) if idx_len is not None: # i.e. not a sparsified entry d["clines"][rn + idx_len].append( - f"\\cline{{{lvln+1}-{len(visible_index_levels)+data_len}}}" + f"\\cline{{{lvln + 1}-{len(visible_index_levels) + data_len}}}" # noqa: E501 ) def format( @@ -1005,6 +1037,7 @@ def format( Returns ------- Styler + Returns itself for chaining. See Also -------- @@ -1035,7 +1068,7 @@ def format( When using a ``formatter`` string the dtypes must be compatible, otherwise a `ValueError` will be raised. - When instantiating a Styler, default formatting can be applied be setting the + When instantiating a Styler, default formatting can be applied by setting the ``pandas.options``: - ``styler.format.formatter``: default None. @@ -1047,7 +1080,7 @@ def format( .. warning:: `Styler.format` is ignored when using the output format `Styler.to_excel`, - since Excel and Python have inherrently different formatting structures. + since Excel and Python have inherently different formatting structures. However, it is possible to use the `number-format` pseudo CSS attribute to force Excel permissible formatting. See examples. @@ -1063,15 +1096,15 @@ def format( Using a ``formatter`` specification on consistent column dtypes - >>> df.style.format('{:.2f}', na_rep='MISS', subset=[0,1]) # doctest: +SKIP + >>> df.style.format('{:.2f}', na_rep='MISS', subset=[0, 1]) # doctest: +SKIP 0 1 2 0 MISS 1.00 A 1 2.00 MISS 3.000000 Using the default ``formatter`` for unspecified columns - >>> df.style.format({0: '{:.2f}', 1: '£ {:.1f}'}, na_rep='MISS', precision=1) - ... # doctest: +SKIP + >>> df.style.format({0: '{:.2f}', 1: '£ {:.1f}'}, + ... na_rep='MISS', precision=1) # doctest: +SKIP 0 1 2 0 MISS £ 1.0 A 1 2.00 MISS 3.0 @@ -1079,8 +1112,8 @@ def format( Multiple ``na_rep`` or ``precision`` specifications under the default ``formatter``. - >>> (df.style.format(na_rep='MISS', precision=1, subset=[0]) - ... .format(na_rep='PASS', precision=2, subset=[1, 2])) # doctest: +SKIP + >>> (df.style.format(na_rep='MISS', precision=1, subset=[0]).format( + ... na_rep='PASS', precision=2, subset=[1, 2])) # doctest: +SKIP 0 1 2 0 MISS 1.00 A 1 2.0 PASS 3.00 @@ -1088,8 +1121,8 @@ def format( Using a callable ``formatter`` function. >>> func = lambda s: 'STRING' if isinstance(s, str) else 'FLOAT' - >>> df.style.format({0: '{:.1f}', 2: func}, precision=4, na_rep='MISS') - ... # doctest: +SKIP + >>> df.style.format({0: '{:.1f}', 2: func}, + ... precision=4, na_rep='MISS') # doctest: +SKIP 0 1 2 0 MISS 1.0000 STRING 1 2.0 MISS FLOAT @@ -1098,8 +1131,7 @@ def format( >>> df = pd.DataFrame([['
    ', '"A&B"', None]]) >>> s = df.style.format( - ... '{0}', escape="html", na_rep="NA" - ... ) + ... '{0}', escape="html", na_rep="NA") >>> s.to_html() # doctest: +SKIP ...
    <div></div>$ "A" $ A&B NA @@ -1348,7 +1382,7 @@ def format_index( >>> df = pd.DataFrame([[1, 2, 3]], columns=["123", "~", "$%#"]) >>> df.style.format_index("\\textbf{{{}}}", escape="latex", axis=1).to_latex() - ... # doctest: +SKIP + ... # doctest: +SKIP \begin{tabular}{lrrr} {} & {\textbf{123}} & {\textbf{\textasciitilde }} & {\textbf{\$\%\#}} \\ 0 & 1 & 2 & 3 \\ @@ -1377,7 +1411,7 @@ def format_index( return self # clear the formatter / revert to default and avoid looping if not isinstance(formatter, dict): - formatter = {level: formatter for level in levels_} + formatter = dict.fromkeys(levels_, formatter) else: formatter = { obj._get_level_number(level): formatter_ @@ -1425,6 +1459,7 @@ def relabel_index( Returns ------- Styler + Returns itself for chaining. See Also -------- @@ -1449,10 +1484,10 @@ def relabel_index( # relabel first, then hide df = pd.DataFrame({"col": ["a", "b", "c"]}) - df.style.relabel_index(["A", "B", "C"]).hide([0,1]) + df.style.relabel_index(["A", "B", "C"]).hide([0, 1]) # hide first, then relabel df = pd.DataFrame({"col": ["a", "b", "c"]}) - df.style.hide([0,1]).relabel_index(["C"]) + df.style.hide([0, 1]).relabel_index(["C"]) This method should be used, rather than :meth:`Styler.format_index`, in one of the following cases (see examples): @@ -1475,7 +1510,7 @@ def relabel_index( Chaining with pre-hidden elements - >>> df.style.hide([0,1]).relabel_index(["C"]) # doctest: +SKIP + >>> df.style.hide([0, 1]).relabel_index(["C"]) # doctest: +SKIP col C c @@ -1493,9 +1528,11 @@ def relabel_index( 1 5 1 0 6 1 7 - >>> styler.hide((midx.get_level_values(0)==0)|(midx.get_level_values(1)==0)) - ... # doctest: +SKIP - >>> styler.hide(level=[0,1]) # doctest: +SKIP + >>> styler.hide( + ... (midx.get_level_values(0) == 0) | (midx.get_level_values(1) == 0) + ... ) + ... # doctest: +SKIP + >>> styler.hide(level=[0, 1]) # doctest: +SKIP >>> styler.relabel_index(["binary6", "binary7"]) # doctest: +SKIP col binary6 6 @@ -1503,9 +1540,9 @@ def relabel_index( We can also achieve the above by indexing first and then re-labeling - >>> styler = df.loc[[(1,1,0), (1,1,1)]].style - >>> styler.hide(level=[0,1]).relabel_index(["binary6", "binary7"]) - ... # doctest: +SKIP + >>> styler = df.loc[[(1, 1, 0), (1, 1, 1)]].style + >>> styler.hide(level=[0, 1]).relabel_index(["binary6", "binary7"]) + ... # doctest: +SKIP col binary6 6 binary7 7 @@ -1516,9 +1553,9 @@ def relabel_index( brackets if the string if pre-formatted), >>> df = pd.DataFrame({"samples": np.random.rand(10)}) - >>> styler = df.loc[np.random.randint(0,10,3)].style - >>> styler.relabel_index([f"sample{i+1} ({{}})" for i in range(3)]) - ... # doctest: +SKIP + >>> styler = df.loc[np.random.randint(0, 10, 3)].style + >>> styler.relabel_index([f"sample{i + 1} ({{}})" for i in range(3)]) + ... # doctest: +SKIP samples sample1 (5) 0.315811 sample2 (0) 0.495941 @@ -1558,6 +1595,140 @@ def alias_(x, value): return self + def format_index_names( + self, + formatter: ExtFormatter | None = None, + axis: Axis = 0, + level: Level | list[Level] | None = None, + na_rep: str | None = None, + precision: int | None = None, + decimal: str = ".", + thousands: str | None = None, + escape: str | None = None, + hyperlinks: str | None = None, + ) -> StylerRenderer: + r""" + Format the text display value of index names or column names. + + .. versionadded:: 3.0 + + Parameters + ---------- + formatter : str, callable, dict or None + Object to define how values are displayed. See notes. + axis : {0, "index", 1, "columns"} + Whether to apply the formatter to the index or column headers. + level : int, str, list + The level(s) over which to apply the generic formatter. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied. + precision : int, optional + Floating point precision to use for display purposes, if not determined by + the specified ``formatter``. + decimal : str, default "." + Character used as decimal separator for floats, complex and integers. + thousands : str, optional, default None + Character used as thousands separator for floats, complex and integers. + escape : str, optional + Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` + in cell display string with HTML-safe sequences. + Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, + ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with + LaTeX-safe sequences. + Escaping is done before ``formatter``. + hyperlinks : {"html", "latex"}, optional + Convert string patterns containing https://, http://, ftp:// or www. to + HTML tags as clickable URL hyperlinks if "html", or LaTeX \href + commands if "latex". + + Returns + ------- + Styler + Returns itself for chaining. + + Raises + ------ + ValueError + If the `formatter` is a string and the dtypes are incompatible. + + See Also + -------- + Styler.format_index: Format the text display value of index labels + or column headers. + + Notes + ----- + This method has a similar signature to :meth:`Styler.format_index`. Since + `names` are generally label based, and often not numeric, the typical features + expected to be more frequently used here are ``escape`` and ``hyperlinks``. + + .. warning:: + `Styler.format_index_names` is ignored when using the output format + `Styler.to_excel`, since Excel and Python have inherently different + formatting structures. + + Examples + -------- + >>> df = pd.DataFrame( + ... [[1, 2], [3, 4]], + ... index=pd.Index(["a", "b"], name="idx"), + ... ) + >>> df # doctest: +SKIP + 0 1 + idx + a 1 2 + b 3 4 + >>> df.style.format_index_names(lambda x: x.upper(), axis=0) # doctest: +SKIP + 0 1 + IDX + a 1 2 + b 3 4 + """ + axis = self.data._get_axis_number(axis) + if axis == 0: + display_funcs_, obj = self._display_funcs_index_names, self.index + else: + display_funcs_, obj = self._display_funcs_column_names, self.columns + levels_ = refactor_levels(level, obj) + + if all( + ( + formatter is None, + level is None, + precision is None, + decimal == ".", + thousands is None, + na_rep is None, + escape is None, + hyperlinks is None, + ) + ): + display_funcs_.clear() + return self # clear the formatter / revert to default and avoid looping + + if not isinstance(formatter, dict): + formatter = dict.fromkeys(levels_, formatter) + else: + formatter = { + obj._get_level_number(level): formatter_ + for level, formatter_ in formatter.items() + } + + for lvl in levels_: + format_func = _maybe_wrap_formatter( + formatter.get(lvl), + na_rep=na_rep, + precision=precision, + decimal=decimal, + thousands=thousands, + escape=escape, + hyperlinks=hyperlinks, + ) + display_funcs_[lvl] = format_func + + return self + def _element( html_element: str, @@ -1569,7 +1740,7 @@ def _element( """ Template to return container with information for a element. If + True, no CSS is generated and styling effects do not apply. Notes ----- @@ -1995,7 +2171,7 @@ class Tooltips: def __init__( self, - css_props: CSSProperties = [ + css_props: CSSProperties = [ # noqa: B006 ("visibility", "hidden"), ("position", "absolute"), ("z-index", 1), @@ -2005,11 +2181,13 @@ def __init__( ], css_name: str = "pd-t", tooltips: DataFrame = DataFrame(), + as_title_attribute: bool = False, ) -> None: self.class_name = css_name self.class_properties = css_props self.tt_data = tooltips self.table_styles: CSSStyles = [] + self.as_title_attribute = as_title_attribute @property def _class_styles(self): @@ -2029,7 +2207,9 @@ def _class_styles(self): } ] - def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str): + def _pseudo_css( + self, uuid: str, name: str, row: int, col: int, text: str + ) -> list[CSSDict]: """ For every table data-cell that has a valid tooltip (not None, NaN or empty string) must create two pseudo CSS entries for the specific @@ -2097,35 +2277,53 @@ def _translate(self, styler: StylerRenderer, d: dict): if self.tt_data.empty: return d - name = self.class_name mask = (self.tt_data.isna()) | (self.tt_data.eq("")) # empty string = no ttip - self.table_styles = [ - style - for sublist in [ - self._pseudo_css(styler.uuid, name, i, j, str(self.tt_data.iloc[i, j])) - for i in range(len(self.tt_data.index)) - for j in range(len(self.tt_data.columns)) - if not ( - mask.iloc[i, j] - or i in styler.hidden_rows - or j in styler.hidden_columns - ) + # this conditional adds tooltips via pseudo css and elements. + if not self.as_title_attribute: + name = self.class_name + self.table_styles = [ + style + for sublist in [ + self._pseudo_css( + styler.uuid, name, i, j, str(self.tt_data.iloc[i, j]) + ) + for i in range(len(self.tt_data.index)) + for j in range(len(self.tt_data.columns)) + if not ( + mask.iloc[i, j] + or i in styler.hidden_rows + or j in styler.hidden_columns + ) + ] + for style in sublist ] - for style in sublist - ] - - if self.table_styles: - # add span class to every cell only if at least 1 non-empty tooltip - for row in d["body"]: - for item in row: - if item["type"] == "td": - item["display_value"] = ( - str(item["display_value"]) - + f'' - ) - d["table_styles"].extend(self._class_styles) - d["table_styles"].extend(self.table_styles) + # add span class to every cell since there is at least 1 non-empty tooltip + if self.table_styles: + for row in d["body"]: + for item in row: + if item["type"] == "td": + item["display_value"] = ( + str(item["display_value"]) + + f'' + ) + d["table_styles"].extend(self._class_styles) + d["table_styles"].extend(self.table_styles) + # this conditional adds tooltips as extra "title" attribute on a element + else: + index_offset = self.tt_data.index.nlevels + body = d["body"] + for i in range(len(self.tt_data.index)): + for j in range(len(self.tt_data.columns)): + if ( + not mask.iloc[i, j] + or i in styler.hidden_rows + or j in styler.hidden_columns + ): + row = body[i] + item = row[j + index_offset] + value = self.tt_data.iloc[i, j] + item["attributes"] += f' title="{value}"' return d @@ -2151,10 +2349,12 @@ def _parse_latex_table_styles(table_styles: CSSStyles, selector: str) -> str | N Examples -------- - >>> table_styles = [{'selector': 'foo', 'props': [('attr','value')]}, - ... {'selector': 'bar', 'props': [('attr', 'overwritten')]}, - ... {'selector': 'bar', 'props': [('a1', 'baz'), ('a2', 'ignore')]}] - >>> _parse_latex_table_styles(table_styles, selector='bar') + >>> table_styles = [ + ... {"selector": "foo", "props": [("attr", "value")]}, + ... {"selector": "bar", "props": [("attr", "overwritten")]}, + ... {"selector": "bar", "props": [("a1", "baz"), ("a2", "ignore")]}, + ... ] + >>> _parse_latex_table_styles(table_styles, selector="bar") 'baz' Notes @@ -2226,7 +2426,7 @@ def _parse_latex_header_span( r""" Refactor the cell `display_value` if a 'colspan' or 'rowspan' attribute is present. - 'rowspan' and 'colspan' do not occur simultaneouly. If they are detected then + 'rowspan' and 'colspan' do not occur simultaneously. If they are detected then the `display_value` is altered to a LaTeX `multirow` or `multicol` command respectively, with the appropriate cell-span. @@ -2238,8 +2438,8 @@ def _parse_latex_header_span( Examples -------- - >>> cell = {'cellstyle': '', 'display_value':'text', 'attributes': 'colspan="3"'} - >>> _parse_latex_header_span(cell, 't', 'c') + >>> cell = {"cellstyle": "", "display_value": "text", "attributes": 'colspan="3"'} + >>> _parse_latex_header_span(cell, "t", "c") '\\multicolumn{3}{c}{text}' """ display_val = _parse_latex_cell_styles( @@ -2288,12 +2488,12 @@ def _parse_latex_css_conversion(styles: CSSList) -> CSSList: Ignore conversion if tagged with `--latex` option, skipped if no conversion found. """ - def font_weight(value, arg): + def font_weight(value, arg) -> tuple[str, str] | None: if value in ("bold", "bolder"): return "bfseries", f"{arg}" return None - def font_style(value, arg): + def font_style(value, arg) -> tuple[str, str] | None: if value == "italic": return "itshape", f"{arg}" if value == "oblique": @@ -2317,7 +2517,7 @@ def color(value, user_arg, command, comm_arg): if value[0] == "#" and len(value) == 7: # color is hex code return command, f"[HTML]{{{value[1:].upper()}}}{arg}" if value[0] == "#" and len(value) == 4: # color is short hex code - val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}" + val = f"{value[1].upper() * 2}{value[2].upper() * 2}{value[3].upper() * 2}" return command, f"[HTML]{{{val}}}{arg}" elif value[:3] == "rgb": # color is rgb or rgba r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip() diff --git a/pandas/io/formats/templates/typst.tpl b/pandas/io/formats/templates/typst.tpl new file mode 100644 index 0000000000000..66de8f31b405e --- /dev/null +++ b/pandas/io/formats/templates/typst.tpl @@ -0,0 +1,12 @@ +#table( + columns: {{ head[0] | length }}, +{% for r in head %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} + +{% for r in body %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} +) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index f56fca8d7ef44..febf43b9a1018 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -1,6 +1,7 @@ """ :mod:`pandas.io.formats.xml` is a module for formatting data in XML. """ + from __future__ import annotations import codecs @@ -10,7 +11,6 @@ Any, final, ) -import warnings from pandas.errors import AbstractMethodError from pandas.util._decorators import ( @@ -24,10 +24,7 @@ from pandas.core.shared_docs import _shared_docs from pandas.io.common import get_handle -from pandas.io.xml import ( - get_data_from_filepath, - preprocess_data, -) +from pandas.io.xml import get_data_from_filepath if TYPE_CHECKING: from pandas._typing import ( @@ -210,13 +207,7 @@ def _process_dataframe(self) -> dict[int | str, dict[str, Any]]: df = df.reset_index() if self.na_rep is not None: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Downcasting object dtype arrays", - category=FutureWarning, - ) - df = df.fillna(self.na_rep) + df = df.fillna(self.na_rep) return df.to_dict(orient="index") @@ -269,7 +260,7 @@ def _other_namespaces(self) -> dict: nmsp_dict: dict[str, str] = {} if self.namespaces: nmsp_dict = { - f"xmlns{p if p=='' else f':{p}'}": n + f"xmlns{p if p == '' else f':{p}'}": n for p, n in self.namespaces.items() if n != self.prefix_uri[1:-1] } @@ -293,8 +284,8 @@ def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: try: if not isna(d[col]): elem_row.attrib[attr_name] = str(d[col]) - except KeyError: - raise KeyError(f"no valid column, {col}") + except KeyError as err: + raise KeyError(f"no valid column, {col}") from err return elem_row @final @@ -330,8 +321,8 @@ def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None: try: val = None if isna(d[col]) or d[col] == "" else str(d[col]) sub_element_cls(elem_row, elem_name).text = val - except KeyError: - raise KeyError(f"no valid column, {col}") + except KeyError as err: + raise KeyError(f"no valid column, {col}") from err @final def write_output(self) -> str | None: @@ -408,10 +399,12 @@ def _get_prefix_uri(self) -> str: if self.prefix: try: uri = f"{{{self.namespaces[self.prefix]}}}" - except KeyError: - raise KeyError(f"{self.prefix} is not included in namespaces") + except KeyError as err: + raise KeyError( + f"{self.prefix} is not included in namespaces" + ) from err elif "" in self.namespaces: - uri = f'{{{self.namespaces[""]}}}' + uri = f"{{{self.namespaces['']}}}" else: uri = "" @@ -504,10 +497,12 @@ def _get_prefix_uri(self) -> str: if self.prefix: try: uri = f"{{{self.namespaces[self.prefix]}}}" - except KeyError: - raise KeyError(f"{self.prefix} is not included in namespaces") + except KeyError as err: + raise KeyError( + f"{self.prefix} is not included in namespaces" + ) from err elif "" in self.namespaces: - uri = f'{{{self.namespaces[""]}}}' + uri = f"{{{self.namespaces['']}}}" else: uri = "" @@ -544,7 +539,7 @@ def _transform_doc(self) -> bytes: storage_options=self.storage_options, ) - with preprocess_data(handle_data) as xml_data: + with handle_data as xml_data: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py deleted file mode 100644 index 350002bf461ff..0000000000000 --- a/pandas/io/gbq.py +++ /dev/null @@ -1,255 +0,0 @@ -""" Google BigQuery support """ -from __future__ import annotations - -from typing import ( - TYPE_CHECKING, - Any, -) -import warnings - -from pandas.compat._optional import import_optional_dependency -from pandas.util._exceptions import find_stack_level - -if TYPE_CHECKING: - import google.auth - - from pandas import DataFrame - - -def _try_import(): - # since pandas is a dependency of pandas-gbq - # we need to import on first use - msg = ( - "pandas-gbq is required to load data from Google BigQuery. " - "See the docs: https://pandas-gbq.readthedocs.io." - ) - pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg) - return pandas_gbq - - -def read_gbq( - query: str, - project_id: str | None = None, - index_col: str | None = None, - col_order: list[str] | None = None, - reauth: bool = False, - auth_local_webserver: bool = True, - dialect: str | None = None, - location: str | None = None, - configuration: dict[str, Any] | None = None, - credentials: google.auth.credentials.Credentials | None = None, - use_bqstorage_api: bool | None = None, - max_results: int | None = None, - progress_bar_type: str | None = None, -) -> DataFrame: - """ - Load data from Google BigQuery. - - .. deprecated:: 2.2.0 - - Please use ``pandas_gbq.read_gbq`` instead. - - This function requires the `pandas-gbq package - `__. - - See the `How to authenticate with Google BigQuery - `__ - guide for authentication instructions. - - Parameters - ---------- - query : str - SQL-Like Query to return data values. - project_id : str, optional - Google BigQuery Account project ID. Optional when available from - the environment. - index_col : str, optional - Name of result column to use for index in results DataFrame. - col_order : list(str), optional - List of BigQuery column names in the desired order for results - DataFrame. - reauth : bool, default False - Force Google BigQuery to re-authenticate the user. This is useful - if multiple accounts are used. - auth_local_webserver : bool, default True - Use the `local webserver flow`_ instead of the `console flow`_ - when getting user credentials. - - .. _local webserver flow: - https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server - .. _console flow: - https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console - - *New in version 0.2.0 of pandas-gbq*. - - .. versionchanged:: 1.5.0 - Default value is changed to ``True``. Google has deprecated the - ``auth_local_webserver = False`` `"out of band" (copy-paste) - flow - `_. - dialect : str, default 'legacy' - Note: The default value is changing to 'standard' in a future version. - - SQL syntax dialect to use. Value can be one of: - - ``'legacy'`` - Use BigQuery's legacy SQL dialect. For more information see - `BigQuery Legacy SQL Reference - `__. - ``'standard'`` - Use BigQuery's standard SQL, which is - compliant with the SQL 2011 standard. For more information - see `BigQuery Standard SQL Reference - `__. - location : str, optional - Location where the query job should run. See the `BigQuery locations - documentation - `__ for a - list of available locations. The location must match that of any - datasets used in the query. - - *New in version 0.5.0 of pandas-gbq*. - configuration : dict, optional - Query config parameters for job processing. - For example: - - configuration = {'query': {'useQueryCache': False}} - - For more information see `BigQuery REST API Reference - `__. - credentials : google.auth.credentials.Credentials, optional - Credentials for accessing Google APIs. Use this parameter to override - default credentials, such as to use Compute Engine - :class:`google.auth.compute_engine.Credentials` or Service Account - :class:`google.oauth2.service_account.Credentials` directly. - - *New in version 0.8.0 of pandas-gbq*. - use_bqstorage_api : bool, default False - Use the `BigQuery Storage API - `__ to - download query results quickly, but at an increased cost. To use this - API, first `enable it in the Cloud Console - `__. - You must also have the `bigquery.readsessions.create - `__ - permission on the project you are billing queries to. - - This feature requires version 0.10.0 or later of the ``pandas-gbq`` - package. It also requires the ``google-cloud-bigquery-storage`` and - ``fastavro`` packages. - - max_results : int, optional - If set, limit the maximum number of rows to fetch from the query - results. - - progress_bar_type : Optional, str - If set, use the `tqdm `__ library to - display a progress bar while the data downloads. Install the - ``tqdm`` package to use this feature. - - Possible values of ``progress_bar_type`` include: - - ``None`` - No progress bar. - ``'tqdm'`` - Use the :func:`tqdm.tqdm` function to print a progress bar - to :data:`sys.stderr`. - ``'tqdm_notebook'`` - Use the :func:`tqdm.tqdm_notebook` function to display a - progress bar as a Jupyter notebook widget. - ``'tqdm_gui'`` - Use the :func:`tqdm.tqdm_gui` function to display a - progress bar as a graphical dialog box. - - Returns - ------- - df: DataFrame - DataFrame representing results of query. - - See Also - -------- - pandas_gbq.read_gbq : This function in the pandas-gbq library. - DataFrame.to_gbq : Write a DataFrame to Google BigQuery. - - Examples - -------- - Example taken from `Google BigQuery documentation - `_ - - >>> sql = "SELECT name FROM table_name WHERE state = 'TX' LIMIT 100;" - >>> df = pd.read_gbq(sql, dialect="standard") # doctest: +SKIP - >>> project_id = "your-project-id" # doctest: +SKIP - >>> df = pd.read_gbq(sql, - ... project_id=project_id, - ... dialect="standard" - ... ) # doctest: +SKIP - """ - warnings.warn( - "read_gbq is deprecated and will be removed in a future version. " - "Please use pandas_gbq.read_gbq instead: " - "https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.read_gbq", - FutureWarning, - stacklevel=find_stack_level(), - ) - pandas_gbq = _try_import() - - kwargs: dict[str, str | bool | int | None] = {} - - # START: new kwargs. Don't populate unless explicitly set. - if use_bqstorage_api is not None: - kwargs["use_bqstorage_api"] = use_bqstorage_api - if max_results is not None: - kwargs["max_results"] = max_results - - kwargs["progress_bar_type"] = progress_bar_type - # END: new kwargs - - return pandas_gbq.read_gbq( - query, - project_id=project_id, - index_col=index_col, - col_order=col_order, - reauth=reauth, - auth_local_webserver=auth_local_webserver, - dialect=dialect, - location=location, - configuration=configuration, - credentials=credentials, - **kwargs, - ) - - -def to_gbq( - dataframe: DataFrame, - destination_table: str, - project_id: str | None = None, - chunksize: int | None = None, - reauth: bool = False, - if_exists: str = "fail", - auth_local_webserver: bool = True, - table_schema: list[dict[str, str]] | None = None, - location: str | None = None, - progress_bar: bool = True, - credentials: google.auth.credentials.Credentials | None = None, -) -> None: - warnings.warn( - "to_gbq is deprecated and will be removed in a future version. " - "Please use pandas_gbq.to_gbq instead: " - "https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.to_gbq", - FutureWarning, - stacklevel=find_stack_level(), - ) - pandas_gbq = _try_import() - pandas_gbq.to_gbq( - dataframe, - destination_table, - project_id=project_id, - chunksize=chunksize, - reauth=reauth, - if_exists=if_exists, - auth_local_webserver=auth_local_webserver, - table_schema=table_schema, - location=location, - progress_bar=progress_bar, - credentials=credentials, - ) diff --git a/pandas/io/html.py b/pandas/io/html.py index 5d5bf079784be..183af3a03221b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -7,7 +7,9 @@ from __future__ import annotations from collections import abc +import errno import numbers +import os import re from re import Pattern from typing import ( @@ -15,7 +17,6 @@ Literal, cast, ) -import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -24,7 +25,6 @@ EmptyDataError, ) from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -36,10 +36,7 @@ from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( - file_exists, get_handle, - is_file_like, - is_fsspec_url, is_url, stringify_path, validate_header_arg, @@ -134,21 +131,17 @@ def _read( ------- raw_text : str """ - text: str | bytes - if ( - is_url(obj) - or hasattr(obj, "read") - or (isinstance(obj, str) and file_exists(obj)) - ): + try: with get_handle( obj, "r", encoding=encoding, storage_options=storage_options ) as handles: - text = handles.handle.read() - elif isinstance(obj, (str, bytes)): - text = obj - else: - raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") - return text + return handles.handle.read() + except OSError as err: + if not is_url(obj): + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}" + ) from err + raise class _HtmlFrameParser: @@ -158,7 +151,7 @@ class _HtmlFrameParser: Parameters ---------- io : str or file-like - This can be either a string of raw HTML, a valid URL using the HTTP, + This can be either a string path, a valid URL using the HTTP, FTP, or FILE protocols or a file-like object. match : str or regex @@ -269,7 +262,7 @@ def _attr_getter(self, obj, attr): # Both lxml and BeautifulSoup have the same implementation: return obj.get(attr) - def _href_getter(self, obj): + def _href_getter(self, obj) -> str | None: """ Return a href if the DOM node contains a child or None. @@ -392,7 +385,7 @@ def _parse_tables(self, document, match, attrs): """ raise AbstractMethodError(self) - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: """ Return whether an individual DOM node matches a tag @@ -461,15 +454,26 @@ def row_is_all_th(row): while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows, section="header") - body = self._expand_colspan_rowspan(body_rows, section="body") - footer = self._expand_colspan_rowspan(footer_rows, section="footer") + header, rem = self._expand_colspan_rowspan(header_rows, section="header") + body, rem = self._expand_colspan_rowspan( + body_rows, + section="body", + remainder=rem, + overflow=len(footer_rows) > 0, + ) + footer, _ = self._expand_colspan_rowspan( + footer_rows, section="footer", remainder=rem, overflow=False + ) return header, body, footer def _expand_colspan_rowspan( - self, rows, section: Literal["header", "footer", "body"] - ): + self, + rows, + section: Literal["header", "footer", "body"], + remainder: list[tuple[int, str | tuple, int]] | None = None, + overflow: bool = True, + ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]: """ Given a list of
    `` containing only whitespaces. Examples -------- @@ -1227,22 +1226,6 @@ def read_html( io = stringify_path(io) - if isinstance(io, str) and not any( - [ - is_file_like(io), - file_exists(io), - is_url(io), - is_fsspec_url(io), - ] - ): - warnings.warn( - "Passing literal html to 'read_html' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return _parse( flavor=flavor, io=io, diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py new file mode 100644 index 0000000000000..8a3e8f5da49b3 --- /dev/null +++ b/pandas/io/iceberg.py @@ -0,0 +1,93 @@ +from typing import ( + Any, +) + +from pandas.compat._optional import import_optional_dependency + +from pandas import DataFrame + + +def read_iceberg( + table_identifier: str, + catalog_name: str | None = None, + catalog_properties: dict[str, Any] | None = None, + row_filter: str | None = None, + selected_fields: tuple[str] | None = None, + case_sensitive: bool = True, + snapshot_id: int | None = None, + limit: int | None = None, + scan_properties: dict[str, Any] | None = None, +) -> DataFrame: + """ + Read an Apache Iceberg table into a pandas DataFrame. + + .. warning:: + + read_iceberg is experimental and may change without warning. + + Parameters + ---------- + table_identifier : str + Table identifier. + catalog_name : str, optional + The name of the catalog. + catalog_properties : dict of {str: str}, optional + The properties that are used next to the catalog configuration. + row_filter : str, optional + A string that describes the desired rows. + selected_fields : tuple of str, optional + A tuple of strings representing the column names to return in the output + dataframe. + case_sensitive : bool, default True + If True column matching is case sensitive. + snapshot_id : int, optional + Snapshot ID to time travel to. By default the table will be scanned as of the + current snapshot ID. + limit : int, optional + An integer representing the number of rows to return in the scan result. + By default all matching rows will be fetched. + scan_properties : dict of {str: obj}, optional + Additional Table properties as a dictionary of string key value pairs to use + for this scan. + + Returns + ------- + DataFrame + DataFrame based on the Iceberg table. + + See Also + -------- + read_parquet : Read a Parquet file. + + Examples + -------- + >>> df = pd.read_iceberg( + ... table_identifier="my_table", + ... catalog_name="my_catalog", + ... catalog_properties={"s3.secret-access-key": "my-secret"}, + ... row_filter="trip_distance >= 10.0", + ... selected_fields=("VendorID", "tpep_pickup_datetime"), + ... ) # doctest: +SKIP + """ + pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog") + pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions") + + if catalog_properties is None: + catalog_properties = {} + catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties) + table = catalog.load_table(table_identifier) + if row_filter is None: + row_filter = pyiceberg_expressions.AlwaysTrue() + if selected_fields is None: + selected_fields = ("*",) + if scan_properties is None: + scan_properties = {} + result = table.scan( + row_filter=row_filter, + selected_fields=selected_fields, + case_sensitive=case_sensitive, + snapshot_id=snapshot_id, + options=scan_properties, + limit=limit, + ) + return result.to_pandas() diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index 8f4e7a62834b5..39f78e26d6041 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -7,9 +7,9 @@ from pandas.io.json._table_schema import build_table_schema __all__ = [ - "ujson_dumps", - "ujson_loads", + "build_table_schema", "read_json", "to_json", - "build_table_schema", + "ujson_dumps", + "ujson_loads", ] diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ed66e46b300f7..6b4f6c05c3123 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -5,19 +5,16 @@ abstractmethod, ) from collections import abc -from io import StringIO from itertools import islice from typing import ( TYPE_CHECKING, Any, - Callable, Generic, Literal, TypeVar, final, overload, ) -import warnings import numpy as np @@ -30,12 +27,12 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( ensure_str, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype @@ -52,26 +49,25 @@ from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, dedup_names, - extension_to_compression, - file_exists, get_handle, - is_fsspec_url, is_potential_multi_index, - is_url, stringify_path, ) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import ( build_table_schema, parse_table_schema, + set_default_names, ) from pandas.io.parsers.readers import validate_integer if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Mapping, ) @@ -113,8 +109,7 @@ def to_json( indent: int = ..., storage_options: StorageOptions = ..., mode: Literal["a", "w"] = ..., -) -> None: - ... +) -> None: ... @overload @@ -133,8 +128,7 @@ def to_json( indent: int = ..., storage_options: StorageOptions = ..., mode: Literal["a", "w"] = ..., -) -> str: - ... +) -> str: ... def to_json( @@ -251,11 +245,9 @@ def __init__( self.default_handler = default_handler self.index = index self.indent = indent - - self.is_copy = None self._format_axes() - def _format_axes(self): + def _format_axes(self) -> None: raise AbstractMethodError(self) def write(self) -> str: @@ -287,7 +279,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: else: return self.obj - def _format_axes(self): + def _format_axes(self) -> None: if not self.obj.index.is_unique and self.orient == "index": raise ValueError(f"Series index must be unique for orient='{self.orient}'") @@ -304,7 +296,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: obj_to_write = self.obj return obj_to_write - def _format_axes(self): + def _format_axes(self) -> None: """ Try to format axes if they are datelike. """ @@ -364,6 +356,8 @@ def __init__( raise ValueError(msg) self.schema = build_table_schema(obj, index=self.index) + if self.index: + obj = set_default_names(obj) # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): @@ -372,26 +366,28 @@ def __init__( ) # TODO: Do this timedelta properly in objToJSON.c See GH #15137 - if ( - (obj.ndim == 1) - and (obj.name in set(obj.index.names)) - or len(obj.columns.intersection(obj.index.names)) + if ((obj.ndim == 1) and (obj.name in set(obj.index.names))) or len( + obj.columns.intersection(obj.index.names) ): msg = "Overlapping names between the index and columns" raise ValueError(msg) - obj = obj.copy() timedeltas = obj.select_dtypes(include=["timedelta"]).columns + copied = False if len(timedeltas): + obj = obj.copy() + copied = True obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat()) - # Convert PeriodIndex to datetimes before serializing - if isinstance(obj.index.dtype, PeriodDtype): - obj.index = obj.index.to_timestamp() # exclude index from obj if index=False if not self.index: self.obj = obj.reset_index(drop=True) else: + # Convert PeriodIndex to datetimes before serializing + if isinstance(obj.index.dtype, PeriodDtype): + if not copied: + obj = obj.copy(deep=False) + obj.index = obj.index.to_timestamp() self.obj = obj.reset_index(drop=False) self.date_format = "iso" self.orient = "records" @@ -423,8 +419,7 @@ def read_json( storage_options: StorageOptions = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., engine: JSONEngine = ..., -) -> JsonReader[Literal["frame"]]: - ... +) -> JsonReader[Literal["frame"]]: ... @overload @@ -448,8 +443,7 @@ def read_json( storage_options: StorageOptions = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., engine: JSONEngine = ..., -) -> JsonReader[Literal["series"]]: - ... +) -> JsonReader[Literal["series"]]: ... @overload @@ -473,8 +467,7 @@ def read_json( storage_options: StorageOptions = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., engine: JSONEngine = ..., -) -> Series: - ... +) -> Series: ... @overload @@ -498,8 +491,7 @@ def read_json( storage_options: StorageOptions = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., engine: JSONEngine = ..., -) -> DataFrame: - ... +) -> DataFrame: ... @doc( @@ -530,9 +522,15 @@ def read_json( """ Convert a JSON string to pandas object. + This method reads JSON files or JSON-like data and converts them into pandas + objects. It supports a variety of input formats, including line-delimited JSON, + compressed files, and various data representations (table, records, index-based, + etc.). When `chunksize` is specified, an iterator is returned instead of loading + the entire data into memory. + Parameters ---------- - path_or_buf : a valid JSON str, path object or file-like object + path_or_buf : a str path, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: @@ -660,14 +658,15 @@ def read_json( {storage_options} - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -717,7 +716,7 @@ def read_json( "data":[["a","b"],["c","d"]]\ }}\ ' - >>> pd.read_json(StringIO(_), orient='split') + >>> pd.read_json(StringIO(_), orient='split') # noqa: F821 col 1 col 2 row 1 a b row 2 c d @@ -727,7 +726,7 @@ def read_json( >>> df.to_json(orient='index') '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}' - >>> pd.read_json(StringIO(_), orient='index') + >>> pd.read_json(StringIO(_), orient='index') # noqa: F821 col 1 col 2 row 1 a b row 2 c d @@ -737,7 +736,7 @@ def read_json( >>> df.to_json(orient='records') '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]' - >>> pd.read_json(StringIO(_), orient='records') + >>> pd.read_json(StringIO(_), orient='records') # noqa: F821 col 1 col 2 0 a b 1 c d @@ -881,18 +880,6 @@ def __init__( self.nrows = validate_integer("nrows", self.nrows, 0) if not self.lines: raise ValueError("nrows can only be passed if lines=True") - if ( - isinstance(filepath_or_buffer, str) - and not self.lines - and "\n" in filepath_or_buffer - ): - warnings.warn( - "Passing literal json to 'read_json' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) if self.engine == "pyarrow": if not self.lines: raise ValueError( @@ -902,45 +889,22 @@ def __init__( self.data = filepath_or_buffer elif self.engine == "ujson": data = self._get_data_from_filepath(filepath_or_buffer) - self.data = self._preprocess_data(data) - - def _preprocess_data(self, data): - """ - At this point, the data either has a `read` attribute (e.g. a file - object or a StringIO) or is a string that is a JSON document. - - If self.chunksize, we prepare the data for the `__next__` method. - Otherwise, we read it into memory for the `read` method. - """ - if hasattr(data, "read") and not (self.chunksize or self.nrows): - with self: - data = data.read() - if not hasattr(data, "read") and (self.chunksize or self.nrows): - data = StringIO(data) - - return data + # If self.chunksize, we prepare the data for the `__next__` method. + # Otherwise, we read it into memory for the `read` method. + if not (self.chunksize or self.nrows): + with self: + self.data = data.read() + else: + self.data = data def _get_data_from_filepath(self, filepath_or_buffer): """ The function read_json accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) - 3. JSON string - - This method turns (1) into (2) to simplify the rest of the processing. - It returns input types (2) and (3) unchanged. - - It raises FileNotFoundError if the input is a string ending in - one of .json, .json.gz, .json.bz2, etc. but no such file exists. """ - # if it is a string but the file does not exist, it might be a JSON string filepath_or_buffer = stringify_path(filepath_or_buffer) - if ( - not isinstance(filepath_or_buffer, str) - or is_url(filepath_or_buffer) - or is_fsspec_url(filepath_or_buffer) - or file_exists(filepath_or_buffer) - ): + try: self.handles = get_handle( filepath_or_buffer, "r", @@ -949,23 +913,11 @@ def _get_data_from_filepath(self, filepath_or_buffer): storage_options=self.storage_options, errors=self.encoding_errors, ) - filepath_or_buffer = self.handles.handle - elif ( - isinstance(filepath_or_buffer, str) - and filepath_or_buffer.lower().endswith( - (".json",) + tuple(f".json{c}" for c in extension_to_compression) - ) - and not file_exists(filepath_or_buffer) - ): - raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") - else: - warnings.warn( - "Passing literal json to 'read_json' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) + except OSError as err: + raise FileNotFoundError( + f"File {filepath_or_buffer} does not exist" + ) from err + filepath_or_buffer = self.handles.handle return filepath_or_buffer def _combine_lines(self, lines) -> str: @@ -973,20 +925,17 @@ def _combine_lines(self, lines) -> str: Combines a list of JSON objects into one JSON object. """ return ( - f'[{",".join([line for line in (line.strip() for line in lines) if line])}]' + f"[{','.join([line for line in (line.strip() for line in lines) if line])}]" ) @overload - def read(self: JsonReader[Literal["frame"]]) -> DataFrame: - ... + def read(self: JsonReader[Literal["frame"]]) -> DataFrame: ... @overload - def read(self: JsonReader[Literal["series"]]) -> Series: - ... + def read(self: JsonReader[Literal["series"]]) -> Series: ... @overload - def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: - ... + def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: ... def read(self) -> DataFrame | Series: """ @@ -995,42 +944,63 @@ def read(self) -> DataFrame | Series: obj: DataFrame | Series with self: if self.engine == "pyarrow": - pyarrow_json = import_optional_dependency("pyarrow.json") - pa_table = pyarrow_json.read_json(self.data) + obj = self._read_pyarrow() + elif self.engine == "ujson": + obj = self._read_ujson() - mapping: type[ArrowDtype] | None | Callable - if self.dtype_backend == "pyarrow": - mapping = ArrowDtype - elif self.dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping + return obj - mapping = _arrow_dtype_mapping().get - else: - mapping = None + def _read_pyarrow(self) -> DataFrame: + """ + Read JSON using the pyarrow engine. + """ + pyarrow_json = import_optional_dependency("pyarrow.json") + options = None + + if isinstance(self.dtype, dict): + pa = import_optional_dependency("pyarrow") + fields = [] + for field, dtype in self.dtype.items(): + pd_dtype = pandas_dtype(dtype) + if isinstance(pd_dtype, ArrowDtype): + fields.append((field, pd_dtype.pyarrow_dtype)) + + schema = pa.schema(fields) + options = pyarrow_json.ParseOptions( + explicit_schema=schema, unexpected_field_behavior="infer" + ) - return pa_table.to_pandas(types_mapper=mapping) - elif self.engine == "ujson": - if self.lines: - if self.chunksize: - obj = concat(self) - elif self.nrows: - lines = list(islice(self.data, self.nrows)) - lines_json = self._combine_lines(lines) - obj = self._get_object_parser(lines_json) - else: - data = ensure_str(self.data) - data_lines = data.split("\n") - obj = self._get_object_parser(self._combine_lines(data_lines)) - else: - obj = self._get_object_parser(self.data) - if self.dtype_backend is not lib.no_default: - return obj.convert_dtypes( - infer_objects=False, dtype_backend=self.dtype_backend - ) - else: - return obj - - def _get_object_parser(self, json) -> DataFrame | Series: + pa_table = pyarrow_json.read_json(self.data, parse_options=options) + df = arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) + + return df + + def _read_ujson(self) -> DataFrame | Series: + """ + Read JSON using the ujson engine. + """ + obj: DataFrame | Series + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data_lines = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data_lines)) + else: + obj = self._get_object_parser(self.data) + if self.dtype_backend is not lib.no_default: + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) + else: + return obj + + def _get_object_parser(self, json: str) -> DataFrame | Series: """ Parses a json document into a pandas object. """ @@ -1046,16 +1016,14 @@ def _get_object_parser(self, json) -> DataFrame | Series: "date_unit": self.date_unit, "dtype_backend": self.dtype_backend, } - obj = None if typ == "frame": - obj = FrameParser(json, **kwargs).parse() - - if typ == "series" or obj is None: + return FrameParser(json, **kwargs).parse() + elif typ == "series": if not isinstance(dtype, bool): kwargs["dtype"] = dtype - obj = SeriesParser(json, **kwargs).parse() - - return obj + return SeriesParser(json, **kwargs).parse() + else: + raise ValueError(f"{typ=} must be 'frame' or 'series'.") def close(self) -> None: """ @@ -1071,16 +1039,15 @@ def __iter__(self) -> Self: return self @overload - def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame: - ... + def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame: ... @overload - def __next__(self: JsonReader[Literal["series"]]) -> Series: - ... + def __next__(self: JsonReader[Literal["series"]]) -> Series: ... @overload - def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: - ... + def __next__( + self: JsonReader[Literal["frame", "series"]], + ) -> DataFrame | Series: ... def __next__(self) -> DataFrame | Series: if self.nrows and self.nrows_seen >= self.nrows: @@ -1169,7 +1136,6 @@ def __init__( self.convert_dates = convert_dates self.date_unit = date_unit self.keep_default_dates = keep_default_dates - self.obj: DataFrame | Series | None = None self.dtype_backend = dtype_backend @final @@ -1183,26 +1149,22 @@ def check_keys_split(self, decoded: dict) -> None: raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}") @final - def parse(self): - self._parse() + def parse(self) -> DataFrame | Series: + obj = self._parse() - if self.obj is None: - return None if self.convert_axes: - self._convert_axes() - self._try_convert_types() - return self.obj + obj = self._convert_axes(obj) + obj = self._try_convert_types(obj) + return obj - def _parse(self): + def _parse(self) -> DataFrame | Series: raise AbstractMethodError(self) @final - def _convert_axes(self) -> None: + def _convert_axes(self, obj: DataFrame | Series) -> DataFrame | Series: """ Try to convert axes. """ - obj = self.obj - assert obj is not None # for mypy for axis_name in obj._AXIS_ORDERS: ax = obj._get_axis(axis_name) ser = Series(ax, dtype=ax.dtype, copy=False) @@ -1215,9 +1177,10 @@ def _convert_axes(self) -> None: ) if result: new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) - setattr(self.obj, axis_name, new_axis) + setattr(obj, axis_name, new_axis) + return obj - def _try_convert_types(self): + def _try_convert_types(self, obj): raise AbstractMethodError(self) @final @@ -1232,26 +1195,23 @@ def _try_convert_data( """ Try to parse a Series into a column by inferring dtype. """ + org_data = data # don't try to coerce, unless a force conversion if use_dtypes: if not self.dtype: if all(notna(data)): return data, False - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Downcasting object dtype arrays", - category=FutureWarning, - ) - filled = data.fillna(np.nan) + filled = data.fillna(np.nan) return filled, True elif self.dtype is True: pass - else: - # dtype to force + elif not _should_convert_dates( + convert_dates, self.keep_default_dates, name + ): + # convert_dates takes precedence over columns listed in dtypes dtype = ( self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype ) @@ -1262,10 +1222,11 @@ def _try_convert_data( return data, False if convert_dates: - new_data, result = self._try_convert_to_date(data) - if result: + new_data = self._try_convert_to_date(data) + if new_data is not data: return new_data, True + converted = False if self.dtype_backend is not lib.no_default and not is_axis: # Fall through for conversion later on return data, True @@ -1273,32 +1234,34 @@ def _try_convert_data( # try float try: data = data.astype("float64") + converted = True except (TypeError, ValueError): pass - if data.dtype.kind == "f": - if data.dtype != "float64": - # coerce floats to 64 - try: - data = data.astype("float64") - except (TypeError, ValueError): - pass + if data.dtype.kind == "f" and data.dtype != "float64": + # coerce floats to 64 + try: + data = data.astype("float64") + converted = True + except (TypeError, ValueError): + pass # don't coerce 0-len data if len(data) and data.dtype in ("float", "object"): # coerce ints if we can try: - new_data = data.astype("int64") + new_data = org_data.astype("int64") if (new_data == data).all(): data = new_data + converted = True except (TypeError, ValueError, OverflowError): pass - # coerce ints to 64 - if data.dtype == "int": - # coerce floats to 64 + if data.dtype == "int" and data.dtype != "int64": + # coerce ints to 64 try: data = data.astype("int64") + converted = True except (TypeError, ValueError): pass @@ -1307,19 +1270,19 @@ def _try_convert_data( if self.orient == "split": return data, False - return data, True + return data, converted @final - def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: + def _try_convert_to_date(self, data: Series) -> Series: """ Try to parse a ndarray like into a date column. Try to coerce object in epoch/iso formats and integer/float in epoch - formats. Return a boolean if parsing was successful. + formats. """ # no conversion on empty if not len(data): - return data, False + return data new_data = data @@ -1330,7 +1293,7 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: try: new_data = data.astype("int64") except OverflowError: - return data, False + return data except (TypeError, ValueError): pass @@ -1342,64 +1305,45 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: | (new_data._values == iNaT) ) if not in_range.all(): - return data, False + return data date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS for date_unit in date_units: try: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - ".*parsing datetimes with mixed time " - "zones will raise an error", - category=FutureWarning, - ) - new_data = to_datetime(new_data, errors="raise", unit=date_unit) + return to_datetime(new_data, errors="raise", unit=date_unit) except (ValueError, OverflowError, TypeError): continue - return new_data, True - return data, False + return data class SeriesParser(Parser): _default_orient = "index" _split_keys = ("name", "index", "data") - obj: Series | None - def _parse(self) -> None: + def _parse(self) -> Series: data = ujson_loads(self.json, precise_float=self.precise_float) if self.orient == "split": decoded = {str(k): v for k, v in data.items()} self.check_keys_split(decoded) - self.obj = Series(**decoded) + return Series(**decoded) else: - self.obj = Series(data) + return Series(data) - def _try_convert_types(self) -> None: - if self.obj is None: - return - obj, result = self._try_convert_data( - "data", self.obj, convert_dates=self.convert_dates - ) - if result: - self.obj = obj + def _try_convert_types(self, obj: Series) -> Series: + obj, _ = self._try_convert_data("data", obj, convert_dates=self.convert_dates) + return obj class FrameParser(Parser): _default_orient = "columns" _split_keys = ("columns", "index", "data") - obj: DataFrame | None - def _parse(self) -> None: + def _parse(self) -> DataFrame: json = self.json orient = self.orient - if orient == "columns": - self.obj = DataFrame( - ujson_loads(json, precise_float=self.precise_float), dtype=None - ) - elif orient == "split": + if orient == "split": decoded = { str(k): v for k, v in ujson_loads(json, precise_float=self.precise_float).items() @@ -1413,90 +1357,61 @@ def _parse(self) -> None: orig_names, is_potential_multi_index(orig_names, None), ) - self.obj = DataFrame(dtype=None, **decoded) + return DataFrame(dtype=None, **decoded) elif orient == "index": - self.obj = DataFrame.from_dict( + return DataFrame.from_dict( ujson_loads(json, precise_float=self.precise_float), dtype=None, orient="index", ) elif orient == "table": - self.obj = parse_table_schema(json, precise_float=self.precise_float) + return parse_table_schema(json, precise_float=self.precise_float) else: - self.obj = DataFrame( + # includes orient == "columns" + return DataFrame( ujson_loads(json, precise_float=self.precise_float), dtype=None ) - def _process_converter( - self, - f: Callable[[Hashable, Series], tuple[Series, bool]], - filt: Callable[[Hashable], bool] | None = None, - ) -> None: - """ - Take a conversion function and possibly recreate the frame. - """ - if filt is None: - filt = lambda col: True - - obj = self.obj - assert obj is not None # for mypy - - needs_new_obj = False - new_obj = {} - for i, (col, c) in enumerate(obj.items()): - if filt(col): - new_data, result = f(col, c) - if result: - c = new_data - needs_new_obj = True - new_obj[i] = c - - if needs_new_obj: - # possibly handle dup columns - new_frame = DataFrame(new_obj, index=obj.index) - new_frame.columns = obj.columns - self.obj = new_frame - - def _try_convert_types(self) -> None: - if self.obj is None: - return - if self.convert_dates: - self._try_convert_dates() - - self._process_converter( - lambda col, c: self._try_convert_data(col, c, convert_dates=False) + def _try_convert_types(self, obj: DataFrame) -> DataFrame: + arrays = [] + for col_label, series in obj.items(): + result, _ = self._try_convert_data( + col_label, + series, + convert_dates=_should_convert_dates( + self.convert_dates, + keep_default_dates=self.keep_default_dates, + col=col_label, + ), + ) + arrays.append(result.array) + return DataFrame._from_arrays( + arrays, obj.columns, obj.index, verify_integrity=False ) - def _try_convert_dates(self) -> None: - if self.obj is None: - return - - # our columns to parse - convert_dates_list_bool = self.convert_dates - if isinstance(convert_dates_list_bool, bool): - convert_dates_list_bool = [] - convert_dates = set(convert_dates_list_bool) - - def is_ok(col) -> bool: - """ - Return if this col is ok to try for a date parse. - """ - if col in convert_dates: - return True - if not self.keep_default_dates: - return False - if not isinstance(col, str): - return False - - col_lower = col.lower() - if ( - col_lower.endswith(("_at", "_time")) - or col_lower == "modified" - or col_lower == "date" - or col_lower == "datetime" - or col_lower.startswith("timestamp") - ): - return True - return False - self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok) +def _should_convert_dates( + convert_dates: bool | list[str], + keep_default_dates: bool, + col: Hashable, +) -> bool: + """ + Return bool whether a DataFrame column should be cast to datetime. + """ + if convert_dates is False: + # convert_dates=True means follow keep_default_dates + return False + elif not isinstance(convert_dates, bool) and col in set(convert_dates): + return True + elif not keep_default_dates: + return False + elif not isinstance(col, str): + return False + col_lower = col.lower() + if ( + col_lower.endswith(("_at", "_time")) + or col_lower in {"modified", "date", "datetime"} + or col_lower.startswith("timestamp") + ): + return True + return False diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index b1e2210f9d894..642408b35ba24 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -11,6 +11,7 @@ TYPE_CHECKING, Any, DefaultDict, + overload, ) import numpy as np @@ -18,7 +19,10 @@ from pandas._libs.writers import convert_json_to_lines import pandas as pd -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) if TYPE_CHECKING: from collections.abc import Iterable @@ -42,13 +46,33 @@ def convert_to_line_delimits(s: str) -> str: return convert_json_to_lines(s) +@overload +def nested_to_record( + ds: dict, + prefix: str = ..., + sep: str = ..., + level: int = ..., + max_level: int | None = ..., +) -> dict[str, Any]: ... + + +@overload def nested_to_record( - ds, + ds: list[dict], + prefix: str = ..., + sep: str = ..., + level: int = ..., + max_level: int | None = ..., +) -> list[dict[str, Any]]: ... + + +def nested_to_record( + ds: dict | list[dict], prefix: str = "", sep: str = ".", level: int = 0, max_level: int | None = None, -): +) -> dict[str, Any] | list[dict[str, Any]]: """ A simplified json_normalize @@ -123,7 +147,7 @@ def nested_to_record( return new_ds -def _normalise_json( +def _normalize_json( data: Any, key_string: str, normalized_dict: dict[str, Any], @@ -153,7 +177,7 @@ def _normalise_json( if not key_string: new_key = new_key.removeprefix(separator) - _normalise_json( + _normalize_json( data=value, key_string=new_key, normalized_dict=normalized_dict, @@ -164,7 +188,7 @@ def _normalise_json( return normalized_dict -def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]: +def _normalize_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]: """ Order the top level keys and then recursively go to depth @@ -177,10 +201,10 @@ def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, A Returns ------- - dict or list of dicts, matching `normalised_json_object` + dict or list of dicts, matching `normalized_json_object` """ top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)} - nested_dict_ = _normalise_json( + nested_dict_ = _normalize_json( data={k: v for k, v in data.items() if isinstance(v, dict)}, key_string="", normalized_dict={}, @@ -211,7 +235,7 @@ def _simple_json_normalize( Returns ------- frame : DataFrame - d - dict or list of dicts, matching `normalised_json_object` + d - dict or list of dicts, matching `normalized_json_object` Examples -------- @@ -232,18 +256,18 @@ def _simple_json_normalize( } """ - normalised_json_object = {} + normalized_json_object = {} # expect a dictionary, as most jsons are. However, lists are perfectly valid if isinstance(ds, dict): - normalised_json_object = _normalise_json_ordered(data=ds, separator=sep) + normalized_json_object = _normalize_json_ordered(data=ds, separator=sep) elif isinstance(ds, list): - normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds] - return normalised_json_list - return normalised_json_object + normalized_json_list = [_simple_json_normalize(row, sep=sep) for row in ds] + return normalized_json_list + return normalized_json_object def json_normalize( - data: dict | list[dict], + data: dict | list[dict] | Series, record_path: str | list | None = None, meta: str | list[str | list[str]] | None = None, meta_prefix: str | None = None, @@ -255,9 +279,13 @@ def json_normalize( """ Normalize semi-structured JSON data into a flat table. + This method is designed to transform semi-structured JSON data, such as nested + dictionaries or lists, into a flat table. This is particularly useful when + handling JSON-like data structures that contain deeply nested fields. + Parameters ---------- - data : dict or list of dicts + data : dict, list of dicts, or Series of dicts Unserialized JSON objects. record_path : str or list of str, default None Path in each object to list of records. If not passed, data will be @@ -265,10 +293,10 @@ def json_normalize( meta : list of paths (str or list of str), default None Fields to use as metadata for each record in resulting table. meta_prefix : str, default None - If True, prefix records with dotted (?) path, e.g. foo.bar.field if + If True, prefix records with dotted path, e.g. foo.bar.field if meta is ['foo', 'bar']. record_prefix : str, default None - If True, prefix records with dotted (?) path, e.g. foo.bar.field if + If True, prefix records with dotted path, e.g. foo.bar.field if path to records is ['foo', 'bar']. errors : {'raise', 'ignore'}, default 'raise' Configures error handling. @@ -286,8 +314,13 @@ def json_normalize( Returns ------- - frame : DataFrame - Normalize semi-structured JSON data into a flat table. + DataFrame + The normalized data, represented as a pandas DataFrame. + + See Also + -------- + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Series : One-dimensional ndarray with axis labels (including time series). Examples -------- @@ -342,6 +375,26 @@ def json_normalize( 1 NaN Mark Reg 130 60 2 2.0 Faye Raker 130 60 + >>> data = [ + ... { + ... "id": 1, + ... "name": "Cole Volk", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "id": 2, + ... "name": "Faye Raker", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... ] + >>> series = pd.Series(data, index=pd.Index(["a", "b", "c"])) + >>> pd.json_normalize(series) + id name fitness.height fitness.weight + a 1.0 Cole Volk 130 60 + b NaN Mark Reg 130 60 + c 2.0 Faye Raker 130 60 + >>> data = [ ... { ... "state": "Florida", @@ -427,11 +480,16 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: result = [] else: raise TypeError( - f"{js} has non list value {result} for path {spec}. " - "Must be list or null." + f"Path must contain list or null, " + f"but got {type(result).__name__} at {spec!r}" ) return result + if isinstance(data, Series): + index = data.index + else: + index = None + if isinstance(data, list) and not data: return DataFrame() elif isinstance(data, dict): @@ -454,7 +512,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: and record_prefix is None and max_level is None ): - return DataFrame(_simple_json_normalize(data, sep=sep)) + return DataFrame(_simple_json_normalize(data, sep=sep), index=index) if record_path is None: if any([isinstance(x, dict) for x in y.values()] for y in data): @@ -466,7 +524,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data, sep=sep, max_level=max_level) - return DataFrame(data) + return DataFrame(data, index=index) elif not isinstance(record_path, list): record_path = [record_path] @@ -537,8 +595,10 @@ def _recursive_extract(data, path, seen_meta, level: int = 0) -> None: if values.ndim > 1: # GH 37782 values = np.empty((len(v),), dtype=object) - for i, v in enumerate(v): - values[i] = v + for i, val in enumerate(v): + values[i] = val result[k] = values.repeat(lengths) + if index is not None: + result.index = index.repeat(lengths) return result diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 4d9fba72cf173..7879be18b52c9 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -3,6 +3,7 @@ https://specs.frictionlessdata.io/table-schema/ """ + from __future__ import annotations from typing import ( @@ -15,7 +16,6 @@ from pandas._libs import lib from pandas._libs.json import ujson_loads from pandas._libs.tslibs import timezones -from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import _registry as registry @@ -114,7 +114,7 @@ def set_default_names(data): ) return data - data = data.copy() + data = data.copy(deep=False) if data.index.nlevels > 1: data.index.names = com.fill_missing_names(data.index.names) else: @@ -144,11 +144,11 @@ def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]: field["freq"] = dtype.freq.freqstr elif isinstance(dtype, DatetimeTZDtype): if timezones.is_utc(dtype.tz): - # timezone.utc has no "zone" attr field["tz"] = "UTC" else: - # error: "tzinfo" has no attribute "zone" - field["tz"] = dtype.tz.zone # type: ignore[attr-defined] + zone = timezones.get_timezone(dtype.tz) + if isinstance(zone, str): + field["tz"] = zone elif isinstance(dtype, ExtensionDtype): field["extDtype"] = dtype.name return field @@ -212,8 +212,7 @@ def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype: elif field.get("freq"): # GH#9586 rename frequency M to ME for offsets offset = to_offset(field["freq"]) - freq_n, freq_name = offset.n, offset.name - freq = freq_to_period_freqstr(freq_n, freq_name) + freq = PeriodDtype(offset)._freqstr # GH#47747 using datetime over period to minimize the change surface return f"period[{freq}]" else: @@ -240,9 +239,16 @@ def build_table_schema( """ Create a Table schema from ``data``. + This method is a utility to generate a JSON-serializable schema + representation of a pandas Series or DataFrame, compatible with the + Table Schema specification. It enables structured data to be shared + and validated in various applications, ensuring consistency and + interoperability. + Parameters ---------- - data : Series, DataFrame + data : Series or DataFrame + The input data for which the table schema is to be created. index : bool, default True Whether to include ``data.index`` in the schema. primary_key : bool or None, default True @@ -257,6 +263,12 @@ def build_table_schema( Returns ------- dict + A dictionary representing the Table schema. + + See Also + -------- + DataFrame.to_json : Convert the object to a JSON string. + read_json : Convert a JSON string to pandas object. Notes ----- @@ -276,8 +288,8 @@ def build_table_schema( >>> df = pd.DataFrame( ... {'A': [1, 2, 3], ... 'B': ['a', 'b', 'c'], - ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), - ... }, index=pd.Index(range(3), name='idx')) + ... 'C': pd.date_range('2016-01-01', freq='D', periods=3), + ... }, index=pd.Index(range(3), name='idx')) >>> build_table_schema(df) {'fields': \ [{'name': 'idx', 'type': 'integer'}, \ diff --git a/pandas/io/orc.py b/pandas/io/orc.py index fed9463c38d5d..02e0ec5247e74 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,24 +1,21 @@ -""" orc compat """ +"""orc compat""" + from __future__ import annotations import io -from types import ModuleType from typing import ( TYPE_CHECKING, Any, Literal, ) -from pandas._config import using_pyarrow_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.indexes.api import default_index -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( get_handle, is_fsspec_url, @@ -48,6 +45,13 @@ def read_orc( """ Load an ORC object from the file path, returning a DataFrame. + This method reads an ORC (Optimized Row Columnar) file into a pandas + DataFrame using the `pyarrow.orc` library. ORC is a columnar storage format + that provides efficient compression and fast retrieval for analytical workloads. + It allows reading specific columns, handling different filesystem + types (such as local storage, cloud storage via fsspec, or pyarrow filesystem), + and supports different data type backends, including `numpy_nullable` and `pyarrow`. + Parameters ---------- path : str, path object, or file-like object @@ -61,19 +65,20 @@ def read_orc( Output always follows the ordering of the file and not the columns list. This mirrors the original behaviour of :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 filesystem : fsspec or pyarrow filesystem, default None - Filesystem object to use when reading the parquet file. + Filesystem object to use when reading the orc file. .. versionadded:: 2.1.0 @@ -83,6 +88,15 @@ def read_orc( Returns ------- DataFrame + DataFrame based on the ORC file. + + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_sas : Load a SAS file into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. Notes ----- @@ -98,7 +112,7 @@ def read_orc( -------- >>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP """ - # we require a newer version of pyarrow than we support for parquet + # we require a newer version of pyarrow than we support for orc orc = import_optional_dependency("pyarrow.orc") @@ -117,21 +131,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if dtype_backend is not lib.no_default: - if dtype_backend == "pyarrow": - df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) - else: - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - df = pa_table.to_pandas(types_mapper=mapping.get) - return df - else: - if using_pyarrow_string_dtype(): - types_mapper = arrow_string_types_mapper() - else: - types_mapper = None - return pa_table.to_pandas(types_mapper=types_mapper) + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) def to_orc( @@ -218,7 +218,6 @@ def to_orc( if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") - engine = import_optional_dependency(engine, min_version="10.0.1") pa = import_optional_dependency("pyarrow") orc = import_optional_dependency("pyarrow.orc") @@ -227,10 +226,9 @@ def to_orc( path = io.BytesIO() assert path is not None # For mypy with get_handle(path, "wb", is_text=False) as handles: - assert isinstance(engine, ModuleType) # For mypy try: orc.write_table( - engine.Table.from_pandas(df, preserve_index=index), + pa.Table.from_pandas(df, preserve_index=index), handles.handle, **engine_kwargs, ) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9570d6f8b26bd..6a5a83088e986 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,4 +1,5 @@ -""" parquet compat """ +"""parquet compat""" + from __future__ import annotations import io @@ -9,27 +10,24 @@ Any, Literal, ) -import warnings -from warnings import catch_warnings - -from pandas._config import using_pyarrow_string_dtype -from pandas._config.config import _get_option +from warnings import ( + catch_warnings, + filterwarnings, +) from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas import ( DataFrame, get_option, ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, get_handle, @@ -151,7 +149,7 @@ def validate_dataframe(df: DataFrame) -> None: if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") - def write(self, df: DataFrame, path, compression, **kwargs): + def write(self, df: DataFrame, path, compression, **kwargs) -> None: raise AbstractMethodError(self) def read(self, path, columns=None, **kwargs) -> DataFrame: @@ -241,29 +239,14 @@ def read( path, columns=None, filters=None, - use_nullable_dtypes: bool = False, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True - to_pandas_kwargs = {} - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get - elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_pyarrow_string_dtype(): - to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() - - manager = _get_option("mode.data_manager", silent=True) - if manager == "array": - to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] - path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -278,10 +261,17 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) - - if manager == "array": - result = result._as_manager("array", copy=False) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -362,18 +352,13 @@ def read( filters=None, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: parquet_kwargs: dict[str, Any] = {} - use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) dtype_backend = kwargs.pop("dtype_backend", lib.no_default) # We are disabling nullable dtypes for fastparquet pending discussion parquet_kwargs["pandas_nulls"] = False - if use_nullable_dtypes: - raise ValueError( - "The 'use_nullable_dtypes' argument is not supported for the " - "fastparquet engine" - ) if dtype_backend is not lib.no_default: raise ValueError( "The 'dtype_backend' argument is not supported for the " @@ -383,6 +368,10 @@ def read( raise NotImplementedError( "filesystem is not implemented for the fastparquet engine." ) + if to_pandas_kwargs is not None: + raise NotImplementedError( + "to_pandas_kwargs is not implemented for the fastparquet engine." + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -400,7 +389,15 @@ def read( try: parquet_file = self.api.ParquetFile(path, **parquet_kwargs) - return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + return parquet_file.to_pandas( + columns=columns, filters=filters, **kwargs + ) finally: if handles is not None: handles.close() @@ -465,7 +462,7 @@ def to_parquet( .. versionadded:: 2.1.0 kwargs - Additional keyword arguments passed to the engine + Additional keyword arguments passed to the engine. Returns ------- @@ -501,15 +498,18 @@ def read_parquet( engine: str = "auto", columns: list[str] | None = None, storage_options: StorageOptions | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem: Any = None, filters: list[tuple] | list[list[tuple]] | None = None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: """ Load a parquet object from the file path, returning a DataFrame. + The function automatically handles reading the data from a parquet file + and creates a DataFrame with the appropriate structure. + Parameters ---------- path : str, path object or file-like object @@ -539,25 +539,15 @@ def read_parquet( .. versionadded:: 1.3.0 - use_nullable_dtypes : bool, default False - If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame. (only applicable for the ``pyarrow`` - engine) - As new dtypes are added that support ``pd.NA`` in the future, the - output with this option will change to use those dtypes. - Note: this is an experimental option, and behaviour (e.g. additional - support dtypes) may change without notice. - - .. deprecated:: 2.0 - - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -585,12 +575,19 @@ def read_parquet( .. versionadded:: 2.1.0 + to_pandas_kwargs : dict | None, default None + Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas` + when ``engine="pyarrow"``. + + .. versionadded:: 3.0.0 + **kwargs Any additional kwargs are passed to the engine. Returns ------- DataFrame + DataFrame based on parquet file. See Also -------- @@ -598,9 +595,7 @@ def read_parquet( Examples -------- - >>> original_df = pd.DataFrame( - ... {{"foo": range(5), "bar": range(5, 10)}} - ... ) + >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) >>> original_df foo bar 0 0 5 @@ -628,7 +623,7 @@ def read_parquet( 2 7 3 8 4 9 - >>> restored_bar.equals(original_df[['bar']]) + >>> restored_bar.equals(original_df[["bar"]]) True The function uses `kwargs` that are passed directly to the engine. @@ -649,19 +644,6 @@ def read_parquet( """ impl = get_engine(engine) - - if use_nullable_dtypes is not lib.no_default: - msg = ( - "The argument 'use_nullable_dtypes' is deprecated and will be removed " - "in a future version." - ) - if use_nullable_dtypes is True: - msg += ( - "Use dtype_backend='numpy_nullable' instead of use_nullable_dtype=True." - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - else: - use_nullable_dtypes = False check_dtype_backend(dtype_backend) return impl.read( @@ -669,8 +651,8 @@ def read_parquet( columns=columns, filters=filters, storage_options=storage_options, - use_nullable_dtypes=use_nullable_dtypes, dtype_backend=dtype_backend, filesystem=filesystem, + to_pandas_kwargs=to_pandas_kwargs, **kwargs, ) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 66a7ccacf675b..8cadde1ad6537 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_pyarrow_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,18 +14,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer -import pandas as pd -from pandas import DataFrame - -from pandas.io._util import ( - _arrow_dtype_mapping, - arrow_string_types_mapper, -) +from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: from pandas._typing import ReadBuffer + from pandas import DataFrame + class ArrowParserWrapper(ParserBase): """ @@ -41,7 +35,7 @@ def __init__(self, src: ReadBuffer[bytes], **kwds) -> None: self._parse_kwds() - def _parse_kwds(self): + def _parse_kwds(self) -> None: """ Validates keywords before passing to pyarrow. """ @@ -99,12 +93,12 @@ def _get_pyarrow_options(self) -> None: if callable(on_bad_lines): self.parse_options["invalid_row_handler"] = on_bad_lines elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR: - self.parse_options[ - "invalid_row_handler" - ] = None # PyArrow raises an exception by default + self.parse_options["invalid_row_handler"] = ( + None # PyArrow raises an exception by default + ) elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: - def handle_warning(invalid_row): + def handle_warning(invalid_row) -> str: warnings.warn( f"Expected {invalid_row.expected_columns} columns, but found " f"{invalid_row.actual_columns}: {invalid_row.text}", @@ -171,11 +165,12 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. - self.names = list(range(num_cols - len(self.names))) + self.names + columns_prefix = [str(x) for x in range(num_cols - len(self.names))] + self.names = columns_prefix + self.names multi_index_named = False frame.columns = self.names - # we only need the frame not the names - _, frame = self._do_date_conversions(frame.columns, frame) + + frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: index_to_set = self.index_col.copy() for i, item in enumerate(self.index_col): @@ -214,12 +209,12 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: self.dtype = pandas_dtype(self.dtype) try: frame = frame.astype(self.dtype) - except TypeError as e: + except TypeError as err: # GH#44901 reraise to keep api consistent - raise ValueError(e) + raise ValueError(str(err)) from err return frame - def _validate_usecols(self, usecols): + def _validate_usecols(self, usecols) -> None: if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): raise ValueError( "The pyarrow engine does not allow 'usecols' to be integer " @@ -247,7 +242,7 @@ def read(self) -> DataFrame: try: convert_options = pyarrow_csv.ConvertOptions(**self.convert_options) - except TypeError: + except TypeError as err: include = self.convert_options.get("include_columns", None) if include is not None: self._validate_usecols(include) @@ -258,7 +253,7 @@ def read(self) -> DataFrame: ): raise TypeError( "The 'pyarrow' engine requires all na_values to be strings" - ) + ) from err raise @@ -287,17 +282,14 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + frame = arrow_table_to_pandas( + table, dtype_backend=dtype_backend, null_to_int64=True + ) - else: - frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 09f0f2af8e5c6..c283f600eb971 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -3,13 +3,11 @@ from collections import defaultdict from copy import copy import csv -import datetime from enum import Enum import itertools from typing import ( TYPE_CHECKING, Any, - Callable, cast, final, overload, @@ -24,7 +22,6 @@ ) import pandas._libs.ops as libops from pandas._libs.parsers import STR_NA_VALUES -from pandas._libs.tslibs import parsing from pandas.compat._optional import import_optional_dependency from pandas.errors import ( ParserError, @@ -32,45 +29,31 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( - ensure_object, is_bool_dtype, is_dict_like, - is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_list_like, is_object_dtype, - is_scalar, is_string_dtype, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, ) from pandas.core.dtypes.missing import isna from pandas import ( - ArrowDtype, DataFrame, DatetimeIndex, StringDtype, - concat, ) from pandas.core import algorithms from pandas.core.arrays import ( ArrowExtensionArray, BaseMaskedArray, BooleanArray, - Categorical, - ExtensionArray, FloatingArray, IntegerArray, ) -from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -84,7 +67,7 @@ if TYPE_CHECKING: from collections.abc import ( - Hashable, + Callable, Iterable, Mapping, Sequence, @@ -93,8 +76,10 @@ from pandas._typing import ( ArrayLike, DtypeArg, - DtypeObj, + Hashable, + HashableT, Scalar, + SequenceT, ) @@ -109,7 +94,6 @@ class BadLineHandleMethod(Enum): keep_default_na: bool dayfirst: bool cache_dates: bool - keep_date_col: bool usecols_dtype: str | None def __init__(self, kwds) -> None: @@ -123,12 +107,17 @@ def __init__(self, kwds) -> None: self.index_names: Sequence[Hashable] | None = None self.col_names: Sequence[Hashable] | None = None - self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) - self._parse_date_cols: Iterable = [] + parse_dates = kwds.pop("parse_dates", False) + if parse_dates is None or lib.is_bool(parse_dates): + parse_dates = bool(parse_dates) + elif not isinstance(parse_dates, list): + raise TypeError( + "Only booleans and lists are accepted for the 'parse_dates' parameter" + ) + self.parse_dates: bool | list = parse_dates self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) - self.keep_date_col = kwds.pop("keep_date_col", False) self.na_values = kwds.get("na_values") self.na_fvalues = kwds.get("na_fvalues") @@ -143,13 +132,6 @@ def __init__(self, kwds) -> None: self.false_values = kwds.get("false_values") self.cache_dates = kwds.pop("cache_dates", True) - self._date_conv = _make_date_converter( - date_parser=self.date_parser, - date_format=self.date_format, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - ) - # validate header options for mi self.header = kwds.get("header") if is_list_like(self.header, allow_sets=False): @@ -172,99 +154,27 @@ def __init__(self, kwds) -> None: and all(map(is_integer, self.index_col)) ): raise ValueError( - "index_col must only contain row numbers " + "index_col must only contain integers of column positions " "when specifying a multi-index header" ) else: self.index_col = list(self.index_col) - self._name_processed = False - self._first_chunk = True - self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: - """ - Check if parse_dates are in columns. - - If user has provided names for parse_dates, check if those columns - are available. - - Parameters - ---------- - columns : list - List of names of the dataframe. - - Returns - ------- - The names of the columns which will get parsed later if a dict or list - is given as specification. - - Raises - ------ - ValueError - If column to parse_date is not in dataframe. - - """ - cols_needed: Iterable - if is_dict_like(self.parse_dates): - cols_needed = itertools.chain(*self.parse_dates.values()) - elif is_list_like(self.parse_dates): - # a column in parse_dates could be represented - # ColReference = Union[int, str] - # DateGroups = List[ColReference] - # ParseDates = Union[DateGroups, List[DateGroups], - # Dict[ColReference, DateGroups]] - cols_needed = itertools.chain.from_iterable( - col if is_list_like(col) and not isinstance(col, tuple) else [col] - for col in self.parse_dates - ) - else: - cols_needed = [] - - cols_needed = list(cols_needed) - - # get only columns that are references using names (str), not by index - missing_cols = ", ".join( - sorted( - { - col - for col in cols_needed - if isinstance(col, str) and col not in columns - } - ) - ) - if missing_cols: - raise ValueError( - f"Missing column provided to 'parse_dates': '{missing_cols}'" - ) - # Convert positions to actual column names - return [ - col if (isinstance(col, str) or col in columns) else columns[col] - for col in cols_needed - ] - def close(self) -> None: pass - @final - @property - def _has_complex_date_col(self) -> bool: - return isinstance(self.parse_dates, dict) or ( - isinstance(self.parse_dates, list) - and len(self.parse_dates) > 0 - and isinstance(self.parse_dates[0], list) - ) - @final def _should_parse_dates(self, i: int) -> bool: - if lib.is_bool(self.parse_dates): - return bool(self.parse_dates) + if isinstance(self.parse_dates, bool): + return self.parse_dates else: if self.index_names is not None: name = self.index_names[i] @@ -350,97 +260,48 @@ def extract(r): @final def _maybe_make_multi_index_columns( self, - columns: Sequence[Hashable], + columns: SequenceT, col_names: Sequence[Hashable] | None = None, - ) -> Sequence[Hashable] | MultiIndex: + ) -> SequenceT | MultiIndex: # possibly create a column mi here if is_potential_multi_index(columns): - list_columns = cast(list[tuple], columns) - return MultiIndex.from_tuples(list_columns, names=col_names) + columns_mi = cast("Sequence[tuple[Hashable, ...]]", columns) + return MultiIndex.from_tuples(columns_mi, names=col_names) return columns @final def _make_index( - self, data, alldata, columns, indexnamerow: list[Scalar] | None = None + self, alldata, columns, indexnamerow: list[Scalar] | None = None ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: index: Index | None - if not is_index_col(self.index_col) or not self.index_col: + if isinstance(self.index_col, list) and len(self.index_col): + to_remove = [] + indexes = [] + for idx in self.index_col: + if isinstance(idx, str): + raise ValueError(f"Index {idx} invalid") + to_remove.append(idx) + indexes.append(alldata[idx]) + # remove index items from content and columns, don't pop in + # loop + for i in sorted(to_remove, reverse=True): + alldata.pop(i) + if not self._implicit_index: + columns.pop(i) + index = self._agg_index(indexes) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index = index.set_names(indexnamerow[:coffset]) + else: index = None - elif not self._has_complex_date_col: - simple_index = self._get_simple_index(alldata, columns) - index = self._agg_index(simple_index) - elif self._has_complex_date_col: - if not self._name_processed: - (self.index_names, _, self.index_col) = self._clean_index_names( - list(columns), self.index_col - ) - self._name_processed = True - date_index = self._get_complex_date_index(data, columns) - index = self._agg_index(date_index, try_parse_dates=False) - - # add names for the index - if indexnamerow: - coffset = len(indexnamerow) - len(columns) - assert index is not None - index = index.set_names(indexnamerow[:coffset]) - # maybe create a mi on the columns columns = self._maybe_make_multi_index_columns(columns, self.col_names) return index, columns - @final - def _get_simple_index(self, data, columns): - def ix(col): - if not isinstance(col, str): - return col - raise ValueError(f"Index {col} invalid") - - to_remove = [] - index = [] - for idx in self.index_col: - i = ix(idx) - to_remove.append(i) - index.append(data[i]) - - # remove index items from content and columns, don't pop in - # loop - for i in sorted(to_remove, reverse=True): - data.pop(i) - if not self._implicit_index: - columns.pop(i) - - return index - - @final - def _get_complex_date_index(self, data, col_names): - def _get_name(icol): - if isinstance(icol, str): - return icol - - if col_names is None: - raise ValueError(f"Must supply column order to use {icol!s} as index") - - for i, c in enumerate(col_names): - if i == icol: - return c - - to_remove = [] - index = [] - for idx in self.index_col: - name = _get_name(idx) - to_remove.append(name) - index.append(data[name]) - - # remove index items from content and columns, don't pop in - # loop - for c in sorted(to_remove, reverse=True): - data.pop(c) - col_names.remove(c) - - return index - @final def _clean_mapping(self, mapping): """converts col numbers to names""" @@ -460,15 +321,23 @@ def _clean_mapping(self, mapping): return clean @final - def _agg_index(self, index, try_parse_dates: bool = True) -> Index: + def _agg_index(self, index) -> Index: arrays = [] converters = self._clean_mapping(self.converters) + clean_dtypes = self._clean_mapping(self.dtype) - for i, arr in enumerate(index): - if try_parse_dates and self._should_parse_dates(i): - arr = self._date_conv( + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): + if self._should_parse_dates(i): + arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) if self.na_filter: @@ -482,11 +351,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: assert self.index_names is not None col_name = self.index_names[i] if col_name is not None: - col_na_values, col_na_fvalues = _get_na_values( + col_na_values, col_na_fvalues = get_na_values( col_name, self.na_values, self.na_fvalues, self.keep_default_na ) - - clean_dtypes = self._clean_mapping(self.dtype) + else: + col_na_values, col_na_fvalues = set(), set() cast_type = None index_converter = False @@ -498,105 +367,23 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: index_converter = converters.get(self.index_names[i]) is not None try_num_bool = not ( - cast_type and is_string_dtype(cast_type) or index_converter + (cast_type and is_string_dtype(cast_type)) or index_converter ) arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) - - return index - - @final - def _convert_to_ndarrays( - self, - dct: Mapping, - na_values, - na_fvalues, - verbose: bool = False, - converters=None, - dtypes=None, - ): - result = {} - for c, values in dct.items(): - conv_f = None if converters is None else converters.get(c, None) - if isinstance(dtypes, dict): - cast_type = dtypes.get(c, None) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) else: - # single dtype or None - cast_type = dtypes + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - if self.na_filter: - col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na - ) - else: - col_na_values, col_na_fvalues = set(), set() - - if c in self._parse_date_cols: - # GH#26203 Do not convert columns which get converted to dates - # but replace nans to ensure to_datetime works - mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) - np.putmask(values, mask, np.nan) - result[c] = values - continue - - if conv_f is not None: - # conv_f applied to data before inference - if cast_type is not None: - warnings.warn( - ( - "Both a converter and dtype were specified " - f"for column {c} - only the converter will be used." - ), - ParserWarning, - stacklevel=find_stack_level(), - ) - - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = algorithms.isin(values, list(na_values)).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool=False, - ) - else: - is_ea = is_extension_array_dtype(cast_type) - is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object - # or casting to an EA - try_num_bool = not (cast_type and is_str_or_ea_dtype) - - # general type inference and conversion - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool, - ) - - # type specified in dtype param or cast_type is an EA - if cast_type is not None: - cast_type = pandas_dtype(cast_type) - if cast_type and (cvals.dtype != cast_type or is_ea): - if not is_ea and na_count > 0: - if is_bool_dtype(cast_type): - raise ValueError(f"Bool column has NA values in column {c}") - cvals = self._cast_types(cvals, cast_type, c) - - result[c] = cvals - if verbose and na_count: - print(f"Filled {na_count} NA values in column {c!s}") - return result + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _set_noconvert_dtype_columns( @@ -643,20 +430,9 @@ def _set(x) -> int: return x if isinstance(self.parse_dates, list): + validate_parse_dates_presence(self.parse_dates, names) for val in self.parse_dates: - if isinstance(val, list): - for k in val: - noconvert_columns.add(_set(k)) - else: - noconvert_columns.add(_set(val)) - - elif isinstance(self.parse_dates, dict): - for val in self.parse_dates.values(): - if isinstance(val, list): - for k in val: - noconvert_columns.add(_set(k)) - else: - noconvert_columns.add(_set(val)) + noconvert_columns.add(_set(val)) elif self.parse_dates: if isinstance(self.index_col, list): @@ -780,115 +556,47 @@ def _infer_types( return result, na_count - @final - def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: - """ - Cast values to specified type - - Parameters - ---------- - values : ndarray or ExtensionArray - cast_type : np.dtype or ExtensionDtype - dtype to cast values to - column : string - column name - used only for error reporting - - Returns - ------- - converted : ndarray or ExtensionArray - """ - if isinstance(cast_type, CategoricalDtype): - known_cats = cast_type.categories is not None - - if not is_object_dtype(values.dtype) and not known_cats: - # TODO: this is for consistency with - # c-parser which parses all categories - # as strings - values = lib.ensure_string_array( - values, skipna=False, convert_na_value=False - ) - - cats = Index(values).unique().dropna() - values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, true_values=self.true_values - ) - - # use the EA's implementation of casting - elif isinstance(cast_type, ExtensionDtype): - array_type = cast_type.construct_array_type() - try: - if isinstance(cast_type, BooleanDtype): - # error: Unexpected keyword argument "true_values" for - # "_from_sequence_of_strings" of "ExtensionArray" - return array_type._from_sequence_of_strings( # type: ignore[call-arg] - values, - dtype=cast_type, - true_values=self.true_values, - false_values=self.false_values, - ) - else: - return array_type._from_sequence_of_strings(values, dtype=cast_type) - except NotImplementedError as err: - raise NotImplementedError( - f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order to be used in parser methods" - ) from err - - elif isinstance(values, ExtensionArray): - values = values.astype(cast_type, copy=False) - elif issubclass(cast_type.type, str): - # TODO: why skipna=True here and False above? some tests depend - # on it here, but nothing fails if we change it above - # (as no tests get there as of 2022-12-06) - values = lib.ensure_string_array( - values, skipna=True, convert_na_value=False - ) - else: - try: - values = astype_array(values, cast_type, copy=True) - except ValueError as err: - raise ValueError( - f"Unable to convert column {column} to type {cast_type}" - ) from err - return values - @overload def _do_date_conversions( self, names: Index, data: DataFrame, - ) -> tuple[Sequence[Hashable] | Index, DataFrame]: - ... + ) -> DataFrame: ... @overload def _do_date_conversions( self, names: Sequence[Hashable], data: Mapping[Hashable, ArrayLike], - ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: - ... + ) -> Mapping[Hashable, ArrayLike]: ... @final def _do_date_conversions( self, names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, - ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: - # returns data, columns - - if self.parse_dates is not None: - data, names = _process_date_conversion( - data, - self._date_conv, - self.parse_dates, - self.index_col, - self.index_names, - names, - keep_date_col=self.keep_date_col, - dtype_backend=self.dtype_backend, + ) -> Mapping[Hashable, ArrayLike] | DataFrame: + if not isinstance(self.parse_dates, list): + return data + for colspec in self.parse_dates: + if isinstance(colspec, int) and colspec not in data: + colspec = names[colspec] + if (isinstance(self.index_col, list) and colspec in self.index_col) or ( + isinstance(self.index_names, list) and colspec in self.index_names + ): + continue + result = date_converter( + data[colspec], + col=colspec, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data[colspec] = result # type: ignore[index] - return names, data + return data @final def _check_data_length( @@ -920,39 +628,8 @@ def _check_data_length( stacklevel=find_stack_level(), ) - @overload - def _evaluate_usecols( - self, - usecols: set[int] | Callable[[Hashable], object], - names: Sequence[Hashable], - ) -> set[int]: - ... - - @overload - def _evaluate_usecols( - self, usecols: set[str], names: Sequence[Hashable] - ) -> set[str]: - ... - @final - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object] | set[str] | set[int], - names: Sequence[Hashable], - ) -> set[str] | set[int]: - """ - Check whether or not the 'usecols' parameter - is a callable. If so, enumerates the 'names' - parameter and returns a set of indices for - each entry in 'names' that evaluates to True. - If not a callable, returns 'usecols'. - """ - if callable(usecols): - return {i for i, name in enumerate(names) if usecols(name)} - return usecols - - @final - def _validate_usecols_names(self, usecols, names: Sequence): + def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT: """ Validates that all usecols are present in a given list of names. If not, raise a ValueError that @@ -983,56 +660,6 @@ def _validate_usecols_names(self, usecols, names: Sequence): return usecols - @final - def _validate_usecols_arg(self, usecols): - """ - Validate the 'usecols' parameter. - - Checks whether or not the 'usecols' parameter contains all integers - (column selection by index), strings (column by name) or is a callable. - Raises a ValueError if that is not the case. - - Parameters - ---------- - usecols : list-like, callable, or None - List of columns to use when parsing or a callable that can be used - to filter a list of table columns. - - Returns - ------- - usecols_tuple : tuple - A tuple of (verified_usecols, usecols_dtype). - - 'verified_usecols' is either a set if an array-like is passed in or - 'usecols' if a callable or None is passed in. - - 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like - is passed in or None if a callable or None is passed in. - """ - msg = ( - "'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable." - ) - if usecols is not None: - if callable(usecols): - return usecols, None - - if not is_list_like(usecols): - # see gh-20529 - # - # Ensure it is iterable container but not string. - raise ValueError(msg) - - usecols_dtype = lib.infer_dtype(usecols, skipna=False) - - if usecols_dtype not in ("empty", "integer", "string"): - raise ValueError(msg) - - usecols = set(usecols) - - return usecols, usecols_dtype - return usecols, None - @final def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: if not is_index_col(index_col): @@ -1072,7 +699,9 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis return index_names, columns, index_col @final - def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): + def _get_empty_meta( + self, columns: Sequence[HashableT], dtype: DtypeArg | None = None + ) -> tuple[Index, list[HashableT], dict[HashableT, Series]]: columns = list(columns) index_col = self.index_col @@ -1084,12 +713,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -1106,8 +734,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): @@ -1120,107 +754,37 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): return index, columns, col_dict -def _make_date_converter( - date_parser=lib.no_default, +def date_converter( + date_col, + col: Hashable, dayfirst: bool = False, cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - if date_parser is not lib.no_default: - warnings.warn( - "The argument 'date_parser' is deprecated and will " - "be removed in a future version. " - "Please use 'date_format' instead, or read your data in as 'object' dtype " - "and then call 'to_datetime'.", - FutureWarning, - stacklevel=find_stack_level(), + if date_col.dtype.kind in "Mm": + return date_col + + date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format + + str_objs = lib.ensure_string_array(np.asarray(date_col)) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, ) - if date_parser is not lib.no_default and date_format is not None: - raise TypeError("Cannot use both 'date_parser' and 'date_format'") - - def unpack_if_single_element(arg): - # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 - if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1: - return arg[0] - return arg - - def converter(*date_cols, col: Hashable): - if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": - return date_cols[0] - - if date_parser is lib.no_default: - strs = parsing.concat_date_cols(date_cols) - date_fmt = ( - date_format.get(col) if isinstance(date_format, dict) else date_format - ) - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - ".*parsing datetimes with mixed time zones will raise an error", - category=FutureWarning, - ) - str_objs = ensure_object(strs) - try: - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_usecols_with_parse_dates4 - return str_objs - - if isinstance(result, DatetimeIndex): - arr = result.to_numpy() - arr.flags.writeable = True - return arr - return result._values - else: - try: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - ".*parsing datetimes with mixed time zones " - "will raise an error", - category=FutureWarning, - ) - pre_parsed = date_parser( - *(unpack_if_single_element(arg) for arg in date_cols) - ) - try: - result = tools.to_datetime( - pre_parsed, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_read_csv_with_custom_date_parser - result = pre_parsed - if isinstance(result, datetime.datetime): - raise Exception("scalar parser") - return result - except Exception: - # e.g. test_datetime_fractional_seconds - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - ".*parsing datetimes with mixed time zones " - "will raise an error", - category=FutureWarning, - ) - pre_parsed = parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), - parser=date_parser, - ) - try: - return tools.to_datetime(pre_parsed) - except (ValueError, TypeError): - # TODO: not reached in tests 2023-10-27; needed? - return pre_parsed + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + # test_multi_index_parse_dates + return str_objs - return converter + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values parser_defaults = { @@ -1249,14 +813,11 @@ def converter(*date_cols, col: Hashable): "decimal": ".", # 'engine': 'c', "parse_dates": False, - "keep_date_col": False, "dayfirst": False, - "date_parser": lib.no_default, "date_format": None, "usecols": None, # 'iterator': False, "chunksize": None, - "verbose": False, "encoding": None, "compression": None, "skip_blank_lines": True, @@ -1266,129 +827,7 @@ def converter(*date_cols, col: Hashable): } -def _process_date_conversion( - data_dict, - converter: Callable, - parse_spec, - index_col, - index_names, - columns, - keep_date_col: bool = False, - dtype_backend=lib.no_default, -): - def _isindex(colspec): - return (isinstance(index_col, list) and colspec in index_col) or ( - isinstance(index_names, list) and colspec in index_names - ) - - new_cols = [] - new_data = {} - - orig_names = columns - columns = list(columns) - - date_cols = set() - - if parse_spec is None or isinstance(parse_spec, bool): - return data_dict, columns - - if isinstance(parse_spec, list): - # list of column lists - for colspec in parse_spec: - if is_scalar(colspec) or isinstance(colspec, tuple): - if isinstance(colspec, int) and colspec not in data_dict: - colspec = orig_names[colspec] - if _isindex(colspec): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - data_dict[colspec] = converter( - np.asarray(data_dict[colspec]), col=colspec - ) - else: - new_name, col, old_names = _try_convert_dates( - converter, colspec, data_dict, orig_names - ) - if new_name in data_dict: - raise ValueError(f"New date column already in dict {new_name}") - new_data[new_name] = col - new_cols.append(new_name) - date_cols.update(old_names) - - elif isinstance(parse_spec, dict): - # dict of new name to column list - for new_name, colspec in parse_spec.items(): - if new_name in data_dict: - raise ValueError(f"Date column {new_name} already in dict") - - _, col, old_names = _try_convert_dates( - converter, - colspec, - data_dict, - orig_names, - target_name=new_name, - ) - - new_data[new_name] = col - - # If original column can be converted to date we keep the converted values - # This can only happen if values are from single column - if len(colspec) == 1: - new_data[colspec[0]] = col - - new_cols.append(new_name) - date_cols.update(old_names) - - if isinstance(data_dict, DataFrame): - data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False) - else: - data_dict.update(new_data) - new_cols.extend(columns) - - if not keep_date_col: - for c in list(date_cols): - data_dict.pop(c) - new_cols.remove(c) - - return data_dict, new_cols - - -def _try_convert_dates( - parser: Callable, colspec, data_dict, columns, target_name: str | None = None -): - colset = set(columns) - colnames = [] - - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int) and c not in columns: - colnames.append(columns[c]) - else: - colnames.append(c) - - new_name: tuple | str - if all(isinstance(x, tuple) for x in colnames): - new_name = tuple(map("_".join, zip(*colnames))) - else: - new_name = "_".join([str(x) for x in colnames]) - to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] - - new_col = parser(*to_parse, col=new_name if target_name is None else target_name) - return new_name, new_col, colnames - - -def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): +def get_na_values(col, na_values, na_fvalues, keep_default_na: bool): """ Get the NaN values for a given column. @@ -1423,26 +862,130 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): return na_values, na_fvalues -def _validate_parse_dates_arg(parse_dates): +def is_index_col(col) -> bool: + return col is not None and col is not False + + +def validate_parse_dates_presence( + parse_dates: bool | list, columns: Sequence[Hashable] +) -> set: """ - Check whether or not the 'parse_dates' parameter - is a non-boolean scalar. Raises a ValueError if - that is the case. + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Returns + ------- + The names of the columns which will get parsed later if a list + is given as specification. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + if not isinstance(parse_dates, list): + return set() + + missing = set() + unique_cols = set() + for col in parse_dates: + if isinstance(col, str): + if col not in columns: + missing.add(col) + else: + unique_cols.add(col) + elif col in columns: + unique_cols.add(col) + else: + unique_cols.add(columns[col]) + if missing: + missing_cols = ", ".join(sorted(missing)) + raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'") + return unique_cols + + +def _validate_usecols_arg(usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. """ msg = ( - "Only booleans, lists, and dictionaries are accepted " - "for the 'parse_dates' parameter" + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." ) + if usecols is not None: + if callable(usecols): + return usecols, None - if not ( - parse_dates is None - or lib.is_bool(parse_dates) - or isinstance(parse_dates, (list, dict)) - ): - raise TypeError(msg) + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) - return parse_dates + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + if usecols_dtype not in ("empty", "integer", "string"): + raise ValueError(msg) -def is_index_col(col) -> bool: - return col is not None and col is not False + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None + + +@overload +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object], + names: Iterable[Hashable], +) -> set[int]: ... + + +@overload +def evaluate_callable_usecols( + usecols: SequenceT, names: Iterable[Hashable] +) -> SequenceT: ... + + +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object] | SequenceT, + names: Iterable[Hashable], +) -> SequenceT | set[int]: + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return {i for i, name in enumerate(names) if usecols(name)} + return usecols diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 0cd788c5e5739..aa9f3556c8f62 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -30,7 +30,10 @@ from pandas.io.parsers.base_parser import ( ParserBase, ParserError, + date_converter, + evaluate_callable_usecols, is_index_col, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -41,10 +44,12 @@ ) from pandas._typing import ( + AnyArrayLike, ArrayLike, DtypeArg, DtypeObj, ReadCsvBuffer, + SequenceT, ) from pandas import ( @@ -129,7 +134,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: - usecols = self._evaluate_usecols(self.usecols, self.orig_names) + usecols = evaluate_callable_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset @@ -158,36 +163,34 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: ) # error: Cannot determine type of 'names' - self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + validate_parse_dates_presence(self.parse_dates, self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' self.orig_names = self.names # type: ignore[has-type] - if not self._has_complex_date_col: - # error: Cannot determine type of 'index_col' - if self._reader.leading_cols == 0 and is_index_col( - self.index_col # type: ignore[has-type] - ): - self._name_processed = True - ( - index_names, - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - self.index_col, - ) = self._clean_index_names( - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - # error: Cannot determine type of 'index_col' - self.index_col, # type: ignore[has-type] - ) + # error: Cannot determine type of 'index_col' + if self._reader.leading_cols == 0 and is_index_col( + self.index_col # type: ignore[has-type] + ): + ( + index_names, + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + self.index_col, + ) = self._clean_index_names( + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + # error: Cannot determine type of 'index_col' + self.index_col, # type: ignore[has-type] + ) - if self.index_names is None: - self.index_names = index_names + if self.index_names is None: + self.index_names = index_names - if self._reader.header is None and not passed_names: - assert self.index_names is not None - self.index_names = [None] * len(self.index_names) + if self._reader.header is None and not passed_names: + assert self.index_names is not None + self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0 @@ -225,7 +228,7 @@ def read( ) -> tuple[ Index | MultiIndex | None, Sequence[Hashable] | MultiIndex, - Mapping[Hashable, ArrayLike], + Mapping[Hashable, AnyArrayLike], ]: index: Index | MultiIndex | None column_names: Sequence[Hashable] | MultiIndex @@ -233,7 +236,7 @@ def read( if self.low_memory: chunks = self._reader.read_low_memory(nrows) # destructive to chunks - data = _concatenate_chunks(chunks) + data = _concatenate_chunks(chunks, self.names) # type: ignore[has-type] else: data = self._reader.read(nrows) @@ -248,12 +251,16 @@ def read( names, dtype=self.dtype, ) - columns = self._maybe_make_multi_index_columns(columns, self.col_names) + # error: Incompatible types in assignment (expression has type + # "list[Hashable] | MultiIndex", variable has type "list[Hashable]") + columns = self._maybe_make_multi_index_columns( # type: ignore[assignment] + columns, self.col_names + ) - if self.usecols is not None: - columns = self._filter_usecols(columns) + columns = _filter_usecols(self.usecols, columns) + columns_set = set(columns) - col_dict = {k: v for k, v in col_dict.items() if k in columns} + col_dict = {k: v for k, v in col_dict.items() if k in columns_set} return index, columns, col_dict @@ -268,9 +275,6 @@ def read( names = self.names # type: ignore[has-type] if self._reader.leading_cols: - if self._has_complex_date_col: - raise NotImplementedError("file structure not yet supported") - # implicit index, no index names arrays = [] @@ -287,13 +291,21 @@ def read( else: values = data.pop(self.index_col[i]) - values = self._maybe_parse_dates(values, i, try_parse_dates=True) + if self._should_parse_dates(i): + values = date_converter( + values, + col=self.index_names[i] + if self.index_names is not None + else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, + ) arrays.append(values) index = ensure_index_from_sequences(arrays) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) @@ -301,12 +313,10 @@ def read( data_tups = sorted(data.items()) data = {k: v for k, (i, v) in zip(names, data_tups)} - column_names, date_data = self._do_date_conversions(names, data) + date_data = self._do_date_conversions(names, data) # maybe create a mi on the columns - column_names = self._maybe_make_multi_index_columns( - column_names, self.col_names - ) + column_names = self._maybe_make_multi_index_columns(names, self.col_names) else: # rename dict keys @@ -319,8 +329,7 @@ def read( names = list(self.orig_names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) # columns as list alldata = [x[1] for x in data_tups] @@ -329,30 +338,23 @@ def read( data = {k: v for k, (i, v) in zip(names, data_tups)} - names, date_data = self._do_date_conversions(names, data) - index, column_names = self._make_index(date_data, alldata, names) + date_data = self._do_date_conversions(names, data) + index, column_names = self._make_index(alldata, names) return index, column_names, date_data - def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]: - # hackish - usecols = self._evaluate_usecols(self.usecols, names) - if usecols is not None and len(names) != len(usecols): - names = [ - name for i, name in enumerate(names) if i in usecols or name in usecols - ] - return names - def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): - if try_parse_dates and self._should_parse_dates(index): - values = self._date_conv( - values, - col=self.index_names[index] if self.index_names is not None else None, - ) - return values +def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]: + # hackish + usecols = evaluate_callable_usecols(usecols, names) + if usecols is not None and len(names) != len(usecols): + return [name for i, name in enumerate(names) if i in usecols or name in usecols] + return names -def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: +def _concatenate_chunks( + chunks: list[dict[int, ArrayLike]], column_names: list[str] +) -> dict: """ Concatenate chunks of data read with low_memory=True. @@ -375,10 +377,12 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: else: result[name] = concat_compat(arrs) if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object): - warning_columns.append(str(name)) + warning_columns.append(column_names[name]) if warning_columns: - warning_names = ",".join(warning_columns) + warning_names = ", ".join( + [f"{index}: {name}" for index, name in enumerate(warning_columns)] + ) warning_message = " ".join( [ f"Columns ({warning_names}) have mixed types. " @@ -390,7 +394,7 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: def ensure_dtype_objs( - dtype: DtypeArg | dict[Hashable, DtypeArg] | None + dtype: DtypeArg | dict[Hashable, DtypeArg] | None, ) -> DtypeObj | dict[Hashable, DtypeObj] | None: """ Ensure we have either None, a dtype object, or a dictionary mapping to diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 79e7554a5744c..547d8c1fe3d19 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -4,21 +4,17 @@ abc, defaultdict, ) -from collections.abc import ( - Hashable, - Iterator, - Mapping, - Sequence, -) import csv from io import StringIO import re from typing import ( IO, TYPE_CHECKING, + Any, DefaultDict, Literal, cast, + final, ) import warnings @@ -33,32 +29,61 @@ from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, + is_extension_array_dtype, is_integer, is_numeric_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, ) from pandas.core.dtypes.inference import is_dict_like +from pandas.core import algorithms +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.indexes.api import Index + from pandas.io.common import ( dedup_names, is_potential_multi_index, ) from pandas.io.parsers.base_parser import ( ParserBase, + evaluate_callable_usecols, + get_na_values, parser_defaults, + validate_parse_dates_presence, ) if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + Mapping, + Sequence, + ) + from pandas._typing import ( ArrayLike, + DtypeObj, ReadCsvBuffer, Scalar, + T, ) from pandas import ( - Index, MultiIndex, + Series, ) # BOM character (byte order mark) @@ -77,7 +102,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: """ super().__init__(kwds) - self.data: Iterator[str] | None = None + self.data: Iterator[list[str]] | list[list[Scalar]] = [] self.buf: list = [] self.pos = 0 self.line_pos = 0 @@ -103,11 +128,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - self.has_index_names = False - if "has_index_names" in kwds: - self.has_index_names = kwds["has_index_names"] - - self.verbose = kwds["verbose"] + # Passed from read_excel + self.has_index_names = kwds.get("has_index_names", False) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -116,10 +138,11 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: # Set self.data to something that can read lines. if isinstance(f, list): - # read_excel: f is a list - self.data = cast(Iterator[str], f) + # read_excel: f is a nested list, can contain non-str + self.data = f else: assert hasattr(f, "readline") + # yields list of str self.data = self._make_reader(f) # Get columns in two steps: infer from data, then @@ -148,19 +171,13 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: # get popped off for index self.orig_names: list[Hashable] = list(self.columns) - # needs to be cleaned/refactored - # multiple date column thing turning into a real spaghetti factory - - if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = self._get_index_name() - self._name_processed = True - if self.index_names is None: - self.index_names = index_names + index_names, self.orig_names, self.columns = self._get_index_name() + if self.index_names is None: + self.index_names = index_names if self._col_indices is None: self._col_indices = list(range(len(self.columns))) - self._parse_date_cols = self._validate_parse_dates_presence(self.columns) self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: @@ -179,7 +196,7 @@ def num(self) -> re.Pattern: ) return re.compile(regex) - def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]): + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> Iterator[list[str]]: sep = self.delimiter if sep is None or len(sep) == 1: @@ -246,7 +263,9 @@ def _read(): def read( self, rows: int | None = None ) -> tuple[ - Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike] + Index | None, + Sequence[Hashable] | MultiIndex, + Mapping[Hashable, ArrayLike | Series], ]: try: content = self._get_lines(rows) @@ -260,6 +279,7 @@ def read( # done with first read, next time raise StopIteration self._first_chunk = False + index: Index | None columns: Sequence[Hashable] = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 @@ -279,9 +299,10 @@ def read( return index, conv_columns, col_dict # handle new style for names in index - count_empty_content_vals = count_empty_vals(content[0]) indexnamerow = None - if self.has_index_names and count_empty_content_vals == len(columns): + if self.has_index_names and sum( + int(v == "" or v is None) for v in content[0] + ) == len(columns): indexnamerow = content[0] content = content[1:] @@ -289,11 +310,9 @@ def read( data, columns = self._exclude_implicit_index(alldata) conv_data = self._convert_data(data) - columns, conv_data = self._do_date_conversions(columns, conv_data) + conv_data = self._do_date_conversions(columns, conv_data) - index, result_columns = self._make_index( - conv_data, alldata, columns, indexnamerow - ) + index, result_columns = self._make_index(alldata, columns, indexnamerow) return index, result_columns, conv_data @@ -326,7 +345,9 @@ def _exclude_implicit_index( def get_chunk( self, size: int | None = None ) -> tuple[ - Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike] + Index | None, + Sequence[Hashable] | MultiIndex, + Mapping[Hashable, ArrayLike | Series], ]: if size is None: # error: "PythonParser" has no attribute "chunksize" @@ -347,14 +368,15 @@ def _convert_data( if isinstance(self.na_values, dict): for col in self.na_values: - na_value = self.na_values[col] - na_fvalue = self.na_fvalues[col] + if col is not None: + na_value = self.na_values[col] + na_fvalue = self.na_fvalues[col] - if isinstance(col, int) and col not in self.orig_names: - col = self.orig_names[col] + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] - clean_na_values[col] = na_value - clean_na_fvalues[col] = na_fvalue + clean_na_values[col] = na_value + clean_na_fvalues[col] = na_fvalue else: clean_na_values = self.na_values clean_na_fvalues = self.na_fvalues @@ -363,11 +385,169 @@ def _convert_data( data, clean_na_values, clean_na_fvalues, - self.verbose, clean_conv, clean_dtypes, ) + @final + def _convert_to_ndarrays( + self, + dct: Mapping, + na_values, + na_fvalues, + converters=None, + dtypes=None, + ) -> dict[Any, np.ndarray]: + result = {} + parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns) + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if c in parse_date_cols: + # GH#26203 Do not convert columns which get converted to dates + # but replace nans to ensure to_datetime works + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) # pyright: ignore[reportArgumentType] + np.putmask(values, mask, np.nan) + result[c] = values + continue + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will be used." + ), + ParserWarning, + stacklevel=find_stack_level(), + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool=False, + ) + else: + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool, + ) + + # type specified in dtype param or cast_type is an EA + if cast_type is not None: + cast_type = pandas_dtype(cast_type) + if cast_type and (cvals.dtype != cast_type or is_ea): + if not is_ea and na_count > 0: + if is_bool_dtype(cast_type): + raise ValueError(f"Bool column has NA values in column {c}") + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + return result + + @final + def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray or ExtensionArray + cast_type : np.dtype or ExtensionDtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray or ExtensionArray + """ + if isinstance(cast_type, CategoricalDtype): + known_cats = cast_type.categories is not None + + if not is_object_dtype(values.dtype) and not known_cats: + # TODO: this is for consistency with + # c-parser which parses all categories + # as strings + values = lib.ensure_string_array( + values, skipna=False, convert_na_value=False + ) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif isinstance(cast_type, ExtensionDtype): + array_type = cast_type.construct_array_type() + try: + if isinstance(cast_type, BooleanDtype): + # error: Unexpected keyword argument "true_values" for + # "_from_sequence_of_strings" of "ExtensionArray" + values_str = [str(val) for val in values] + return array_type._from_sequence_of_strings( # type: ignore[call-arg] + values_str, + dtype=cast_type, + true_values=self.true_values, # pyright: ignore[reportCallIssue] + false_values=self.false_values, # pyright: ignore[reportCallIssue] + none_values=self.na_values, # pyright: ignore[reportCallIssue] + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError as err: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" + ) from err + + elif isinstance(values, ExtensionArray): + values = values.astype(cast_type, copy=False) + elif issubclass(cast_type.type, str): + # TODO: why skipna=True here and False above? some tests depend + # on it here, but nothing fails if we change it above + # (as no tests get there as of 2022-12-06) + values = lib.ensure_string_array( + values, skipna=True, convert_na_value=False + ) + else: + try: + values = astype_array(values, cast_type, copy=True) + except ValueError as err: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) from err + return values + @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: @@ -415,8 +595,7 @@ def _infer_columns( joi = list(map(str, header[:-1] if have_mi_columns else header)) msg = f"[{','.join(joi)}], len of {len(joi)}, " raise ValueError( - f"Passed header={msg}" - f"but only {self.line_pos} lines in file" + f"Passed header={msg}but only {self.line_pos} lines in file" ) from err # We have an empty file, so check @@ -424,7 +603,7 @@ def _infer_columns( # serve as the 'line' for parsing if have_mi_columns and hr > 0: if clear_buffer: - self._clear_buffer() + self.buf.clear() columns.append([None] * len(columns[-1])) return columns, num_original_columns, unnamed_cols @@ -506,7 +685,7 @@ def _infer_columns( num_original_columns = len(this_columns) if clear_buffer: - self._clear_buffer() + self.buf.clear() first_line: list[Scalar] | None if names is not None: @@ -593,7 +772,7 @@ def _handle_usecols( col_indices: set[int] | list[int] if self.usecols is not None: if callable(self.usecols): - col_indices = self._evaluate_usecols(self.usecols, usecols_key) + col_indices = evaluate_callable_usecols(self.usecols, usecols_key) elif any(isinstance(u, str) for u in self.usecols): if len(columns) > 1: raise ValueError( @@ -689,7 +868,7 @@ def _check_for_bom(self, first_row: list[Scalar]) -> list[Scalar]: new_row_list: list[Scalar] = [new_row] return new_row_list + first_row[1:] - def _is_line_empty(self, line: list[Scalar]) -> bool: + def _is_line_empty(self, line: Sequence[Scalar]) -> bool: """ Check if a line is empty or not. @@ -725,13 +904,11 @@ def _next_line(self) -> list[Scalar]: if ret: line = ret[0] break - except IndexError: - raise StopIteration + except IndexError as err: + raise StopIteration from err else: while self.skipfunc(self.pos): self.pos += 1 - # assert for mypy, data is Iterator[str] or None, would error in next - assert self.data is not None next(self.data) while True: @@ -800,12 +977,10 @@ def _next_iter_line(self, row_num: int) -> list[Scalar] | None: The row number of the line being parsed. """ try: - # assert for mypy, data is Iterator[str] or None, would error in next - assert self.data is not None + assert not isinstance(self.data, list) line = next(self.data) - # for mypy - assert isinstance(line, list) - return line + # lie about list[str] vs list[Scalar] to minimize ignores + return line # type: ignore[return-value] except csv.Error as e: if self.on_bad_lines in ( self.BadLineHandleMethod.ERROR, @@ -855,7 +1030,7 @@ def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: ret.append(rl) return ret - def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: + def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: """ Iterate through the lines and remove any that are either empty or contain only one whitespace value @@ -876,8 +1051,9 @@ def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: for line in lines if ( len(line) > 1 - or len(line) == 1 - and (not isinstance(line[0], str) or line[0].strip()) + or ( + len(line) == 1 and (not isinstance(line[0], str) or line[0].strip()) + ) ) ] return ret @@ -917,9 +1093,6 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: lines=lines, search=self.decimal, replace="." ) - def _clear_buffer(self) -> None: - self.buf = [] - def _get_index_name( self, ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: @@ -1030,7 +1203,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: if callable(self.on_bad_lines): new_l = self.on_bad_lines(_content) if new_l is not None: - content.append(new_l) + content.append(new_l) # pyright: ignore[reportArgumentType] elif self.on_bad_lines in ( self.BadLineHandleMethod.ERROR, self.BadLineHandleMethod.WARN, @@ -1045,8 +1218,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: for row_num, actual_len in bad_lines: msg = ( - f"Expected {col_len} fields in line {row_num + 1}, saw " - f"{actual_len}" + f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}" ) if ( self.delimiter @@ -1121,9 +1293,6 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: row_ct = 0 offset = self.pos if self.pos is not None else 0 while row_ct < rows: - # assert for mypy, data is Iterator[str] or None, would - # error in next - assert self.data is not None new_row = next(self.data) if not self.skipfunc(offset + row_index): row_ct += 1 @@ -1299,7 +1468,7 @@ def detect_colspecs( shifted[0] = 0 edges = np.where((mask ^ shifted) == 1)[0] edge_pairs = list(zip(edges[::2], edges[1::2])) - return edge_pairs + return edge_pairs # type: ignore[return-value] def __next__(self) -> list[str]: # Argument 1 to "next" has incompatible type "Union[IO[str], @@ -1338,7 +1507,7 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: self.infer_nrows, ) - def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: + def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: """ Returns the list of lines without the empty ones. With fixed-width fields, empty lines become arrays of empty strings. @@ -1352,10 +1521,6 @@ def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: ] -def count_empty_vals(vals) -> int: - return sum(1 for v in vals if v == "" or v is None) - - def _validate_skipfooter_arg(skipfooter: int) -> int: """ Validate the 'skipfooter' parameter. diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a9b41b45aba2f..67193f930b4dc 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -3,6 +3,7 @@ GH#48849 provides a convenient way of deprecating keyword arguments """ + from __future__ import annotations from collections import ( @@ -16,9 +17,8 @@ IO, TYPE_CHECKING, Any, - Callable, + Generic, Literal, - NamedTuple, TypedDict, overload, ) @@ -26,22 +26,22 @@ import numpy as np -from pandas._config import using_copy_on_write - from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( AbstractMethodError, ParserWarning, ) -from pandas.util._decorators import Appender +from pandas.util._decorators import ( + Appender, + set_module, +) from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_file_like, is_float, - is_hashable, is_integer, is_list_like, pandas_dtype, @@ -72,6 +72,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -85,14 +86,66 @@ DtypeArg, DtypeBackend, FilePath, + HashableT, IndexLabel, ReadCsvBuffer, Self, StorageOptions, + Unpack, UsecolsArgType, ) -_doc_read_csv_and_table = ( - r""" + + class _read_shared(TypedDict, Generic[HashableT], total=False): + # annotations shared between read_csv/fwf/table's overloads + # NOTE: Keep in sync with the annotations of the implementation + sep: str | None | lib.NoDefault + delimiter: str | None | lib.NoDefault + header: int | Sequence[int] | None | Literal["infer"] + names: Sequence[Hashable] | None | lib.NoDefault + index_col: IndexLabel | Literal[False] | None + usecols: UsecolsArgType + dtype: DtypeArg | None + engine: CSVEngine | None + converters: Mapping[HashableT, Callable] | None + true_values: list | None + false_values: list | None + skipinitialspace: bool + skiprows: list[int] | int | Callable[[Hashable], bool] | None + skipfooter: int + nrows: int | None + na_values: ( + Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None + ) + keep_default_na: bool + na_filter: bool + skip_blank_lines: bool + parse_dates: bool | Sequence[Hashable] | None + date_format: str | dict[Hashable, str] | None + dayfirst: bool + cache_dates: bool + compression: CompressionOptions + thousands: str | None + decimal: str + lineterminator: str | None + quotechar: str + quoting: int + doublequote: bool + escapechar: str | None + comment: str | None + encoding: str | None + encoding_errors: str | None + dialect: str | csv.Dialect | None + on_bad_lines: str + low_memory: bool + memory_map: bool + float_precision: Literal["high", "legacy", "round_trip"] | None + storage_options: StorageOptions | None + dtype_backend: DtypeBackend | lib.NoDefault +else: + _read_shared = dict + + +_doc_read_csv_and_table = r""" {summary} Also supports optionally iterating or breaking of the file @@ -138,6 +191,12 @@ parameter ignores commented lines and empty lines if ``skip_blank_lines=True``, so ``header=0`` denotes the first line of data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` + in the case of MultiIndex columns. names : Sequence of Hashable, optional Sequence of column labels to apply. If the file contains a header row, then you should explicitly pass ``header=0`` to override the column names. @@ -212,13 +271,22 @@ Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). nrows : int, optional Number of rows of file to read. Useful for reading pieces of large files. + Refers to the number of data rows in the returned DataFrame, excluding: + + * The header row containing column names. + * Rows before the header row, if ``header=1`` or larger. + + Example usage: + + * To read the first 999,999 (non-header) rows: + ``read_csv(..., nrows=999999)`` + + * To read rows 1,000,000 through 1,999,999: + ``read_csv(..., skiprows=1000000, nrows=999999)`` na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: " """ - + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") - + """ ". - + ``NaN``: "{na_values_str}". keep_default_na : bool, default True Whether or not to include the default ``NaN`` values when parsing the data. Depending on whether ``na_values`` is passed in, the behavior is as follows: @@ -238,22 +306,15 @@ Detect missing value markers (empty strings and the value of ``na_values``). In data without any ``NA`` values, passing ``na_filter=False`` can improve the performance of reading a large file. -verbose : bool, default False - Indicate number of ``NA`` values placed in non-numeric columns. skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ -default False +parse_dates : bool, None, list of Hashable, default None The behavior is as follows: - * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to - ``True`` if ``date_format`` or ``date_parser`` arguments have been passed. + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse - as a single date column. Values are joined with a space before parsing. - * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call - result 'foo'. Values are joined with a space before parsing. If a column or index cannot be represented as an array of ``datetime``, say because of an unparsable value or a mixture of timezones, the column @@ -262,34 +323,8 @@ :func:`~pandas.read_csv`. Note: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : bool, default False - If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the - format of the ``datetime`` strings in the columns, and if it can be inferred, - switch to a faster method of parsing them. In some cases this can increase - the parsing speed by 5-10x. - - .. deprecated:: 2.0.0 - A strict version of this argument is now the default, passing it has no effect. - -keep_date_col : bool, default False - If ``True`` and ``parse_dates`` specifies combining multiple columns then - keep the original columns. -date_parser : Callable, optional - Function to use for converting a sequence of string columns to an array of - ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the - conversion. pandas will try to call ``date_parser`` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by ``parse_dates`` into a single array - and pass that; and 3) call ``date_parser`` once for each row using one or - more strings (corresponding to the columns defined by ``parse_dates``) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`~pandas.to_datetime` as-needed. date_format : str or dict of column -> format, optional - Format to use for parsing dates when used in conjunction with ``parse_dates``. + Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See `strftime documentation `_ - time string (not necessarily in exactly the same format); + time string (not necessarily in exactly the same format); - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. + and you should probably use it along with `dayfirst`. .. versionadded:: 2.0.0 dayfirst : bool, default False @@ -333,8 +368,7 @@ quotechar : str (length 1), optional Character used to denote the start and end of a quoted item. Quoted items can include the ``delimiter`` and it will be ignored. -quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \ -3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL +quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, @@ -373,39 +407,33 @@ documentation for more details. on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). - Allowed values are : + Allowed values are: - ``'error'``, raise an Exception when a bad line is encountered. - ``'warn'``, raise a warning when a bad line is encountered and skip that line. - ``'skip'``, skip bad lines without raising or warning when they are encountered. + - Callable, function that will process a single bad line. + - With ``engine='python'``, function with signature + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. + - With ``engine='pyarrow'``, function with signature + as described in pyarrow documentation: `invalid_row_handler + `_. .. versionadded:: 1.3.0 .. versionadded:: 1.4.0 - - Callable, function with signature - ``(bad_line: list[str]) -> list[str] | None`` that will process a single - bad line. ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. - Only supported when ``engine='python'`` + Callable .. versionchanged:: 2.2.0 - - Callable, function with signature - as described in `pyarrow documentation - _` when ``engine='pyarrow'`` - -delim_whitespace : bool, default False - Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be - used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option - is set to ``True``, nothing should be passed in for the ``delimiter`` - parameter. + Callable for ``engine='pyarrow'`` - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed @@ -425,14 +453,14 @@ {storage_options} -dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' +dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -451,12 +479,85 @@ Examples -------- >>> pd.{func_name}('data.csv') # doctest: +SKIP -""" -) + Name Value +0 foo 1 +1 bar 2 +2 #baz 3 + +Index and header can be specified via the `index_col` and `header` arguments. + +>>> pd.{func_name}('data.csv', header=None) # doctest: +SKIP + 0 1 +0 Name Value +1 foo 1 +2 bar 2 +3 #baz 3 + +>>> pd.{func_name}('data.csv', index_col='Value') # doctest: +SKIP + Name +Value +1 foo +2 bar +3 #baz + +Column types are inferred but can be explicitly specified using the dtype argument. + +>>> pd.{func_name}('data.csv', dtype={{'Value': float}}) # doctest: +SKIP + Name Value +0 foo 1.0 +1 bar 2.0 +2 #baz 3.0 + +True, False, and NA values, and thousands separators have defaults, +but can be explicitly specified, too. Supply the values you would like +as strings or lists of strings! + +>>> pd.{func_name}('data.csv', na_values=['foo', 'bar']) # doctest: +SKIP + Name Value +0 NaN 1 +1 NaN 2 +2 #baz 3 + +Comment lines in the input file can be skipped using the `comment` argument. + +>>> pd.{func_name}('data.csv', comment='#') # doctest: +SKIP + Name Value +0 foo 1 +1 bar 2 + +By default, columns with dates will be read as ``object`` rather than ``datetime``. + +>>> df = pd.{func_name}('tmp.csv') # doctest: +SKIP + +>>> df # doctest: +SKIP + col 1 col 2 col 3 +0 10 10/04/2018 Sun 15 Jan 2023 +1 20 15/04/2018 Fri 12 May 2023 + +>>> df.dtypes # doctest: +SKIP +col 1 int64 +col 2 object +col 3 object +dtype: object + +Specific columns can be parsed as dates by using the `parse_dates` and +`date_format` arguments. + +>>> df = pd.{func_name}( +... 'tmp.csv', +... parse_dates=[1, 2], +... date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}}, +... ) # doctest: +SKIP + +>>> df.dtypes # doctest: +SKIP +col 1 int64 +col 2 datetime64[ns] +col 3 datetime64[ns] +dtype: object +""" # noqa: E501 class _C_Parser_Defaults(TypedDict): - delim_whitespace: Literal[False] na_filter: Literal[True] low_memory: Literal[True] memory_map: Literal[False] @@ -464,7 +565,6 @@ class _C_Parser_Defaults(TypedDict): _c_parser_defaults: _C_Parser_Defaults = { - "delim_whitespace": False, "na_filter": True, "low_memory": True, "memory_map": False, @@ -490,36 +590,26 @@ class _Fwf_Defaults(TypedDict): "thousands", "memory_map", "dialect", - "delim_whitespace", "quoting", "lineterminator", "converters", "iterator", "dayfirst", - "verbose", "skipinitialspace", "low_memory", } -class _DeprecationConfig(NamedTuple): - default_value: Any - msg: str | None - - @overload -def validate_integer(name: str, val: None, min_val: int = ...) -> None: - ... +def validate_integer(name: str, val: None, min_val: int = ...) -> None: ... @overload -def validate_integer(name: str, val: float, min_val: int = ...) -> int: - ... +def validate_integer(name: str, val: float, min_val: int = ...) -> int: ... @overload -def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: - ... +def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: ... def validate_integer( @@ -582,13 +672,10 @@ def _read( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds ) -> DataFrame | TextFileReader: """Generic reader of line files.""" - # if we pass a date_parser and parse_dates=False, we should not parse the + # if we pass a date_format and parse_dates=False, we should not parse the # dates GH#44366 if kwds.get("parse_dates", None) is None: - if ( - kwds.get("date_parser", lib.no_default) is lib.no_default - and kwds.get("date_format", None) is None - ): + if kwds.get("date_format", None) is None: kwds["parse_dates"] = False else: kwds["parse_dates"] = True @@ -596,6 +683,14 @@ def _read( # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) chunksize = kwds.get("chunksize", None) + + # Check type of encoding_errors + errors = kwds.get("encoding_errors", "strict") + if not isinstance(errors, str): + raise ValueError( + f"encoding_errors must be a string, got {type(errors).__name__}" + ) + if kwds.get("engine") == "pyarrow": if iterator: raise ValueError( @@ -624,243 +719,44 @@ def _read( return parser.read(nrows) -# iterator=True -> TextFileReader @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: Literal[True], chunksize: int | None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool | lib.NoDefault = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: Literal["high", "legacy"] | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> TextFileReader: - ... - - -# chunksize=int -> TextFileReader + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: bool = ..., chunksize: int, - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool | lib.NoDefault = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: Literal["high", "legacy"] | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> TextFileReader: - ... - - -# default case -> DataFrame + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: Literal[False] = ..., chunksize: None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool | lib.NoDefault = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: Literal["high", "legacy"] | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> DataFrame: - ... - - -# Unions -> DataFrame | TextFileReader + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame: ... + + @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: bool = ..., chunksize: int | None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool | lib.NoDefault = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: Literal["high", "legacy"] | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> DataFrame | TextFileReader: - ... + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame | TextFileReader: ... @Appender( @@ -869,12 +765,16 @@ def read_csv( summary="Read a comma-separated values (csv) file into DataFrame.", see_also_func_name="read_table", see_also_func_summary="Read general delimited file into DataFrame.", + na_values_str=fill( + '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " + ), _default_sep="','", storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer", ) ) +@set_module("pandas") def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, @@ -888,7 +788,7 @@ def read_csv( # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, - converters: Mapping[Hashable, Callable] | None = None, + converters: Mapping[HashableT, Callable] | None = None, true_values: list | None = None, false_values: list | None = None, skipinitialspace: bool = False, @@ -902,13 +802,9 @@ def read_csv( | None = None, keep_default_na: bool = True, na_filter: bool = True, - verbose: bool | lib.NoDefault = lib.no_default, skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, @@ -931,78 +827,12 @@ def read_csv( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, - float_precision: Literal["high", "legacy"] | None = None, + float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - keep_date_col = False - - if lib.is_list_like(parse_dates): - # GH#55569 - depr = False - # error: Item "bool" of "bool | Sequence[Hashable] | None" has no - # attribute "__iter__" (not iterable) - if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - depr = True - elif isinstance(parse_dates, dict) and any( - lib.is_list_like(x) for x in parse_dates.values() - ): - depr = True - if depr: - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_csv " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - delim_whitespace = False - - if verbose is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'verbose' keyword in pd.read_csv is deprecated and " - "will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - verbose = False - # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1011,7 +841,6 @@ def read_csv( kwds_defaults = _refine_defaults_read( dialect, delimiter, - delim_whitespace, engine, sep, on_bad_lines, @@ -1024,232 +853,44 @@ def read_csv( return _read(filepath_or_buffer, kwds) -# iterator=True -> TextFileReader @overload def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: Literal[True], chunksize: int | None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: str | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> TextFileReader: - ... - - -# chunksize=int -> TextFileReader + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + @overload def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: bool = ..., chunksize: int, - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: str | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> TextFileReader: - ... - - -# default -> DataFrame + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... + + @overload def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: Literal[False] = ..., chunksize: None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: str | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> DataFrame: - ... - - -# Unions -> DataFrame | TextFileReader + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame: ... + + @overload def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: bool = ..., chunksize: int | None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: str | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> DataFrame | TextFileReader: - ... + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame | TextFileReader: ... @Appender( @@ -1260,12 +901,16 @@ def read_table( see_also_func_summary=( "Read a comma-separated values (csv) file into DataFrame." ), + na_values_str=fill( + '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " + ), _default_sep=r"'\\t' (tab-stop)", storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer", ) ) +@set_module("pandas") def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, @@ -1279,7 +924,7 @@ def read_table( # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, - converters: Mapping[Hashable, Callable] | None = None, + converters: Mapping[HashableT, Callable] | None = None, true_values: list | None = None, false_values: list | None = None, skipinitialspace: bool = False, @@ -1287,16 +932,15 @@ def read_table( skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = None, keep_default_na: bool = True, na_filter: bool = True, - verbose: bool | lib.NoDefault = lib.no_default, skip_blank_lines: bool = True, # Datetime Handling - parse_dates: bool | Sequence[Hashable] = False, - infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, + parse_dates: bool | Sequence[Hashable] | None = None, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, @@ -1319,69 +963,12 @@ def read_table( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, - float_precision: str | None = None, + float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - keep_date_col = False - - # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" - if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - # GH#55569 - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_table " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if infer_datetime_format is not lib.no_default: - warnings.warn( - "The argument 'infer_datetime_format' is deprecated and will " - "be removed in a future version. " - "A strict version of it is now the default, see " - "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " - "You can safely remove this argument.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - delim_whitespace = False - - if verbose is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'verbose' keyword in pd.read_table is deprecated and " - "will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - verbose = False - # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1390,7 +977,6 @@ def read_table( kwds_defaults = _refine_defaults_read( dialect, delimiter, - delim_whitespace, engine, sep, on_bad_lines, @@ -1410,12 +996,10 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = ..., widths: Sequence[int] | None = ..., infer_nrows: int = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., iterator: Literal[True], chunksize: int | None = ..., - **kwds, -) -> TextFileReader: - ... + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... @overload @@ -1425,12 +1009,10 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = ..., widths: Sequence[int] | None = ..., infer_nrows: int = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., iterator: bool = ..., chunksize: int, - **kwds, -) -> TextFileReader: - ... + **kwds: Unpack[_read_shared[HashableT]], +) -> TextFileReader: ... @overload @@ -1440,24 +1022,22 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = ..., widths: Sequence[int] | None = ..., infer_nrows: int = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., iterator: Literal[False] = ..., chunksize: None = ..., - **kwds, -) -> DataFrame: - ... + **kwds: Unpack[_read_shared[HashableT]], +) -> DataFrame: ... +@set_module("pandas") def read_fwf( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, colspecs: Sequence[tuple[int, int]] | str | None = "infer", widths: Sequence[int] | None = None, infer_nrows: int = 100, - dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, iterator: bool = False, chunksize: int | None = None, - **kwds, + **kwds: Unpack[_read_shared[HashableT]], ) -> DataFrame | TextFileReader: r""" Read a table of fixed-width formatted lines into DataFrame. @@ -1478,7 +1058,7 @@ def read_fwf( ``file://localhost/path/to/table.csv``. colspecs : list of tuple (int, int) or 'infer'. optional A list of tuples giving the extents of the fixed-width - fields of each line as half-open intervals (i.e., [from, to[ ). + fields of each line as half-open intervals (i.e., [from, to] ). String value 'infer' can be used to instruct the parser to try detecting the column specifications from the first 100 rows of the data which are not being skipped via skiprows (default='infer'). @@ -1488,17 +1068,11 @@ def read_fwf( infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. - - .. versionadded:: 2.0 - + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. + chunksize : int, optional + Number of lines to read from the file per chunk. **kwds : optional Optional keyword arguments can be passed to ``TextFileReader``. @@ -1515,7 +1089,7 @@ def read_fwf( Examples -------- - >>> pd.read_fwf('data.csv') # doctest: +SKIP + >>> pd.read_fwf("data.csv") # doctest: +SKIP """ # Check input arguments. if colspecs is None and widths is None: @@ -1536,7 +1110,7 @@ def read_fwf( # GH#40830 # Ensure length of `colspecs` matches length of `names` names = kwds.get("names") - if names is not None: + if names is not None and names is not lib.no_default: if len(names) != len(colspecs) and colspecs != "infer": # need to check len(index_col) as it might contain # unnamed indices, in which case it's name is not required @@ -1547,20 +1121,26 @@ def read_fwf( if not is_list_like(index_col): len_index = 1 else: + # for mypy: handled in the if-branch + assert index_col is not lib.no_default + len_index = len(index_col) if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): # If usecols is used colspec may be longer than names raise ValueError("Length of colspecs must match length of names") - kwds["colspecs"] = colspecs - kwds["infer_nrows"] = infer_nrows - kwds["engine"] = "python-fwf" - kwds["iterator"] = iterator - kwds["chunksize"] = chunksize - - check_dtype_backend(dtype_backend) - kwds["dtype_backend"] = dtype_backend - return _read(filepath_or_buffer, kwds) + check_dtype_backend(kwds.setdefault("dtype_backend", lib.no_default)) + return _read( + filepath_or_buffer, + kwds + | { + "colspecs": colspecs, + "infer_nrows": infer_nrows, + "engine": "python-fwf", + "iterator": iterator, + "chunksize": chunksize, + }, + ) class TextFileReader(abc.Iterator): @@ -1639,8 +1219,7 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: and value != getattr(value, "value", default) ): raise ValueError( - f"The {repr(argname)} option is not supported with the " - f"'pyarrow' engine" + f"The {argname!r} option is not supported with the 'pyarrow' engine" ) options[argname] = value @@ -1656,8 +1235,8 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: pass else: raise ValueError( - f"The {repr(argname)} option is not supported with the " - f"{repr(engine)} engine" + f"The {argname!r} option is not supported with the " + f"{engine!r} engine" ) else: value = default @@ -1679,6 +1258,16 @@ def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: raise ValueError( "The 'python' engine cannot iterate through this file buffer." ) + if hasattr(f, "encoding"): + file_encoding = f.encoding + orig_reader_enc = self.orig_options.get("encoding", None) + any_none = file_encoding is None or orig_reader_enc is None + if file_encoding != orig_reader_enc and not any_none: + file_path = getattr(f, "name", None) + raise ValueError( + f"The specified reader encoding {orig_reader_enc} is different " + f"from the encoding {file_encoding} of file {file_path}." + ) def _clean_options( self, options: dict[str, Any], engine: CSVEngine @@ -1694,17 +1283,10 @@ def _clean_options( engine = "python" sep = options["delimiter"] - delim_whitespace = options["delim_whitespace"] - if sep is None and not delim_whitespace: - if engine in ("c", "pyarrow"): - fallback_reason = ( - f"the '{engine}' engine does not support " - "sep=None with delim_whitespace=False" - ) - engine = "python" - elif sep is not None and len(sep) > 1: + if sep is not None and len(sep) > 1: if engine == "c" and sep == r"\s+": + # delim_whitespace passed on to pandas._libs.parsers.TextReader result["delim_whitespace"] = True del result["delimiter"] elif engine not in ("python", "python-fwf"): @@ -1715,9 +1297,6 @@ def _clean_options( r"different from '\s+' are interpreted as regex)" ) engine = "python" - elif delim_whitespace: - if "python" in engine: - result["delimiter"] = r"\s+" elif sep is not None: encodeable = True encoding = sys.getfilesystemencoding() or "utf-8" @@ -1760,7 +1339,7 @@ def _clean_options( if fallback_reason and result[arg] != _c_parser_defaults.get(arg): raise ValueError( "Falling back to the 'python' engine because " - f"{fallback_reason}, but this causes {repr(arg)} to be " + f"{fallback_reason}, but this causes {arg!r} to be " "ignored as it is not supported by the 'python' engine." ) del result[arg] @@ -1816,12 +1395,11 @@ def _clean_options( if not is_integer(skiprows) and skiprows is not None: # pyarrow expects skiprows to be passed as an integer raise ValueError( - "skiprows argument must be an integer when using " - "engine='pyarrow'" + "skiprows argument must be an integer when using engine='pyarrow'" ) else: if is_integer(skiprows): - skiprows = list(range(skiprows)) + skiprows = range(skiprows) if skiprows is None: skiprows = set() elif not callable(skiprows): @@ -1854,6 +1432,7 @@ def _make_engine( "pyarrow": ArrowParserWrapper, "python-fwf": FixedWidthFieldParser, } + if engine not in mapping: raise ValueError( f"Unknown engine: {engine} (valid options are {mapping.keys()})" @@ -1967,7 +1546,7 @@ def read(self, nrows: int | None = None) -> DataFrame: new_col_dict, columns=columns, index=index, - copy=not using_copy_on_write(), + copy=False, ) self._currow += new_rows @@ -1979,7 +1558,10 @@ def get_chunk(self, size: int | None = None) -> DataFrame: if self.nrows is not None: if self._currow >= self.nrows: raise StopIteration - size = min(size, self.nrows - self._currow) + if size is None: + size = self.nrows - self._currow + else: + size = min(size, self.nrows - self._currow) return self.read(nrows=size) def __enter__(self) -> Self: @@ -2023,10 +1605,6 @@ def TextParser(*args, **kwds) -> TextFileReader: comment : str, optional Comment out remainder of line parse_dates : bool, default False - keep_date_col : bool, default False - date_parser : function, optional - - .. deprecated:: 2.0.0 date_format : str or dict of column -> format, default ``None`` .. versionadded:: 2.0.0 @@ -2074,7 +1652,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T if keep_default_na: v = set(v) | STR_NA_VALUES - na_values[k] = v + na_values[k] = _stringify_na_values(v, floatify) na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} else: if not is_list_like(na_values): @@ -2101,7 +1679,7 @@ def _floatify_na_values(na_values): return result -def _stringify_na_values(na_values, floatify: bool): +def _stringify_na_values(na_values, floatify: bool) -> set[str | float]: """return a stringified and numeric for these values""" result: list[str | float] = [] for x in na_values: @@ -2131,7 +1709,6 @@ def _stringify_na_values(na_values, floatify: bool): def _refine_defaults_read( dialect: str | csv.Dialect | None, delimiter: str | None | lib.NoDefault, - delim_whitespace: bool, engine: CSVEngine | None, sep: str | None | lib.NoDefault, on_bad_lines: str | Callable, @@ -2151,14 +1728,6 @@ def _refine_defaults_read( documentation for more details. delimiter : str or object Alias for sep. - delim_whitespace : bool - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\\s+'``. If this option - is set to True, nothing should be passed in for the ``delimiter`` - parameter. - - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. engine : {{'c', 'python'}} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. @@ -2178,12 +1747,6 @@ def _refine_defaults_read( ------- kwds : dict Input parameters with correct values. - - Raises - ------ - ValueError : - If a delimiter was specified with ``sep`` (or ``delimiter``) and - ``delim_whitespace=True``. """ # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] @@ -2214,12 +1777,6 @@ def _refine_defaults_read( if delimiter is None: delimiter = sep - if delim_whitespace and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - if delimiter == "\n": raise ValueError( r"Specified \n as separator or delimiter. This forces the python engine " diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 0dae0e7106b69..f0441f583bea2 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,4 +1,5 @@ -""" pickle compat """ +"""pickle compat""" + from __future__ import annotations import pickle @@ -8,7 +9,7 @@ ) import warnings -from pandas.compat import pickle_compat as pc +from pandas.compat import pickle_compat from pandas.util._decorators import doc from pandas.core.shared_docs import _shared_docs @@ -78,7 +79,9 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP + >>> original_df = pd.DataFrame( + ... {{"foo": range(5), "bar": range(5, 10)}} + ... ) # doctest: +SKIP >>> original_df # doctest: +SKIP foo bar 0 0 5 @@ -96,7 +99,7 @@ def to_pickle( 2 2 7 3 3 8 4 4 9 - """ # noqa: E501 + """ if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL @@ -121,7 +124,7 @@ def read_pickle( storage_options: StorageOptions | None = None, ) -> DataFrame | Series: """ - Load pickled pandas object (or any object) from file. + Load pickled pandas object (or any object) from file and return unpickled object. .. warning:: @@ -143,7 +146,8 @@ def read_pickle( Returns ------- - same type as object stored in file + object + The unpickled pandas object (or any object) that was stored in file. See Also -------- @@ -155,14 +159,14 @@ def read_pickle( Notes ----- - read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3 + read_pickle is only guaranteed to be backwards compatible to pandas 1.0 provided the object was serialized with to_pickle. Examples -------- >>> original_df = pd.DataFrame( ... {{"foo": range(5), "bar": range(5, 10)}} - ... ) # doctest: +SKIP + ... ) # doctest: +SKIP >>> original_df # doctest: +SKIP foo bar 0 0 5 @@ -181,6 +185,7 @@ def read_pickle( 3 3 8 4 4 9 """ + # TypeError for Cython complaints about object.__new__ vs Tick.__new__ excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError) with get_handle( filepath_or_buffer, @@ -191,20 +196,14 @@ def read_pickle( ) as handles: # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes - # 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError - try: - # TypeError for Cython complaints about object.__new__ vs Tick.__new__ - try: - with warnings.catch_warnings(record=True): - # We want to silence any warnings about, e.g. moved modules. - warnings.simplefilter("ignore", Warning) - return pickle.load(handles.handle) - except excs_to_catch: - # e.g. - # "No module named 'pandas.core.sparse.series'" - # "Can't get attribute '__nat_unpickle' on str: # set the encoding if we need if encoding is None: @@ -303,14 +299,14 @@ def to_hdf( dropna=dropna, ) - path_or_buf = stringify_path(path_or_buf) - if isinstance(path_or_buf, str): + if isinstance(path_or_buf, HDFStore): + f(path_or_buf) + else: + path_or_buf = stringify_path(path_or_buf) with HDFStore( path_or_buf, mode=mode, complevel=complevel, complib=complib ) as store: f(store) - else: - f(path_or_buf) def read_hdf( @@ -365,7 +361,7 @@ def read_hdf( A list of Term (or convertible) objects. start : int, optional Row number to start selection. - stop : int, optional + stop : int, optional Row number to stop selection. columns : list, optional A list of columns names to return. @@ -386,11 +382,18 @@ def read_hdf( DataFrame.to_hdf : Write a HDF file from a DataFrame. HDFStore : Low-level access to HDF files. + Notes + ----- + When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true, + and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding + to UTF-8, the resulting dtype will be + ``pd.StringDtype(storage="python", na_value=np.nan)``. + Examples -------- - >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP - >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP - >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP + >>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"]) # doctest: +SKIP + >>> df.to_hdf("./store.h5", "data") # doctest: +SKIP + >>> reread = pd.read_hdf("./store.h5") # doctest: +SKIP """ if mode not in ["r", "r+", "a"]: raise ValueError( @@ -531,9 +534,9 @@ class HDFStore: Examples -------- >>> bar = pd.DataFrame(np.random.randn(10, 4)) - >>> store = pd.HDFStore('test.h5') - >>> store['foo'] = bar # write to HDF5 - >>> bar = store['foo'] # retrieve + >>> store = pd.HDFStore("test.h5") + >>> store["foo"] = bar # write to HDF5 + >>> bar = store["foo"] # retrieve >>> store.close() **Create or load HDF5 file in-memory** @@ -543,9 +546,9 @@ class HDFStore: written when closed: >>> bar = pd.DataFrame(np.random.randn(10, 4)) - >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE') - >>> store['foo'] = bar - >>> store.close() # only now, data is written to disk + >>> store = pd.HDFStore("test.h5", driver="H5FD_CORE") + >>> store["foo"] = bar + >>> store.close() # only now, data is written to disk """ _handle: File | None @@ -604,7 +607,7 @@ def __getitem__(self, key: str): def __setitem__(self, key: str, value) -> None: self.put(key, value) - def __delitem__(self, key: str) -> None: + def __delitem__(self, key: str) -> int | None: return self.remove(key) def __getattr__(self, name: str): @@ -667,12 +670,18 @@ def keys(self, include: str = "pandas") -> list[str]: ------ raises ValueError if kind has an illegal value + See Also + -------- + HDFStore.info : Prints detailed information on the store. + HDFStore.get_node : Returns the node with the key. + HDFStore.get_storer : Returns the storer object for a key. + Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP - >>> store.get('data') # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP + >>> store.get("data") # doctest: +SKIP >>> print(store.keys()) # doctest: +SKIP ['/data1', '/data2'] >>> store.close() # doctest: +SKIP @@ -790,18 +799,24 @@ def get(self, key: str): Parameters ---------- key : str + Object to retrieve from file. Raises KeyError if not found. Returns ------- object Same type as object stored in file. + See Also + -------- + HDFStore.get_node : Returns the node with the key. + HDFStore.get_storer : Returns the storer object for a key. + Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP - >>> store.get('data') # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP + >>> store.get("data") # doctest: +SKIP >>> store.close() # doctest: +SKIP """ with patch_pickle(): @@ -858,19 +873,25 @@ def select( object Retrieved object from file. + See Also + -------- + HDFStore.select_as_coordinates : Returns the selection as an index. + HDFStore.select_column : Returns a single column from the table. + HDFStore.select_as_multiple : Retrieves pandas objects from multiple tables. + Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP - >>> store.get('data') # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP + >>> store.get("data") # doctest: +SKIP >>> print(store.keys()) # doctest: +SKIP ['/data1', '/data2'] - >>> store.select('/data1') # doctest: +SKIP + >>> store.select("/data1") # doctest: +SKIP A B 0 1 2 1 3 4 - >>> store.select('/data1', where='columns == A') # doctest: +SKIP + >>> store.select("/data1", where="columns == A") # doctest: +SKIP A 0 1 1 3 @@ -1118,10 +1139,18 @@ def put( """ Store object in HDFStore. + This method writes a pandas DataFrame or Series into an HDF5 file using + either the fixed or table format. The `table` format allows additional + operations like incremental appends and queries but may have performance + trade-offs. The `fixed` format provides faster read/write operations but + does not support appends or queries. + Parameters ---------- key : str + Key of object to store in file. value : {Series, DataFrame} + Value of object to store in file. format : 'fixed(f)|table(t)', default is 'fixed' Format to use when storing object in HDFStore. Value can be one of: @@ -1135,12 +1164,27 @@ def put( Write DataFrame index as a column. append : bool, default False This will force Table format, append the input data to the existing. + complib : default None + This parameter is currently not accepted. + complevel : int, 0-9, default None + Specifies a compression level for data. + A value of 0 or None disables compression. + min_itemsize : int, dict, or None + Dict of columns that specify minimum str sizes. + nan_rep : str + Str to use as str nan representation. data_columns : list of columns or True, default None List of columns to create as data columns, or True to use all columns. See `here `__. encoding : str, default None Provide an encoding for strings. + errors : str, default 'strict' + The error handling scheme to use for encoding errors. + The default is 'strict' meaning that encoding errors raise a + UnicodeEncodeError. Other possible values are 'ignore', 'replace' and + 'xmlcharrefreplace' as well as any other name registered with + codecs.register_error that can handle UnicodeEncodeErrors. track_times : bool, default True Parameter is propagated to 'create_table' method of 'PyTables'. If set to False it enables to have the same h5 files (same hashes) @@ -1148,11 +1192,16 @@ def put( dropna : bool, default False, optional Remove missing values. + See Also + -------- + HDFStore.info : Prints detailed information on the store. + HDFStore.get_storer : Returns the storer object for a key. + Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1174,7 +1223,7 @@ def put( dropna=dropna, ) - def remove(self, key: str, where=None, start=None, stop=None) -> None: + def remove(self, key: str, where=None, start=None, stop=None) -> int | None: """ Remove pandas object partially by specifying the where condition @@ -1222,14 +1271,12 @@ def remove(self, key: str, where=None, start=None, stop=None) -> None: # remove the node if com.all_none(where, start, stop): s.group._f_remove(recursive=True) + return None # delete from the table - else: - if not s.is_table: - raise ValueError( - "can only remove with where on objects written as tables" - ) - return s.delete(where=where, start=start, stop=stop) + if not s.is_table: + raise ValueError("can only remove with where on objects written as tables") + return s.delete(where=where, start=start, stop=stop) def append( self, @@ -1259,7 +1306,9 @@ def append( Parameters ---------- key : str + Key of object to append. value : {Series, DataFrame} + Value of object to append. format : 'table' is the default Format to use when storing object in HDFStore. Value can be one of: @@ -1267,23 +1316,47 @@ def append( Table format. Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of the data. + axes : default None + This parameter is currently not accepted. index : bool, default True Write DataFrame index as a column. - append : bool, default True + append : bool, default True Append the input data to the existing. + complib : default None + This parameter is currently not accepted. + complevel : int, 0-9, default None + Specifies a compression level for data. + A value of 0 or None disables compression. + columns : default None + This parameter is currently not accepted, try data_columns. + min_itemsize : int, dict, or None + Dict of columns that specify minimum str sizes. + nan_rep : str + Str to use as str nan representation. + chunksize : int or None + Size to chunk the writing. + expectedrows : int + Expected TOTAL row size of this table. + dropna : bool, default False, optional + Do not write an ALL nan row to the store settable + by the option 'io.hdf.dropna_table'. data_columns : list of columns, or True, default None List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here `__. - min_itemsize : dict of columns that specify minimum str sizes - nan_rep : str to use as str nan representation - chunksize : size to chunk the writing - expectedrows : expected TOTAL row size of this table - encoding : default None, provide an encoding for str - dropna : bool, default False, optional - Do not write an ALL nan row to the store settable - by the option 'io.hdf.dropna_table'. + encoding : default None + Provide an encoding for str. + errors : str, default 'strict' + The error handling scheme to use for encoding errors. + The default is 'strict' meaning that encoding errors raise a + UnicodeEncodeError. Other possible values are 'ignore', 'replace' and + 'xmlcharrefreplace' as well as any other name registered with + codecs.register_error that can handle UnicodeEncodeErrors. + + See Also + -------- + HDFStore.append_to_multiple : Append to multiple tables. Notes ----- @@ -1292,11 +1365,11 @@ def append( Examples -------- - >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df1, format='table') # doctest: +SKIP - >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B']) - >>> store.append('data', df2) # doctest: +SKIP + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df1, format="table") # doctest: +SKIP + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["A", "B"]) + >>> store.append("data", df2) # doctest: +SKIP >>> store.close() # doctest: +SKIP A B 0 1 2 @@ -1481,11 +1554,15 @@ def groups(self) -> list: list List of objects. + See Also + -------- + HDFStore.get_node : Returns the node with the key. + Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df) # doctest: +SKIP >>> print(store.groups()) # doctest: +SKIP >>> store.close() # doctest: +SKIP [/data (Group) '' @@ -1536,13 +1613,17 @@ def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]: leaves : list Names (strings) of the pandas objects contained in `path`. + See Also + -------- + HDFStore.info : Prints detailed information on the store. + Examples -------- - >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df1, format='table') # doctest: +SKIP - >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B']) - >>> store.append('data', df2) # doctest: +SKIP + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data", df1, format="table") # doctest: +SKIP + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["A", "B"]) + >>> store.append("data", df2) # doctest: +SKIP >>> store.close() # doctest: +SKIP >>> for group in store.walk(): # doctest: +SKIP ... print(group) # doctest: +SKIP @@ -1661,24 +1742,33 @@ def info(self) -> str: Returns ------- str + A String containing the python pandas class name, filepath to the HDF5 + file and all the object keys along with their respective dataframe shapes. + + See Also + -------- + HDFStore.get_storer : Returns the storer object for a key. Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP - >>> store.put('data', df) # doctest: +SKIP + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["C", "D"]) + >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP + >>> store.put("data1", df1) # doctest: +SKIP + >>> store.put("data2", df2) # doctest: +SKIP >>> print(store.info()) # doctest: +SKIP >>> store.close() # doctest: +SKIP File path: store.h5 - /data frame (shape->[2,2]) + /data1 frame (shape->[2,2]) + /data2 frame (shape->[2,2]) """ path = pprint_thing(self._path) output = f"{type(self)}\nFile path: {path}\n" if self.is_open: lkeys = sorted(self.keys()) - if len(lkeys): + if lkeys: keys = [] values = [] @@ -1707,7 +1797,7 @@ def info(self) -> str: # ------------------------------------------------------------------------ # private methods - def _check_if_open(self): + def _check_if_open(self) -> None: if not self.is_open: raise ClosedFileError(f"{self._path} file is not open!") @@ -1735,8 +1825,8 @@ def _create_storer( if value is not None and not isinstance(value, (Series, DataFrame)): raise TypeError("value must be None, Series, or DataFrame") - pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) - tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) + pt = getattr(group._v_attrs, "pandas_type", None) + tt = getattr(group._v_attrs, "table_type", None) # infer the pt from the passed value if pt is None: @@ -1803,7 +1893,7 @@ def _create_storer( "worm": WORMTable, } try: - cls = _TABLE_MAP[tt] + cls = _TABLE_MAP[tt] # type: ignore[index] except KeyError as err: raise TypeError( f"cannot properly create the storer for: [_TABLE_MAP] [group->" @@ -2150,13 +2240,13 @@ def convert( # preventing the original recarry from being free'ed values = values[self.cname].copy() - val_kind = _ensure_decoded(self.kind) + val_kind = self.kind values = _maybe_convert(values, val_kind, encoding, errors) kwargs = {} - kwargs["name"] = _ensure_decoded(self.index_name) + kwargs["name"] = self.index_name if self.freq is not None: - kwargs["freq"] = _ensure_decoded(self.freq) + kwargs["freq"] = self.freq factory: type[Index | DatetimeIndex] = Index if lib.is_np_dtype(values.dtype, "M") or isinstance( @@ -2170,20 +2260,37 @@ def convert( # "Union[Type[Index], Type[DatetimeIndex]]") factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment] x, freq=kwds.get("freq", None) - )._rename( - kwds["name"] - ) + )._rename(kwds["name"]) # making an Index instance could throw a number of different errors try: new_pd_index = factory(values, **kwargs) + except UnicodeEncodeError as err: + if ( + errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + new_pd_index = factory( + values, + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise except ValueError: # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') if "freq" in kwargs: kwargs["freq"] = None new_pd_index = factory(values, **kwargs) - final_pd_index = _set_tz(new_pd_index, self.tz) + + final_pd_index: Index + if self.tz is not None and isinstance(new_pd_index, DatetimeIndex): + final_pd_index = new_pd_index.tz_localize("UTC").tz_convert(self.tz) + else: + final_pd_index = new_pd_index return final_pd_index, final_pd_index def take_data(self): @@ -2217,7 +2324,7 @@ def maybe_set_size(self, min_itemsize=None) -> None: min_itemsize can be an integer or a dict with this columns name with an integer size """ - if _ensure_decoded(self.kind) == "string": + if self.kind == "string": if isinstance(min_itemsize, dict): min_itemsize = min_itemsize.get(self.name) @@ -2238,7 +2345,7 @@ def validate_and_set(self, handler: AppendableTable, append: bool) -> None: def validate_col(self, itemsize=None): """validate this column: return the compared against itemsize""" # validate this column for string truncation (or reset to the max size) - if _ensure_decoded(self.kind) == "string": + if self.kind == "string": c = self.col if c is not None: if itemsize is None: @@ -2568,19 +2675,19 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): assert isinstance(converted, np.ndarray) # for mypy # use the meta if needed - meta = _ensure_decoded(self.meta) + meta = self.meta metadata = self.metadata ordered = self.ordered tz = self.tz assert dtype_name is not None # convert to the correct dtype - dtype = _ensure_decoded(dtype_name) + dtype = dtype_name # reverse converts if dtype.startswith("datetime64"): # recreate with tz if indicated - converted = _set_tz(converted, tz, coerce=True) + converted = _set_tz(converted, tz, dtype) elif dtype == "timedelta64": converted = np.asarray(converted, dtype="m8[ns]") @@ -2625,7 +2732,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): converted = converted.astype("O", copy=False) # convert nans / decode - if _ensure_decoded(kind) == "string": + if kind == "string": converted = _unconvert_string_array( converted, nan_rep=nan_rep, encoding=encoding, errors=errors ) @@ -2713,18 +2820,19 @@ def is_old_version(self) -> bool: @property def version(self) -> tuple[int, int, int]: """compute and set our version""" - version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) - try: - version = tuple(int(x) for x in version.split(".")) - if len(version) == 2: - version = version + (0,) - except AttributeError: - version = (0, 0, 0) - return version + version = getattr(self.group._v_attrs, "pandas_version", None) + if isinstance(version, str): + version_tup = tuple(int(x) for x in version.split(".")) + if len(version_tup) == 2: + version_tup = version_tup + (0,) + assert len(version_tup) == 3 # needed for mypy + return version_tup + else: + return (0, 0, 0) @property def pandas_type(self): - return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) + return getattr(self.group._v_attrs, "pandas_type", None) def __repr__(self) -> str: """return a pretty representation of myself""" @@ -2819,7 +2927,7 @@ def read( columns=None, start: int | None = None, stop: int | None = None, - ): + ) -> Series | DataFrame: raise NotImplementedError( "cannot read on an abstract storer: subclasses should implement" ) @@ -2831,7 +2939,7 @@ def write(self, obj, **kwargs) -> None: def delete( self, where=None, start: int | None = None, stop: int | None = None - ) -> None: + ) -> int | None: """ support fully deleting the node in its entirety (only) - where specification must be None @@ -2861,9 +2969,7 @@ def _alias_to_class(self, alias): return self._reverse_index_map.get(alias, Index) def _get_index_factory(self, attrs): - index_class = self._alias_to_class( - _ensure_decoded(getattr(attrs, "index_class", "")) - ) + index_class = self._alias_to_class(getattr(attrs, "index_class", "")) factory: Callable @@ -2899,12 +3005,7 @@ def f(values, freq=None, tz=None): factory = TimedeltaIndex if "tz" in attrs: - if isinstance(attrs["tz"], bytes): - # created by python2 - kwargs["tz"] = attrs["tz"].decode("utf-8") - else: - # created by python3 - kwargs["tz"] = attrs["tz"] + kwargs["tz"] = attrs["tz"] assert index_class is DatetimeIndex # just checking return factory, kwargs @@ -2936,9 +3037,9 @@ def set_attrs(self) -> None: def get_attrs(self) -> None: """retrieve our attributes""" self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) - self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) + self.errors = getattr(self.attrs, "errors", "strict") for n in self.attributes: - setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) + setattr(self, n, getattr(self.attrs, n, None)) def write(self, obj, **kwargs) -> None: self.set_attrs() @@ -2954,8 +3055,11 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if isinstance(node, tables.VLArray): ret = node[0][start:stop] + dtype = getattr(attrs, "value_type", None) + if dtype is not None: + ret = pd_array(ret, dtype=dtype) else: - dtype = _ensure_decoded(getattr(attrs, "value_type", None)) + dtype = getattr(attrs, "value_type", None) shape = getattr(attrs, "shape", None) if shape is not None: @@ -2967,7 +3071,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - ret = _set_tz(ret, tz, coerce=True) + ret = _set_tz(ret, tz, dtype) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -2980,7 +3084,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None def read_index( self, key: str, start: int | None = None, stop: int | None = None ) -> Index: - variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) + variety = getattr(self.attrs, f"{key}_variety") if variety == "multi": return self.read_multi_index(key, start=start, stop=stop) @@ -3070,12 +3174,11 @@ def read_index_node( # have written a sentinel. Here we replace it with the original. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type) - kind = _ensure_decoded(node._v_attrs.kind) + kind = node._v_attrs.kind name = None if "name" in node._v_attrs: name = _ensure_str(node._v_attrs.name) - name = _ensure_decoded(name) attrs = node._v_attrs factory, kwargs = self._get_index_factory(attrs) @@ -3089,12 +3192,29 @@ def read_index_node( **kwargs, ) else: - index = factory( - _unconvert_index( - data, kind, encoding=self.encoding, errors=self.errors - ), - **kwargs, - ) + try: + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + **kwargs, + ) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise index.name = name @@ -3164,7 +3284,7 @@ def write_array( pass elif inferred_type == "string": pass - else: + elif get_option("performance_warnings"): ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level()) @@ -3181,7 +3301,9 @@ def write_array( # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "asi8" self._handle.create_array( - self.group, key, value.asi8 # type: ignore[union-attr] + self.group, + key, + value.asi8, # type: ignore[union-attr] ) node = getattr(self.group, key) @@ -3192,6 +3314,11 @@ def write_array( elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif isinstance(value, BaseStringArray): + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) + vlarr.append(value.to_numpy()) + node = getattr(self.group, key) + node._v_attrs.value_type = str(value.dtype) elif empty_array: self.write_array_empty(key, value) else: @@ -3207,7 +3334,7 @@ class SeriesFixed(GenericFixed): name: Hashable @property - def shape(self): + def shape(self) -> tuple[int] | None: try: return (len(self.group.values),) except (TypeError, AttributeError): @@ -3223,9 +3350,24 @@ def read( self.validate_read(columns, where) index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) - result = Series(values, index=index, name=self.name, copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - result = result.astype("string[pyarrow_numpy]") + try: + result = Series(values, index=index, name=self.name, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + result = Series( + values, + index=index, + name=self.name, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise return result def write(self, obj, **kwargs) -> None: @@ -3293,28 +3435,23 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - df = df.astype("string[pyarrow_numpy]") + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): + df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) if len(dfs) > 0: - out = concat(dfs, axis=1, copy=True) - if using_copy_on_write(): - # with CoW, concat ignores the copy keyword. Here, we still want - # to copy to enforce optimized column-major layout - out = out.copy() - out = out.reindex(columns=items, copy=False) - return out + out = concat(dfs, axis=1).copy() + return out.reindex(columns=items) return DataFrame(columns=axes[0], index=axes[1]) def write(self, obj, **kwargs) -> None: super().write(obj, **kwargs) - # TODO(ArrayManager) HDFStore relies on accessing the blocks - if isinstance(obj._mgr, ArrayManager): - obj = obj._as_manager("block") - data = obj._mgr if not data.is_consolidated(): data = data.consolidate() @@ -3443,6 +3580,12 @@ def validate(self, other) -> None: # Value of type "Optional[Any]" is not indexable [index] oax = ov[i] # type: ignore[index] if sax != oax: + if c == "values_axes" and sax.kind != oax.kind: + raise ValueError( + f"Cannot serialize the column [{oax.values[0]}] " + f"because its data contents are not [{sax.kind}] " + f"but [{oax.kind}] object dtype" + ) raise ValueError( f"invalid combination of [{c}] on appending data " f"[{sax}] vs current table [{oax}]" @@ -3540,7 +3683,7 @@ def queryables(self) -> dict[str, Any]: return dict(d1 + d2 + d3) - def index_cols(self): + def index_cols(self) -> list[tuple[Any, Any]]: """return a list of my index cols""" # Note: each `i.cname` below is assured to be a str. return [(i.axis, i.cname) for i in self.index_axes] @@ -3598,7 +3741,7 @@ def get_attrs(self) -> None: self.info = getattr(self.attrs, "info", None) or {} self.nan_rep = getattr(self.attrs, "nan_rep", None) self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) - self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) + self.errors = getattr(self.attrs, "errors", "strict") self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or [] self.index_axes = [a for a in self.indexables if a.is_an_indexable] self.values_axes = [a for a in self.indexables if not a.is_an_indexable] @@ -3670,7 +3813,7 @@ def indexables(self): dc = set(self.data_columns) base_pos = len(_indexables) - def f(i, c): + def f(i, c: str) -> DataCol: assert isinstance(c, str) klass = DataCol if c in dc: @@ -3836,7 +3979,7 @@ def get_object(cls, obj, transposed: bool): """return the data for this obj""" return obj - def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): + def validate_data_columns(self, data_columns, min_itemsize, non_index_axes) -> list: """ take the input data_columns and min_itemize and create a data columns spec @@ -4065,7 +4208,9 @@ def _create_axes( if isinstance(data_converted.dtype, CategoricalDtype): ordered = data_converted.ordered meta = "category" - metadata = np.array(data_converted.categories, copy=False).ravel() + metadata = np.asarray(data_converted.categories).ravel() + elif isinstance(blk.dtype, StringDtype): + meta = str(blk.dtype) data, dtype_name = _get_data_and_dtype_name(data_converted) @@ -4123,16 +4268,10 @@ def _get_blocks_and_items( data_columns, ): # Helper to clarify non-state-altering parts of _create_axes - - # TODO(ArrayManager) HDFStore relies on accessing the blocks - if isinstance(frame._mgr, ArrayManager): - frame = frame._as_manager("block") - def get_blk_items(mgr): return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks] mgr = frame._mgr - mgr = cast(BlockManager, mgr) blocks: list[Block] = list(mgr.blocks) blk_items: list[Index] = get_blk_items(mgr) @@ -4144,7 +4283,6 @@ def get_blk_items(mgr): axis, axis_labels = new_non_index_axes[0] new_labels = Index(axis_labels).difference(Index(data_columns)) mgr = frame.reindex(new_labels, axis=axis)._mgr - mgr = cast(BlockManager, mgr) blocks = list(mgr.blocks) blk_items = get_blk_items(mgr) @@ -4153,7 +4291,6 @@ def get_blk_items(mgr): # index, so we can infer that (as long as axis==1) we # get a single column back, so a single block. mgr = frame.reindex([c], axis=axis)._mgr - mgr = cast(BlockManager, mgr) blocks.extend(mgr.blocks) blk_items.extend(get_blk_items(mgr)) @@ -4333,7 +4470,9 @@ def read_column( encoding=self.encoding, errors=self.errors, ) - return Series(_set_tz(col_values[1], a.tz), name=column, copy=False) + cvs = col_values[1] + dtype = getattr(self.table.attrs, f"{column}_meta", None) + return Series(cvs, name=column, copy=False, dtype=dtype) raise KeyError(f"column [{column}] not found in the table") @@ -4451,7 +4590,7 @@ def write_data(self, chunksize: int | None, dropna: bool = False) -> None: masks.append(mask.astype("u1", copy=False)) # consolidate masks - if len(masks): + if masks: mask = masks[0] for m in masks[1:]: mask = mask & m @@ -4536,7 +4675,9 @@ def write_data_chunk( self.table.append(rows) self.table.flush() - def delete(self, where=None, start: int | None = None, stop: int | None = None): + def delete( + self, where=None, start: int | None = None, stop: int | None = None + ) -> int | None: # delete all rows (and return the nrows) if where is None or not len(where): if start is None and stop is None: @@ -4569,7 +4710,7 @@ def delete(self, where=None, start: int | None = None, stop: int | None = None): groups = list(diff[diff > 1].index) # 1 group - if not len(groups): + if not groups: groups = [0] # final element @@ -4672,20 +4813,38 @@ def read( if values.ndim == 1 and isinstance(values, np.ndarray): values = values.reshape((1, values.shape[0])) - if isinstance(values, np.ndarray): - df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + if isinstance(values, (np.ndarray, DatetimeArray)): + try: + df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + df = DataFrame( + values.T, + columns=cols_, + index=index_, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise elif isinstance(values, Index): df = DataFrame(values, columns=cols_, index=index_) else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): + if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_pyarrow_string_dtype() and is_string_array( - values, # type: ignore[arg-type] - skipna=True, - ): - df = df.astype("string[pyarrow_numpy]") + + # If str / string dtype is stored in meta, use that. + for column in cols_: + dtype = getattr(self.table.attrs, f"{column}_meta", None) + if dtype in ["str", "string"]: + df[column] = df[column].astype(dtype) frames.append(df) if len(frames) == 1: @@ -4864,7 +5023,7 @@ def read( columns=None, start: int | None = None, stop: int | None = None, - ): + ) -> DataFrame: df = super().read(where=where, columns=columns, start=start, stop=stop) df = df.set_index(self.levels) @@ -4908,55 +5067,25 @@ def _get_tz(tz: tzinfo) -> str | tzinfo: return zone -@overload -def _set_tz( - values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False -) -> DatetimeIndex: - ... - - -@overload -def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray: - ... - - def _set_tz( - values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False -) -> np.ndarray | DatetimeIndex: + values: npt.NDArray[np.int64], tz: str | tzinfo | None, datetime64_dtype: str +) -> DatetimeArray: """ - coerce the values to a DatetimeIndex if tz is set - preserve the input shape if possible + Coerce the values to a DatetimeArray with appropriate tz. Parameters ---------- - values : ndarray or Index - tz : str or tzinfo - coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray + values : ndarray[int64] + tz : str, tzinfo, or None + datetime64_dtype : str, e.g. "datetime64[ns]", "datetime64[25s]" """ - if isinstance(values, DatetimeIndex): - # If values is tzaware, the tz gets dropped in the values.ravel() - # call below (which returns an ndarray). So we are only non-lossy - # if `tz` matches `values.tz`. - assert values.tz is None or values.tz == tz - if values.tz is not None: - return values - - if tz is not None: - if isinstance(values, DatetimeIndex): - name = values.name - else: - name = None - values = values.ravel() - - tz = _ensure_decoded(tz) - values = DatetimeIndex(values, name=name) - values = values.tz_localize("UTC").tz_convert(tz) - elif coerce: - values = np.asarray(values, dtype="M8[ns]") - - # error: Incompatible return value type (got "Union[ndarray, Index]", - # expected "Union[ndarray, DatetimeIndex]") - return values # type: ignore[return-value] + assert values.dtype == "i8", values.dtype + # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None"; + # expected "tzinfo" + unit, _ = np.datetime_data(datetime64_dtype) # parsing dtype: unit, count + dtype = tz_to_dtype(tz=tz, unit=unit) # type: ignore[arg-type] + dta = DatetimeArray._from_sequence(values, dtype=dtype) + return dta def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: @@ -5062,6 +5191,9 @@ def _maybe_convert_for_string_atom( errors, columns: list[str], ): + if isinstance(bvalues.dtype, StringDtype): + # "ndarray[Any, Any]" has no attribute "to_numpy" + bvalues = bvalues.to_numpy() # type: ignore[union-attr] if bvalues.dtype != object: return bvalues @@ -5086,6 +5218,9 @@ def _maybe_convert_for_string_atom( data = bvalues.copy() data[mask] = nan_rep + if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize: + raise ValueError("NaN representation is too large for existing column size") + # see if we have a valid string type inferred_type = lib.infer_dtype(data, skipna=False) if inferred_type != "string": @@ -5143,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd # encode if needed if len(data): data = ( - Series(data.ravel(), copy=False) + Series(data.ravel(), copy=False, dtype="object") .str.encode(encoding, errors) ._values.reshape(data.shape) ) @@ -5183,7 +5318,11 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data, copy=False).str.decode(encoding, errors=errors)._values + ser = Series(data, copy=False).str.decode( + encoding, errors=errors, dtype="object" + ) + data = ser.to_numpy() + data.flags.writeable = True else: data = data.astype(dtype, copy=False).astype(object, copy=False) @@ -5250,8 +5389,6 @@ def _dtype_to_kind(dtype_str: str) -> str: """ Find the "kind" string describing the given dtype name. """ - dtype_str = _ensure_decoded(dtype_str) - if dtype_str.startswith(("string", "bytes")): kind = "string" elif dtype_str.startswith("float"): @@ -5273,6 +5410,8 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "integer" elif dtype_str == "object": kind = "object" + elif dtype_str == "str": + kind = "str" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") @@ -5361,7 +5500,13 @@ def __init__( if self.terms is not None: self.condition, self.filter = self.terms.evaluate() - def generate(self, where): + @overload + def generate(self, where: dict | list | tuple | str) -> PyTablesExpr: ... + + @overload + def generate(self, where: None) -> None: ... + + def generate(self, where: dict | list | tuple | str | None) -> PyTablesExpr | None: """where can be a : dict,list,tuple,string""" if where is None: return None diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c5bdfb5541788..792af5ff713a3 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -13,18 +13,17 @@ Reference for binary data compression: http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm """ + from __future__ import annotations -from collections import abc -from datetime import ( - datetime, - timedelta, -) +from datetime import datetime import sys from typing import TYPE_CHECKING import numpy as np +from pandas._config import get_option + from pandas._libs.byteswap import ( read_double_with_byteswap, read_float_with_byteswap, @@ -43,12 +42,11 @@ from pandas import ( DataFrame, Timestamp, - isna, ) from pandas.io.common import get_handle import pandas.io.sas.sas_constants as const -from pandas.io.sas.sasreader import ReaderBase +from pandas.io.sas.sasreader import SASReader if TYPE_CHECKING: from pandas._typing import ( @@ -62,20 +60,6 @@ _sas_origin = Timestamp("1960-01-01") -def _parse_datetime(sas_datetime: float, unit: str): - if isna(sas_datetime): - return pd.NaT - - if unit == "s": - return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime) - - elif unit == "d": - return datetime(1960, 1, 1) + timedelta(days=sas_datetime) - - else: - raise ValueError("unit must be 'd' or 's'") - - def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: """ Convert to Timestamp if possible, otherwise to datetime.datetime. @@ -133,7 +117,7 @@ def __init__( # SAS7BDAT represents a SAS data file in SAS7BDAT format. -class SAS7BDATReader(ReaderBase, abc.Iterator): +class SAS7BDATReader(SASReader): """ Read SAS files in SAS7BDAT format. @@ -326,7 +310,7 @@ def __next__(self) -> DataFrame: return da # Read a single float of the given width (4 or 8). - def _read_float(self, offset: int, width: int): + def _read_float(self, offset: int, width: int) -> float: assert self._cached_page is not None if width == 4: return read_float_with_byteswap( @@ -368,11 +352,6 @@ def _read_bytes(self, offset: int, length: int): raise ValueError("The cached page is too small.") return self._cached_page[offset : offset + length] - def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: - return self._convert_header_text( - self._read_bytes(offset, length).rstrip(b"\x00 ") - ) - def _parse_metadata(self) -> None: done = False while not done: @@ -539,7 +518,7 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None: buf = self._read_bytes(offset1, self._lcs) self.creator_proc = buf[0 : self._lcp] if hasattr(self, "creator_proc"): - self.creator_proc = self._convert_header_text(self.creator_proc) + self.creator_proc = self._convert_header_text(self.creator_proc) # pyright: ignore[reportArgumentType] def _process_columnname_subheader(self, offset: int, length: int) -> None: int_len = self._int_length @@ -722,6 +701,7 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt = {} js, jb = 0, 0 + infer_string = get_option("future.infer_string") for j in range(self.column_count): name = self.column_names[j] @@ -738,10 +718,13 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) + if infer_string: + rslt[name] = rslt[name].astype("str") + js += 1 else: self.close() - raise ValueError(f"unknown column type {repr(self._column_types[j])}") + raise ValueError(f"unknown column type {self._column_types[j]!r}") df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False) return df diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py index 62c17bd03927e..8da7becd76d3b 100644 --- a/pandas/io/sas/sas_constants.py +++ b/pandas/io/sas/sas_constants.py @@ -181,36 +181,36 @@ class SASIndex: subheader_signature_to_index: Final = { - b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index, - b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index, - b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index, - b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index, - b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index, - b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index, - b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index, - b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index, - b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index, - b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index, - b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index, - b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index, - b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index, - b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index, - b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index, - b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index, - b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index, - b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index, - b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index, - b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index, - b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index, - b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index, - b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index, - b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index, - b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index, - b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index, - b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index, - b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index, - b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index, - b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index, + b"\xf7\xf7\xf7\xf7": SASIndex.row_size_index, + b"\x00\x00\x00\x00\xf7\xf7\xf7\xf7": SASIndex.row_size_index, + b"\xf7\xf7\xf7\xf7\x00\x00\x00\x00": SASIndex.row_size_index, + b"\xf7\xf7\xf7\xf7\xff\xff\xfb\xfe": SASIndex.row_size_index, + b"\xf6\xf6\xf6\xf6": SASIndex.column_size_index, + b"\x00\x00\x00\x00\xf6\xf6\xf6\xf6": SASIndex.column_size_index, + b"\xf6\xf6\xf6\xf6\x00\x00\x00\x00": SASIndex.column_size_index, + b"\xf6\xf6\xf6\xf6\xff\xff\xfb\xfe": SASIndex.column_size_index, + b"\x00\xfc\xff\xff": SASIndex.subheader_counts_index, + b"\xff\xff\xfc\x00": SASIndex.subheader_counts_index, + b"\x00\xfc\xff\xff\xff\xff\xff\xff": SASIndex.subheader_counts_index, + b"\xff\xff\xff\xff\xff\xff\xfc\x00": SASIndex.subheader_counts_index, + b"\xfd\xff\xff\xff": SASIndex.column_text_index, + b"\xff\xff\xff\xfd": SASIndex.column_text_index, + b"\xfd\xff\xff\xff\xff\xff\xff\xff": SASIndex.column_text_index, + b"\xff\xff\xff\xff\xff\xff\xff\xfd": SASIndex.column_text_index, + b"\xff\xff\xff\xff": SASIndex.column_name_index, + b"\xff\xff\xff\xff\xff\xff\xff\xff": SASIndex.column_name_index, + b"\xfc\xff\xff\xff": SASIndex.column_attributes_index, + b"\xff\xff\xff\xfc": SASIndex.column_attributes_index, + b"\xfc\xff\xff\xff\xff\xff\xff\xff": SASIndex.column_attributes_index, + b"\xff\xff\xff\xff\xff\xff\xff\xfc": SASIndex.column_attributes_index, + b"\xfe\xfb\xff\xff": SASIndex.format_and_label_index, + b"\xff\xff\xfb\xfe": SASIndex.format_and_label_index, + b"\xfe\xfb\xff\xff\xff\xff\xff\xff": SASIndex.format_and_label_index, + b"\xff\xff\xff\xff\xff\xff\xfb\xfe": SASIndex.format_and_label_index, + b"\xfe\xff\xff\xff": SASIndex.column_list_index, + b"\xff\xff\xff\xfe": SASIndex.column_list_index, + b"\xfe\xff\xff\xff\xff\xff\xff\xff": SASIndex.column_list_index, + b"\xff\xff\xff\xff\xff\xff\xff\xfe": SASIndex.column_list_index, } diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e68f4789f0a06..a9c45e720fd56 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -7,9 +7,9 @@ https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf """ + from __future__ import annotations -from collections import abc from datetime import datetime import struct from typing import TYPE_CHECKING @@ -23,7 +23,7 @@ import pandas as pd from pandas.io.common import get_handle -from pandas.io.sas.sasreader import ReaderBase +from pandas.io.sas.sasreader import SASReader if TYPE_CHECKING: from pandas._typing import ( @@ -33,19 +33,16 @@ ReadBuffer, ) _correct_line1 = ( - "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _correct_header1 = ( "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000" ) _correct_header2 = ( - "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _correct_obs_header = ( - "HEADER RECORD*******OBS HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******OBS HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _fieldkeys = [ "ntype", @@ -251,7 +248,7 @@ def _parse_float_vec(vec): return ieee -class XportReader(ReaderBase, abc.Iterator): +class XportReader(SASReader): __doc__ = _xport_reader_doc def __init__( @@ -288,7 +285,7 @@ def close(self) -> None: def _get_row(self): return self.filepath_or_buffer.read(80).decode() - def _read_header(self): + def _read_header(self) -> None: self.filepath_or_buffer.seek(0) # read file header diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index c39313d5dc654..6daf4a24781bd 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -1,12 +1,14 @@ """ Read SAS sas7bdat or xport files. """ + from __future__ import annotations from abc import ( ABC, abstractmethod, ) +from collections.abc import Iterator from typing import ( TYPE_CHECKING, overload, @@ -32,18 +34,16 @@ from pandas import DataFrame -class ReaderBase(ABC): +class SASReader(Iterator["DataFrame"], ABC): """ - Protocol for XportReader and SAS7BDATReader classes. + Abstract class for XportReader and SAS7BDATReader. """ @abstractmethod - def read(self, nrows: int | None = None) -> DataFrame: - ... + def read(self, nrows: int | None = None) -> DataFrame: ... @abstractmethod - def close(self) -> None: - ... + def close(self) -> None: ... def __enter__(self) -> Self: return self @@ -67,8 +67,7 @@ def read_sas( chunksize: int = ..., iterator: bool = ..., compression: CompressionOptions = ..., -) -> ReaderBase: - ... +) -> SASReader: ... @overload @@ -81,8 +80,7 @@ def read_sas( chunksize: None = ..., iterator: bool = ..., compression: CompressionOptions = ..., -) -> DataFrame | ReaderBase: - ... +) -> DataFrame | SASReader: ... @doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer") @@ -95,7 +93,7 @@ def read_sas( chunksize: int | None = None, iterator: bool = False, compression: CompressionOptions = "infer", -) -> DataFrame | ReaderBase: +) -> DataFrame | SASReader: """ Read SAS files stored as either XPORT or SAS7BDAT format files. @@ -122,8 +120,17 @@ def read_sas( Returns ------- - DataFrame if iterator=False and chunksize=None, else SAS7BDATReader - or XportReader + DataFrame, SAS7BDATReader, or XportReader + DataFrame if iterator=False and chunksize=None, else SAS7BDATReader + or XportReader, file format is inferred from file extension. + + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. Examples -------- @@ -144,10 +151,10 @@ def read_sas( format = "sas7bdat" else: raise ValueError( - f"unable to infer format of SAS file from filename: {repr(fname)}" + f"unable to infer format of SAS file from filename: {fname!r}" ) - reader: ReaderBase + reader: SASReader if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader diff --git a/pandas/io/spss.py b/pandas/io/spss.py index db31a07df79e6..dfada10c719c9 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -24,6 +27,7 @@ def read_spss( usecols: Sequence[str] | None = None, convert_categoricals: bool = True, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + **kwargs: Any, ) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. @@ -36,20 +40,34 @@ def read_spss( Return a subset of the columns. If None, return all columns. convert_categoricals : bool, default is True Convert categorical columns into pd.Categorical. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed + nullable :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 + **kwargs + Additional keyword arguments that can be passed to :func:`pyreadstat.read_sav`. + + .. versionadded:: 3.0 Returns ------- DataFrame + DataFrame based on the SPSS file. + + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_sas : Read an SAS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. Examples -------- @@ -64,7 +82,10 @@ def read_spss( usecols = list(usecols) # pyreadstat requires a list df, metadata = pyreadstat.read_sav( - stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals + stringify_path(path), + usecols=usecols, + apply_value_formats=convert_categoricals, + **kwargs, ) df.attrs = metadata.__dict__ if dtype_backend is not lib.no_default: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b0fa6bc6e90c4..7376843f7e8ff 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -23,7 +23,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -32,7 +31,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -46,11 +45,10 @@ from pandas.core.dtypes.common import ( is_dict_like, is_list_like, + is_object_dtype, + is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - DatetimeTZDtype, -) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas import get_option @@ -59,26 +57,31 @@ Series, ) from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.common import maybe_make_list from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime +from pandas.io._util import arrow_table_to_pandas + if TYPE_CHECKING: from collections.abc import ( + Callable, + Generator, Iterator, Mapping, ) from sqlalchemy import Table from sqlalchemy.sql.expression import ( + Delete, Select, TextClause, ) from pandas._typing import ( - DateTimeErrorChoices, DtypeArg, DtypeBackend, IndexLabel, @@ -110,14 +113,7 @@ def _handle_date_column( # read_sql like functions. # Format can take on custom to_datetime argument values such as # {"errors": "coerce"} or {"dayfirst": True} - error: DateTimeErrorChoices = format.pop("errors", None) or "ignore" - if error == "ignore": - try: - return to_datetime(col, **format) - except (TypeError, ValueError): - # TODO: not reached 2023-10-27; needed? - return col - return to_datetime(col, errors=error, **format) + return to_datetime(col, **format) else: # Allow passing of formatting string for integers # GH17855 @@ -136,7 +132,7 @@ def _handle_date_column( return to_datetime(col, errors="coerce", format=format, utc=utc) -def _parse_date_columns(data_frame, parse_dates): +def _parse_date_columns(data_frame: DataFrame, parse_dates) -> DataFrame: """ Force non-datetime columns to be read as such. Supports both string formatted and integer timestamp columns. @@ -164,6 +160,7 @@ def _convert_arrays_to_dataframe( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame: content = lib.to_object_array_tuples(data) + idx_len = content.shape[0] arrays = convert_object_array( list(content.T), dtype=None, @@ -184,9 +181,9 @@ def _convert_arrays_to_dataframe( result_arrays.append(ArrowExtensionArray(pa_array)) arrays = result_arrays # type: ignore[assignment] if arrays: - df = DataFrame(dict(zip(list(range(len(columns))), arrays))) - df.columns = columns - return df + return DataFrame._from_arrays( + arrays, columns=columns, index=range(idx_len), verify_integrity=False + ) else: return DataFrame(columns=columns) @@ -199,7 +196,7 @@ def _wrap_result( parse_dates=None, dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", -): +) -> DataFrame: """Wrap result set of a SQLAlchemy query in a DataFrame.""" frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend) @@ -234,54 +231,22 @@ def _wrap_result_adbc( return df -def execute(sql, con, params=None): - """ - Execute the given SQL query using the provided connection object. - - Parameters - ---------- - sql : string - SQL query to be executed. - con : SQLAlchemy connection or sqlite3 connection - If a DBAPI2 object, only sqlite3 is supported. - params : list or tuple, optional, default: None - List of parameters to pass to execute method. - - Returns - ------- - Results Iterable - """ - warnings.warn( - "`pandas.io.sql.execute` is deprecated and " - "will be removed in the future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) # GH50185 - sqlalchemy = import_optional_dependency("sqlalchemy", errors="ignore") - - if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Engine)): - raise TypeError("pandas.io.sql.execute requires a connection") # GH50185 - with pandasSQL_builder(con, need_transaction=True) as pandas_sql: - return pandas_sql.execute(sql, params) - - # ----------------------------------------------------------------------------- # -- Read and write to DataFrames @overload -def read_sql_table( +def read_sql_table( # pyright: ignore[reportOverlappingOverload] table_name: str, con, schema=..., index_col: str | list[str] | None = ..., coerce_float=..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., columns: list[str] | None = ..., chunksize: None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> DataFrame: - ... +) -> DataFrame: ... @overload @@ -291,12 +256,11 @@ def read_sql_table( schema=..., index_col: str | list[str] | None = ..., coerce_float=..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., columns: list[str] | None = ..., chunksize: int = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> Iterator[DataFrame]: - ... +) -> Iterator[DataFrame]: ... def read_sql_table( @@ -305,7 +269,7 @@ def read_sql_table( schema: str | None = None, index_col: str | list[str] | None = None, coerce_float: bool = True, - parse_dates: list[str] | dict[str, str] | None = None, + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None, columns: list[str] | None = None, chunksize: int | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, @@ -345,14 +309,15 @@ def read_sql_table( chunksize : int, default None If specified, returns an iterator where `chunksize` is the number of rows to include in each chunk. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -373,7 +338,7 @@ def read_sql_table( Examples -------- - >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP + >>> pd.read_sql_table("table_name", "postgres:///db_name") # doctest:+SKIP """ check_dtype_backend(dtype_backend) @@ -402,18 +367,17 @@ def read_sql_table( @overload -def read_sql_query( +def read_sql_query( # pyright: ignore[reportOverlappingOverload] sql, con, index_col: str | list[str] | None = ..., coerce_float=..., params: list[Any] | Mapping[str, Any] | None = ..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., chunksize: None = ..., dtype: DtypeArg | None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> DataFrame: - ... +) -> DataFrame: ... @overload @@ -423,12 +387,11 @@ def read_sql_query( index_col: str | list[str] | None = ..., coerce_float=..., params: list[Any] | Mapping[str, Any] | None = ..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., chunksize: int = ..., dtype: DtypeArg | None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> Iterator[DataFrame]: - ... +) -> Iterator[DataFrame]: ... def read_sql_query( @@ -437,7 +400,7 @@ def read_sql_query( index_col: str | list[str] | None = None, coerce_float: bool = True, params: list[Any] | Mapping[str, Any] | None = None, - parse_dates: list[str] | dict[str, str] | None = None, + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, @@ -484,20 +447,23 @@ def read_sql_query( {'a': np.float64, 'b': np.int32, 'c': 'Int64'}. .. versionadded:: 1.3.0 - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 Returns ------- DataFrame or Iterator[DataFrame] + Returns a DataFrame object that contains the result set of the + executed SQL query, in relation to the specified database connection. See Also -------- @@ -513,8 +479,9 @@ def read_sql_query( -------- >>> from sqlalchemy import create_engine # doctest: +SKIP >>> engine = create_engine("sqlite:///database.db") # doctest: +SKIP + >>> sql_query = "SELECT int_column FROM test_data" # doctest: +SKIP >>> with engine.connect() as conn, conn.begin(): # doctest: +SKIP - ... data = pd.read_sql_table("data", conn) # doctest: +SKIP + ... data = pd.read_sql_query(sql_query, conn) # doctest: +SKIP """ check_dtype_backend(dtype_backend) @@ -536,7 +503,7 @@ def read_sql_query( @overload -def read_sql( +def read_sql( # pyright: ignore[reportOverlappingOverload] sql, con, index_col: str | list[str] | None = ..., @@ -547,8 +514,7 @@ def read_sql( chunksize: None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., dtype: DtypeArg | None = None, -) -> DataFrame: - ... +) -> DataFrame: ... @overload @@ -563,8 +529,7 @@ def read_sql( chunksize: int = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., dtype: DtypeArg | None = None, -) -> Iterator[DataFrame]: - ... +) -> Iterator[DataFrame]: ... def read_sql( @@ -626,14 +591,15 @@ def read_sql( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 dtype : Type name or dict of columns @@ -646,12 +612,23 @@ def read_sql( Returns ------- DataFrame or Iterator[DataFrame] + Returns a DataFrame object that contains the result set of the + executed SQL query or an SQL Table based on the provided input, + in relation to the specified database connection. See Also -------- read_sql_table : Read SQL database table into a DataFrame. read_sql_query : Read SQL query into a DataFrame. + Notes + ----- + ``pandas`` does not attempt to sanitize SQL statements; + instead it simply forwards the statement you are executing + to the underlying driver, which may or may not sanitize from there. + Please refer to the underlying driver documentation for any details. + Generally, be wary when accepting statements from arbitrary sources. + Examples -------- Read data from SQL via either a SQL query or a SQL tablename. @@ -659,27 +636,41 @@ def read_sql( providing only the SQL tablename will result in an error. >>> from sqlite3 import connect - >>> conn = connect(':memory:') - >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']], - ... columns=['int_column', 'date_column']) - >>> df.to_sql(name='test_data', con=conn) + >>> conn = connect(":memory:") + >>> df = pd.DataFrame( + ... data=[[0, "10/11/12"], [1, "12/11/10"]], + ... columns=["int_column", "date_column"], + ... ) + >>> df.to_sql(name="test_data", con=conn) 2 - >>> pd.read_sql('SELECT int_column, date_column FROM test_data', conn) + >>> pd.read_sql("SELECT int_column, date_column FROM test_data", conn) int_column date_column 0 0 10/11/12 1 1 12/11/10 - >>> pd.read_sql('test_data', 'postgres:///db_name') # doctest:+SKIP + >>> pd.read_sql("test_data", "postgres:///db_name") # doctest:+SKIP + + For parameterized query, using ``params`` is recommended over string interpolation. + + >>> from sqlalchemy import text + >>> sql = text( + ... "SELECT int_column, date_column FROM test_data WHERE int_column=:int_val" + ... ) + >>> pd.read_sql(sql, conn, params={"int_val": 1}) # doctest:+SKIP + int_column date_column + 0 1 12/11/10 Apply date parsing to columns through the ``parse_dates`` argument The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns. Custom argument values for applying ``pd.to_datetime`` on a column are specified via a dictionary format: - >>> pd.read_sql('SELECT int_column, date_column FROM test_data', - ... conn, - ... parse_dates={"date_column": {"format": "%d/%m/%y"}}) + >>> pd.read_sql( + ... "SELECT int_column, date_column FROM test_data", + ... conn, + ... parse_dates={"date_column": {"format": "%d/%m/%y"}}, + ... ) int_column date_column 0 0 2012-11-10 1 1 2010-11-12 @@ -689,8 +680,8 @@ def read_sql( pandas now supports reading via ADBC drivers >>> from adbc_driver_postgresql import dbapi # doctest:+SKIP - >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP - ... pd.read_sql('SELECT int_column FROM test_data', conn) + >>> with dbapi.connect("postgres:///db_name") as conn: # doctest:+SKIP + ... pd.read_sql("SELECT int_column FROM test_data", conn) int_column 0 0 1 1 @@ -748,7 +739,7 @@ def to_sql( name: str, con, schema: str | None = None, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool = True, index_label: IndexLabel | None = None, chunksize: int | None = None, @@ -760,6 +751,12 @@ def to_sql( """ Write records stored in a DataFrame to a SQL database. + .. warning:: + The pandas library does not attempt to sanitize inputs provided via a to_sql call. + Please refer to the documentation for the underlying database driver to see if it + will properly prevent injection, or alternatively be advised of a security risk when + executing arbitrary commands in a to_sql call. + Parameters ---------- frame : DataFrame, Series @@ -774,10 +771,11 @@ def to_sql( schema : str, optional Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). - if_exists : {'fail', 'replace', 'append'}, default 'fail' + if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. - append: If table exists, insert data. Create if does not exist. + - delete_rows: If a table exists, delete all records and insert data. index : bool, default True Write DataFrame index as a column. index_label : str or sequence, optional @@ -828,7 +826,7 @@ def to_sql( `sqlite3 `__ or `SQLAlchemy `__ """ # noqa: E501 - if if_exists not in ("fail", "replace", "append"): + if if_exists not in ("fail", "replace", "append", "delete_rows"): raise ValueError(f"'{if_exists}' is not valid for if_exists") if isinstance(frame, Series): @@ -936,7 +934,7 @@ def __init__( pandas_sql_engine, frame=None, index: bool | str | list[str] | None = True, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", prefix: str = "pandas", index_label=None, schema=None, @@ -984,11 +982,13 @@ def create(self) -> None: if self.exists(): if self.if_exists == "fail": raise ValueError(f"Table '{self.name}' already exists.") - if self.if_exists == "replace": + elif self.if_exists == "replace": self.pd_sql.drop_table(self.name, self.schema) self._execute_create() elif self.if_exists == "append": pass + elif self.if_exists == "delete_rows": + self.pd_sql.delete_rows(self.name, self.schema) else: raise ValueError(f"'{self.if_exists}' is not valid for if_exists") else: @@ -1007,32 +1007,29 @@ def _execute_insert(self, conn, keys: list[str], data_iter) -> int: Each item contains a list of values to be inserted """ data = [dict(zip(keys, row)) for row in data_iter] - result = conn.execute(self.table.insert(), data) + result = self.pd_sql.execute(self.table.insert(), data) return result.rowcount def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: """ - Alternative to _execute_insert for DBs support multivalue INSERT. + Alternative to _execute_insert for DBs support multi-value INSERT. Note: multi-value insert is usually faster for analytics DBs and tables containing a few columns but performance degrades quickly with increase of columns. + """ from sqlalchemy import insert data = [dict(zip(keys, row)) for row in data_iter] - stmt = insert(self.table) - # conn.execute is used here to ensure compatibility with Oracle. - # Using stmt.values(data) would produce a multi row insert that - # isn't supported by Oracle. - # see: https://docs.sqlalchemy.org/en/20/core/dml.html#sqlalchemy.sql.expression.Insert.values - result = conn.execute(stmt, data) + stmt = insert(self.table).values(data) + result = self.pd_sql.execute(stmt) return result.rowcount def insert_data(self) -> tuple[list[str], list[np.ndarray]]: if self.index is not None: - temp = self.frame.copy() + temp = self.frame.copy(deep=False) temp.index.names = self.index try: temp.reset_index(inplace=True) @@ -1056,10 +1053,7 @@ def insert_data(self) -> tuple[list[str], list[np.ndarray]]: # GH#53854 to_pydatetime not supported for pyarrow date dtypes d = ser._values.to_numpy(dtype=object) else: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=FutureWarning) - # GH#52459 to_pydatetime will return Index[object] - d = np.asarray(ser.dt.to_pydatetime(), dtype=object) + d = ser.dt.to_pydatetime()._values else: d = ser._values.to_pydatetime() elif ser.dtype.kind == "m": @@ -1137,7 +1131,7 @@ def _query_iterator( coerce_float: bool = True, parse_dates=None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", - ): + ) -> Generator[DataFrame]: """Return generator through chunked result set.""" has_read_data = False with exit_stack: @@ -1333,12 +1327,17 @@ def _harmonize_columns( self.frame[col_name] = _handle_date_column(df_col, utc=utc) elif dtype_backend == "numpy" and col_type is float: # floats support NA, can always convert! - self.frame[col_name] = df_col.astype(col_type, copy=False) - + self.frame[col_name] = df_col.astype(col_type) + elif ( + using_string_dtype() + and is_string_dtype(col_type) + and is_object_dtype(self.frame[col_name]) + ): + self.frame[col_name] = df_col.astype(col_type) elif dtype_backend == "numpy" and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: - self.frame[col_name] = df_col.astype(col_type, copy=False) + self.frame[col_name] = df_col.astype(col_type) except KeyError: pass # this column not in results @@ -1421,6 +1420,7 @@ def _get_dtype(self, sqltype): DateTime, Float, Integer, + String, ) if isinstance(sqltype, Float): @@ -1440,6 +1440,10 @@ def _get_dtype(self, sqltype): return date elif isinstance(sqltype, Boolean): return bool + elif isinstance(sqltype, String): + if using_string_dtype(): + return StringDtype(na_value=np.nan) + return object @@ -1486,7 +1490,7 @@ def to_sql( self, frame, name: str, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool = True, index_label=None, schema=None, @@ -1514,7 +1518,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: pass @@ -1655,12 +1659,20 @@ def run_transaction(self): else: yield self.con - def execute(self, sql: str | Select | TextClause, params=None): + def execute(self, sql: str | Select | TextClause | Delete, params=None): """Simple passthrough to SQLAlchemy connectable""" + from sqlalchemy.exc import SQLAlchemyError + args = [] if params is None else [params] if isinstance(sql, str): - return self.con.exec_driver_sql(sql, *args) - return self.con.execute(sql, *args) + execute_function = self.con.exec_driver_sql + else: + execute_function = self.con.execute + + try: + return execute_function(sql, *args) + except SQLAlchemyError as exc: + raise DatabaseError(f"Execution failed on sql '{sql}': {exc}") from exc def read_table( self, @@ -1704,14 +1716,15 @@ def read_table( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1749,7 +1762,7 @@ def _query_iterator( parse_dates=None, dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", - ): + ) -> Generator[DataFrame]: """Return generator through chunked result set""" has_read_data = False with exit_stack: @@ -1871,7 +1884,7 @@ def prep_table( self, frame, name: str, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool | str | list[str] | None = True, index_label=None, schema=None, @@ -1888,7 +1901,7 @@ def prep_table( # Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]"; expected type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]" - dtype = {col_name: dtype for col_name in frame} # type: ignore[misc] + dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type] else: dtype = cast(dict, dtype) @@ -1948,7 +1961,7 @@ def to_sql( self, frame, name: str, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool = True, index_label=None, schema: str | None = None, @@ -1966,10 +1979,11 @@ def to_sql( frame : DataFrame name : string Name of SQL table. - if_exists : {'fail', 'replace', 'append'}, default 'fail' + if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. - append: If table exists, insert data. Create if does not exist. + - delete_rows: If a table exists, delete all records and insert data. index : boolean, default True Write DataFrame index as a column. index_label : string or sequence, default None @@ -2066,6 +2080,16 @@ def drop_table(self, table_name: str, schema: str | None = None) -> None: self.get_table(table_name, schema).drop(bind=self.con) self.meta.clear() + def delete_rows(self, table_name: str, schema: str | None = None) -> None: + schema = schema or self.meta.schema + if self.has_table(table_name, schema): + self.meta.reflect( + bind=self.con, only=[table_name], schema=schema, views=True + ) + table = self.get_table(table_name, schema) + self.execute(table.delete()).close() + self.meta.clear() + def _create_sql_schema( self, frame: DataFrame, @@ -2073,7 +2097,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLTable( table_name, self, @@ -2113,6 +2137,8 @@ def run_transaction(self): self.con.commit() def execute(self, sql: str | Select | TextClause, params=None): + from adbc_driver_manager import Error + if not isinstance(sql, str): raise TypeError("Query must be a string unless using sqlalchemy.") args = [] if params is None else [params] @@ -2120,10 +2146,10 @@ def execute(self, sql: str | Select | TextClause, params=None): try: cur.execute(sql, *args) return cur - except Exception as exc: + except Error as exc: try: self.con.rollback() - except Exception as inner_exc: # pragma: no cover + except Error as inner_exc: # pragma: no cover ex = DatabaseError( f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" ) @@ -2169,14 +2195,15 @@ def read_table( schema of the SQL database object. chunksize : int, default None Raises NotImplementedError - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -2211,23 +2238,9 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_pyarrow_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - arrow_string_types_mapper() - else: - mapping = None - - with self.con.cursor() as cur: - cur.execute(stmt) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + with self.execute(stmt) as cur: + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2295,19 +2308,9 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - - with self.con.cursor() as cur: - cur.execute(sql) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + with self.execute(sql) as cur: + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2322,7 +2325,7 @@ def to_sql( self, frame, name: str, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool = True, index_label=None, schema: str | None = None, @@ -2344,6 +2347,7 @@ def to_sql( - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. - append: If table exists, insert data. Create if does not exist. + - delete_rows: If a table exists, delete all records and insert data. index : boolean, default True Write DataFrame index as a column. index_label : string or sequence, default None @@ -2361,6 +2365,9 @@ def to_sql( engine : {'auto', 'sqlalchemy'}, default 'auto' Raises NotImplementedError if not set to 'auto' """ + pa = import_optional_dependency("pyarrow") + from adbc_driver_manager import Error + if index_label: raise NotImplementedError( "'index_label' is not implemented for ADBC drivers" @@ -2390,12 +2397,13 @@ def to_sql( if if_exists == "fail": raise ValueError(f"Table '{table_name}' already exists.") elif if_exists == "replace": - with self.con.cursor() as cur: - cur.execute(f"DROP TABLE {table_name}") + sql_statement = f"DROP TABLE {table_name}" + self.execute(sql_statement).close() elif if_exists == "append": mode = "append" - - import pyarrow as pa + elif if_exists == "delete_rows": + mode = "append" + self.delete_rows(name, schema) try: tbl = pa.Table.from_pandas(frame, preserve_index=index) @@ -2403,7 +2411,14 @@ def to_sql( raise ValueError("datatypes not supported") from exc with self.con.cursor() as cur: - total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) + try: + total_inserted = cur.adbc_ingest( + table_name=name, data=tbl, mode=mode, db_schema_name=schema + ) + except Error as exc: + raise DatabaseError( + f"Failed to insert records on table={name} with {mode=}" + ) from exc self.con.commit() return total_inserted @@ -2426,6 +2441,11 @@ def has_table(self, name: str, schema: str | None = None) -> bool: return False + def delete_rows(self, name: str, schema: str | None = None) -> None: + table_name = f"{schema}.{name}" if schema else name + if self.has_table(name, schema): + self.execute(f"DELETE FROM {table_name}").close() + def _create_sql_schema( self, frame: DataFrame, @@ -2433,7 +2453,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: raise NotImplementedError("not implemented for adbc") @@ -2450,7 +2470,7 @@ def _create_sql_schema( } -def _get_unicode_name(name: object): +def _get_unicode_name(name: object) -> str: try: uname = str(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError as err: @@ -2458,7 +2478,7 @@ def _get_unicode_name(name: object): return uname -def _get_valid_sqlite_name(name: object): +def _get_valid_sqlite_name(name: object) -> str: # See https://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ # -for-sqlite-table-column-names-in-python # Ensure the string can be encoded as UTF-8. @@ -2520,9 +2540,9 @@ def sql_schema(self) -> str: return str(";\n".join(self.table)) def _execute_create(self) -> None: - with self.pd_sql.run_transaction() as conn: + with self.pd_sql.run_transaction() as cur: for stmt in self.table: - conn.execute(stmt) + cur.execute(stmt) def insert_statement(self, *, num_rows: int) -> str: names = list(map(str, self.frame.columns)) @@ -2544,8 +2564,13 @@ def insert_statement(self, *, num_rows: int) -> str: return insert_statement def _execute_insert(self, conn, keys, data_iter) -> int: + from sqlite3 import Error + data_list = list(data_iter) - conn.executemany(self.insert_statement(num_rows=1), data_list) + try: + conn.executemany(self.insert_statement(num_rows=1), data_list) + except Error as exc: + raise DatabaseError("Execution failed") from exc return conn.rowcount def _execute_insert_multi(self, conn, keys, data_iter) -> int: @@ -2590,7 +2615,7 @@ def _create_table_setup(self): ] ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index] - if len(ix_cols): + if ix_cols: cnames = "_".join(ix_cols) cnames_br = ",".join([escape(c) for c in ix_cols]) create_stmts.append( @@ -2667,6 +2692,8 @@ def run_transaction(self): cur.close() def execute(self, sql: str | Select | TextClause, params=None): + from sqlite3 import Error + if not isinstance(sql, str): raise TypeError("Query must be a string unless using sqlalchemy.") args = [] if params is None else [params] @@ -2674,10 +2701,10 @@ def execute(self, sql: str | Select | TextClause, params=None): try: cur.execute(sql, *args) return cur - except Exception as exc: + except Error as exc: try: self.con.rollback() - except Exception as inner_exc: # pragma: no cover + except Error as inner_exc: # pragma: no cover ex = DatabaseError( f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" ) @@ -2696,7 +2723,7 @@ def _query_iterator( parse_dates=None, dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", - ): + ) -> Generator[DataFrame]: """Return generator through chunked result set""" has_read_data = False while True: @@ -2793,10 +2820,11 @@ def to_sql( frame: DataFrame name: string Name of SQL table. - if_exists: {'fail', 'replace', 'append'}, default 'fail' + if_exists: {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' fail: If table exists, do nothing. replace: If table exists, drop it, recreate it, and insert data. append: If table exists, insert data. Create if it does not exist. + delete_rows: If a table exists, delete all records and insert data. index : bool, default True Write DataFrame index as a column index_label : string or sequence, default None @@ -2831,7 +2859,7 @@ def to_sql( # Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]"; expected type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]" - dtype = {col_name: dtype for col_name in frame} # type: ignore[misc] + dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type] else: dtype = cast(dict, dtype) @@ -2870,7 +2898,12 @@ def get_table(self, table_name: str, schema: str | None = None) -> None: def drop_table(self, name: str, schema: str | None = None) -> None: drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}" - self.execute(drop_sql) + self.execute(drop_sql).close() + + def delete_rows(self, name: str, schema: str | None = None) -> None: + delete_sql = f"DELETE FROM {_get_valid_sqlite_name(name)}" + if self.has_table(name, schema): + self.execute(delete_sql).close() def _create_sql_schema( self, @@ -2879,7 +2912,7 @@ def _create_sql_schema( keys=None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLiteTable( table_name, self, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0f097c6059c7c..cd290710ddbaa 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -9,6 +9,7 @@ You can find more information on http://presbrey.mit.edu/PyDTA and https://www.statsmodels.org/devel/ """ + from __future__ import annotations from collections import abc @@ -24,7 +25,6 @@ IO, TYPE_CHECKING, AnyStr, - Callable, Final, cast, ) @@ -47,9 +47,11 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_object, is_numeric_dtype, + is_string_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype @@ -60,10 +62,7 @@ Timestamp, isna, to_datetime, - to_timedelta, ) -from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.integer import IntegerDtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.core.indexes.range import RangeIndex @@ -74,6 +73,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -91,9 +91,9 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " - "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," - "and 119 (Stata 15/16, over 32,767 variables)." + "versions 102, 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), " + "113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), " + "118 (Stata 14/15/16), and 119 (Stata 15/16, over 32,767 variables)." ) _statafile_processing_params1 = """\ @@ -161,7 +161,8 @@ Returns ------- -DataFrame or pandas.api.typing.StataReader +DataFrame, pandas.api.typing.StataReader + If iterator or chunksize, returns StataReader, else DataFrame. See Also -------- @@ -176,7 +177,7 @@ Creating a dummy stata for this example >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'], -... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP +... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP >>> df.to_stata('animals.dta') # doctest: +SKIP Read a Stata dta file: @@ -189,7 +190,7 @@ >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP >>> df.to_stata('filename.dta') # doctest: +SKIP ->>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP +>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP >>> for chunk in itr: ... # Operate on a single chunk, e.g., chunk.mean() ... pass # doctest: +SKIP @@ -216,7 +217,7 @@ Parameters ---------- path_or_buf : path (string), buffer or path object - string, path object (pathlib.Path or py._path.local.LocalPath) or object + string, pathlib.Path or object implementing a binary read() functions. {_statafile_processing_params1} {_statafile_processing_params2} @@ -232,6 +233,7 @@ stata_epoch: Final = datetime(1960, 1, 1) +unix_epoch: Final = datetime(1970, 1, 1) def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: @@ -254,9 +256,9 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: Examples -------- >>> dates = pd.Series([52]) - >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") + >>> _stata_elapsed_date_to_datetime_vec(dates, "%tw") 0 1961-01-01 - dtype: datetime64[ns] + dtype: datetime64[s] Notes ----- @@ -280,64 +282,43 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: date - ty years since 0000 """ - MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year - MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days - MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days - MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 - MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 - def convert_year_month_safe(year, month) -> Series: - """ - Convert year and month to datetimes, using pandas vectorized versions - when the date range falls within the range supported by pandas. - Otherwise it falls back to a slower but more robust method - using datetime. - """ - if year.max() < MAX_YEAR and year.min() > MIN_YEAR: - return to_datetime(100 * year + month, format="%Y%m") - else: - index = getattr(year, "index", None) - return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index) + if fmt.startswith(("%tc", "tc")): + # Delta ms relative to base + td = np.timedelta64(stata_epoch - unix_epoch, "ms") + res = np.array(dates._values, dtype="M8[ms]") + td + return Series(res, index=dates.index) - def convert_year_days_safe(year, days) -> Series: - """ - Converts year (e.g. 1999) and days since the start of the year to a - datetime or datetime64 Series - """ - if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: - return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") - else: - index = getattr(year, "index", None) - value = [ - datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days) - ] - return Series(value, index=index) - - def convert_delta_safe(base, deltas, unit) -> Series: - """ - Convert base dates and deltas to datetimes, using pandas vectorized - versions if the deltas satisfy restrictions required to be expressed - as dates in pandas. - """ - index = getattr(deltas, "index", None) - if unit == "d": - if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: - values = [base + timedelta(days=int(d)) for d in deltas] - return Series(values, index=index) - elif unit == "ms": - if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: - values = [ - base + timedelta(microseconds=(int(d) * 1000)) for d in deltas - ] - return Series(values, index=index) - else: - raise ValueError("format not understood") - base = to_datetime(base) - deltas = to_timedelta(deltas, unit=unit) - return base + deltas + elif fmt.startswith(("%td", "td", "%d", "d")): + # Delta days relative to base + td = np.timedelta64(stata_epoch - unix_epoch, "D") + res = np.array(dates._values, dtype="M8[D]") + td + return Series(res, index=dates.index) + + elif fmt.startswith(("%tm", "tm")): + # Delta months relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 12 + res = np.array(ordinals, dtype="M8[M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%tq", "tq")): + # Delta quarters relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 4 + res = np.array(ordinals, dtype="M8[3M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%th", "th")): + # Delta half-years relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 2 + res = np.array(ordinals, dtype="M8[6M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%ty", "ty")): + # Years -- not delta + ordinals = dates - 1970 + res = np.array(ordinals, dtype="M8[Y]").astype("M8[s]") + return Series(res, index=dates.index) - # TODO(non-nano): If/when pandas supports more than datetime64[ns], this - # should be improved to use correct range, e.g. datetime[Y] for yearly bad_locs = np.isnan(dates) has_bad_values = False if bad_locs.any(): @@ -345,11 +326,7 @@ def convert_delta_safe(base, deltas, unit) -> Series: dates._values[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) - if fmt.startswith(("%tc", "tc")): # Delta ms relative to base - base = stata_epoch - ms = dates - conv_dates = convert_delta_safe(base, ms, "ms") - elif fmt.startswith(("%tC", "tC")): + if fmt.startswith(("%tC", "tC")): warnings.warn( "Encountered %tC format. Leaving in Stata Internal Format.", stacklevel=find_stack_level(), @@ -358,33 +335,18 @@ def convert_delta_safe(base, deltas, unit) -> Series: if has_bad_values: conv_dates[bad_locs] = NaT return conv_dates - # Delta days relative to base - elif fmt.startswith(("%td", "td", "%d", "d")): - base = stata_epoch - days = dates - conv_dates = convert_delta_safe(base, days, "d") # does not count leap days - 7 days is a week. # 52nd week may have more than 7 days elif fmt.startswith(("%tw", "tw")): year = stata_epoch.year + dates // 52 days = (dates % 52) * 7 - conv_dates = convert_year_days_safe(year, days) - elif fmt.startswith(("%tm", "tm")): # Delta months relative to base - year = stata_epoch.year + dates // 12 - month = (dates % 12) + 1 - conv_dates = convert_year_month_safe(year, month) - elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base - year = stata_epoch.year + dates // 4 - quarter_month = (dates % 4) * 3 + 1 - conv_dates = convert_year_month_safe(year, quarter_month) - elif fmt.startswith(("%th", "th")): # Delta half-years relative to base - year = stata_epoch.year + dates // 2 - month = (dates % 2) * 6 + 1 - conv_dates = convert_year_month_safe(year, month) - elif fmt.startswith(("%ty", "ty")): # Years -- not delta - year = dates - first_month = np.ones_like(dates) - conv_dates = convert_year_month_safe(year, first_month) + per_y = (year - 1970).array.view("Period[Y]") + per_d = per_y.asfreq("D", how="S") + per_d_shifted = per_d + days._values + per_s = per_d_shifted.asfreq("s", how="S") + conv_dates_arr = per_s.view("M8[s]") + conv_dates = Series(conv_dates_arr, index=dates.index) + else: raise ValueError(f"Date fmt {fmt} not understood") @@ -409,24 +371,26 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series: index = dates.index NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000 US_PER_DAY = NS_PER_DAY / 1000 + MS_PER_DAY = NS_PER_DAY / 1_000_000 def parse_dates_safe( dates: Series, delta: bool = False, year: bool = False, days: bool = False - ): + ) -> DataFrame: d = {} if lib.is_np_dtype(dates.dtype, "M"): if delta: - time_delta = dates - Timestamp(stata_epoch).as_unit("ns") - d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds + time_delta = dates.dt.as_unit("ms") - Timestamp(stata_epoch).as_unit( + "ms" + ) + d["delta"] = time_delta._values.view(np.int64) if days or year: date_index = DatetimeIndex(dates) d["year"] = date_index._data.year d["month"] = date_index._data.month if days: - days_in_ns = dates._values.view(np.int64) - to_datetime( - d["year"], format="%Y" - )._values.view(np.int64) - d["days"] = days_in_ns // NS_PER_DAY + year_start = np.asarray(dates).astype("M8[Y]").astype(dates.dtype) + diff = dates - year_start + d["days"] = np.asarray(diff).astype("m8[D]").view("int64") elif infer_dtype(dates, skipna=False) == "datetime": if delta: @@ -466,7 +430,7 @@ def g(x: datetime) -> int: if fmt in ["%tc", "tc"]: d = parse_dates_safe(dates, delta=True) - conv_dates = d.delta / 1000 + conv_dates = d.delta elif fmt in ["%tC", "tC"]: warnings.warn( "Stata Internal Format tC not supported.", @@ -475,7 +439,7 @@ def g(x: datetime) -> int: conv_dates = dates elif fmt in ["%td", "td"]: d = parse_dates_safe(dates, delta=True) - conv_dates = d.delta // US_PER_DAY + conv_dates = d.delta // MS_PER_DAY elif fmt in ["%tw", "tw"]: d = parse_dates_safe(dates, year=True, days=True) conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 @@ -591,17 +555,26 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: for col in data: # Cast from unsupported types to supported types - is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype)) + is_nullable_int = ( + isinstance(data[col].dtype, ExtensionDtype) + and data[col].dtype.kind in "iub" + ) # We need to find orig_missing before altering data below orig_missing = data[col].isna() if is_nullable_int: - missing_loc = data[col].isna() - if missing_loc.any(): - # Replace with always safe value - fv = 0 if isinstance(data[col].dtype, IntegerDtype) else False - data.loc[missing_loc, col] = fv + fv = 0 if data[col].dtype.kind in "iu" else False # Replace with NumPy-compatible column - data[col] = data[col].astype(data[col].dtype.numpy_dtype) + data[col] = data[col].fillna(fv).astype(data[col].dtype.numpy_dtype) + elif isinstance(data[col].dtype, ExtensionDtype): + if getattr(data[col].dtype, "numpy_dtype", None) is not None: + data[col] = data[col].astype(data[col].dtype.numpy_dtype) + elif is_string_dtype(data[col].dtype): + # TODO could avoid converting string dtype to object here, + # but handle string dtype in _encode_strings + data[col] = data[col].astype("object") + # generate_table checks for None values + data.loc[data[col].isna(), col] = None + dtype = data[col].dtype empty_df = data.shape[0] == 0 for c_data in conversion_data: @@ -687,7 +660,7 @@ def __init__( self._prepare_value_labels() - def _prepare_value_labels(self): + def _prepare_value_labels(self) -> None: """Encode value labels.""" self.text_len = 0 @@ -718,12 +691,6 @@ def _prepare_value_labels(self): self.txt.append(category) self.n += 1 - if self.text_len > 32000: - raise ValueError( - "Stata value labels for a single variable must " - "have a combined length less than 32,000 characters." - ) - # Ensure int32 self.off = np.array(offsets, dtype=np.int32) self.val = np.array(values, dtype=np.int32) @@ -1014,6 +981,19 @@ def __init__(self) -> None: np.float64(struct.unpack(" None: # These missing values are the generic '.' in Stata, and are used # to replace nans - self.MISSING_VALUES = { + self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = { "b": 101, "h": 32741, "l": 2147483621, @@ -1152,11 +1132,9 @@ def __init__( # State variables for the file self._close_file: Callable[[], None] | None = None - self._missing_values = False - self._can_read_value_labels = False self._column_selector_set = False + self._value_label_dict: dict[str, dict[int, str]] = {} self._value_labels_read = False - self._data_read = False self._dtype: np.dtype | None = None self._lines_read = 0 @@ -1214,24 +1192,6 @@ def __exit__( if self._close_file: self._close_file() - def close(self) -> None: - """Close the handle if its open. - - .. deprecated: 2.0.0 - - The close method is not part of the public API. - The only supported way to use StataReader is to use it as a context manager. - """ - warnings.warn( - "The StataReader.close() method is not part of the public API and " - "will be removed in a future version without notice. " - "Using StataReader as a context manager is the only supported method.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if self._close_file: - self._close_file() - def _set_encoding(self) -> None: """ Set string encoding which depends on file version @@ -1403,8 +1363,10 @@ def _get_variable_labels(self) -> list[str]: def _get_nobs(self) -> int: if self._format_version >= 118: return self._read_uint64() - else: + elif self._format_version >= 103: return self._read_uint32() + else: + return self._read_uint16() def _get_data_label(self) -> str: if self._format_version >= 118: @@ -1428,7 +1390,7 @@ def _get_time_stamp(self) -> str: elif self._format_version > 104: return self._decode(self._path_or_buf.read(18)) else: - raise ValueError() + raise ValueError def _get_seek_variable_labels(self) -> int: if self._format_version == 117: @@ -1440,13 +1402,28 @@ def _get_seek_variable_labels(self) -> int: elif self._format_version >= 118: return self._read_int64() + 17 else: - raise ValueError() + raise ValueError def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 111, 113, 114, 115]: + if self._format_version not in [ + 102, + 103, + 104, + 105, + 108, + 110, + 111, + 113, + 114, + 115, + ]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() + # Note 102 format will have a zero in this header position, so support + # relies on little-endian being set whenever this value isn't one, + # even though for later releases strictly speaking the value should + # be either one or two to be valid self._byteorder = ">" if self._read_int8() == 0x1 else "<" self._filetype = self._read_int8() self._path_or_buf.read(1) # unused @@ -1456,10 +1433,11 @@ def _read_old_header(self, first_char: bytes) -> None: self._data_label = self._get_data_label() - self._time_stamp = self._get_time_stamp() + if self._format_version >= 105: + self._time_stamp = self._get_time_stamp() # descriptors - if self._format_version > 108: + if self._format_version >= 111: typlist = [int(c) for c in self._path_or_buf.read(self._nvar)] else: buf = self._path_or_buf.read(self._nvar) @@ -1554,17 +1532,8 @@ def _decode(self, s: bytes) -> str: ) return s.decode("latin-1") - def _read_value_labels(self) -> None: - self._ensure_open() - if self._value_labels_read: - # Don't read twice - return - if self._format_version <= 108: - # Value labels are not supported in version 108 and earlier. - self._value_labels_read = True - self._value_label_dict: dict[str, dict[float, str]] = {} - return - + def _read_new_value_labels(self) -> None: + """Reads value labels with variable length strings (108 and later format)""" if self._format_version >= 117: self._path_or_buf.seek(self._seek_value_labels) else: @@ -1572,9 +1541,6 @@ def _read_value_labels(self) -> None: offset = self._nobs * self._dtype.itemsize self._path_or_buf.seek(self._data_location + offset) - self._value_labels_read = True - self._value_label_dict = {} - while True: if self._format_version >= 117: if self._path_or_buf.read(5) == b" @@ -1582,8 +1548,10 @@ def _read_value_labels(self) -> None: slength = self._path_or_buf.read(4) if not slength: - break # end of value label table (format < 117) - if self._format_version <= 117: + break # end of value label table (format < 117), or end-of-file + if self._format_version == 108: + labname = self._decode(self._path_or_buf.read(9)) + elif self._format_version <= 117: labname = self._decode(self._path_or_buf.read(33)) else: labname = self._decode(self._path_or_buf.read(129)) @@ -1607,8 +1575,45 @@ def _read_value_labels(self) -> None: self._value_label_dict[labname][val[i]] = self._decode( txt[off[i] : end] ) + if self._format_version >= 117: self._path_or_buf.read(6) # + + def _read_old_value_labels(self) -> None: + """Reads value labels with fixed-length strings (105 and earlier format)""" + assert self._dtype is not None + offset = self._nobs * self._dtype.itemsize + self._path_or_buf.seek(self._data_location + offset) + + while True: + if not self._path_or_buf.read(2): + # end-of-file may have been reached, if so stop here + break + + # otherwise back up and read again, taking byteorder into account + self._path_or_buf.seek(-2, os.SEEK_CUR) + n = self._read_uint16() + labname = self._decode(self._path_or_buf.read(9)) + self._path_or_buf.read(1) # padding + codes = np.frombuffer( + self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n + ) + self._value_label_dict[labname] = {} + for i in range(n): + self._value_label_dict[labname][codes[i]] = self._decode( + self._path_or_buf.read(8) + ) + + def _read_value_labels(self) -> None: + self._ensure_open() + if self._value_labels_read: + # Don't read twice + return + + if self._format_version >= 108: + self._read_new_value_labels() + else: + self._read_old_value_labels() self._value_labels_read = True def _read_strls(self) -> None: @@ -1623,14 +1628,13 @@ def _read_strls(self) -> None: v_o = self._read_uint64() else: buf = self._path_or_buf.read(12) - # Only tested on little endian file on little endian machine. + # Only tested on little endian machine. v_size = 2 if self._format_version == 118 else 3 if self._byteorder == "<": buf = buf[0:v_size] + buf[4 : (12 - v_size)] else: - # This path may not be correct, impossible to test - buf = buf[0:v_size] + buf[(4 + v_size) :] - v_o = struct.unpack("Q", buf)[0] + buf = buf[4 - v_size : 4] + buf[(4 + v_size) :] + v_o = struct.unpack(f"{self._byteorder}Q", buf)[0] typ = self._read_uint8() length = self._read_uint32() va = self._path_or_buf.read(length) @@ -1699,8 +1703,6 @@ def read( # StopIteration. If reading the whole thing return an empty # data frame. if (self._nobs == 0) and nrows == 0: - self._can_read_value_labels = True - self._data_read = True data = DataFrame(columns=self._varlist) # Apply dtypes correctly for i, col in enumerate(data.columns): @@ -1713,7 +1715,6 @@ def read( return data if (self._format_version >= 117) and (not self._value_labels_read): - self._can_read_value_labels = True self._read_strls() # Read data @@ -1736,9 +1737,7 @@ def read( ) self._lines_read += read_lines - if self._lines_read == self._nobs: - self._can_read_value_labels = True - self._data_read = True + # if necessary, swap the byte order to native here if self._byteorder != self._native_byteorder: raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder()) @@ -1786,7 +1785,7 @@ def read( i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) ) - if convert_categoricals and self._format_version > 108: + if convert_categoricals: data = self._do_convert_categoricals( data, self._value_label_dict, self._lbllist, order_categoricals ) @@ -1816,15 +1815,31 @@ def read( return data def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: + # missing code for double was different in version 105 and prior + old_missingdouble = float.fromhex("0x1.0p333") + # Check for missing values, and replace if found replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] - if fmt not in self.VALID_RANGE: - continue + # recode instances of the old missing code to the currently used value + if self._format_version <= 105 and fmt == "d": + data.iloc[:, i] = data.iloc[:, i].replace( + old_missingdouble, self.MISSING_VALUES["d"] + ) + + if self._format_version <= 111: + if fmt not in self.OLD_VALID_RANGE: + continue - fmt = cast(str, fmt) # only strs in VALID_RANGE - nmin, nmax = self.VALID_RANGE[fmt] + fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE + nmin, nmax = self.OLD_VALID_RANGE[fmt] + else: + if fmt not in self.VALID_RANGE: + continue + + fmt = cast(str, fmt) # only strs in VALID_RANGE + nmin, nmax = self.VALID_RANGE[fmt] series = data.iloc[:, i] # appreciably faster to do this with ndarray instead of Series @@ -1839,7 +1854,12 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra umissing, umissing_loc = np.unique(series[missing], return_inverse=True) replacement = Series(series, dtype=object) for j, um in enumerate(umissing): - missing_value = StataMissingValue(um) + if self._format_version <= 111: + missing_value = StataMissingValue( + float(self.MISSING_VALUES[fmt]) + ) + else: + missing_value = StataMissingValue(um) loc = missing_loc[umissing_loc == j] replacement.iloc[loc] = missing_value @@ -1848,10 +1868,6 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra if dtype not in (np.float32, np.float64): dtype = np.float64 replacement = Series(series, dtype=dtype) - if not replacement._values.flags["WRITEABLE"]: - # only relevant for ArrayManager; construction - # path for BlockManager ensures writeability - replacement = replacement.copy() # Note: operating on ._values is much faster than directly # TODO: can we fix that? replacement._values[missing] = np.nan @@ -1889,7 +1905,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra fmtlist = [] lbllist = [] for col in columns: - i = data.columns.get_loc(col) + i = data.columns.get_loc(col) # type: ignore[no-untyped-call] dtyplist.append(self._dtyplist[i]) typlist.append(self._typlist[i]) fmtlist.append(self._fmtlist[i]) @@ -1906,7 +1922,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra def _do_convert_categoricals( self, data: DataFrame, - value_label_dict: dict[str, dict[float, str]], + value_label_dict: dict[str, dict[int, str]], lbllist: Sequence[str], order_categoricals: bool, ) -> DataFrame: @@ -1986,15 +2002,28 @@ def data_label(self) -> str: """ Return data label of Stata file. + The data label is a descriptive string associated with the dataset + stored in the Stata file. This property provides access to that + label, if one is present. + + See Also + -------- + io.stata.StataReader.variable_labels : Return a dict associating each variable + name with corresponding label. + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame([(1,)], columns=["variable"]) >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) >>> data_label = "This is a data file." >>> path = "/My_path/filename.dta" - >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP - ... data_label=data_label, # doctest: +SKIP - ... version=None) # doctest: +SKIP + >>> df.to_stata( + ... path, + ... time_stamp=time_stamp, # doctest: +SKIP + ... data_label=data_label, # doctest: +SKIP + ... version=None, + ... ) # doctest: +SKIP >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP ... print(reader.data_label) # doctest: +SKIP This is a data file. @@ -2014,9 +2043,19 @@ def variable_labels(self) -> dict[str, str]: """ Return a dict associating each variable name with corresponding label. + This method retrieves variable labels from a Stata file. Variable labels are + mappings between variable names and their corresponding descriptive labels + in a Stata dataset. + Returns ------- dict + A python dictionary. + + See Also + -------- + read_stata : Read Stata file into DataFrame. + DataFrame.to_stata : Export DataFrame object to Stata dta format. Examples -------- @@ -2024,8 +2063,12 @@ def variable_labels(self) -> dict[str, str]: >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) >>> path = "/My_path/filename.dta" >>> variable_labels = {"col_1": "This is an example"} - >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP - ... variable_labels=variable_labels, version=None) # doctest: +SKIP + >>> df.to_stata( + ... path, + ... time_stamp=time_stamp, # doctest: +SKIP + ... variable_labels=variable_labels, + ... version=None, + ... ) # doctest: +SKIP >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP ... print(reader.variable_labels()) # doctest: +SKIP {'index': '', 'col_1': 'This is an example', 'col_2': ''} @@ -2037,13 +2080,23 @@ def variable_labels(self) -> dict[str, str]: self._ensure_open() return dict(zip(self._varlist, self._variable_labels)) - def value_labels(self) -> dict[str, dict[float, str]]: + def value_labels(self) -> dict[str, dict[int, str]]: """ Return a nested dict associating each variable name to its value and label. + This method retrieves the value labels from a Stata file. Value labels are + mappings between the coded values and their corresponding descriptive labels + in a Stata dataset. + Returns ------- dict + A python dictionary. + + See Also + -------- + read_stata : Read Stata file into DataFrame. + DataFrame.to_stata : Export DataFrame object to Stata dta format. Examples -------- @@ -2051,8 +2104,12 @@ def value_labels(self) -> dict[str, dict[float, str]]: >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21) >>> path = "/My_path/filename.dta" >>> value_labels = {"col_1": {3: "x"}} - >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP - ... value_labels=value_labels, version=None) # doctest: +SKIP + >>> df.to_stata( + ... path, + ... time_stamp=time_stamp, # doctest: +SKIP + ... value_labels=value_labels, + ... version=None, + ... ) # doctest: +SKIP >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP ... print(reader.value_labels()) # doctest: +SKIP {'col_1': {3: 'x'}} @@ -2149,15 +2206,15 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict: new_dict = {} - for key in convert_dates: + for key, value in convert_dates.items(): if not convert_dates[key].startswith("%"): # make sure proper fmts - convert_dates[key] = "%" + convert_dates[key] + convert_dates[key] = "%" + value if key in varlist: - new_dict.update({varlist.index(key): convert_dates[key]}) + new_dict[varlist.index(key)] = convert_dates[key] else: if not isinstance(key, int): raise ValueError("convert_dates key must be a column or an integer") - new_dict.update({key: convert_dates[key]}) + new_dict[key] = convert_dates[key] return new_dict @@ -2198,7 +2255,7 @@ def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int: def _dtype_to_default_stata_fmt( - dtype, column: Series, dta_version: int = 114, force_strl: bool = False + dtype: np.dtype, column: Series, dta_version: int = 114, force_strl: bool = False ) -> str: """ Map numpy dtype to stata's default format for this type. Not terribly @@ -2253,7 +2310,7 @@ class StataWriter(StataParser): Parameters ---------- fname : path (string), buffer or path object - string, path object (pathlib.Path or py._path.local.LocalPath) or + string, pathlib.Path or object implementing a binary write() functions. If using a buffer then the buffer will not be automatically closed after the file is written. @@ -2309,19 +2366,19 @@ class StataWriter(StataParser): Examples -------- - >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b']) - >>> writer = StataWriter('./data_file.dta', data) + >>> data = pd.DataFrame([[1.0, 1]], columns=["a", "b"]) + >>> writer = StataWriter("./data_file.dta", data) >>> writer.write_file() Directly write a zip file >>> compression = {{"method": "zip", "archive_name": "data_file.dta"}} - >>> writer = StataWriter('./data_file.zip', data, compression=compression) + >>> writer = StataWriter("./data_file.zip", data, compression=compression) >>> writer.write_file() Save a DataFrame with dates >>> from datetime import datetime - >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date']) - >>> writer = StataWriter('./date_data_file.dta', data, {{'date' : 'tw'}}) + >>> data = pd.DataFrame([[datetime(2000, 1, 1)]], columns=["date"]) + >>> writer = StataWriter("./date_data_file.dta", data, {{"date": "tw"}}) >>> writer.write_file() """ @@ -2666,6 +2723,7 @@ def _encode_strings(self) -> None: continue column = self.data[col] dtype = column.dtype + # TODO could also handle string dtype here specifically if dtype.type is np.object_: inferred_dtype = infer_dtype(column, skipna=True) if not ((inferred_dtype == "string") or len(column) == 0): @@ -2690,20 +2748,36 @@ def write_file(self) -> None: """ Export DataFrame object to Stata dta format. + This method writes the contents of a pandas DataFrame to a `.dta` file + compatible with Stata. It includes features for handling value labels, + variable types, and metadata like timestamps and data labels. The output + file can then be read and used in Stata or other compatible statistical + tools. + + See Also + -------- + read_stata : Read Stata file into DataFrame. + DataFrame.to_stata : Export DataFrame object to Stata dta format. + io.stata.StataWriter : A class for writing Stata binary dta files. + Examples -------- - >>> df = pd.DataFrame({"fully_labelled": [1, 2, 3, 3, 1], - ... "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan], - ... "Y": [7, 7, 9, 8, 10], - ... "Z": pd.Categorical(["j", "k", "l", "k", "j"]), - ... }) + >>> df = pd.DataFrame( + ... { + ... "fully_labelled": [1, 2, 3, 3, 1], + ... "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan], + ... "Y": [7, 7, 9, 8, 10], + ... "Z": pd.Categorical(["j", "k", "l", "k", "j"]), + ... } + ... ) >>> path = "/My_path/filename.dta" - >>> labels = {"fully_labelled": {1: "one", 2: "two", 3: "three"}, - ... "partially_labelled": {1.0: "one", 2.0: "two"}, - ... } - >>> writer = pd.io.stata.StataWriter(path, - ... df, - ... value_labels=labels) # doctest: +SKIP + >>> labels = { + ... "fully_labelled": {1: "one", 2: "two", 3: "three"}, + ... "partially_labelled": {1.0: "one", 2.0: "two"}, + ... } + >>> writer = pd.io.stata.StataWriter( + ... path, df, value_labels=labels + ... ) # doctest: +SKIP >>> writer.write_file() # doctest: +SKIP >>> df = pd.read_stata(path) # doctest: +SKIP >>> df # doctest: +SKIP @@ -2805,7 +2879,7 @@ def _write_header( # ds_format - just use 114 self._write_bytes(struct.pack("b", 114)) # byteorder - self._write(byteorder == ">" and "\x01" or "\x02") + self._write((byteorder == ">" and "\x01") or "\x02") # filetype self._write("\x01") # unused @@ -2933,13 +3007,7 @@ def _prepare_data(self) -> np.rec.recarray: for i, col in enumerate(data): typ = typlist[i] if typ <= self._max_string_length: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Downcasting object dtype arrays", - category=FutureWarning, - ) - dc = data[col].fillna("") + dc = data[col].fillna("") data[col] = dc.apply(_pad_bytes, args=(typ,)) stype = f"S{typ}" dtypes[col] = stype @@ -3061,6 +3129,8 @@ def __init__( if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) + # Flag whether chosen byteorder matches the system on which we're running + self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder) gso_v_type = "I" # uint32 gso_o_type = "Q" # uint64 @@ -3073,13 +3143,20 @@ def __init__( o_size = 6 else: # version == 119 o_size = 5 - self._o_offet = 2 ** (8 * (8 - o_size)) + if self._native_byteorder: + self._o_offet = 2 ** (8 * (8 - o_size)) + else: + self._o_offet = 2 ** (8 * o_size) self._gso_o_type = gso_o_type self._gso_v_type = gso_v_type def _convert_key(self, key: tuple[int, int]) -> int: v, o = key - return v + self._o_offet * o + if self._native_byteorder: + return v + self._o_offet * o + else: + # v, o will be swapped when applying byteorder + return o + self._o_offet * v def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: """ @@ -3119,8 +3196,8 @@ def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: for o, (idx, row) in enumerate(selected.iterrows()): for j, (col, v) in enumerate(col_index): val = row[col] - # Allow columns with mixed str and None (GH 23633) - val = "" if val is None else val + # Allow columns with mixed str and None or pd.NA (GH 23633) + val = "" if isna(val) else val key = gso_table.get(val, None) if key is None: # Stata prefers human numbers @@ -3203,7 +3280,7 @@ class StataWriter117(StataWriter): Parameters ---------- fname : path (string), buffer or path object - string, path object (pathlib.Path or py._path.local.LocalPath) or + string, pathlib.Path or object implementing a binary write() functions. If using a buffer then the buffer will not be automatically closed after the file is written. @@ -3263,22 +3340,24 @@ class StataWriter117(StataWriter): Examples -------- - >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c']) - >>> writer = pd.io.stata.StataWriter117('./data_file.dta', data) + >>> data = pd.DataFrame([[1.0, 1, "a"]], columns=["a", "b", "c"]) + >>> writer = pd.io.stata.StataWriter117("./data_file.dta", data) >>> writer.write_file() Directly write a zip file >>> compression = {"method": "zip", "archive_name": "data_file.dta"} >>> writer = pd.io.stata.StataWriter117( - ... './data_file.zip', data, compression=compression - ... ) + ... "./data_file.zip", data, compression=compression + ... ) >>> writer.write_file() Or with long strings stored in strl format - >>> data = pd.DataFrame([['A relatively long string'], [''], ['']], - ... columns=['strls']) + >>> data = pd.DataFrame( + ... [["A relatively long string"], [""], [""]], columns=["strls"] + ... ) >>> writer = pd.io.stata.StataWriter117( - ... './data_file_with_long_strings.dta', data, convert_strl=['strls']) + ... "./data_file_with_long_strings.dta", data, convert_strl=["strls"] + ... ) >>> writer.write_file() """ @@ -3346,7 +3425,7 @@ def _write_header( # ds_format - 117 bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder - bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) + bio.write(self._tag((byteorder == ">" and "MSF") or "LSF", "byteorder")) # number of vars, 2 bytes in 117 and 118, 4 byte in 119 nvar_type = "H" if self._dta_version <= 118 else "I" bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) @@ -3504,7 +3583,7 @@ def _write_characteristics(self) -> None: self._update_map("characteristics") self._write_bytes(self._tag(b"", "characteristics")) - def _write_data(self, records) -> None: + def _write_data(self, records: np.rec.recarray) -> None: self._update_map("data") self._write_bytes(b"") self._write_bytes(records.tobytes()) @@ -3554,7 +3633,9 @@ def _convert_strls(self, data: DataFrame) -> DataFrame: ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) + ssw = StataStrLWriter( + data, convert_cols, version=self._dta_version, byteorder=self._byteorder + ) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) @@ -3589,7 +3670,7 @@ class StataWriterUTF8(StataWriter117): Parameters ---------- fname : path (string), buffer or path object - string, path object (pathlib.Path or py._path.local.LocalPath) or + string, pathlib.Path or object implementing a binary write() functions. If using a buffer then the buffer will not be automatically closed after the file is written. @@ -3656,21 +3737,23 @@ class StataWriterUTF8(StataWriter117): Using Unicode data and column names >>> from pandas.io.stata import StataWriterUTF8 - >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) - >>> writer = StataWriterUTF8('./data_file.dta', data) + >>> data = pd.DataFrame([[1.0, 1, "ᴬ"]], columns=["a", "β", "ĉ"]) + >>> writer = StataWriterUTF8("./data_file.dta", data) >>> writer.write_file() Directly write a zip file >>> compression = {"method": "zip", "archive_name": "data_file.dta"} - >>> writer = StataWriterUTF8('./data_file.zip', data, compression=compression) + >>> writer = StataWriterUTF8("./data_file.zip", data, compression=compression) >>> writer.write_file() Or with long strings stored in strl format - >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], - ... columns=['strls']) - >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, - ... convert_strl=['strls']) + >>> data = pd.DataFrame( + ... [["ᴀ relatively long ŝtring"], [""], [""]], columns=["strls"] + ... ) + >>> writer = StataWriterUTF8( + ... "./data_file_with_long_strings.dta", data, convert_strl=["strls"] + ... ) >>> writer.write_file() """ diff --git a/pandas/io/xml.py b/pandas/io/xml.py index ac497cd266027..0fcf27af42fde 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -9,9 +9,7 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) -import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -20,7 +18,6 @@ ParserError, ) from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -28,10 +25,8 @@ from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( - file_exists, get_handle, infer_compression, - is_file_like, is_fsspec_url, is_url, stringify_path, @@ -39,7 +34,10 @@ from pandas.io.parsers import TextParser if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from xml.etree.ElementTree import Element from lxml import etree @@ -176,7 +174,6 @@ def __init__( self.encoding = encoding self.stylesheet = stylesheet self.iterparse = iterparse - self.is_style = None self.compression: CompressionOptions = compression self.storage_options = storage_options @@ -484,12 +481,12 @@ def _validate_path(self) -> list[Any]: if children == [] and attrs == {}: raise ValueError(msg) - except (KeyError, SyntaxError): + except (KeyError, SyntaxError) as err: raise SyntaxError( "You have used an incorrect or unsupported XPath " "expression for etree library or you used an " "undeclared namespace prefix." - ) + ) from err return elems @@ -528,7 +525,7 @@ def _parse_doc( storage_options=self.storage_options, ) - with preprocess_data(handle_data) as xml_data: + with handle_data as xml_data: curr_parser = XMLParser(encoding=self.encoding) document = parse(xml_data, parser=curr_parser) @@ -635,7 +632,7 @@ def _parse_doc( storage_options=self.storage_options, ) - with preprocess_data(handle_data) as xml_data: + with handle_data as xml_data: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): @@ -673,44 +670,27 @@ def get_data_from_filepath( encoding: str | None, compression: CompressionOptions, storage_options: StorageOptions, -) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: +): """ Extract raw XML data. - The method accepts three input types: + The method accepts two input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) - 3. XML string or bytes - - This method turns (1) into (2) to simplify the rest of the processing. - It returns input types (2) and (3) unchanged. """ - if not isinstance(filepath_or_buffer, bytes): - filepath_or_buffer = stringify_path(filepath_or_buffer) - - if ( - isinstance(filepath_or_buffer, str) - and not filepath_or_buffer.startswith((" io.StringIO | io.BytesIO: @@ -746,12 +726,12 @@ class that build Data Frame and infers specific dtypes. try: with TextParser(nodes, names=tags, **kwargs) as tp: return tp.read() - except ParserError: + except ParserError as err: raise ParserError( "XML document may be too complex for import. " "Try to flatten document and use distinct " "element and attribute names." - ) + ) from err def _parse( @@ -790,22 +770,6 @@ def _parse( p: _EtreeFrameParser | _LxmlFrameParser - if isinstance(path_or_buffer, str) and not any( - [ - is_file_like(path_or_buffer), - file_exists(path_or_buffer), - is_url(path_or_buffer), - is_fsspec_url(path_or_buffer), - ] - ): - warnings.warn( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if parser == "lxml": lxml = import_optional_dependency("lxml.etree", errors="ignore") @@ -894,8 +858,8 @@ def read_xml( ---------- path_or_buffer : str, path object, or file-like object String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a ``read()`` function. The string can be any valid XML - string or a path. The string can further be a URL. Valid URL schemes + object implementing a ``read()`` function. The string can be a path. + The string can further be a URL. Valid URL schemes include http, ftp, s3, and file. .. deprecated:: 2.1.0 @@ -916,9 +880,7 @@ def read_xml( Note: if XML document uses default namespace denoted as `xmlns=''` without a prefix, you must assign any temporary namespace prefix such as 'doc' to the URI in order to parse - underlying nodes and/or attributes. For example, :: - - namespaces = {{"doc": "https://example.com"}} + underlying nodes and/or attributes. elems_only : bool, optional, default False Parse only the child elements at the specified ``xpath``. By default, @@ -971,7 +933,7 @@ def read_xml( and ability to use XSLT stylesheet are supported. stylesheet : str, path object or file-like object - A URL, file-like object, or a raw string containing an XSLT script. + A URL, file-like object, or a string path containing an XSLT script. This stylesheet should flatten complex, deeply nested XML documents for easier parsing. To use this feature you must have ``lxml`` module installed and specify 'lxml' as ``parser``. The ``xpath`` must @@ -987,9 +949,7 @@ def read_xml( and unlike ``xpath``, descendants do not need to relate to each other but can exist any where in document under the repeating element. This memory- efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). - For example, :: - - iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} + For example, ``{{"row_element": ["child_elem", "attr", "grandchild_elem"]}}``. .. versionadded:: 1.5.0 @@ -999,14 +959,15 @@ def read_xml( {storage_options} - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1118,9 +1079,11 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(StringIO(xml), - ... xpath="//doc:row", - ... namespaces={{"doc": "https://example.com"}}) + >>> df = pd.read_xml( + ... StringIO(xml), + ... xpath="//doc:row", + ... namespaces={{"doc": "https://example.com"}}, + ... ) >>> df shape degrees sides 0 square 360 4.0 @@ -1147,9 +1110,9 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(StringIO(xml_data), - ... dtype_backend="numpy_nullable", - ... parse_dates=["e"]) + >>> df = pd.read_xml( + ... StringIO(xml_data), dtype_backend="numpy_nullable", parse_dates=["e"] + ... ) >>> df index a b c d e 0 0 1 2.5 True a 2019-12-31 diff --git a/pandas/meson.build b/pandas/meson.build index 435103a954d86..840ac257bba09 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -1,7 +1,8 @@ -incdir_numpy = run_command(py, - [ - '-c', - ''' +incdir_numpy = run_command( + py, + [ + '-c', + ''' import os import numpy as np try: @@ -12,9 +13,9 @@ try: except Exception: incdir = np.get_include() print(incdir) - ''' - ], - check: true + ''', + ], + check: true, ).stdout().strip() inc_np = include_directories(incdir_numpy) @@ -36,9 +37,9 @@ subdirs_list = [ 'plotting', 'tests', 'tseries', - 'util' + 'util', ] -foreach subdir: subdirs_list +foreach subdir : subdirs_list install_subdir(subdir, install_dir: py.get_install_dir() / 'pandas') endforeach @@ -47,6 +48,6 @@ top_level_py_list = [ '_typing.py', '_version.py', 'conftest.py', - 'testing.py' + 'testing.py', ] py.install_sources(top_level_py_list, subdir: 'pandas') diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index 55c861e384d67..837bfaf82ca27 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -55,6 +55,7 @@ For the discussion about the API see https://github.com/pandas-dev/pandas/issues/26747. """ + from pandas.plotting._core import ( PlotAccessor, boxplot, @@ -79,20 +80,20 @@ __all__ = [ "PlotAccessor", + "andrews_curves", + "autocorrelation_plot", + "bootstrap_plot", "boxplot", "boxplot_frame", "boxplot_frame_groupby", + "deregister_matplotlib_converters", "hist_frame", "hist_series", - "scatter_matrix", - "radviz", - "andrews_curves", - "bootstrap_plot", - "parallel_coordinates", "lag_plot", - "autocorrelation_plot", - "table", + "parallel_coordinates", "plot_params", + "radviz", "register_matplotlib_converters", - "deregister_matplotlib_converters", + "scatter_matrix", + "table", ] diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index cb5598a98d5af..9670b5439c87e 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -3,7 +3,6 @@ import importlib from typing import ( TYPE_CHECKING, - Callable, Literal, ) @@ -27,6 +26,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -39,11 +39,16 @@ from pandas import ( DataFrame, + Index, Series, ) from pandas.core.groupby.generic import DataFrameGroupBy +def holds_integer(column: Index) -> bool: + return column.inferred_type in {"integer", "mixed-integer"} + + def hist_series( self: Series, by=None, @@ -98,7 +103,7 @@ def hist_series( Returns ------- - matplotlib.AxesSubplot + matplotlib.axes.Axes A histogram plot. See Also @@ -112,7 +117,7 @@ def hist_series( .. plot:: :context: close-figs - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) >>> hist = ser.hist() @@ -121,7 +126,7 @@ def hist_series( .. plot:: :context: close-figs - >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> lst = ["a", "a", "a", "b", "b", "b"] >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) >>> hist = ser.groupby(level=0).hist() """ @@ -227,7 +232,8 @@ def hist_frame( Returns ------- - matplotlib.AxesSubplot or numpy.ndarray of them + matplotlib.Axes or numpy.ndarray of them + Returns a AxesSubplot object a numpy array of AxesSubplot objects. See Also -------- @@ -241,9 +247,11 @@ def hist_frame( .. plot:: :context: close-figs - >>> data = {'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]} - >>> index = ['pig', 'rabbit', 'duck', 'chicken', 'horse'] + >>> data = { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... } + >>> index = ["pig", "rabbit", "duck", "chicken", "horse"] >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) """ @@ -331,7 +339,7 @@ def hist_frame( See Also -------- -pandas.Series.plot.hist: Make a histogram. +Series.plot.hist: Make a histogram. matplotlib.pyplot.boxplot : Matplotlib equivalent plot. Notes @@ -423,7 +431,7 @@ def hist_frame( >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', ... return_type='axes') >>> type(boxplot) - + If ``return_type`` is `None`, a NumPy array of axes with the same shape as ``layout`` is returned: @@ -565,18 +573,23 @@ def boxplot_frame_groupby( Parameters ---------- - grouped : Grouped DataFrame + grouped : DataFrameGroupBy + The grouped DataFrame object over which to create the box plots. subplots : bool * ``False`` - no subplots will be used * ``True`` - create a subplot for each group. - column : column name or list of names, or vector Can be any valid input to groupby. fontsize : float or str - rot : label rotation angle - grid : Setting this to True will show the grid + Font size for the labels. + rot : float + Rotation angle of labels (in degrees) on the x-axis. + grid : bool + Whether to show grid lines on the plot. ax : Matplotlib axis object, default None - figsize : A tuple (width, height) in inches + The axes on which to draw the plots. If None, uses the current axes. + figsize : tuple of (float, float) + The figure size in inches (width, height). layout : tuple (optional) The layout of the plot: (rows, columns). sharex : bool, default False @@ -594,8 +607,15 @@ def boxplot_frame_groupby( Returns ------- - dict of key/value = group key/DataFrame.boxplot return value - or DataFrame.boxplot return value in case subplots=figures=False + dict or DataFrame.boxplot return value + If ``subplots=True``, returns a dictionary of group keys to the boxplot + return values. If ``subplots=False``, returns the boxplot return value + of a single DataFrame. + + See Also + -------- + DataFrame.boxplot : Create a box plot from a DataFrame. + Series.plot : Plot a Series. Examples -------- @@ -606,10 +626,10 @@ def boxplot_frame_groupby( >>> import itertools >>> tuples = [t for t in itertools.product(range(1000), range(4))] - >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) + >>> index = pd.MultiIndex.from_tuples(tuples, names=["lvl0", "lvl1"]) >>> data = np.random.randn(len(index), 4) - >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) - >>> grouped = df.groupby(level='lvl1') + >>> df = pd.DataFrame(data, columns=list("ABCD"), index=index) + >>> grouped = df.groupby(level="lvl1") >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10)) # doctest: +SKIP The ``subplots=False`` option shows the boxplots in a single figure. @@ -647,6 +667,9 @@ class PlotAccessor(PandasObject): ---------- data : Series or DataFrame The object for which the method is called. + + Attributes + ---------- x : label or position, default None Only used if data is a DataFrame. y : label, position or list of label, positions, default None @@ -786,6 +809,21 @@ class PlotAccessor(PandasObject): If the backend is not the default matplotlib one, the return value will be the object returned by the backend. + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + DataFrame.hist : Make a histogram. + DataFrame.boxplot : Make a box plot. + DataFrame.plot.scatter : Make a scatter plot with varying marker + point size and color. + DataFrame.plot.hexbin : Make a hexagonal binning plot of + two variables. + DataFrame.plot.kde : Make Kernel Density Estimate plot using + Gaussian kernels. + DataFrame.plot.area : Make a stacked area plot. + DataFrame.plot.bar : Make a bar plot. + DataFrame.plot.barh : Make a horizontal bar plot. + Notes ----- - See matplotlib documentation online for more on this subject @@ -802,16 +840,20 @@ class PlotAccessor(PandasObject): :context: close-figs >>> ser = pd.Series([1, 2, 3, 3]) - >>> plot = ser.plot(kind='hist', title="My plot") + >>> plot = ser.plot(kind="hist", title="My plot") For DataFrame: .. plot:: :context: close-figs - >>> df = pd.DataFrame({'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]}, - ... index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> df = pd.DataFrame( + ... { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... }, + ... index=["pig", "rabbit", "duck", "chicken", "horse"], + ... ) >>> plot = df.plot(title="DataFrame Plot") For SeriesGroupBy: @@ -828,8 +870,7 @@ class PlotAccessor(PandasObject): .. plot:: :context: close-figs - >>> df = pd.DataFrame({"col1" : [1, 2, 3, 4], - ... "col2" : ["A", "B", "A", "B"]}) + >>> df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": ["A", "B", "A", "B"]}) >>> plot = df.groupby("col2").plot(kind="bar", title="DataFrameGroupBy Plot") """ @@ -921,7 +962,7 @@ def _get_call_args(backend_name: str, data: Series | DataFrame, args, kwargs): if args and isinstance(data, ABCSeries): positional_args = str(args)[1:-1] keyword_args = ", ".join( - [f"{name}={repr(value)}" for (name, _), value in zip(arg_def, args)] + [f"{name}={value!r}" for (name, _), value in zip(arg_def, args)] ) msg = ( "`Series.plot()` should not be called with positional " @@ -958,14 +999,10 @@ def __call__(self, *args, **kwargs): if kind not in self._all_kinds: raise ValueError( - f"{kind} is not a valid plot kind " - f"Valid plot kinds: {self._all_kinds}" + f"{kind} is not a valid plot kind Valid plot kinds: {self._all_kinds}" ) - # The original data structured can be transformed before passed to the - # backend. For example, for DataFrame is common to set the index as the - # `x` parameter, and return a Series with the parameter `y` as values. - data = self._parent.copy() + data = self._parent if isinstance(data, ABCSeries): kwargs["reuse_plot"] = True @@ -982,15 +1019,15 @@ def __call__(self, *args, **kwargs): f"{kind} requires either y column or 'subplots=True'" ) if y is not None: - if is_integer(y) and not data.columns._holds_integer(): + if is_integer(y) and not holds_integer(data.columns): y = data.columns[y] # converted to series actually. copy to not modify - data = data[y].copy() + data = data[y].copy(deep=False) data.index.name = y elif isinstance(data, ABCDataFrame): data_cols = data.columns if x is not None: - if is_integer(x) and not data.columns._holds_integer(): + if is_integer(x) and not holds_integer(data.columns): x = data_cols[x] elif not isinstance(data[x], ABCSeries): raise ValueError("x must be a label or position") @@ -999,7 +1036,7 @@ def __call__(self, *args, **kwargs): # check if we have y as int or list of ints int_ylist = is_list_like(y) and all(is_integer(c) for c in y) int_y_arg = is_integer(y) or int_ylist - if int_y_arg and not data.columns._holds_integer(): + if int_y_arg and not holds_integer(data.columns): y = data_cols[y] label_kw = kwargs["label"] if "label" in kwargs else False @@ -1012,14 +1049,15 @@ def __call__(self, *args, **kwargs): except (IndexError, KeyError, TypeError): pass - # don't overwrite - data = data[y].copy() + data = data[y] if isinstance(data, ABCSeries): label_name = label_kw or y data.name = label_name else: - match = is_list_like(label_kw) and len(label_kw) == len(y) + # error: Argument 1 to "len" has incompatible type "Any | bool"; + # expected "Sized" [arg-type] + match = is_list_like(label_kw) and len(label_kw) == len(y) # type: ignore[arg-type] if label_kw and not match: raise ValueError( "label should be list-like and same length as y" @@ -1053,9 +1091,9 @@ def __call__(self, *args, **kwargs): over the years. >>> df = pd.DataFrame({ - ... 'pig': [20, 18, 489, 675, 1776], - ... 'horse': [4, 25, 281, 600, 1900] - ... }, index=[1990, 1997, 2003, 2009, 2014]) + ... 'pig': [20, 18, 489, 675, 1776], + ... 'horse': [4, 25, 281, 600, 1900] + ... }, index=[1990, 1997, 2003, 2009, 2014]) >>> lines = df.plot.line() .. plot:: @@ -1089,7 +1127,11 @@ def __call__(self, *args, **kwargs): @Substitution(kind="line") @Appender(_bar_or_line_doc) def line( - self, x: Hashable | None = None, y: Hashable | None = None, **kwargs + self, + x: Hashable | None = None, + y: Hashable | None = None, + color: str | Sequence[str] | dict | None = None, + **kwargs, ) -> PlotAccessor: """ Plot Series or DataFrame as lines. @@ -1097,6 +1139,8 @@ def line( This function is useful to plot lines using DataFrame's values as coordinates. """ + if color is not None: + kwargs["color"] = color return self(kind="line", x=x, y=y, **kwargs) @Appender( @@ -1114,7 +1158,7 @@ def line( .. plot:: :context: close-figs - >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) + >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) >>> ax = df.plot.bar(x='lab', y='val', rot=0) Plot a whole dataframe to a bar plot. Each column is assigned a @@ -1177,8 +1221,12 @@ def line( ) @Substitution(kind="bar") @Appender(_bar_or_line_doc) - def bar( # pylint: disable=disallowed-name - self, x: Hashable | None = None, y: Hashable | None = None, **kwargs + def bar( + self, + x: Hashable | None = None, + y: Hashable | None = None, + color: str | Sequence[str] | dict | None = None, + **kwargs, ) -> PlotAccessor: """ Vertical bar plot. @@ -1189,13 +1237,15 @@ def bar( # pylint: disable=disallowed-name axis of the plot shows the specific categories being compared, and the other axis represents a measured value. """ + if color is not None: + kwargs["color"] = color return self(kind="bar", x=x, y=y, **kwargs) @Appender( """ See Also -------- - DataFrame.plot.bar: Vertical bar plot. + DataFrame.plot.bar : Vertical bar plot. DataFrame.plot : Make plots of DataFrame using matplotlib. matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib. @@ -1266,7 +1316,11 @@ def bar( # pylint: disable=disallowed-name @Substitution(kind="bar") @Appender(_bar_or_line_doc) def barh( - self, x: Hashable | None = None, y: Hashable | None = None, **kwargs + self, + x: Hashable | None = None, + y: Hashable | None = None, + color: str | Sequence[str] | dict | None = None, + **kwargs, ) -> PlotAccessor: """ Make a horizontal bar plot. @@ -1277,6 +1331,8 @@ def barh( axis of the plot shows the specific categories being compared, and the other axis represents a measured value. """ + if color is not None: + kwargs["color"] = color return self(kind="barh", x=x, y=y, **kwargs) def box(self, by: IndexLabel | None = None, **kwargs) -> PlotAccessor: @@ -1313,6 +1369,7 @@ def box(self, by: IndexLabel | None = None, **kwargs) -> PlotAccessor: Returns ------- :class:`matplotlib.axes.Axes` or numpy.ndarray of them + The matplotlib axes containing the box plot. See Also -------- @@ -1329,7 +1386,7 @@ def box(self, by: IndexLabel | None = None, **kwargs) -> PlotAccessor: :context: close-figs >>> data = np.random.randn(25, 4) - >>> df = pd.DataFrame(data, columns=list('ABCD')) + >>> df = pd.DataFrame(data, columns=list("ABCD")) >>> ax = df.plot.box() You can also generate groupings if you specify the `by` parameter (which @@ -1374,7 +1431,7 @@ def hist( Returns ------- - class:`matplotlib.AxesSubplot` + :class:`matplotlib.axes.Axes` Return a histogram plot. See Also @@ -1392,8 +1449,8 @@ def hist( .. plot:: :context: close-figs - >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) - >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) + >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=["one"]) + >>> df["two"] = df["one"] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) A grouped histogram can be generated by providing the parameter `by` (which @@ -1412,6 +1469,7 @@ def kde( self, bw_method: Literal["scott", "silverman"] | float | Callable | None = None, ind: np.ndarray | int | None = None, + weights: np.ndarray | None = None, **kwargs, ) -> PlotAccessor: """ @@ -1437,6 +1495,9 @@ def kde( 1000 equally spaced points are used. If `ind` is a NumPy array, the KDE is evaluated at the points passed. If `ind` is an integer, `ind` number of equally spaced points are used. + weights : NumPy array, optional + Weights of datapoints. This must be the same shape as datapoints. + If None (default), the samples are assumed to be equally weighted. **kwargs Additional keyword arguments are documented in :meth:`DataFrame.plot`. @@ -1444,6 +1505,7 @@ def kde( Returns ------- matplotlib.axes.Axes or numpy.ndarray of them + The matplotlib axes containing the KDE plot. See Also -------- @@ -1491,10 +1553,12 @@ def kde( .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5], - ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6], - ... }) + >>> df = pd.DataFrame( + ... { + ... "x": [1, 2, 2.5, 3, 3.5, 4, 5], + ... "y": [4, 4, 4.5, 5, 5.5, 6, 6], + ... } + ... ) >>> ax = df.plot.kde() A scalar bandwidth can be specified. Using a small bandwidth value can @@ -1519,7 +1583,7 @@ def kde( >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) """ - return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs) + return self(kind="kde", bw_method=bw_method, ind=ind, weights=weights, **kwargs) density = kde @@ -1556,7 +1620,7 @@ def area( See Also -------- - DataFrame.plot : Make plots of DataFrame using matplotlib / pylab. + DataFrame.plot : Make plots of DataFrame using matplotlib. Examples -------- @@ -1565,12 +1629,16 @@ def area( .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'sales': [3, 2, 3, 9, 10, 6], - ... 'signups': [5, 5, 6, 12, 14, 13], - ... 'visits': [20, 42, 28, 62, 81, 50], - ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01', - ... freq='ME')) + >>> df = pd.DataFrame( + ... { + ... "sales": [3, 2, 3, 9, 10, 6], + ... "signups": [5, 5, 6, 12, 14, 13], + ... "visits": [20, 42, 28, 62, 81, 50], + ... }, + ... index=pd.date_range( + ... start="2018/01/01", end="2018/07/01", freq="ME" + ... ), + ... ) >>> ax = df.plot.area() Area plots are stacked by default. To produce an unstacked plot, @@ -1586,23 +1654,25 @@ def area( .. plot:: :context: close-figs - >>> ax = df.plot.area(y='sales') + >>> ax = df.plot.area(y="sales") Draw with a different `x`: .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'sales': [3, 2, 3], - ... 'visits': [20, 42, 28], - ... 'day': [1, 2, 3], - ... }) - >>> ax = df.plot.area(x='day') + >>> df = pd.DataFrame( + ... { + ... "sales": [3, 2, 3], + ... "visits": [20, 42, 28], + ... "day": [1, 2, 3], + ... } + ... ) + >>> ax = df.plot.area(x="day") """ return self(kind="area", x=x, y=y, stacked=stacked, **kwargs) - def pie(self, **kwargs) -> PlotAccessor: + def pie(self, y: IndexLabel | None = None, **kwargs) -> PlotAccessor: """ Generate a pie plot. @@ -1639,16 +1709,19 @@ def pie(self, **kwargs) -> PlotAccessor: .. plot:: :context: close-figs - >>> df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97], - ... 'radius': [2439.7, 6051.8, 6378.1]}, - ... index=['Mercury', 'Venus', 'Earth']) - >>> plot = df.plot.pie(y='mass', figsize=(5, 5)) + >>> df = pd.DataFrame( + ... {"mass": [0.330, 4.87, 5.97], "radius": [2439.7, 6051.8, 6378.1]}, + ... index=["Mercury", "Venus", "Earth"], + ... ) + >>> plot = df.plot.pie(y="mass", figsize=(5, 5)) .. plot:: :context: close-figs >>> plot = df.plot.pie(subplots=True, figsize=(11, 6)) """ + if y is not None: + kwargs["y"] = y if ( isinstance(self._parent, ABCDataFrame) and kwargs.get("y", None) is None @@ -1714,6 +1787,7 @@ def scatter( Returns ------- :class:`matplotlib.axes.Axes` or numpy.ndarray of them + The matplotlib axes containing the scatter plot. See Also -------- @@ -1728,22 +1802,26 @@ def scatter( .. plot:: :context: close-figs - >>> df = pd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], - ... [6.4, 3.2, 1], [5.9, 3.0, 2]], - ... columns=['length', 'width', 'species']) - >>> ax1 = df.plot.scatter(x='length', - ... y='width', - ... c='DarkBlue') + >>> df = pd.DataFrame( + ... [ + ... [5.1, 3.5, 0], + ... [4.9, 3.0, 0], + ... [7.0, 3.2, 1], + ... [6.4, 3.2, 1], + ... [5.9, 3.0, 2], + ... ], + ... columns=["length", "width", "species"], + ... ) + >>> ax1 = df.plot.scatter(x="length", y="width", c="DarkBlue") And now with the color determined by a column as well. .. plot:: :context: close-figs - >>> ax2 = df.plot.scatter(x='length', - ... y='width', - ... c='species', - ... colormap='viridis') + >>> ax2 = df.plot.scatter( + ... x="length", y="width", c="species", colormap="viridis" + ... ) """ return self(kind="scatter", x=x, y=y, s=s, c=c, **kwargs) @@ -1794,7 +1872,7 @@ def hexbin( Returns ------- - matplotlib.AxesSubplot + matplotlib.Axes The matplotlib ``Axes`` on which the hexbin is plotted. See Also @@ -1812,9 +1890,8 @@ def hexbin( :context: close-figs >>> n = 10000 - >>> df = pd.DataFrame({'x': np.random.randn(n), - ... 'y': np.random.randn(n)}) - >>> ax = df.plot.hexbin(x='x', y='y', gridsize=20) + >>> df = pd.DataFrame({"x": np.random.randn(n), "y": np.random.randn(n)}) + >>> ax = df.plot.hexbin(x="x", y="y", gridsize=20) The next example uses `C` and `np.sum` as `reduce_C_function`. Note that `'observations'` values ranges from 1 to 5 but the result @@ -1825,17 +1902,21 @@ def hexbin( :context: close-figs >>> n = 500 - >>> df = pd.DataFrame({ - ... 'coord_x': np.random.uniform(-3, 3, size=n), - ... 'coord_y': np.random.uniform(30, 50, size=n), - ... 'observations': np.random.randint(1,5, size=n) - ... }) - >>> ax = df.plot.hexbin(x='coord_x', - ... y='coord_y', - ... C='observations', - ... reduce_C_function=np.sum, - ... gridsize=10, - ... cmap="viridis") + >>> df = pd.DataFrame( + ... { + ... "coord_x": np.random.uniform(-3, 3, size=n), + ... "coord_y": np.random.uniform(30, 50, size=n), + ... "observations": np.random.randint(1, 5, size=n), + ... } + ... ) + >>> ax = df.plot.hexbin( + ... x="coord_x", + ... y="coord_y", + ... C="observations", + ... reduce_C_function=np.sum, + ... gridsize=10, + ... cmap="viridis", + ... ) """ if reduce_C_function is not None: kwargs["reduce_C_function"] = reduce_C_function diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 75c61da03795a..ff28868aa0033 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -69,25 +69,25 @@ def plot(data, kind, **kwargs): kwargs["ax"] = getattr(ax, "left_ax", ax) plot_obj = PLOT_CLASSES[kind](data, **kwargs) plot_obj.generate() - plot_obj.draw() + plt.draw_if_interactive() return plot_obj.result __all__ = [ - "plot", - "hist_series", - "hist_frame", - "boxplot", - "boxplot_frame", - "boxplot_frame_groupby", - "table", "andrews_curves", "autocorrelation_plot", "bootstrap_plot", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "deregister", + "hist_frame", + "hist_series", "lag_plot", "parallel_coordinates", + "plot", "radviz", - "scatter_matrix", "register", - "deregister", + "scatter_matrix", + "table", ] diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index d2b76decaa75d..af77972da8634 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -7,7 +7,7 @@ ) import warnings -from matplotlib.artist import setp +import matplotlib as mpl import numpy as np from pandas._libs import lib @@ -20,6 +20,7 @@ import pandas as pd import pandas.core.common as com +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( @@ -54,7 +55,8 @@ def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> ticks = ax.get_xticks() if is_vertical else ax.get_yticks() if len(ticks) != len(labels): i, remainder = divmod(len(ticks), len(labels)) - assert remainder == 0, remainder + if Version(mpl.__version__) < Version("3.10"): + assert remainder == 0, remainder labels *= i if is_vertical: ax.set_xticklabels(labels, **kwargs) @@ -82,7 +84,7 @@ def __init__(self, data, return_type: str = "axes", **kwargs) -> None: self.return_type = return_type # Do not call LinePlot.__init__ which may fill nan - MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + MPLPlot.__init__(self, data, **kwargs) if self.subplots: # Disable label ax sharing. Otherwise, all subplots shows last @@ -121,8 +123,7 @@ def _validate_color_args(self, color, colormap): if colormap is not None: warnings.warn( - "'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'", + "'color' and 'colormap' cannot be used simultaneously. Using 'color'", stacklevel=find_stack_level(), ) @@ -198,10 +199,7 @@ def _make_plot(self, fig: Figure) -> None: else self.data ) - # error: Argument "data" to "_iter_data" of "MPLPlot" has - # incompatible type "object"; expected "DataFrame | - # dict[Hashable, Series | DataFrame]" - for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() @@ -274,13 +272,13 @@ def maybe_color_bp(bp, color_tup, **kwds) -> None: # GH#30346, when users specifying those arguments explicitly, our defaults # for these four kwargs should be overridden; if not, use Pandas settings if not kwds.get("boxprops"): - setp(bp["boxes"], color=color_tup[0], alpha=1) + mpl.artist.setp(bp["boxes"], color=color_tup[0], alpha=1) if not kwds.get("whiskerprops"): - setp(bp["whiskers"], color=color_tup[1], alpha=1) + mpl.artist.setp(bp["whiskers"], color=color_tup[1], alpha=1) if not kwds.get("medianprops"): - setp(bp["medians"], color=color_tup[2], alpha=1) + mpl.artist.setp(bp["medians"], color=color_tup[2], alpha=1) if not kwds.get("capprops"): - setp(bp["caps"], color=color_tup[3], alpha=1) + mpl.artist.setp(bp["caps"], color=color_tup[3], alpha=1) def _grouped_plot_by_column( @@ -311,8 +309,6 @@ def _grouped_plot_by_column( layout=layout, ) - _axes = flatten_axes(axes) - # GH 45465: move the "by" label based on "vert" xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None) if kwargs.get("vert", True): @@ -322,8 +318,7 @@ def _grouped_plot_by_column( ax_values = [] - for i, col in enumerate(columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), columns): gp_col = grouped[col] keys, values = zip(*gp_col) re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs) @@ -371,8 +366,8 @@ def _get_colors(): # num_colors=3 is required as method maybe_color_bp takes the colors # in positions 0 and 2. # if colors not provided, use same defaults as DataFrame.plot.box - result = get_standard_colors(num_colors=3) - result = np.take(result, [0, 0, 2]) + result_list = get_standard_colors(num_colors=3) + result = np.take(result_list, [0, 0, 2]) result = np.append(result, "k") colors = kwds.pop("color", None) @@ -455,7 +450,7 @@ def plot_group(keys, values, ax: Axes, **kwds): if ax is None: rc = {"figure.figsize": figsize} if figsize is not None else {} - with plt.rc_context(rc): + with mpl.rc_context(rc): ax = plt.gca() data = data._get_numeric_data() naxes = len(data.columns) @@ -531,25 +526,18 @@ def boxplot_frame_groupby( figsize=figsize, layout=layout, ) - axes = flatten_axes(axes) - - ret = pd.Series(dtype=object) - - for (key, group), ax in zip(grouped, axes): + data = {} + for (key, group), ax in zip(grouped, flatten_axes(axes)): d = group.boxplot( ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds ) ax.set_title(pprint_thing(key)) - ret.loc[key] = d + data[key] = d + ret = pd.Series(data) maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) else: keys, frames = zip(*grouped) - if grouped.axis == 0: - df = pd.concat(frames, keys=keys, axis=1) - elif len(frames) > 1: - df = frames[0].join(frames[1::]) - else: - df = frames[0] + df = pd.concat(frames, keys=keys, axis=1) # GH 16748, DataFrameGroupby fails when subplots=False and `column` argument # is assigned, and in this case, since `df` here becomes MI after groupby, diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 0eb3318ac96c5..774062e0f0412 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -4,7 +4,6 @@ import datetime as pydt from datetime import ( datetime, - timedelta, tzinfo, ) import functools @@ -15,13 +14,8 @@ ) import warnings +import matplotlib as mpl import matplotlib.dates as mdates -from matplotlib.ticker import ( - AutoLocator, - Formatter, - Locator, -) -from matplotlib.transforms import nonsingular import matplotlib.units as munits import numpy as np @@ -98,7 +92,7 @@ def wrapper(*args, **kwargs): @contextlib.contextmanager -def pandas_converters() -> Generator[None, None, None]: +def pandas_converters() -> Generator[None]: """ Context manager registering pandas' converters for a plot. @@ -175,7 +169,7 @@ def axisinfo(unit, axis) -> munits.AxisInfo | None: if unit != "time": return None - majloc = AutoLocator() + majloc = mpl.ticker.AutoLocator() # pyright: ignore[reportAttributeAccessIssue] majfmt = TimeFormatter(majloc) return munits.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") @@ -185,7 +179,7 @@ def default_units(x, axis) -> str: # time formatter -class TimeFormatter(Formatter): +class TimeFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] def __init__(self, locs) -> None: self.locs = locs @@ -231,16 +225,20 @@ def __call__(self, x, pos: int | None = 0) -> str: class PeriodConverter(mdates.DateConverter): @staticmethod def convert(values, units, axis): + if not hasattr(axis, "freq"): + raise TypeError("Axis must have `freq` set to convert to Periods") + return PeriodConverter.convert_from_freq(values, axis.freq) + + @staticmethod + def convert_from_freq(values, freq): if is_nested_list_like(values): - values = [PeriodConverter._convert_1d(v, units, axis) for v in values] + values = [PeriodConverter._convert_1d(v, freq) for v in values] else: - values = PeriodConverter._convert_1d(values, units, axis) + values = PeriodConverter._convert_1d(values, freq) return values @staticmethod - def _convert_1d(values, units, axis): - if not hasattr(axis, "freq"): - raise TypeError("Axis must have `freq` set to convert to Periods") + def _convert_1d(values, freq): valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) with warnings.catch_warnings(): warnings.filterwarnings( @@ -254,17 +252,17 @@ def _convert_1d(values, units, axis): or is_integer(values) or is_float(values) ): - return get_datevalue(values, axis.freq) + return get_datevalue(values, freq) elif isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).asi8 + return values.asfreq(freq).asi8 elif isinstance(values, Index): - return values.map(lambda x: get_datevalue(x, axis.freq)) + return values.map(lambda x: get_datevalue(x, freq)) elif lib.infer_dtype(values, skipna=False) == "period": # https://github.com/pandas-dev/pandas/issues/24304 # convert ndarray[period] -> PeriodIndex - return PeriodIndex(values, freq=axis.freq).asi8 + return PeriodIndex(values, freq=freq).asi8 elif isinstance(values, (list, tuple, np.ndarray, Index)): - return [get_datevalue(x, axis.freq) for x in values] + return [get_datevalue(x, freq) for x in values] return values @@ -431,7 +429,7 @@ def __call__(self): freq = f"{interval}ms" tz = self.tz.tzname(None) st = dmin.replace(tzinfo=None) - ed = dmin.replace(tzinfo=None) + ed = dmax.replace(tzinfo=None) all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) try: @@ -460,28 +458,6 @@ def autoscale(self): return self.nonsingular(vmin, vmax) -def _from_ordinal(x, tz: tzinfo | None = None) -> datetime: - ix = int(x) - dt = datetime.fromordinal(ix) - remainder = float(x) - ix - hour, remainder = divmod(24 * remainder, 1) - minute, remainder = divmod(60 * remainder, 1) - second, remainder = divmod(60 * remainder, 1) - microsecond = int(1_000_000 * remainder) - if microsecond < 10: - microsecond = 0 # compensate for rounding errors - dt = datetime( - dt.year, dt.month, dt.day, int(hour), int(minute), int(second), microsecond - ) - if tz is not None: - dt = dt.astimezone(tz) - - if microsecond > 999990: # compensate for rounding errors - dt += timedelta(microseconds=1_000_000 - microsecond) - - return dt - - # Fixed frequency dynamic tick locators and formatters # ------------------------------------------------------------------------- @@ -555,7 +531,7 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: ppd = -1 # placeholder for above-day freqs - if dtype_code >= FreqGroup.FR_HR.value: + if dtype_code >= FreqGroup.FR_HR.value: # pyright: ignore[reportAttributeAccessIssue] # error: "BaseOffset" has no attribute "_creso" ppd = periods_per_day(freq._creso) # type: ignore[attr-defined] ppm = 28 * ppd @@ -584,7 +560,8 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: return ppd, ppm, ppy -def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] @@ -711,7 +688,7 @@ def _second_finder(label_interval: int) -> None: elif span <= periodsperyear // 4: month_start = _period_break(dates_, "month") info_maj[month_start] = True - if dtype_code < FreqGroup.FR_HR.value: + if dtype_code < FreqGroup.FR_HR.value: # pyright: ignore[reportAttributeAccessIssue] info["min"] = True else: day_start = _period_break(dates_, "day") @@ -783,7 +760,8 @@ def _second_finder(label_interval: int) -> None: return info -def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin @@ -854,7 +832,8 @@ def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -901,7 +880,8 @@ def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 @@ -934,13 +914,13 @@ def get_finder(freq: BaseOffset): return _quarterly_finder elif fgroup == FreqGroup.FR_MTH: return _monthly_finder - elif (dtype_code >= FreqGroup.FR_BUS.value) or fgroup == FreqGroup.FR_WK: + elif (dtype_code >= FreqGroup.FR_BUS.value) or fgroup == FreqGroup.FR_WK: # pyright: ignore[reportAttributeAccessIssue] return _daily_finder else: # pragma: no cover raise NotImplementedError(f"Unsupported frequency: {dtype_code}") -class TimeSeries_DateLocator(Locator): +class TimeSeries_DateLocator(mpl.ticker.Locator): # pyright: ignore[reportAttributeAccessIssue] """ Locates the ticks along an axis controlled by a :class:`Series`. @@ -1021,7 +1001,7 @@ def autoscale(self): if vmin == vmax: vmin -= 1 vmax += 1 - return nonsingular(vmin, vmax) + return mpl.transforms.nonsingular(vmin, vmax) # ------------------------------------------------------------------------- @@ -1029,7 +1009,7 @@ def autoscale(self): # ------------------------------------------------------------------------- -class TimeSeries_DateFormatter(Formatter): +class TimeSeries_DateFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] """ Formats the ticks along an axis controlled by a :class:`PeriodIndex`. @@ -1105,7 +1085,7 @@ def __call__(self, x, pos: int | None = 0) -> str: return period.strftime(fmt) -class TimeSeries_TimedeltaFormatter(Formatter): +class TimeSeries_TimedeltaFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] """ Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. """ diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 479a5e19dc1c5..1c7e1ab57b2a9 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -55,20 +55,20 @@ from pandas.core.dtypes.missing import isna import pandas.core.common as com -from pandas.core.frame import DataFrame -from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib import tools -from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters +from pandas.plotting._matplotlib.converter import ( + PeriodConverter, + register_pandas_matplotlib_converters, +) from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by from pandas.plotting._matplotlib.misc import unpack_single_str_list from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.timeseries import ( - decorate_axes, format_dateaxis, maybe_convert_index, - maybe_resample, + prepare_ts_data, use_dynamic_x, ) from pandas.plotting._matplotlib.tools import ( @@ -93,16 +93,22 @@ npt, ) - from pandas import Series + from pandas import ( + DataFrame, + Index, + Series, + ) + + +def holds_integer(column: Index) -> bool: + return column.inferred_type in {"integer", "mixed-integer"} def _color_in_style(style: str) -> bool: """ Check if there is a color letter in the style string. """ - from matplotlib.colors import BASE_COLORS - - return not set(BASE_COLORS).isdisjoint(style) + return not set(mpl.colors.BASE_COLORS).isdisjoint(style) class MPLPlot(ABC): @@ -169,8 +175,6 @@ def __init__( style=None, **kwds, ) -> None: - import matplotlib.pyplot as plt - # if users assign an empty list or tuple, raise `ValueError` # similar to current `df.box` and `df.hist` APIs. if by in ([], ()): @@ -180,7 +184,7 @@ def __init__( # Assign the rest of columns into self.columns if by is explicitly defined # while column is not, only need `columns` in hist/box plot when it's DF # TODO: Might deprecate `column` argument in future PR (#28373) - if isinstance(data, DataFrame): + if isinstance(data, ABCDataFrame): if column: self.columns = com.maybe_make_list(column) elif self.by is None: @@ -231,7 +235,7 @@ def __init__( self.rot = self._default_rot if grid is None: - grid = False if secondary_y else plt.rcParams["axes.grid"] + grid = False if secondary_y else mpl.rcParams["axes.grid"] self.grid = grid self.legend = legend @@ -285,12 +289,27 @@ def __init__( self.data = self._ensure_frame(self.data) + from pandas.plotting import plot_params + + self.x_compat = plot_params["x_compat"] + if "x_compat" in self.kwds: + self.x_compat = bool(self.kwds.pop("x_compat")) + + @final + def _is_ts_plot(self) -> bool: + # this is slightly deceptive + return not self.x_compat and self.use_index and self._use_dynamic_x() + + @final + def _use_dynamic_x(self) -> bool: + return use_dynamic_x(self._get_ax(0), self.data.index) + @final @staticmethod def _validate_sharex(sharex: bool | None, ax, by) -> bool: if sharex is None: # if by is defined, subplots are used and sharex should be False - if ax is None and by is None: # pylint: disable=simplifiable-if-statement + if ax is None and by is None: sharex = True else: # if we get an axis, the users should do the visibility @@ -448,7 +467,9 @@ def _validate_color_args(self, color, colormap): ) if self.style is not None: - if is_list_like(self.style): + if isinstance(self.style, dict): + styles = [self.style[col] for col in self.columns if col in self.style] + elif is_list_like(self.style): styles = self.style else: styles = [self.style] @@ -465,7 +486,7 @@ def _validate_color_args(self, color, colormap): @final @staticmethod def _iter_data( - data: DataFrame | dict[Hashable, Series | DataFrame] + data: DataFrame | dict[Hashable, Series | DataFrame], ) -> Iterator[tuple[Hashable, np.ndarray]]: for col, values in data.items(): # This was originally written to use values.values before EAs @@ -490,10 +511,6 @@ def _get_nseries(self, data: Series | DataFrame) -> int: def nseries(self) -> int: return self._get_nseries(self.data) - @final - def draw(self) -> None: - self.plt.draw_if_interactive() - @final def generate(self) -> None: self._compute_plot_data() @@ -547,7 +564,7 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: new_ax.set_yscale("log") elif self.logy == "sym" or self.loglog == "sym": new_ax.set_yscale("symlog") - return new_ax # type: ignore[return-value] + return new_ax @final @cache_readonly @@ -563,6 +580,8 @@ def axes(self) -> Sequence[Axes]: @final @cache_readonly def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: + import matplotlib.pyplot as plt + if self.subplots: naxes = ( self.nseries if isinstance(self.subplots, bool) else len(self.subplots) @@ -577,7 +596,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: layout_type=self._layout_type, ) elif self.ax is None: - fig = self.plt.figure(figsize=self.figsize) + fig = plt.figure(figsize=self.figsize) axes = fig.add_subplot(111) else: fig = self.ax.get_figure() @@ -585,7 +604,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: fig.set_size_inches(self.figsize) axes = self.ax - axes = flatten_axes(axes) + axes = np.fromiter(flatten_axes(axes), dtype=object) if self.logx is True or self.loglog is True: [a.set_xscale("log") for a in axes] @@ -616,8 +635,7 @@ def result(self): # error: Argument 1 to "len" has incompatible type "Union[bool, # Tuple[Any, ...], List[Any], ndarray[Any, Any]]"; expected "Sized" all_sec = ( - is_list_like(self.secondary_y) - and len(self.secondary_y) == self.nseries # type: ignore[arg-type] + is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries # type: ignore[arg-type] ) if sec_true or all_sec: # if all data is plotted on secondary, return right axes @@ -662,7 +680,7 @@ def _ensure_frame(self, data) -> DataFrame: return data @final - def _compute_plot_data(self): + def _compute_plot_data(self) -> None: data = self.data # GH15079 reconstruct data if by is defined @@ -672,7 +690,7 @@ def _compute_plot_data(self): # GH16953, infer_objects is needed as fallback, for ``Series`` # with ``dtype == object`` - data = data.infer_objects(copy=False) + data = data.infer_objects() include_type = [np.number, "datetime", "datetimetz", "timedelta"] # GH23719, allow plotting boolean @@ -699,7 +717,7 @@ def _compute_plot_data(self): self.data = numeric_data.apply(type(self)._convert_to_ndarray) - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: raise AbstractMethodError(self) @final @@ -745,7 +763,7 @@ def _post_plot_logic(self, ax: Axes, data) -> None: """Post process for each axes. Overridden in child classes""" @final - def _adorn_subplots(self, fig: Figure): + def _adorn_subplots(self, fig: Figure) -> None: """Common post process unrelated to data""" if len(self.axes) > 0: all_axes = self._get_subplots(fig) @@ -784,7 +802,13 @@ def _adorn_subplots(self, fig: Figure): if self.title: if self.subplots: if is_list_like(self.title): - if len(self.title) != self.nseries: + if not isinstance(self.subplots, bool): + if len(self.subplots) != len(self.title): + raise ValueError( + f"The number of titles ({len(self.title)}) must equal " + f"the number of subplots ({len(self.subplots)})." + ) + elif len(self.title) != self.nseries: raise ValueError( "The length of `title` must equal the number " "of columns if using `title` of type `list` " @@ -870,10 +894,7 @@ def _make_legend(self) -> None: if leg is not None: title = leg.get_title().get_text() # Replace leg.legend_handles because it misses marker info - if Version(mpl.__version__) < Version("3.7"): - handles = leg.legendHandles - else: - handles = leg.legend_handles + handles = leg.legend_handles labels = [x.get_text() for x in leg.get_texts()] if self.legend: @@ -893,7 +914,13 @@ def _make_legend(self) -> None: elif self.subplots and self.legend: for ax in self.axes: if ax.get_visible(): - ax.legend(loc="best") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "No artists with labels found to put in legend.", + UserWarning, + ) + ax.legend(loc="best") @final @staticmethod @@ -912,13 +939,6 @@ def _get_ax_legend(ax: Axes): ax = other_ax return ax, leg - @final - @cache_readonly - def plt(self): - import matplotlib.pyplot as plt - - return plt - _need_to_set_index = False @final @@ -1022,7 +1042,7 @@ def _col_idx_to_axis_idx(self, col_idx: int) -> int: return col_idx @final - def _get_ax(self, i: int): + def _get_ax(self, i: int) -> Axes: # get the twinx ax if appropriate if self.subplots: i = self._col_idx_to_axis_idx(i) @@ -1038,7 +1058,7 @@ def _get_ax(self, i: int): return ax @final - def on_right(self, i: int): + def on_right(self, i: int) -> bool: if isinstance(self.secondary_y, bool): return self.secondary_y @@ -1211,16 +1231,11 @@ def _get_errorbars( return errors @final - def _get_subplots(self, fig: Figure): - if Version(mpl.__version__) < Version("3.8"): - from matplotlib.axes import Subplot as Klass - else: - from matplotlib.axes import Axes as Klass - + def _get_subplots(self, fig: Figure) -> list[Axes]: return [ ax for ax in fig.get_axes() - if (isinstance(ax, Klass) and ax.get_subplotspec() is not None) + if (isinstance(ax, mpl.axes.Axes) and ax.get_subplotspec() is not None) ] @final @@ -1247,9 +1262,9 @@ def __init__(self, data, x, y, **kwargs) -> None: MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: raise ValueError(self._kind + " requires an x and y column") - if is_integer(x) and not self.data.columns._holds_integer(): + if is_integer(x) and not holds_integer(self.data.columns): x = self.data.columns[x] - if is_integer(y) and not self.data.columns._holds_integer(): + if is_integer(y) and not holds_integer(self.data.columns): y = self.data.columns[y] self.x = x @@ -1319,14 +1334,24 @@ def __init__( self.norm = norm super().__init__(data, x, y, **kwargs) - if is_integer(c) and not self.data.columns._holds_integer(): + if is_integer(c) and not holds_integer(self.data.columns): c = self.data.columns[c] self.c = c - def _make_plot(self, fig: Figure): + @register_pandas_matplotlib_converters + def _make_plot(self, fig: Figure) -> None: x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] + from pandas import Series + + x_data = data[x] + s = Series(index=x_data) + if use_dynamic_x(ax, s.index): + s = maybe_convert_index(ax, s) + freq, s = prepare_ts_data(s, ax, self.kwds) + x_data = s.index + c_is_column = is_hashable(c) and c in self.data.columns color_by_categorical = c_is_column and isinstance( @@ -1342,8 +1367,24 @@ def _make_plot(self, fig: Figure): label = self.label else: label = None + + # if a list of non-color strings is passed in as c, color points + # by uniqueness of the strings, such same strings get same color + create_colors = not self._are_valid_colors(c_values) + if create_colors: + color_mapping = self._get_color_mapping(c_values) + c_values = [color_mapping[s] for s in c_values] + + # build legend for labeling custom colors + ax.legend( + handles=[ + mpl.patches.Circle((0, 0), facecolor=c, label=s) + for s, c in color_mapping.items() + ] + ) + scatter = ax.scatter( - data[x].values, + x_data.values, data[y].values, c=c_values, label=label, @@ -1352,6 +1393,7 @@ def _make_plot(self, fig: Figure): s=self.s, **self.kwds, ) + if cb: cbar_label = c if c_is_column else "" cbar = self._plot_colorbar(ax, fig=fig, label=cbar_label) @@ -1380,7 +1422,7 @@ def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool): if c is not None and color is not None: raise TypeError("Specify exactly one of `c` and `color`") if c is None and color is None: - c_values = self.plt.rcParams["patch.facecolor"] + c_values = mpl.rcParams["patch.facecolor"] elif color is not None: c_values = color elif color_by_categorical: @@ -1391,6 +1433,30 @@ def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool): c_values = c return c_values + def _are_valid_colors(self, c_values: Series) -> bool: + # check if c_values contains strings and if these strings are valid mpl colors. + # no need to check numerics as these (and mpl colors) will be validated for us + # in .Axes.scatter._parse_scatter_color_args(...) + unique = np.unique(c_values) + try: + if len(c_values) and all(isinstance(c, str) for c in unique): + mpl.colors.to_rgba_array(unique) + + return True + + except (TypeError, ValueError) as _: + return False + + def _get_color_mapping(self, c_values: Series) -> dict[str, np.ndarray]: + unique = np.unique(c_values) + n_colors = len(unique) + + # passing `None` here will default to :rc:`image.cmap` + cmap = mpl.colormaps.get_cmap(self.colormap) + colors = cmap(np.linspace(0, 1, n_colors)) # RGB tuples + + return dict(zip(unique, colors)) + def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): c = self.c if self.colormap is not None: @@ -1405,12 +1471,10 @@ def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): cmap = None if color_by_categorical and cmap is not None: - from matplotlib import colors - n_cats = len(self.data[c].cat.categories) - cmap = colors.ListedColormap([cmap(i) for i in range(cmap.N)]) + cmap = mpl.colors.ListedColormap([cmap(i) for i in range(cmap.N)]) bounds = np.linspace(0, n_cats, n_cats + 1) - norm = colors.BoundaryNorm(bounds, cmap.N) + norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # TODO: warn that we are ignoring self.norm if user specified it? # Doesn't happen in any tests 2023-11-09 else: @@ -1435,7 +1499,7 @@ def _kind(self) -> Literal["hexbin"]: def __init__(self, data, x, y, C=None, *, colorbar: bool = True, **kwargs) -> None: super().__init__(data, x, y, **kwargs) - if is_integer(C) and not self.data.columns._holds_integer(): + if is_integer(C) and not holds_integer(self.data.columns): C = self.data.columns[C] self.C = C @@ -1480,23 +1544,9 @@ def _kind(self) -> Literal["line", "area", "hist", "kde", "box"]: return "line" def __init__(self, data, **kwargs) -> None: - from pandas.plotting import plot_params - MPLPlot.__init__(self, data, **kwargs) if self.stacked: self.data = self.data.fillna(value=0) - self.x_compat = plot_params["x_compat"] - if "x_compat" in self.kwds: - self.x_compat = bool(self.kwds.pop("x_compat")) - - @final - def _is_ts_plot(self) -> bool: - # this is slightly deceptive - return not self.x_compat and self.use_index and self._use_dynamic_x() - - @final - def _use_dynamic_x(self) -> bool: - return use_dynamic_x(self._get_ax(0), self.data) def _make_plot(self, fig: Figure) -> None: if self._is_ts_plot(): @@ -1586,15 +1636,8 @@ def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds): # accept x to be consistent with normal plot func, # x is not passed to tsplot as it uses data.index as x coordinate # column_num must be in kwds for stacking purpose - freq, data = maybe_resample(data, ax, kwds) + freq, data = prepare_ts_data(data, ax, kwds) - # Set ax with freq info - decorate_axes(ax, freq) - # digging deeper - if hasattr(ax, "left_ax"): - decorate_axes(ax.left_ax, freq) - if hasattr(ax, "right_ax"): - decorate_axes(ax.right_ax, freq) # TODO #54485 ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined] @@ -1670,8 +1713,6 @@ def _update_stacker(cls, ax: Axes, stacking_id: int | None, values) -> None: ax._stacker_neg_prior[stacking_id] += values # type: ignore[attr-defined] def _post_plot_logic(self, ax: Axes, data) -> None: - from matplotlib.ticker import FixedLocator - def get_label(i): if is_float(i) and i.is_integer(): i = int(i) @@ -1685,7 +1726,7 @@ def get_label(i): xticklabels = [get_label(x) for x in xticks] # error: Argument 1 to "FixedLocator" has incompatible type "ndarray[Any, # Any]"; expected "Sequence[float]" - ax.xaxis.set_major_locator(FixedLocator(xticks)) # type: ignore[arg-type] + ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(xticks)) # type: ignore[arg-type] ax.set_xticklabels(xticklabels) # If the index is an irregular time series, then by default @@ -1719,13 +1760,7 @@ def _kind(self) -> Literal["area"]: def __init__(self, data, **kwargs) -> None: kwargs.setdefault("stacked", True) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Downcasting object dtype arrays", - category=FutureWarning, - ) - data = data.fillna(value=0) + data = data.fillna(value=0) LinePlot.__init__(self, data, **kwargs) if not self.stacked: @@ -1823,7 +1858,6 @@ def __init__( self.bar_width = width self._align = align self._position = position - self.tick_pos = np.arange(len(data)) if is_list_like(bottom): bottom = np.array(bottom) @@ -1836,6 +1870,16 @@ def __init__( MPLPlot.__init__(self, data, **kwargs) + if self._is_ts_plot(): + self.tick_pos = np.array( + PeriodConverter.convert_from_freq( + self._get_xticks(), + data.index.freq, + ) + ) + else: + self.tick_pos = np.arange(len(data)) + @cache_readonly def ax_pos(self) -> np.ndarray: return self.tick_pos - self.tickoffset @@ -1865,6 +1909,7 @@ def lim_offset(self): # error: Signature of "_plot" incompatible with supertype "MPLPlot" @classmethod + @register_pandas_matplotlib_converters def _plot( # type: ignore[override] cls, ax: Axes, @@ -1889,6 +1934,21 @@ def _make_plot(self, fig: Figure) -> None: K = self.nseries data = self.data.fillna(0) + + _stacked_subplots_ind: dict[int, int] = {} + _stacked_subplots_offsets = [] + + self.subplots: list[Any] + + if not isinstance(self.subplots, bool): + if bool(self.subplots) and self.stacked: + for i, sub_plot in enumerate(self.subplots): + if len(sub_plot) <= 1: + continue + for plot in sub_plot: + _stacked_subplots_ind[int(plot)] = i + _stacked_subplots_offsets.append([0, 0]) + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() @@ -1914,7 +1974,28 @@ def _make_plot(self, fig: Figure) -> None: start = start + self._start_base kwds["align"] = self._align - if self.subplots: + + if i in _stacked_subplots_ind: + offset_index = _stacked_subplots_ind[i] + pos_prior, neg_prior = _stacked_subplots_offsets[offset_index] # type:ignore[assignment] + mask = y >= 0 + start = np.where(mask, pos_prior, neg_prior) + self._start_base + w = self.bar_width / 2 + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds, + ) + pos_new = pos_prior + np.where(mask, y, 0) + neg_new = neg_prior + np.where(mask, 0, y) + _stacked_subplots_offsets[offset_index] = [pos_new, neg_new] + + elif self.subplots: w = self.bar_width / 2 rect = self._plot( ax, @@ -1928,7 +2009,7 @@ def _make_plot(self, fig: Figure) -> None: ) ax.set_title(label) elif self.stacked: - mask = y > 0 + mask = y >= 0 start = np.where(mask, pos_prior, neg_prior) + self._start_base w = self.bar_width / 2 rect = self._plot( @@ -2046,9 +2127,12 @@ def _kind(self) -> Literal["pie"]: _layout_type = "horizontal" - def __init__(self, data, kind=None, **kwargs) -> None: + def __init__(self, data: Series | DataFrame, kind=None, **kwargs) -> None: data = data.fillna(value=0) - if (data < 0).any().any(): + lt_zero = data < 0 + if isinstance(data, ABCDataFrame) and lt_zero.any().any(): + raise ValueError(f"{self._kind} plot doesn't allow negative values") + elif isinstance(data, ABCSeries) and lt_zero.any(): raise ValueError(f"{self._kind} plot doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) @@ -2077,9 +2161,6 @@ def _make_plot(self, fig: Figure) -> None: for i, (label, y) in enumerate(self._iter_data(data=self.data)): ax = self._get_ax(i) - if label is not None: - label = pprint_thing(label) - ax.set_ylabel(label) kwds = self.kwds.copy() @@ -2101,9 +2182,11 @@ def blank_labeler(label, value): results = ax.pie(y, labels=blabels, **kwds) if kwds.get("autopct", None) is not None: - patches, texts, autotexts = results + # error: Need more than 2 values to unpack (3 expected) + patches, texts, autotexts = results # type: ignore[misc] else: - patches, texts = results + # error: Too many values to unpack (2 expected, 3 provided) + patches, texts = results # type: ignore[misc] autotexts = [] if self.fontsize is not None: diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index cbb66065a8039..783f79710097c 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -50,10 +50,9 @@ def create_iter_data_given_by( If `by` is assigned: >>> import numpy as np - >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] + >>> tuples = [("h1", "a"), ("h1", "b"), ("h2", "a"), ("h2", "b")] >>> mi = pd.MultiIndex.from_tuples(tuples) - >>> value = [[1, 3, np.nan, np.nan], - ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] + >>> value = [[1, 3, np.nan, np.nan], [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] >>> data = pd.DataFrame(value, columns=mi) >>> create_iter_data_given_by(data) {'h1': h1 @@ -106,9 +105,9 @@ def reconstruct_data_with_by( Examples -------- - >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} + >>> d = {"h": ["h1", "h1", "h2"], "a": [1, 3, 5], "b": [3, 4, 6]} >>> df = pd.DataFrame(d) - >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b']) + >>> reconstruct_data_with_by(df, by="h", cols=["a", "b"]) h1 h2 a b a b 0 1.0 3.0 NaN NaN diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index e610f1adb602c..1a423ad49c294 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -41,7 +41,9 @@ if TYPE_CHECKING: from matplotlib.axes import Axes + from matplotlib.container import BarContainer from matplotlib.figure import Figure + from matplotlib.patches import Polygon from pandas._typing import PlottingOrientation @@ -76,7 +78,7 @@ def __init__( self.xlabel = kwargs.get("xlabel") self.ylabel = kwargs.get("ylabel") # Do not call LinePlot.__init__ which may fill nan - MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + MPLPlot.__init__(self, data, **kwargs) self.bins = self._adjust_bins(bins) @@ -92,12 +94,13 @@ def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: """Calculate bins given data""" - nd_values = data.infer_objects(copy=False)._get_numeric_data() - values = np.ravel(nd_values) + nd_values = data.infer_objects()._get_numeric_data() + values = nd_values.values + if nd_values.ndim == 2: + values = values.reshape(-1) values = values[~isna(values)] - hist, bins = np.histogram(values, bins=bins, range=self._bin_range) - return bins + return np.histogram_bin_edges(values, bins=bins, range=self._bin_range) # error: Signature of "_plot" incompatible with supertype "LinePlot" @classmethod @@ -112,7 +115,8 @@ def _plot( # type: ignore[override] *, bins, **kwds, - ): + # might return a subset from the possible return types of Axes.hist(...)[2]? + ) -> BarContainer | Polygon | list[BarContainer | Polygon]: if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) @@ -133,10 +137,7 @@ def _make_plot(self, fig: Figure) -> None: if self.by is not None else self.data ) - - # error: Argument "data" to "_iter_data" of "MPLPlot" has incompatible - # type "object"; expected "DataFrame | dict[Hashable, Series | DataFrame]" - for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() @@ -171,7 +172,8 @@ def _make_plot(self, fig: Figure) -> None: if self.by is not None: ax.set_title(pprint_thing(label)) - self._append_legend_handles_labels(artists[0], label) + # error: Value of type "Polygon" is not indexable + self._append_legend_handles_labels(artists[0], label) # type: ignore[index,arg-type] def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None: """merge BoxPlot/KdePlot properties to passed kwds""" @@ -202,17 +204,13 @@ def _post_plot_logic(self, ax: Axes, data) -> None: # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible # type "Hashable"; expected "str" ax.set_xlabel( - "Frequency" - if self.xlabel is None - else self.xlabel # type: ignore[arg-type] + "Frequency" if self.xlabel is None else self.xlabel # type: ignore[arg-type] ) ax.set_ylabel(self.ylabel) # type: ignore[arg-type] else: ax.set_xlabel(self.xlabel) # type: ignore[arg-type] ax.set_ylabel( - "Frequency" - if self.ylabel is None - else self.ylabel # type: ignore[arg-type] + "Frequency" if self.ylabel is None else self.ylabel # type: ignore[arg-type] ) @property @@ -236,7 +234,7 @@ def __init__( self, data, bw_method=None, ind=None, *, weights=None, **kwargs ) -> None: # Do not call LinePlot.__init__ which may fill nan - MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + MPLPlot.__init__(self, data, **kwargs) self.bw_method = bw_method self.ind = ind self.weights = weights @@ -268,6 +266,7 @@ def _plot( # type: ignore[override] y: np.ndarray, style=None, bw_method=None, + weights=None, ind=None, column_num=None, stacking_id: int | None = None, @@ -276,7 +275,7 @@ def _plot( # type: ignore[override] from scipy.stats import gaussian_kde y = remove_na_arraylike(y) - gkde = gaussian_kde(y, bw_method=bw_method) + gkde = gaussian_kde(y, bw_method=bw_method, weights=weights) y = gkde.evaluate(ind) lines = MPLPlot._plot(ax, ind, y, style=style, **kwds) @@ -322,10 +321,7 @@ def _grouped_plot( naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout ) - _axes = flatten_axes(axes) - - for i, (key, group) in enumerate(grouped): - ax = _axes[i] + for ax, (key, group) in zip(flatten_axes(axes), grouped): if numeric_only and isinstance(group, ABCDataFrame): group = group._get_numeric_data() plotf(group, ax, **kwargs) @@ -457,10 +453,8 @@ def hist_series( ax.grid(grid) axes = np.array([ax]) - # error: Argument 1 to "set_ticks_props" has incompatible type "ndarray[Any, - # dtype[Any]]"; expected "Axes | Sequence[Axes]" set_ticks_props( - axes, # type: ignore[arg-type] + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, @@ -559,12 +553,9 @@ def hist_frame( figsize=figsize, layout=layout, ) - _axes = flatten_axes(axes) - can_set_label = "label" not in kwds - for i, col in enumerate(data.columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), data.columns): if legend and can_set_label: kwds["label"] = col ax.hist(data[col].dropna().values, bins=bins, **kwds) diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 1f9212587e05e..4a891ec27e8cb 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -3,8 +3,7 @@ import random from typing import TYPE_CHECKING -from matplotlib import patches -import matplotlib.lines as mlines +import matplotlib as mpl import numpy as np from pandas.core.dtypes.missing import notna @@ -129,7 +128,7 @@ def scatter_matrix( def _get_marker_compat(marker): - if marker not in mlines.lineMarkers: + if marker not in mpl.lines.lineMarkers: return "o" return marker @@ -190,10 +189,10 @@ def normalize(series): ) ax.legend() - ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) + ax.add_patch(mpl.patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) for xy, name in zip(s, df.columns): - ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) + ax.add_patch(mpl.patches.Circle(xy, radius=0.025, facecolor="gray")) if xy[0] < 0.0 and xy[1] < 0.0: ax.text( diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index bf4e4be3bfd82..7cf63c8621392 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -3,11 +3,13 @@ from collections.abc import ( Collection, Iterator, + Sequence, ) import itertools from typing import ( TYPE_CHECKING, cast, + overload, ) import warnings @@ -20,18 +22,47 @@ from pandas.core.dtypes.common import is_list_like -import pandas.core.common as com - if TYPE_CHECKING: from matplotlib.colors import Colormap +@overload +def get_standard_colors( + num_colors: int, + colormap: Colormap | None = ..., + color_type: str = ..., + *, + color: dict[str, Color], +) -> dict[str, Color]: ... + + +@overload +def get_standard_colors( + num_colors: int, + colormap: Colormap | None = ..., + color_type: str = ..., + *, + color: Color | Sequence[Color] | None = ..., +) -> list[Color]: ... + + +@overload +def get_standard_colors( + num_colors: int, + colormap: Colormap | None = ..., + color_type: str = ..., + *, + color: dict[str, Color] | Color | Sequence[Color] | None = ..., +) -> dict[str, Color] | list[Color]: ... + + def get_standard_colors( num_colors: int, colormap: Colormap | None = None, color_type: str = "default", - color: dict[str, Color] | Color | Collection[Color] | None = None, -): + *, + color: dict[str, Color] | Color | Sequence[Color] | None = None, +) -> dict[str, Color] | list[Color]: """ Get standard colors based on `colormap`, `color_type` or `color` inputs. @@ -218,33 +249,17 @@ def _is_floats_color(color: Color | Collection[Color]) -> bool: def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color]: """Get colors from user input color type.""" if color_type == "default": - return _get_default_colors(num_colors) + prop_cycle = mpl.rcParams["axes.prop_cycle"] + return [ + c["color"] + for c in itertools.islice(prop_cycle, min(num_colors, len(prop_cycle))) + ] elif color_type == "random": - return _get_random_colors(num_colors) + return np.random.default_rng(num_colors).random((num_colors, 3)).tolist() else: raise ValueError("color_type must be either 'default' or 'random'") -def _get_default_colors(num_colors: int) -> list[Color]: - """Get `num_colors` of default colors from matplotlib rc params.""" - import matplotlib.pyplot as plt - - colors = [c["color"] for c in plt.rcParams["axes.prop_cycle"]] - return colors[0:num_colors] - - -def _get_random_colors(num_colors: int) -> list[Color]: - """Get `num_colors` of random colors.""" - return [_random_color(num) for num in range(num_colors)] - - -def _random_color(column: int) -> list[float]: - """Get a random color represented as a list of length 3""" - # GH17525 use common._random_state to avoid resetting the seed - rs = com.random_state(column) - return rs.rand(3).tolist() - - def _is_single_string_color(color: Color) -> bool: """Check if `color` is a single string color. diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index bf1c0f6346f02..beaf5b6259ef3 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -48,7 +48,6 @@ from pandas._typing import NDFrameT from pandas import ( - DataFrame, DatetimeIndex, Index, PeriodIndex, @@ -205,7 +204,10 @@ def _get_ax_freq(ax: Axes): def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: - freqstr = to_offset(freq, is_period=True).rule_code + if isinstance(freq, BaseOffset): + freqstr = freq.name + else: + freqstr = to_offset(freq, is_period=True).rule_code return get_period_alias(freqstr) @@ -228,8 +230,8 @@ def _get_freq(ax: Axes, series: Series): return freq, ax_freq -def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: - freq = _get_index_freq(data.index) +def use_dynamic_x(ax: Axes, index: Index) -> bool: + freq = _get_index_freq(index) ax_freq = _get_ax_freq(ax) if freq is None: # convert irregular if axes has freq info @@ -247,18 +249,15 @@ def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: return False # FIXME: hack this for 0.10.1, creating more technical debt...sigh - if isinstance(data.index, ABCDatetimeIndex): + if isinstance(index, ABCDatetimeIndex): # error: "BaseOffset" has no attribute "_period_dtype_code" freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str) - base = to_offset( - freq_str, is_period=True - )._period_dtype_code # type: ignore[attr-defined] - x = data.index + base = to_offset(freq_str, is_period=True)._period_dtype_code # type: ignore[attr-defined] if base <= FreqGroup.FR_DAY.value: - return x[:1].is_normalized - period = Period(x[0], freq_str) + return index[:1].is_normalized + period = Period(index[0], freq_str) assert isinstance(period, Period) - return period.to_timestamp().tz_localize(x.tz) == x[0] + return period.to_timestamp().tz_localize(index.tz) == index[0] return True @@ -309,7 +308,7 @@ def maybe_convert_index(ax: Axes, data: NDFrameT) -> NDFrameT: if isinstance(data.index, ABCDatetimeIndex): data = data.tz_localize(None).to_period(freq=freq_str) elif isinstance(data.index, ABCPeriodIndex): - data.index = data.index.asfreq(freq=freq_str) + data.index = data.index.asfreq(freq=freq_str, how="start") return data @@ -332,7 +331,7 @@ def format_dateaxis( default, changing the limits of the x axis will intelligently change the positions of the ticks. """ - from matplotlib import pylab + import matplotlib.pyplot as plt # handle index specific formatting # Note: DatetimeIndex does not use this @@ -364,4 +363,20 @@ def format_dateaxis( else: raise TypeError("index type not supported") - pylab.draw_if_interactive() + plt.draw_if_interactive() + + +def prepare_ts_data( + series: Series, ax: Axes, kwargs: dict[str, Any] +) -> tuple[BaseOffset | str, Series]: + freq, data = maybe_resample(series, ax, kwargs) + + # Set ax with freq info + decorate_axes(ax, freq) + # digging deeper + if hasattr(ax, "left_ax"): + decorate_axes(ax.left_ax, freq) + if hasattr(ax, "right_ax"): + decorate_axes(ax.right_ax, freq) + + return freq, data diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 898b5b25e7b01..8ee75e7fe553e 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -5,8 +5,7 @@ from typing import TYPE_CHECKING import warnings -from matplotlib import ticker -import matplotlib.table +import matplotlib as mpl import numpy as np from pandas.util._exceptions import find_stack_level @@ -20,8 +19,8 @@ if TYPE_CHECKING: from collections.abc import ( + Generator, Iterable, - Sequence, ) from matplotlib.axes import Axes @@ -57,7 +56,7 @@ def format_date_labels(ax: Axes, rot) -> None: fig = ax.get_figure() if fig is not None: # should always be a Figure but can technically be None - maybe_adjust_figure(fig, bottom=0.2) + maybe_adjust_figure(fig, bottom=0.2) # type: ignore[arg-type] def table( @@ -80,7 +79,7 @@ def table( # error: Argument "cellText" to "table" has incompatible type "ndarray[Any, # Any]"; expected "Sequence[Sequence[str]] | None" - return matplotlib.table.table( + return mpl.table.table( ax, cellText=cellText, # type: ignore[arg-type] rowLabels=rowLabels, @@ -101,13 +100,14 @@ def _get_layout( nrows, ncols = layout if nrows == -1 and ncols > 0: - layout = nrows, ncols = (ceil(nplots / ncols), ncols) + layout = (ceil(nplots / ncols), ncols) elif ncols == -1 and nrows > 0: - layout = nrows, ncols = (nrows, ceil(nplots / nrows)) + layout = (nrows, ceil(nplots / nrows)) elif ncols <= 0 and nrows <= 0: msg = "At least one dimension of layout must be positive" raise ValueError(msg) + nrows, ncols = layout if nrows * ncols < nplots: raise ValueError( f"Layout of {nrows}x{ncols} must be larger than required size {nplots}" @@ -234,7 +234,7 @@ def create_subplots( else: if is_list_like(ax): if squeeze: - ax = flatten_axes(ax) + ax = np.fromiter(flatten_axes(ax), dtype=object) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is ignored.", @@ -263,7 +263,7 @@ def create_subplots( if squeeze: return fig, ax else: - return fig, flatten_axes(ax) + return fig, np.fromiter(flatten_axes(ax), dtype=object) else: warnings.warn( "To output multiple subplots, the figure containing " @@ -329,10 +329,10 @@ def _remove_labels_from_axis(axis: Axis) -> None: # set_visible will not be effective if # minor axis has NullLocator and NullFormatter (default) - if isinstance(axis.get_minor_locator(), ticker.NullLocator): - axis.set_minor_locator(ticker.AutoLocator()) - if isinstance(axis.get_minor_formatter(), ticker.NullFormatter): - axis.set_minor_formatter(ticker.FormatStrFormatter("")) + if isinstance(axis.get_minor_locator(), mpl.ticker.NullLocator): + axis.set_minor_locator(mpl.ticker.AutoLocator()) + if isinstance(axis.get_minor_formatter(), mpl.ticker.NullFormatter): + axis.set_minor_formatter(mpl.ticker.FormatStrFormatter("")) for t in axis.get_minorticklabels(): t.set_visible(False) @@ -442,32 +442,31 @@ def handle_shared_axes( _remove_labels_from_axis(ax.yaxis) -def flatten_axes(axes: Axes | Sequence[Axes]) -> np.ndarray: +def flatten_axes(axes: Axes | Iterable[Axes]) -> Generator[Axes]: if not is_list_like(axes): - return np.array([axes]) + yield axes # type: ignore[misc] elif isinstance(axes, (np.ndarray, ABCIndex)): - return np.asarray(axes).ravel() - return np.array(axes) + yield from np.asarray(axes).reshape(-1) + else: + yield from axes # type: ignore[misc] def set_ticks_props( - axes: Axes | Sequence[Axes], + axes: Axes | Iterable[Axes], xlabelsize: int | None = None, xrot=None, ylabelsize: int | None = None, yrot=None, ): - import matplotlib.pyplot as plt - for ax in flatten_axes(axes): if xlabelsize is not None: - plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize) # type: ignore[arg-type] if xrot is not None: - plt.setp(ax.get_xticklabels(), rotation=xrot) + mpl.artist.setp(ax.get_xticklabels(), rotation=xrot) # type: ignore[arg-type] if ylabelsize is not None: - plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize) # type: ignore[arg-type] if yrot is not None: - plt.setp(ax.get_yticklabels(), rotation=yrot) + mpl.artist.setp(ax.get_yticklabels(), rotation=yrot) # type: ignore[arg-type] return axes diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 18db460d388a4..6b2b89ee2618c 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -30,19 +30,33 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: """ Helper function to convert DataFrame and Series to matplotlib.table. + This method provides an easy way to visualize tabular data within a Matplotlib + figure. It automatically extracts index and column labels from the DataFrame + or Series, unless explicitly specified. This function is particularly useful + when displaying summary tables alongside other plots or when creating static + reports. It utilizes the `matplotlib.pyplot.table` backend and allows + customization through various styling options available in Matplotlib. + Parameters ---------- ax : Matplotlib axes object + The axes on which to draw the table. data : DataFrame or Series Data for table contents. **kwargs Keyword arguments to be passed to matplotlib.table.table. If `rowLabels` or `colLabels` is not specified, data index or column - name will be used. + names will be used. Returns ------- matplotlib table object + The created table as a matplotlib Table object. + + See Also + -------- + DataFrame.plot : Make plots of DataFrame using matplotlib. + matplotlib.pyplot.table : Create a table from data in a Matplotlib plot. Examples -------- @@ -51,12 +65,13 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: :context: close-figs >>> import matplotlib.pyplot as plt - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) - >>> fix, ax = plt.subplots() - >>> ax.axis('off') - (0.0, 1.0, 0.0, 1.0) - >>> table = pd.plotting.table(ax, df, loc='center', - ... cellLoc='center', colWidths=list([.2, .2])) + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> fig, ax = plt.subplots() + >>> ax.axis("off") + (np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(1.0)) + >>> table = pd.plotting.table( + ... ax, df, loc="center", cellLoc="center", colWidths=[0.2, 0.2] + ... ) """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.table( @@ -92,16 +107,17 @@ def register() -> None: >>> pd.plotting.register_matplotlib_converters() - >>> df = pd.DataFrame({'ts': pd.period_range('2020', periods=2, freq='M'), - ... 'y': [1, 2] - ... }) - >>> plot = df.plot.line(x='ts', y='y') + >>> df = pd.DataFrame( + ... {"ts": pd.period_range("2020", periods=2, freq="M"), "y": [1, 2]} + ... ) + >>> plot = df.plot.line(x="ts", y="y") Unsetting the register manually an error will be raised: - >>> pd.set_option("plotting.matplotlib.register_converters", - ... False) # doctest: +SKIP - >>> df.plot.line(x='ts', y='y') # doctest: +SKIP + >>> pd.set_option( + ... "plotting.matplotlib.register_converters", False + ... ) # doctest: +SKIP + >>> df.plot.line(x="ts", y="y") # doctest: +SKIP Traceback (most recent call last): TypeError: float() argument must be a string or a real number, not 'Period' """ @@ -135,16 +151,17 @@ def deregister() -> None: >>> pd.plotting.register_matplotlib_converters() - >>> df = pd.DataFrame({'ts': pd.period_range('2020', periods=2, freq='M'), - ... 'y': [1, 2] - ... }) - >>> plot = df.plot.line(x='ts', y='y') + >>> df = pd.DataFrame( + ... {"ts": pd.period_range("2020", periods=2, freq="M"), "y": [1, 2]} + ... ) + >>> plot = df.plot.line(x="ts", y="y") Unsetting the register manually an error will be raised: - >>> pd.set_option("plotting.matplotlib.register_converters", - ... False) # doctest: +SKIP - >>> df.plot.line(x='ts', y='y') # doctest: +SKIP + >>> pd.set_option( + ... "plotting.matplotlib.register_converters", False + ... ) # doctest: +SKIP + >>> df.plot.line(x="ts", y="y") # doctest: +SKIP Traceback (most recent call last): TypeError: float() argument must be a string or a real number, not 'Period' """ @@ -168,14 +185,21 @@ def scatter_matrix( """ Draw a matrix of scatter plots. + Each pair of numeric columns in the DataFrame is plotted against each other, + resulting in a matrix of scatter plots. The diagonal plots can display either + histograms or Kernel Density Estimation (KDE) plots for each variable. + Parameters ---------- frame : DataFrame + The data to be plotted. alpha : float, optional Amount of transparency applied. figsize : (float,float), optional A tuple (width, height) in inches. ax : Matplotlib axis object, optional + An existing Matplotlib axis object for the plots. If None, a new axis is + created. grid : bool, optional Setting this to True will show the grid. diagonal : {'hist', 'kde'} @@ -198,13 +222,21 @@ def scatter_matrix( numpy.ndarray A matrix of scatter plots. + See Also + -------- + plotting.parallel_coordinates : Plots parallel coordinates for multivariate data. + plotting.andrews_curves : Generates Andrews curves for visualizing clusters of + multivariate data. + plotting.radviz : Creates a RadViz visualization. + plotting.bootstrap_plot : Visualizes uncertainty in data via bootstrap sampling. + Examples -------- .. plot:: :context: close-figs - >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) + >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) >>> pd.plotting.scatter_matrix(df, alpha=0.2) array([[, , , ], @@ -275,10 +307,11 @@ def radviz( Returns ------- :class:`matplotlib.axes.Axes` + The Axes object from Matplotlib. See Also -------- - pandas.plotting.andrews_curves : Plot clustering visualization. + plotting.andrews_curves : Plot clustering visualization. Examples -------- @@ -288,25 +321,25 @@ def radviz( >>> df = pd.DataFrame( ... { - ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, 6.7, 4.6], - ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, 3.3, 3.6], - ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, 5.7, 1.0], - ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, 2.1, 0.2], - ... 'Category': [ - ... 'virginica', - ... 'virginica', - ... 'setosa', - ... 'virginica', - ... 'virginica', - ... 'versicolor', - ... 'versicolor', - ... 'setosa', - ... 'virginica', - ... 'setosa' - ... ] + ... "SepalLength": [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, 6.7, 4.6], + ... "SepalWidth": [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, 3.3, 3.6], + ... "PetalLength": [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, 5.7, 1.0], + ... "PetalWidth": [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, 2.1, 0.2], + ... "Category": [ + ... "virginica", + ... "virginica", + ... "setosa", + ... "virginica", + ... "virginica", + ... "versicolor", + ... "versicolor", + ... "setosa", + ... "virginica", + ... "setosa", + ... ], ... } ... ) - >>> pd.plotting.radviz(df, 'Category') # doctest: +SKIP + >>> pd.plotting.radviz(df, "Category") # doctest: +SKIP """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.radviz( @@ -363,6 +396,12 @@ def andrews_curves( Returns ------- :class:`matplotlib.axes.Axes` + The matplotlib Axes object with the plot. + + See Also + -------- + plotting.parallel_coordinates : Plot parallel coordinates chart. + DataFrame.plot : Make plots of Series or DataFrame. Examples -------- @@ -371,10 +410,10 @@ def andrews_curves( :context: close-figs >>> df = pd.read_csv( - ... 'https://raw.githubusercontent.com/pandas-dev/' - ... 'pandas/main/pandas/tests/io/data/csv/iris.csv' - ... ) - >>> pd.plotting.andrews_curves(df, 'Name') # doctest: +SKIP + ... "https://raw.githubusercontent.com/pandas-dev/" + ... "pandas/main/pandas/tests/io/data/csv/iris.csv" + ... ) # doctest: +SKIP + >>> pd.plotting.andrews_curves(df, "Name") # doctest: +SKIP """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.andrews_curves( @@ -428,8 +467,8 @@ def bootstrap_plot( See Also -------- - pandas.DataFrame.plot : Basic plotting for DataFrame objects. - pandas.Series.plot : Basic plotting for Series objects. + DataFrame.plot : Basic plotting for DataFrame objects. + Series.plot : Basic plotting for Series objects. Examples -------- @@ -468,6 +507,7 @@ def parallel_coordinates( Parameters ---------- frame : DataFrame + The DataFrame to be plotted. class_column : str Column name containing class names. cols : list, optional @@ -494,6 +534,13 @@ def parallel_coordinates( Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the parallel coordinates plot. + + See Also + -------- + plotting.andrews_curves : Generate a matplotlib plot for visualizing clusters + of multivariate data. + plotting.radviz : Plot a multidimensional dataset in 2D. Examples -------- @@ -502,11 +549,11 @@ def parallel_coordinates( :context: close-figs >>> df = pd.read_csv( - ... 'https://raw.githubusercontent.com/pandas-dev/' - ... 'pandas/main/pandas/tests/io/data/csv/iris.csv' - ... ) + ... "https://raw.githubusercontent.com/pandas-dev/" + ... "pandas/main/pandas/tests/io/data/csv/iris.csv" + ... ) # doctest: +SKIP >>> pd.plotting.parallel_coordinates( - ... df, 'Name', color=('#556270', '#4ECDC4', '#C7F464') + ... df, "Name", color=("#556270", "#4ECDC4", "#C7F464") ... ) # doctest: +SKIP """ plot_backend = _get_plot_backend("matplotlib") @@ -530,6 +577,10 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax """ Lag plot for time series. + A lag plot is a scatter plot of a time series against a lag of itself. It helps + in visualizing the temporal dependence between observations by plotting the values + at time `t` on the x-axis and the values at time `t + lag` on the y-axis. + Parameters ---------- series : Series @@ -544,6 +595,13 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax Returns ------- matplotlib.axes.Axes + The matplotlib Axes object containing the lag plot. + + See Also + -------- + plotting.autocorrelation_plot : Autocorrelation plot for time series. + matplotlib.pyplot.scatter : A scatter plot of y vs. x with varying marker size + and/or color in Matplotlib. Examples -------- @@ -587,6 +645,12 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the autocorrelation plot. + + See Also + -------- + Series.autocorr : Compute the lag-N autocorrelation for a Series. + plotting.lag_plot : Lag plot for time series. Examples -------- @@ -613,6 +677,14 @@ class _Options(dict): the same as the plot function parameters, but is stored in a canonical format that makes it easy to breakdown into groups later. + See Also + -------- + plotting.register_matplotlib_converters : Register pandas formatters and + converters with matplotlib. + plotting.bootstrap_plot : Bootstrap plot on mean, median and mid-range statistics. + plotting.autocorrelation_plot : Autocorrelation plot for time series. + plotting.lag_plot : Lag plot for time series. + Examples -------- @@ -620,10 +692,10 @@ class _Options(dict): :context: close-figs >>> np.random.seed(42) - >>> df = pd.DataFrame({'A': np.random.randn(10), - ... 'B': np.random.randn(10)}, - ... index=pd.date_range("1/1/2000", - ... freq='4MS', periods=10)) + >>> df = pd.DataFrame( + ... {"A": np.random.randn(10), "B": np.random.randn(10)}, + ... index=pd.date_range("1/1/2000", freq="4MS", periods=10), + ... ) >>> with pd.plotting.plot_params.use("x_compat", True): ... _ = df["A"].plot(color="r") ... _ = df["B"].plot(color="g") @@ -633,8 +705,7 @@ class _Options(dict): _ALIASES = {"x_compat": "xaxis.compat"} _DEFAULT_KEYS = ["xaxis.compat"] - def __init__(self, deprecated: bool = False) -> None: - self._deprecated = deprecated + def __init__(self) -> None: super().__setitem__("xaxis.compat", False) def __getitem__(self, key): @@ -668,11 +739,11 @@ def reset(self) -> None: # error: Cannot access "__init__" directly self.__init__() # type: ignore[misc] - def _get_canonical_key(self, key): + def _get_canonical_key(self, key: str) -> str: return self._ALIASES.get(key, key) @contextmanager - def use(self, key, value) -> Generator[_Options, None, None]: + def use(self, key, value) -> Generator[_Options]: """ Temporarily set a parameter value using the with statement. Aliasing allowed. diff --git a/pandas/testing.py b/pandas/testing.py index 841b55df48556..433b22bf1107e 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -2,7 +2,6 @@ Public testing utility functions. """ - from pandas._testing import ( assert_extension_array_equal, assert_frame_equal, @@ -13,6 +12,6 @@ __all__ = [ "assert_extension_array_equal", "assert_frame_equal", - "assert_series_equal", "assert_index_equal", + "assert_series_equal", ] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 60bcb97aaa364..871e977cbe2f8 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -6,6 +6,7 @@ from pandas import api import pandas._testing as tm from pandas.api import ( + executors as api_executors, extensions as api_extensions, indexers as api_indexers, interchange as api_interchange, @@ -133,7 +134,6 @@ class TestPDApi(Base): "show_versions", "timedelta_range", "unique", - "value_counts", "wide_to_long", ] @@ -153,7 +153,6 @@ class TestPDApi(Base): "read_csv", "read_excel", "read_fwf", - "read_gbq", "read_hdf", "read_html", "read_xml", @@ -169,6 +168,7 @@ class TestPDApi(Base): "read_parquet", "read_orc", "read_spss", + "read_iceberg", ] # top-level json funcs @@ -245,11 +245,13 @@ def test_depr(self): class TestApi(Base): allowed_api_dirs = [ + "executors", "types", "extensions", "indexers", "interchange", "typing", + "internals", ] allowed_typing = [ "DataFrameGroupBy", @@ -258,15 +260,18 @@ class TestApi(Base): "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", + "FrozenList", "JsonReader", "NaTType", "NAType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", "RollingGroupby", "SeriesGroupBy", "StataReader", + "SASReader", "TimedeltaIndexResamplerGroupby", "TimeGrouper", "Window", @@ -293,7 +298,6 @@ class TestApi(Base): "is_int64_dtype", "is_integer", "is_integer_dtype", - "is_interval", "is_interval_dtype", "is_iterator", "is_list_like", @@ -337,6 +341,7 @@ class TestApi(Base): "ExtensionArray", "ExtensionScalarOpsMixin", ] + allowed_api_executors = ["BaseExecutionEngine"] def test_api(self): self.check(api, self.allowed_api_dirs) @@ -356,6 +361,32 @@ def test_api_indexers(self): def test_api_extensions(self): self.check(api_extensions, self.allowed_api_extensions) + def test_api_executors(self): + self.check(api_executors, self.allowed_api_executors) + + +class TestErrors(Base): + def test_errors(self): + self.check(pd.errors, pd.errors.__all__, ignored=["ctypes", "cow"]) + + +class TestUtil(Base): + def test_util(self): + self.check( + pd.util, + ["hash_array", "hash_pandas_object"], + ignored=[ + "_decorators", + "_test_decorators", + "_exceptions", + "_validators", + "capitalize_first_letter", + "version", + "_print_versions", + "_tester", + ], + ) + class TestTesting(Base): funcs = [ @@ -375,9 +406,38 @@ def test_util_in_top_level(self): pd.util.foo -def test_pandas_array_alias(): - msg = "PandasArray has been renamed NumpyExtensionArray" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = pd.arrays.PandasArray - - assert res is pd.arrays.NumpyExtensionArray +def test_set_module(): + assert pd.DataFrame.__module__ == "pandas" + assert pd.CategoricalDtype.__module__ == "pandas" + assert pd.PeriodDtype.__module__ == "pandas" + assert pd.IntervalDtype.__module__ == "pandas" + assert pd.SparseDtype.__module__ == "pandas" + assert pd.ArrowDtype.__module__ == "pandas" + assert pd.StringDtype.__module__ == "pandas" + assert pd.Index.__module__ == "pandas" + assert pd.CategoricalIndex.__module__ == "pandas" + assert pd.DatetimeIndex.__module__ == "pandas" + assert pd.IntervalIndex.__module__ == "pandas" + assert pd.MultiIndex.__module__ == "pandas" + assert pd.PeriodIndex.__module__ == "pandas" + assert pd.RangeIndex.__module__ == "pandas" + assert pd.TimedeltaIndex.__module__ == "pandas" + assert pd.Period.__module__ == "pandas" + assert pd.Timestamp.__module__ == "pandas" + assert pd.Timedelta.__module__ == "pandas" + assert pd.concat.__module__ == "pandas" + assert pd.isna.__module__ == "pandas" + assert pd.notna.__module__ == "pandas" + assert pd.merge.__module__ == "pandas" + assert pd.merge_ordered.__module__ == "pandas" + assert pd.merge_asof.__module__ == "pandas" + assert pd.read_csv.__module__ == "pandas" + assert pd.read_table.__module__ == "pandas" + assert pd.read_fwf.__module__ == "pandas" + assert pd.Series.__module__ == "pandas" + assert pd.date_range.__module__ == "pandas" + assert pd.bdate_range.__module__ == "pandas" + assert pd.timedelta_range.__module__ == "pandas" + assert pd.NamedAgg.__module__ == "pandas" + assert api.typing.SeriesGroupBy.__module__ == "pandas.api.typing" + assert api.typing.DataFrameGroupBy.__module__ == "pandas.api.typing" diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index fbaa6e7e18bca..bf39370c49d76 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -34,7 +34,6 @@ class TestTypes(Base): "is_timedelta64_ns_dtype", "is_unsigned_integer_dtype", "is_period_dtype", - "is_interval", "is_interval_dtype", "is_re", "is_re_compilable", diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b7eac6b8f0ea1..dde1158dc7951 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -15,7 +17,61 @@ date_range, ) import pandas._testing as tm +from pandas.api.executors import BaseExecutionEngine from pandas.tests.frame.common import zip_frames +from pandas.util.version import Version + + +class MockExecutionEngine(BaseExecutionEngine): + """ + Execution Engine to test if the execution engine interface receives and + uses all parameters provided by the user. + + Making this engine work as the default Python engine by calling it, no extra + functionality is implemented here. + + When testing, this will be called when this engine is provided, and then the + same pandas.map and pandas.apply function will be called, but without engine, + executing the default behavior from the python engine. + """ + + def map(data, func, args, kwargs, decorator, skip_na): + kwargs_to_pass = kwargs if isinstance(data, DataFrame) else {} + return data.map( + func, action_na="ignore" if skip_na else False, **kwargs_to_pass + ) + + def apply(data, func, args, kwargs, decorator, axis): + if isinstance(data, Series): + return data.apply(func, convert_dtype=True, args=args, by_row=False) + elif isinstance(data, DataFrame): + return data.apply( + func, + axis=axis, + raw=False, + result_type=None, + args=args, + by_row="compat", + **kwargs, + ) + else: + assert isinstance(data, np.ndarray) + + def wrap_function(func): + # https://github.com/numpy/numpy/issues/8352 + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + if isinstance(result, str): + result = np.array(result, dtype=object) + return result + + return wrapper + + return np.apply_along_axis(wrap_function(func), axis, data, *args, **kwargs) + + +class MockEngineDecorator: + __pandas_udf__ = MockExecutionEngine @pytest.fixture @@ -32,7 +88,13 @@ def int_frame_const_col(): return df -@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) +@pytest.fixture( + params=[ + "python", + pytest.param("numba", marks=pytest.mark.single_cpu), + MockEngineDecorator, + ] +) def engine(request): if request.param == "numba": pytest.importorskip("numba") @@ -63,16 +125,77 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) -def test_apply_args(float_frame, axis, raw, engine, request): - if engine == "numba": - mark = pytest.mark.xfail(reason="numba engine doesn't support args") - request.node.add_marker(mark) +@pytest.mark.parametrize("nopython", [True, False]) +def test_apply_args(float_frame, axis, raw, engine, nopython): + numba = pytest.importorskip("numba") + if ( + engine == "numba" + and Version(numba.__version__) == Version("0.61") + and is_platform_arm() + ): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") + engine_kwargs = {"nopython": nopython} result = float_frame.apply( - lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine + lambda x, y: x + y, + axis, + args=(1,), + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, ) expected = float_frame + 1 tm.assert_frame_equal(result, expected) + # GH:58712 + result = float_frame.apply( + lambda x, a, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + expected = float_frame + 3 + tm.assert_frame_equal(result, expected) + + if engine == "numba": + # py signature binding + with pytest.raises(TypeError, match="missing a required argument: 'a'"): + float_frame.apply( + lambda x, a: x + a, + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + # keyword-only arguments are not supported in numba + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda x, a, *, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda *x, b: x[0] + x[1] + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + def test_apply_categorical_func(): # GH 9573 @@ -211,7 +334,7 @@ def test_apply_broadcast_scalars(float_frame): def test_apply_broadcast_scalars_axis1(float_frame): result = float_frame.apply(np.mean, axis=1, result_type="broadcast") m = float_frame.mean(axis=1) - expected = DataFrame({c: m for c in float_frame.columns}) + expected = DataFrame(dict.fromkeys(float_frame.columns, m)) tm.assert_frame_equal(result, expected) @@ -238,7 +361,7 @@ def test_apply_broadcast_lists_index(float_frame): ) m = list(range(len(float_frame.index))) expected = DataFrame( - {c: m for c in float_frame.columns}, + dict.fromkeys(float_frame.columns, m), dtype="float64", index=float_frame.index, ) @@ -324,18 +447,18 @@ def test_apply_mixed_dtype_corner(): result = df[:0].apply(np.mean, axis=1) # the result here is actually kind of ambiguous, should it be a Series # or a DataFrame? - expected = Series(np.nan, index=pd.Index([], dtype="int64")) + expected = Series(dtype=np.float64) tm.assert_series_equal(result, expected) def test_apply_mixed_dtype_corner_indexing(): df = DataFrame({"A": ["foo"], "B": [1.0]}) result = df.apply(lambda x: x["A"], axis=1) - expected = Series(["foo"], index=[0]) + expected = Series(["foo"], index=range(1)) tm.assert_series_equal(result, expected) result = df.apply(lambda x: x["B"], axis=1) - expected = Series([1.0], index=[0]) + expected = Series([1.0], index=range(1)) tm.assert_series_equal(result, expected) @@ -402,7 +525,7 @@ def test_apply_yield_list(float_frame): def test_apply_reduce_Series(float_frame): float_frame.iloc[::2, float_frame.columns.get_loc("A")] = np.nan - expected = float_frame.mean(1) + expected = float_frame.mean(axis=1) result = float_frame.apply(np.mean, axis=1) tm.assert_series_equal(result, expected) @@ -693,8 +816,9 @@ def test_apply_category_equalness(val): result = df.a.apply(lambda x: x == val) expected = Series( - [np.nan if pd.isnull(x) else x == val for x in df_values], name="a" + [False if pd.isnull(x) else x == val for x in df_values], name="a" ) + # False since behavior of NaN for categorical dtype has been changed (GH 59966) tm.assert_series_equal(result, expected) @@ -993,7 +1117,7 @@ def test_result_type(int_frame_const_col): result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") expected = df.copy() - expected.columns = [0, 1, 2] + expected.columns = range(3) tm.assert_frame_equal(result, expected) @@ -1003,7 +1127,7 @@ def test_result_type_shorter_list(int_frame_const_col): df = int_frame_const_col result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") expected = df[["A", "B"]].copy() - expected.columns = [0, 1] + expected.columns = range(2) tm.assert_frame_equal(result, expected) @@ -1014,12 +1138,21 @@ def test_result_type_broadcast(int_frame_const_col, request, engine): mark = pytest.mark.xfail(reason="numba engine doesn't support list return") request.node.add_marker(mark) df = int_frame_const_col - # broadcast result - result = df.apply( - lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine - ) - expected = df.copy() - tm.assert_frame_equal(result, expected) + if engine is MockEngineDecorator: + with pytest.raises( + NotImplementedError, + match="result_type='broadcast' only implemented for the default engine", + ): + df.apply( + lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine + ) + else: + # broadcast result + result = df.apply( + lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) def test_result_type_broadcast_series_func(int_frame_const_col, engine, request): @@ -1032,14 +1165,27 @@ def test_result_type_broadcast_series_func(int_frame_const_col, engine, request) request.node.add_marker(mark) df = int_frame_const_col columns = ["other", "col", "names"] - result = df.apply( - lambda x: Series([1, 2, 3], index=columns), - axis=1, - result_type="broadcast", - engine=engine, - ) - expected = df.copy() - tm.assert_frame_equal(result, expected) + + if engine is MockEngineDecorator: + with pytest.raises( + NotImplementedError, + match="result_type='broadcast' only implemented for the default engine", + ): + df.apply( + lambda x: Series([1, 2, 3], index=columns), + axis=1, + result_type="broadcast", + engine=engine, + ) + else: + result = df.apply( + lambda x: Series([1, 2, 3], index=columns), + axis=1, + result_type="broadcast", + engine=engine, + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) def test_result_type_series_result(int_frame_const_col, engine, request): @@ -1209,7 +1355,7 @@ def test_agg_multiple_mixed_raises(): ) # sorted index - msg = "does not support reduction" + msg = "does not support operation" with pytest.raises(TypeError, match=msg): mdf.agg(["min", "sum"]) @@ -1286,6 +1432,14 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) +def test_named_agg_reduce_axis1_raises(float_frame): + name1, name2 = float_frame.axes[0].unique()[:2].sort_values() + msg = "Named aggregation is not supported when axis=1." + for axis in [1, "columns"]: + with pytest.raises(NotImplementedError, match=msg): + float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis) + + def test_nuiscance_columns(): # GH 15015 df = DataFrame( @@ -1309,7 +1463,7 @@ def test_nuiscance_columns(): ) tm.assert_frame_equal(result, expected) - msg = "does not support reduction" + msg = "does not support operation" with pytest.raises(TypeError, match=msg): df.agg("sum") @@ -1317,7 +1471,7 @@ def test_nuiscance_columns(): expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) - msg = "does not support reduction" + msg = "does not support operation" with pytest.raises(TypeError, match=msg): df.agg(["sum"]) @@ -1487,9 +1641,9 @@ def test_apply_dtype(col): tm.assert_series_equal(result, expected) -def test_apply_mutating(using_array_manager, using_copy_on_write, warn_copy_on_write): +def test_apply_mutating(): # GH#35462 case where applied func pins a new BlockManager to a row - df = DataFrame({"a": range(100), "b": range(100, 200)}) + df = DataFrame({"a": range(10), "b": range(10, 20)}) df_orig = df.copy() def func(row): @@ -1501,17 +1655,10 @@ def func(row): expected = df.copy() expected["a"] += 1 - with tm.assert_cow_warning(warn_copy_on_write): - result = df.apply(func, axis=1) + result = df.apply(func, axis=1) tm.assert_frame_equal(result, expected) - if using_copy_on_write or using_array_manager: - # INFO(CoW) With copy on write, mutating a viewing row doesn't mutate the parent - # INFO(ArrayManager) With BlockManager, the row is a view and mutated in place, - # with ArrayManager the row is not a view, and thus not mutated in place - tm.assert_frame_equal(df, df_orig) - else: - tm.assert_frame_equal(df, result) + tm.assert_frame_equal(df, df_orig) def test_apply_empty_list_reduce(): @@ -1691,18 +1838,14 @@ def foo2(x, b=2, c=0): expected = df + 7 tm.assert_frame_equal(result, expected) - msg = "using .+ in Series.agg cannot aggregate and" - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.agg([foo1, foo2], 0, 3, c=4) + result = df.agg([foo1, foo2], 0, 3, c=4) expected = DataFrame( [[8, 8], [9, 9], [10, 10]], columns=[["x", "x"], ["foo1", "foo2"]] ) tm.assert_frame_equal(result, expected) # TODO: the result below is wrong, should be fixed (GH53325) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.agg({"x": foo1}, 0, 3, c=4) + result = df.agg({"x": foo1}, 0, 3, c=4) expected = DataFrame([2, 3, 4], columns=["x"]) tm.assert_frame_equal(result, expected) @@ -1710,13 +1853,11 @@ def foo2(x, b=2, c=0): def test_agg_std(): df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"]) - with tm.assert_produces_warning(FutureWarning, match="using DataFrame.std"): - result = df.agg(np.std) + result = df.agg(np.std, ddof=1) expected = Series({"A": 2.0, "B": 2.0}, dtype=float) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="using Series.std"): - result = df.agg([np.std]) + result = df.agg([np.std], ddof=1) expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"]) tm.assert_frame_equal(result, expected) @@ -1731,3 +1872,9 @@ def test_agg_dist_like_and_nonunique_columns(): result = df.agg({"A": "count"}) expected = df["A"].count() tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("engine_name", ["unknown", 25]) +def test_wrong_engine(engine_name): + with pytest.raises(ValueError, match="Unknown engine "): + DataFrame().apply(lambda x: x, engine=engine_name) diff --git a/pandas/tests/apply/test_frame_apply_relabeling.py b/pandas/tests/apply/test_frame_apply_relabeling.py index 723bdd349c0cb..57c109abba304 100644 --- a/pandas/tests/apply/test_frame_apply_relabeling.py +++ b/pandas/tests/apply/test_frame_apply_relabeling.py @@ -49,24 +49,20 @@ def test_agg_relabel_multi_columns_multi_methods(): def test_agg_relabel_partial_functions(): # GH 26513, test on partial, functools or more complex cases df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - msg = "using Series.[mean|min]" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) expected = pd.DataFrame( {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) ) tm.assert_frame_equal(result, expected) - msg = "using Series.[mean|min|max|sum]" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.agg( - foo=("A", min), - bar=("A", np.min), - cat=("B", max), - dat=("C", "min"), - f=("B", np.sum), - kk=("B", lambda x: min(x)), - ) + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + kk=("B", lambda x: min(x)), + ) expected = pd.DataFrame( { "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index b5ad1094f5bf5..0503bf9166ec7 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -118,15 +118,15 @@ def test_dict_nested_renaming_depr(method): def test_missing_column(method, func): # GH 40004 obj = DataFrame({"A": [1]}) - match = re.escape("Column(s) ['B'] do not exist") - with pytest.raises(KeyError, match=match): + msg = r"Label\(s\) \['B'\] do not exist" + with pytest.raises(KeyError, match=msg): getattr(obj, method)(func) def test_transform_mixed_column_name_dtypes(): # GH39025 df = DataFrame({"a": ["1"]}) - msg = r"Column\(s\) \[1, 'b'\] do not exist" + msg = r"Label\(s\) \[1, 'b'\] do not exist" with pytest.raises(KeyError, match=msg): df.transform({"a": int, 1: str, "b": int}) @@ -218,11 +218,13 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - import pyarrow as pa - - expected = (expected, pa.lib.ArrowNotImplementedError) + expected = (expected, NotImplementedError) - msg = "can't multiply sequence by non-int of type 'str'|has no kernel" + msg = ( + "can't multiply sequence by non-int of type 'str'" + "|cannot perform cumprod with type str" # NotImplementedError python backend + "|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): @@ -251,12 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" - if using_infer_string: - import pyarrow as pa - - expected = (expected, pa.lib.ArrowNotImplementedError) + if using_infer_string and func == "cumprod": + expected = (expected, NotImplementedError) - msg = msg + "|does not support|has no kernel" + msg = ( + msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): @@ -359,3 +361,15 @@ def test_transform_reducer_raises(all_reductions, frame_or_series, op_wrapper): msg = "Function did not transform" with pytest.raises(ValueError, match=msg): obj.transform(op) + + +def test_transform_missing_labels_raises(): + # GH 58474 + df = DataFrame({"foo": [2, 4, 6], "bar": [1, 2, 3]}, index=["A", "B", "C"]) + msg = r"Label\(s\) \['A', 'B'\] do not exist" + with pytest.raises(KeyError, match=msg): + df.transform({"A": lambda x: x + 2, "B": lambda x: x * 2}, axis=0) + + msg = r"Label\(s\) \['bar', 'foo'\] do not exist" + with pytest.raises(KeyError, match=msg): + df.transform({"foo": lambda x: x + 2, "bar": lambda x: x * 2}, axis=1) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 57b81711ddb48..75bc3f5b74b9d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,15 +1,26 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, Index, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] +pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu, pytest.mark.skipif()] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=[0, 1]) @@ -26,11 +37,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis): def test_numba_vs_python_string_index(): # GH#56189 - pytest.importorskip("pyarrow") df = DataFrame( 1, - index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)), ) func = lambda x: x result = df.apply(func, engine="numba", axis=0) @@ -69,7 +79,7 @@ def test_numba_vs_python_reductions(reduction, apply_axis): @pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]]) def test_numba_numeric_colnames(colnames): - # Check that numeric column names lower properly and can be indxed on + # Check that numeric column names lower properly and can be indexed on df = DataFrame( np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=colnames ) @@ -100,13 +110,14 @@ def test_numba_nonunique_unsupported(apply_axis): def test_numba_unsupported_dtypes(apply_axis): + pytest.importorskip("pyarrow") f = lambda x: x df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]}) df["c"] = df["c"].astype("double[pyarrow]") with pytest.raises( ValueError, - match="Column b must have a numeric dtype. Found 'object|string' instead", + match="Column b must have a numeric dtype. Found 'object|str' instead", ): df.apply(f, engine="numba", axis=apply_axis) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index df24fa08f48e1..9541b0b7495c7 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -75,17 +75,6 @@ def f(x): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("convert_dtype", [True, False]) -def test_apply_convert_dtype_deprecated(convert_dtype): - ser = Series(np.random.default_rng(2).standard_normal(10)) - - def func(x): - return x if x > 0 else np.nan - - with tm.assert_produces_warning(FutureWarning): - ser.apply(func, convert_dtype=convert_dtype, by_row="compat") - - def test_apply_args(): s = Series(["foo,bar"]) @@ -104,12 +93,7 @@ def f(x, a=0, b=0, c=0): return x + a + 10 * b + 100 * c s = Series([1, 2]) - msg = ( - "in Series.agg cannot aggregate and has been deprecated. " - "Use Series.transform to keep behavior unchanged." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.agg(f, 0, *args, **kwargs) + result = s.agg(f, 0, *args, **kwargs) expected = s + increment tm.assert_series_equal(result, expected) @@ -124,13 +108,9 @@ def foo1(x, a=1, c=0): def foo2(x, b=2, c=0): return x + b + c - msg = "using .+ in Series.agg cannot aggregate and" - with tm.assert_produces_warning(FutureWarning, match=msg): - s.agg(foo1, 0, 3, c=4) - with tm.assert_produces_warning(FutureWarning, match=msg): - s.agg([foo1, foo2], 0, 3, c=4) - with tm.assert_produces_warning(FutureWarning, match=msg): - s.agg({"a": foo1, "b": foo2}, 0, 3, c=4) + s.agg(foo1, 0, 3, c=4) + s.agg([foo1, foo2], 0, 3, c=4) + s.agg({"a": foo1, "b": foo2}, 0, 3, c=4) def test_series_apply_map_box_timestamps(by_row): @@ -244,7 +224,7 @@ def test_apply_categorical(by_row, using_infer_string): result = ser.apply(lambda x: "A") exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) @@ -256,10 +236,10 @@ def test_apply_categorical_with_nan_values(series, by_row): with pytest.raises(AttributeError, match=msg): s.apply(lambda x: x.split("-")[0], by_row=by_row) return - - result = s.apply(lambda x: x.split("-")[0], by_row=by_row) + # NaN for cat dtype fixed in (GH 59966) + result = s.apply(lambda x: x.split("-")[0] if pd.notna(x) else False, by_row=by_row) result = result.astype(object) - expected = Series(["1", "1", np.nan], dtype="category") + expected = Series(["1", "1", False], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) @@ -325,7 +305,7 @@ def test_transform(string_series, by_row): @pytest.mark.parametrize("op", series_transform_kernels) def test_transform_partial_failure(op, request): # GH 35964 - if op in ("ffill", "bfill", "pad", "backfill", "shift"): + if op in ("ffill", "bfill", "shift"): request.applymarker( pytest.mark.xfail(reason=f"{op} is successful on any dtype") ) @@ -410,34 +390,26 @@ def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row): def test_agg_evaluate_lambdas(string_series): # GH53325 - # in the future, the result will be a Series class. - - with tm.assert_produces_warning(FutureWarning): - result = string_series.agg(lambda x: type(x)) - assert isinstance(result, Series) and len(result) == len(string_series) + result = string_series.agg(lambda x: type(x)) + assert result is Series - with tm.assert_produces_warning(FutureWarning): - result = string_series.agg(type) - assert isinstance(result, Series) and len(result) == len(string_series) + result = string_series.agg(type) + assert result is Series @pytest.mark.parametrize("op_name", ["agg", "apply"]) def test_with_nested_series(datetime_series, op_name): - # GH 2316 + # GH 2316 & GH52123 # .agg with a reducer and a transform, what to do - msg = "cannot aggregate" - warning = FutureWarning if op_name == "agg" else None - with tm.assert_produces_warning(warning, match=msg): - # GH52123 - result = getattr(datetime_series, op_name)( - lambda x: Series([x, x**2], index=["x", "x^2"]) - ) - expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2}) - tm.assert_frame_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = datetime_series.agg(lambda x: Series([x, x**2], index=["x", "x^2"])) - tm.assert_frame_equal(result, expected) + result = getattr(datetime_series, op_name)( + lambda x: Series([x, x**2], index=["x", "x^2"]) + ) + if op_name == "apply": + expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2}) + tm.assert_frame_equal(result, expected) + else: + expected = Series([datetime_series, datetime_series**2], index=["x", "x^2"]) + tm.assert_series_equal(result, expected) def test_replicate_describe(string_series): @@ -575,10 +547,7 @@ def test_apply_listlike_reducer(string_series, ops, names, how, kwargs): # GH 39140 expected = Series({name: op(string_series) for name, op in zip(names, ops)}) expected.name = "series" - warn = FutureWarning if how == "agg" else None - msg = f"using Series.[{'|'.join(names)}]" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(string_series, how)(ops, **kwargs) + result = getattr(string_series, how)(ops, **kwargs) tm.assert_series_equal(result, expected) @@ -599,10 +568,7 @@ def test_apply_dictlike_reducer(string_series, ops, how, kwargs, by_row): # GH 39140 expected = Series({name: op(string_series) for name, op in ops.items()}) expected.name = string_series.name - warn = FutureWarning if how == "agg" else None - msg = "using Series.[sum|mean]" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(string_series, how)(ops, **kwargs) + result = getattr(string_series, how)(ops, **kwargs) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply_relabeling.py b/pandas/tests/apply/test_series_apply_relabeling.py index cdfa054f91c9b..c0a285e6eb38c 100644 --- a/pandas/tests/apply/test_series_apply_relabeling.py +++ b/pandas/tests/apply/test_series_apply_relabeling.py @@ -14,12 +14,8 @@ def test_relabel_no_duplicated_method(): expected = df["B"].agg({"foo": "min", "bar": "max"}) tm.assert_series_equal(result, expected) - msg = "using Series.[sum|min|max]" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df["B"].agg(foo=sum, bar=min, cat="max") - msg = "using Series.[sum|min|max]" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) + result = df["B"].agg(foo=sum, bar=min, cat="max") + expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) tm.assert_series_equal(result, expected) @@ -32,8 +28,6 @@ def test_relabel_duplicated_method(): expected = pd.Series([6, 6], index=["foo", "bar"], name="A") tm.assert_series_equal(result, expected) - msg = "using Series.min" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df["B"].agg(foo=min, bar="min") + result = df["B"].agg(foo=min, bar="min") expected = pd.Series([1, 1], index=["foo", "bar"], name="B") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 17e8322dc40e1..e5a9492630b13 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,6 +4,10 @@ import numpy as np import pytest +from pandas.compat import ( + WASM, +) + from pandas.core.dtypes.common import is_number from pandas import ( @@ -19,34 +23,24 @@ @pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) @pytest.mark.parametrize( - "args,kwds", + "kwds", [ - pytest.param([], {}, id="no_args_or_kwds"), - pytest.param([1], {}, id="axis_from_args"), - pytest.param([], {"axis": 1}, id="axis_from_kwds"), - pytest.param([], {"numeric_only": True}, id="optional_kwds"), - pytest.param([1, True], {"numeric_only": True}, id="args_and_kwds"), + pytest.param({}, id="no_kwds"), + pytest.param({"axis": 1}, id="on_axis"), + pytest.param({"numeric_only": True}, id="func_kwds"), + pytest.param({"axis": 1, "numeric_only": True}, id="axis_and_func_kwds"), ], ) @pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): - if len(args) > 1 and how == "agg": - request.applymarker( - pytest.mark.xfail( - raises=TypeError, - reason="agg/apply signature mismatch - agg passes 2nd " - "argument to func", - ) - ) - result = getattr(float_frame, how)(func, *args, **kwds) - expected = getattr(float_frame, func)(*args, **kwds) +def test_apply_with_string_funcs(float_frame, func, kwds, how): + result = getattr(float_frame, how)(func, **kwds) + expected = getattr(float_frame, func)(**kwds) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("arg", ["sum", "mean", "min", "max", "std"]) -def test_with_string_args(datetime_series, arg): - result = datetime_series.apply(arg) - expected = getattr(datetime_series, arg)() +def test_with_string_args(datetime_series, all_numeric_reductions): + result = datetime_series.apply(all_numeric_reductions) + expected = getattr(datetime_series, all_numeric_reductions)() assert result == expected @@ -64,6 +58,7 @@ def test_apply_np_reducer(op, how): tm.assert_series_equal(result, expected) +@pytest.mark.skipif(WASM, reason="No fp exception support in wasm") @pytest.mark.parametrize( "op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"] ) @@ -271,7 +266,7 @@ def test_transform_groupby_kernel_series(request, string_series, op): @pytest.mark.parametrize("op", frame_transform_kernels) -def test_transform_groupby_kernel_frame(request, axis, float_frame, op): +def test_transform_groupby_kernel_frame(request, float_frame, op): if op == "ngroup": request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") @@ -280,40 +275,26 @@ def test_transform_groupby_kernel_frame(request, axis, float_frame, op): # GH 35964 args = [0.0] if op == "fillna" else [] - if axis in (0, "index"): - ones = np.ones(float_frame.shape[0]) - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - else: - ones = np.ones(float_frame.shape[1]) - msg = "DataFrame.groupby with axis=1 is deprecated" - - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = float_frame.groupby(ones, axis=axis) + ones = np.ones(float_frame.shape[0]) + gb = float_frame.groupby(ones) warn = FutureWarning if op == "fillna" else None op_msg = "DataFrameGroupBy.fillna is deprecated" with tm.assert_produces_warning(warn, match=op_msg): expected = gb.transform(op, *args) - result = float_frame.transform(op, axis, *args) + result = float_frame.transform(op, 0, *args) tm.assert_frame_equal(result, expected) # same thing, but ensuring we have multiple blocks assert "E" not in float_frame.columns float_frame["E"] = float_frame["A"].copy() - assert len(float_frame._mgr.arrays) > 1 + assert len(float_frame._mgr.blocks) > 1 - if axis in (0, "index"): - ones = np.ones(float_frame.shape[0]) - else: - ones = np.ones(float_frame.shape[1]) - with tm.assert_produces_warning(FutureWarning, match=msg): - gb2 = float_frame.groupby(ones, axis=axis) - warn = FutureWarning if op == "fillna" else None - op_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=op_msg): - expected2 = gb2.transform(op, *args) - result2 = float_frame.transform(op, axis, *args) + ones = np.ones(float_frame.shape[0]) + gb2 = float_frame.groupby(ones) + expected2 = gb2.transform(op, *args) + result2 = float_frame.transform(op, 0, *args) tm.assert_frame_equal(result2, expected2) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index b608df1554154..0730729e2fd94 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -1,6 +1,7 @@ """ Assertion helpers for arithmetic tests. """ + import numpy as np import pytest @@ -19,13 +20,16 @@ def assert_cannot_add(left, right, msg="cannot add"): """ - Helper to assert that left and right cannot be added. + Helper function to assert that two objects cannot be added. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str, default "cannot add" + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right @@ -35,13 +39,17 @@ def assert_cannot_add(left, right, msg="cannot add"): def assert_invalid_addsub_type(left, right, msg=None): """ - Helper to assert that left and right can be neither added nor subtracted. + Helper function to assert that two objects can + neither be added nor subtracted. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str or None, default None + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index dbff88dc6f4f6..26dfcf088e74b 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -5,6 +5,7 @@ datetime, time, timedelta, + timezone, ) from itertools import ( product, @@ -14,11 +15,9 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.offsets import shift_months -from pandas.errors import PerformanceWarning import pandas as pd from pandas import ( @@ -182,12 +181,12 @@ class TestDatetime64SeriesComparison: @pytest.mark.parametrize( "op, expected", [ - (operator.eq, Series([False, False, True])), - (operator.ne, Series([True, True, False])), - (operator.lt, Series([False, False, False])), - (operator.gt, Series([False, False, False])), - (operator.ge, Series([False, False, True])), - (operator.le, Series([False, False, True])), + (operator.eq, [False, False, True]), + (operator.ne, [True, True, False]), + (operator.lt, [False, False, False]), + (operator.gt, [False, False, False]), + (operator.ge, [False, False, True]), + (operator.le, [False, False, True]), ], ) def test_nat_comparisons( @@ -210,7 +209,7 @@ def test_nat_comparisons( result = op(left, right) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, Series(expected)) @pytest.mark.parametrize( "data", @@ -390,6 +389,22 @@ def test_dt64_compare_datetime_scalar(self, datetimelike, op, expected): expected = Series(expected, name="A") tm.assert_series_equal(result, expected) + def test_ts_series_numpy_maximum(self): + # GH#50864, test numpy.maximum does not fail + # given a TimeStamp and Series(with dtype datetime64) comparison + ts = Timestamp("2024-07-01") + ts_series = Series( + ["2024-06-01", "2024-07-01", "2024-08-01"], + dtype="datetime64[us]", + ) + + expected = Series( + ["2024-07-01", "2024-07-01", "2024-08-01"], + dtype="datetime64[us]", + ) + + tm.assert_series_equal(expected, np.maximum(ts, ts_series)) + class TestDatetimeIndexComparisons: # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate @@ -1008,14 +1023,16 @@ def test_dt64arr_sub_NaT(self, box_with_array, unit): # ------------------------------------------------------------- # Subtraction of datetime-like array-like - def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): + def test_dt64arr_sub_dt64object_array( + self, performance_warning, box_with_array, tz_naive_fixture + ): dti = date_range("2016-01-01", periods=3, tz=tz_naive_fixture) expected = dti - dti obj = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = obj - obj.astype(object) tm.assert_equal(result, expected) @@ -1079,7 +1096,7 @@ def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array): @pytest.mark.parametrize("freq", ["h", "D", "W", "2ME", "MS", "QE", "B", None]) @pytest.mark.parametrize("dtype", [None, "uint8"]) def test_dt64arr_addsub_intlike( - self, request, dtype, index_or_series_or_array, freq, tz_naive_fixture + self, dtype, index_or_series_or_array, freq, tz_naive_fixture ): # GH#19959, GH#19123, GH#19012 # GH#55860 use index_or_series_or_array instead of box_with_array @@ -1221,7 +1238,6 @@ class TestDatetime64DateOffsetArithmetic: # Tick DateOffsets # TODO: parametrize over timezone? - @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_dt64arr_series_add_tick_DateOffset(self, box_with_array, unit): # GH#4532 # operate with pd.offsets @@ -1311,7 +1327,6 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): # ------------------------------------------------------------- # RelativeDelta DateOffsets - @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array, unit): # GH#10699 vec = DatetimeIndex( @@ -1387,7 +1402,6 @@ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array, unit): "SemiMonthBegin", "Week", ("Week", {"weekday": 3}), - "Week", ("Week", {"weekday": 6}), "BusinessDay", "BDay", @@ -1422,7 +1436,6 @@ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array, unit): ) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("n", [0, 5]) - @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_dt64arr_add_sub_DateOffsets( self, box_with_array, n, normalize, cls_and_kwargs, unit, tz @@ -1489,16 +1502,15 @@ def test_dt64arr_add_sub_DateOffsets( @pytest.mark.parametrize( "other", [ - np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]), - np.array([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]), - np.array( # matching offsets - [pd.offsets.DateOffset(years=1), pd.offsets.DateOffset(years=1)] - ), + [pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], + [pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()], + # matching offsets + [pd.offsets.DateOffset(years=1), pd.offsets.DateOffset(years=1)], ], ) @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub]) def test_dt64arr_add_sub_offset_array( - self, tz_naive_fixture, box_with_array, op, other + self, performance_warning, tz_naive_fixture, box_with_array, op, other ): # GH#18849 # GH#10699 array of offsets @@ -1506,11 +1518,11 @@ def test_dt64arr_add_sub_offset_array( tz = tz_naive_fixture dti = date_range("2017-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) - + other = np.array(other) expected = DatetimeIndex([op(dti[n], other[n]) for n in range(len(dti))]) expected = tm.box_expected(expected, box_with_array).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res = op(dtarr, other) tm.assert_equal(res, expected) @@ -1519,7 +1531,7 @@ def test_dt64arr_add_sub_offset_array( if box_with_array is pd.array and op is roperator.radd: # We expect a NumpyExtensionArray, not ndarray[object] here expected = pd.array(expected, dtype=object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res = op(dtarr, other) tm.assert_equal(res, expected) @@ -1586,6 +1598,38 @@ def test_dti_add_sub_nonzero_mth_offset( expected = tm.box_expected(expected, box_with_array, False) tm.assert_equal(result, expected) + def test_dt64arr_series_add_DateOffset_with_milli(self): + # GH 57529 + dti = DatetimeIndex( + [ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], + dtype="datetime64[ns]", + ) + result = dti + DateOffset(milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-01 00:00:00.016345678", + "2000-01-31 00:00:00.016345678", + "2000-02-29 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + + result = dti + DateOffset(days=1, milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-02 00:00:00.016345678", + "2000-02-01 00:00:00.016345678", + "2000-03-01 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + class TestDatetime64OverflowHandling: # TODO: box + de-duplicate @@ -1842,8 +1886,10 @@ def test_dt64tz_series_sub_dtitz(self): def test_sub_datetime_compat(self, unit): # see GH#14088 - ser = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), NaT]).dt.as_unit(unit) - dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) + ser = Series([datetime(2016, 8, 23, 12, tzinfo=timezone.utc), NaT]).dt.as_unit( + unit + ) + dt = datetime(2016, 8, 22, 12, tzinfo=timezone.utc) # The datetime object has "us" so we upcast lower units exp_unit = tm.get_finest_unit(unit, "us") exp = Series([Timedelta("1 days"), NaT]).dt.as_unit(exp_unit) @@ -2308,7 +2354,7 @@ def test_dti_add_series(self, tz_naive_fixture, names): @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub]) def test_dti_addsub_offset_arraylike( - self, tz_naive_fixture, names, op, index_or_series + self, performance_warning, tz_naive_fixture, names, op, index_or_series ): # GH#18849, GH#19744 other_box = index_or_series @@ -2319,7 +2365,7 @@ def test_dti_addsub_offset_arraylike( xbox = get_upcast_box(dti, other) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res = op(dti, other) expected = DatetimeIndex( @@ -2330,7 +2376,7 @@ def test_dti_addsub_offset_arraylike( @pytest.mark.parametrize("other_box", [pd.Index, np.array]) def test_dti_addsub_object_arraylike( - self, tz_naive_fixture, box_with_array, other_box + self, performance_warning, tz_naive_fixture, box_with_array, other_box ): tz = tz_naive_fixture @@ -2342,21 +2388,20 @@ def test_dti_addsub_object_arraylike( expected = DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = dtarr + other tm.assert_equal(result, expected) expected = DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = dtarr - other tm.assert_equal(result, expected) @pytest.mark.parametrize("years", [-1, 0, 1]) @pytest.mark.parametrize("months", [-2, 0, 2]) -@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_shift_months(years, months, unit): dti = DatetimeIndex( [ @@ -2376,7 +2421,7 @@ def test_shift_months(years, months, unit): tm.assert_index_equal(actual, expected) -def test_dt64arr_addsub_object_dtype_2d(): +def test_dt64arr_addsub_object_dtype_2d(performance_warning): # block-wise DataFrame operations will require operating on 2D # DatetimeArray/TimedeltaArray, so check that specifically. dti = date_range("1994-02-13", freq="2W", periods=4) @@ -2385,14 +2430,14 @@ def test_dt64arr_addsub_object_dtype_2d(): other = np.array([[pd.offsets.Day(n)] for n in range(4)]) assert other.shape == dta.shape - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = dta + other - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): expected = (dta[:, 0] + other[:, 0]).reshape(-1, 1) tm.assert_numpy_array_equal(result, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): # Case where we expect to get a TimedeltaArray back result2 = dta - dta.astype(object) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index d8c1786b6b422..d205569270705 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -37,14 +37,6 @@ def switch_numexpr_min_elements(request, monkeypatch): yield request.param -@pytest.fixture(params=[Index, Series, tm.to_array]) -def box_pandas_1d_array(request): - """ - Fixture to test behavior for Index, Series and tm.to_array classes - """ - return request.param - - @pytest.fixture( params=[ # TODO: add more dtypes here @@ -62,17 +54,6 @@ def numeric_idx(request): return request.param -@pytest.fixture( - params=[Index, Series, tm.to_array, np.array, list], ids=lambda x: x.__name__ -) -def box_1d_array(request): - """ - Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list - classes - """ - return request.param - - def adjust_negative_zero(zero, expected): """ Helper to adjust the expected result if we are dividing by -0.0 @@ -586,16 +567,12 @@ def test_df_div_zero_series_does_not_commute(self): # ------------------------------------------------------------------ # Mod By Zero - def test_df_mod_zero_df(self, using_array_manager): + def test_df_mod_zero_df(self): # GH#3590, modulo as ints df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) # this is technically wrong, as the integer portion is coerced to float first = Series([0, 0, 0, 0]) - if not using_array_manager: - # INFO(ArrayManager) BlockManager doesn't preserve dtype per column - # while ArrayManager performs op column-wisedoes and thus preserves - # dtype if possible - first = first.astype("float64") + first = first.astype("float64") second = Series([np.nan, np.nan, np.nan, 0]) expected = pd.DataFrame({"first": first, "second": second}) result = df % df @@ -1139,11 +1116,10 @@ def test_ufunc_compat(self, holder, dtype): tm.assert_equal(result, expected) # TODO: add more dtypes - @pytest.mark.parametrize("holder", [Index, Series]) @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64]) - def test_ufunc_coercions(self, holder, dtype): - idx = holder([1, 2, 3, 4, 5], dtype=dtype, name="x") - box = Series if holder is Series else Index + def test_ufunc_coercions(self, index_or_series, dtype): + idx = index_or_series([1, 2, 3, 4, 5], dtype=dtype, name="x") + box = index_or_series result = np.sqrt(idx) assert result.dtype == "f8" and isinstance(result, box) @@ -1475,7 +1451,7 @@ def test_fill_value_inf_masking(): expected = pd.DataFrame( {"A": [np.inf, 1.0, 0.0, 1.0], "B": [0.0, np.nan, 0.0, np.nan]} ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_dataframe_div_silenced(): @@ -1503,6 +1479,8 @@ def test_dataframe_div_silenced(): "data, expected_data", [([0, 1, 2], [0, 2, 4])], ) +@pytest.mark.parametrize("box_pandas_1d_array", [Index, Series, tm.to_array]) +@pytest.mark.parametrize("box_1d_array", [Index, Series, tm.to_array, np.array, list]) def test_integer_array_add_list_like( box_pandas_1d_array, box_1d_array, data, expected_data ): diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4ffd76722286a..bc0f78d3aa01a 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -303,7 +301,6 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) @@ -318,24 +315,17 @@ def test_add(self): expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) - def test_sub_fail(self, using_infer_string): + def test_sub_fail(self): index = pd.Index([str(i) for i in range(10)]) - if using_infer_string: - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" - else: - err = TypeError - msg = "unsupported operand type|Cannot broadcast" - with pytest.raises(err, match=msg): + msg = "unsupported operand type|Cannot broadcast|sub' not supported" + with pytest.raises(TypeError, match=msg): index - "a" - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index - index - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index - index.tolist() - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index.tolist() - index def test_sub_object(self): diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 5535fe8ff928d..67762e0b89c73 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -12,7 +12,6 @@ Timestamp, to_offset, ) -from pandas.errors import PerformanceWarning import pandas as pd from pandas import ( @@ -868,7 +867,7 @@ def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): # operations with array/Index of DateOffset objects @pytest.mark.parametrize("box", [np.array, pd.Index]) - def test_pi_add_offset_array(self, box): + def test_pi_add_offset_array(self, performance_warning, box): # GH#18849 pi = PeriodIndex([Period("2015Q1"), Period("2016Q2")]) offs = box( @@ -879,11 +878,11 @@ def test_pi_add_offset_array(self, box): ) expected = PeriodIndex([Period("2015Q2"), Period("2015Q4")]).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res = pi + offs tm.assert_index_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res2 = offs + pi tm.assert_index_equal(res2, expected) @@ -892,14 +891,14 @@ def test_pi_add_offset_array(self, box): # a PerformanceWarning and _then_ raise a TypeError. msg = r"Input cannot be converted to Period\(freq=Q-DEC\)" with pytest.raises(IncompatibleFrequency, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): pi + unanchored with pytest.raises(IncompatibleFrequency, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): unanchored + pi @pytest.mark.parametrize("box", [np.array, pd.Index]) - def test_pi_sub_offset_array(self, box): + def test_pi_sub_offset_array(self, performance_warning, box): # GH#18824 pi = PeriodIndex([Period("2015Q1"), Period("2016Q2")]) other = box( @@ -912,7 +911,7 @@ def test_pi_sub_offset_array(self, box): expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))]) expected = expected.astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res = pi - other tm.assert_index_equal(res, expected) @@ -922,10 +921,10 @@ def test_pi_sub_offset_array(self, box): # a PerformanceWarning and _then_ raise a TypeError. msg = r"Input has different freq=-1M from Period\(freq=Q-DEC\)" with pytest.raises(IncompatibleFrequency, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): pi - anchored with pytest.raises(IncompatibleFrequency, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): anchored - pi def test_pi_add_iadd_int(self, one): @@ -1087,7 +1086,7 @@ def test_parr_add_timedeltalike_minute_gt1(self, three_days, box_with_array): with pytest.raises(TypeError, match=msg): other - rng - @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5d"]) + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5D"]) def test_parr_add_timedeltalike_tick_gt1(self, three_days, freqstr, box_with_array): # GH#23031 adding a time-delta-like offset to a PeriodArray that has # tick-like frequency with n != 1 @@ -1329,13 +1328,13 @@ def test_parr_add_sub_index(self): expected = pi - pi tm.assert_index_equal(result, expected) - def test_parr_add_sub_object_array(self): + def test_parr_add_sub_object_array(self, performance_warning): pi = period_range("2000-12-31", periods=3, freq="D") parr = pi.array other = np.array([Timedelta(days=1), pd.offsets.Day(2), 3]) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = parr + other expected = PeriodIndex( @@ -1343,7 +1342,7 @@ def test_parr_add_sub_object_array(self): )._data.astype(object) tm.assert_equal(result, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = parr - other expected = PeriodIndex(["2000-12-30"] * 3, freq="D")._data.astype(object) @@ -1362,7 +1361,12 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - msg = "cannot add PeriodArray and DatetimeArray" + if box_with_array is pd.DataFrame: + # TODO: before implementing resolution-inference we got the same + # message with DataFrame and non-DataFrame. Why did that change? + msg = "cannot add PeriodArray and Timestamp" + else: + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 007d1e670e1e0..87e085fb22878 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -8,10 +8,8 @@ import numpy as np import pytest -from pandas.errors import ( - OutOfBoundsDatetime, - PerformanceWarning, -) +from pandas.compat import WASM +from pandas.errors import OutOfBoundsDatetime import pandas as pd from pandas import ( @@ -577,7 +575,9 @@ def test_tda_add_sub_index(self): expected = tdi - tdi tm.assert_index_equal(result, expected) - def test_tda_add_dt64_object_array(self, box_with_array, tz_naive_fixture): + def test_tda_add_dt64_object_array( + self, performance_warning, box_with_array, tz_naive_fixture + ): # Result should be cast back to DatetimeArray box = box_with_array @@ -588,7 +588,7 @@ def test_tda_add_dt64_object_array(self, box_with_array, tz_naive_fixture): obj = tm.box_expected(tdi, box) other = tm.box_expected(dti, box) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = obj + other.astype(object) tm.assert_equal(result, other.astype(object)) @@ -1282,7 +1282,9 @@ def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array): # ------------------------------------------------------------------ # __add__/__sub__ with DateOffsets and arrays of DateOffsets - def test_td64arr_add_sub_offset_index(self, names, box_with_array): + def test_td64arr_add_sub_offset_index( + self, performance_warning, names, box_with_array + ): # GH#18849, GH#19744 box = box_with_array exname = get_expected_name(box, names) @@ -1299,22 +1301,22 @@ def test_td64arr_add_sub_offset_index(self, names, box_with_array): ) tdi = tm.box_expected(tdi, box) - expected = tm.box_expected(expected, box).astype(object, copy=False) - expected_sub = tm.box_expected(expected_sub, box).astype(object, copy=False) + expected = tm.box_expected(expected, box).astype(object) + expected_sub = tm.box_expected(expected_sub, box).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res2 = other + tdi tm.assert_equal(res2, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res_sub = tdi - other tm.assert_equal(res_sub, expected_sub) - def test_td64arr_add_sub_offset_array(self, box_with_array): + def test_td64arr_add_sub_offset_array(self, performance_warning, box_with_array): # GH#18849, GH#18824 box = box_with_array tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) @@ -1330,20 +1332,22 @@ def test_td64arr_add_sub_offset_array(self, box_with_array): tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res = tdi + other tm.assert_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res2 = other + tdi tm.assert_equal(res2, expected) expected_sub = tm.box_expected(expected_sub, box_with_array).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res_sub = tdi - other tm.assert_equal(res_sub, expected_sub) - def test_td64arr_with_offset_series(self, names, box_with_array): + def test_td64arr_with_offset_series( + self, performance_warning, names, box_with_array + ): # GH#18849 box = box_with_array box2 = Series if box in [Index, tm.to_array, pd.array] else box @@ -1358,11 +1362,11 @@ def test_td64arr_with_offset_series(self, names, box_with_array): obj = tm.box_expected(tdi, box) expected_add = tm.box_expected(expected_add, box2).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res = obj + other tm.assert_equal(res, expected_add) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res2 = other + obj tm.assert_equal(res2, expected_add) @@ -1371,12 +1375,14 @@ def test_td64arr_with_offset_series(self, names, box_with_array): ) expected_sub = tm.box_expected(expected_sub, box2).astype(object) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): res3 = obj - other tm.assert_equal(res3, expected_sub) @pytest.mark.parametrize("obox", [np.array, Index, Series]) - def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): + def test_td64arr_addsub_anchored_offset_arraylike( + self, performance_warning, obox, box_with_array + ): # GH#18824 tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) tdi = tm.box_expected(tdi, box_with_array) @@ -1387,22 +1393,22 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): # a PerformanceWarning and _then_ raise a TypeError. msg = "has incorrect type|cannot add the type MonthEnd" with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): tdi + anchored with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): anchored + tdi with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): tdi - anchored with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): anchored - tdi # ------------------------------------------------------------------ # Unsorted - def test_td64arr_add_sub_object_array(self, box_with_array): + def test_td64arr_add_sub_object_array(self, performance_warning, box_with_array): box = box_with_array xbox = np.ndarray if box is pd.array else box @@ -1411,7 +1417,7 @@ def test_td64arr_add_sub_object_array(self, box_with_array): other = np.array([Timedelta(days=1), offsets.Day(2), Timestamp("2000-01-04")]) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = tdarr + other expected = Index( @@ -1422,10 +1428,10 @@ def test_td64arr_add_sub_object_array(self, box_with_array): msg = "unsupported operand type|cannot subtract a datelike" with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): tdarr - other - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = other - tdarr expected = Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")]) @@ -1454,7 +1460,13 @@ def test_td64arr_mul_int(self, box_with_array): def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array): rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) - msg = "argument must be an integer|cannot use operands with types dtype" + msg = "|".join( + [ + "argument must be an integer", + "cannot use operands with types dtype", + "Cannot multiply with", + ] + ) with pytest.raises(TypeError, match=msg): rng * two_hours @@ -1736,9 +1748,8 @@ def test_td64_div_object_mixed_result(self, box_with_array): # ------------------------------------------------------------------ # __floordiv__, __rfloordiv__ - def test_td64arr_floordiv_td64arr_with_nat( - self, box_with_array, using_array_manager - ): + @pytest.mark.skipif(WASM, reason="no fp exception support in wasm") + def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): # GH#35529 box = box_with_array xbox = np.ndarray if box is pd.array else box @@ -1751,11 +1762,6 @@ def test_td64arr_floordiv_td64arr_with_nat( expected = np.array([1.0, 1.0, np.nan], dtype=np.float64) expected = tm.box_expected(expected, xbox) - if box is DataFrame and using_array_manager: - # INFO(ArrayManager) floordiv returns integer, and ArrayManager - # performs ops column-wise and thus preserves int64 dtype for - # columns without missing values - expected[[0, 1]] = expected[[0, 1]].astype("int64") with tm.maybe_produces_warning( RuntimeWarning, box is pd.array, check_stacklevel=False @@ -1813,7 +1819,9 @@ def test_td64arr_floordiv_int(self, box_with_array): # TODO: operations with timedelta-like arrays, numeric arrays, # reversed ops - def test_td64arr_mod_tdscalar(self, box_with_array, three_days): + def test_td64arr_mod_tdscalar( + self, performance_warning, box_with_array, three_days + ): tdi = timedelta_range("1 Day", "9 days") tdarr = tm.box_expected(tdi, box_with_array) @@ -1823,15 +1831,15 @@ def test_td64arr_mod_tdscalar(self, box_with_array, three_days): result = tdarr % three_days tm.assert_equal(result, expected) - warn = None if box_with_array is DataFrame and isinstance(three_days, pd.DateOffset): - warn = PerformanceWarning # TODO: making expected be object here a result of DataFrame.__divmod__ # being defined in a naive way that does not dispatch to the underlying # array's __divmod__ expected = expected.astype(object) + else: + performance_warning = False - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(performance_warning): result = divmod(tdarr, three_days) tm.assert_equal(result[1], expected) @@ -1967,19 +1975,20 @@ def test_td64arr_floordiv_numeric_scalar(self, box_with_array, two): two // tdser @pytest.mark.parametrize( - "vector", - [np.array([20, 30, 40]), Index([20, 30, 40]), Series([20, 30, 40])], - ids=lambda x: type(x).__name__, + "klass", + [np.array, Index, Series], + ids=lambda x: x.__name__, ) def test_td64arr_rmul_numeric_array( self, box_with_array, - vector, + klass, any_real_numpy_dtype, ): # GH#4521 # divide/multiply by integers + vector = klass([20, 30, 40]) tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_numpy_dtype) @@ -1997,16 +2006,17 @@ def test_td64arr_rmul_numeric_array( tm.assert_equal(result, expected) @pytest.mark.parametrize( - "vector", - [np.array([20, 30, 40]), Index([20, 30, 40]), Series([20, 30, 40])], - ids=lambda x: type(x).__name__, + "klass", + [np.array, Index, Series], + ids=lambda x: x.__name__, ) def test_td64arr_div_numeric_array( - self, box_with_array, vector, any_real_numpy_dtype + self, box_with_array, klass, any_real_numpy_dtype ): # GH#4521 # divide/multiply by integers + vector = klass([20, 30, 40]) tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_numpy_dtype) @@ -2036,12 +2046,7 @@ def test_td64arr_div_numeric_array( if box_with_array is DataFrame: expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] expected = tm.box_expected(expected, xbox).astype(object) - # We specifically expect timedelta64("NaT") here, not pd.NA - msg = "The 'downcast' keyword in fillna" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected[2] = expected[2].fillna( - np.timedelta64("NaT", "ns"), downcast=False - ) + expected[2] = expected[2].fillna(np.timedelta64("NaT", "ns")) else: expected = [tdser[n] / vector[n] for n in range(len(tdser))] expected = [ @@ -2123,9 +2128,7 @@ def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array): if box_with_array is not Index: expected = tm.box_expected(expected, box_with_array).astype(object) if box_with_array in [Series, DataFrame]: - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = expected.fillna(tdnat, downcast=False) # GH#18463 + expected = expected.fillna(tdnat) # GH#18463 result = left / right tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 0c4fcf149eb20..9ff690cdc914d 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -90,16 +90,8 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): # invalid ops - - if using_infer_string: - import pyarrow as pa - - err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - err = TypeError - op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) @@ -109,7 +101,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "did not contain a loop with signature matching types|" "BooleanArray cannot perform the operation|" "not supported for the input types, and the inputs could not be safely coerced " - "to any supported types according to the casting rule ''safe''" + "to any supported types according to the casting rule ''safe''|" + "not supported for dtype" ) with pytest.raises(TypeError, match=msg): ops("foo") @@ -118,9 +111,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string r"unsupported operand type\(s\) for", "Concatenation operation is not implemented for NumPy arrays", "has no kernel", + "not supported for dtype", ] ) - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -133,7 +127,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "not all arguments converted during string formatting", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series("foo", index=s.index)) diff --git a/pandas/tests/arrays/boolean/test_astype.py b/pandas/tests/arrays/boolean/test_astype.py index 932e903c0e448..8c2672218f273 100644 --- a/pandas/tests/arrays/boolean/test_astype.py +++ b/pandas/tests/arrays/boolean/test_astype.py @@ -5,7 +5,7 @@ import pandas._testing as tm -def test_astype(): +def test_astype(using_infer_string): # with missing values arr = pd.array([True, False, None], dtype="boolean") @@ -20,8 +20,14 @@ def test_astype(): tm.assert_numpy_array_equal(result, expected) result = arr.astype("str") - expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array( + ["True", "False", None], dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_extension_array_equal(result, expected) + else: + expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") + tm.assert_numpy_array_equal(result, expected) # no missing values arr = pd.array([True, False, True], dtype="boolean") diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index a5a2dd33940b8..d821c52d3becb 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -243,7 +243,7 @@ def test_coerce_to_numpy_array(): def test_to_boolean_array_from_strings(): result = BooleanArray._from_sequence_of_strings( np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object), - dtype="boolean", + dtype=pd.BooleanDtype(), ) expected = BooleanArray( np.array([True, False, True, True, False, False, False]), @@ -255,7 +255,7 @@ def test_to_boolean_array_from_strings(): def test_to_boolean_array_from_strings_invalid_string(): with pytest.raises(ValueError, match="cannot be cast"): - BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean") + BooleanArray._from_sequence_of_strings(["donkey"], dtype=pd.BooleanDtype()) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) @@ -308,8 +308,6 @@ def test_to_numpy(box): # converting to int or float without specifying na_value raises with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): arr.to_numpy(dtype="int64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - arr.to_numpy(dtype="float64") def test_to_numpy_copy(): diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py index 66c117ea3fc66..97a24e0f24756 100644 --- a/pandas/tests/arrays/boolean/test_logical.py +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -60,19 +60,20 @@ def test_eq_mismatched_type(self, other): expected = pd.array([True, True]) tm.assert_extension_array_equal(result, expected) - def test_logical_length_mismatch_raises(self, all_logical_operators): + @pytest.mark.parametrize("other", [[True, False], [True, False, True, False]]) + def test_logical_length_mismatch_raises(self, other, all_logical_operators): op_name = all_logical_operators a = pd.array([True, False, None], dtype="boolean") msg = "Lengths must match" with pytest.raises(ValueError, match=msg): - getattr(a, op_name)([True, False]) + getattr(a, op_name)(other) with pytest.raises(ValueError, match=msg): - getattr(a, op_name)(np.array([True, False])) + getattr(a, op_name)(np.array(other)) with pytest.raises(ValueError, match=msg): - getattr(a, op_name)(pd.array([True, False], dtype="boolean")) + getattr(a, op_name)(pd.array(other, dtype="boolean")) def test_logical_nan_raises(self, all_logical_operators): op_name = all_logical_operators diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index dd8c3eda9ed05..7071c6f8844e4 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -43,7 +43,6 @@ def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): assert np.all(a.all()) is exp_all -@pytest.mark.parametrize("dropna", [True, False]) def test_reductions_return_types(dropna, data, all_numeric_reductions): op = all_numeric_reductions s = pd.Series(data) diff --git a/pandas/tests/arrays/categorical/conftest.py b/pandas/tests/arrays/categorical/conftest.py deleted file mode 100644 index 37249210f28f4..0000000000000 --- a/pandas/tests/arrays/categorical/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - -from pandas import Categorical - - -@pytest.fixture -def factor(): - """Fixture returning a Categorical object""" - return Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index d4c19a4970135..a7d0becc30dd9 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -5,7 +5,6 @@ import pandas._testing as tm -@pytest.mark.parametrize("ordered", [True, False]) @pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]]) def test_factorize(categories, ordered): cat = pd.Categorical( @@ -87,3 +86,11 @@ def test_diff(): df = ser.to_frame(name="A") with pytest.raises(TypeError, match=msg): df.diff() + + +def test_hash_read_only_categorical(): + # GH#58481 + idx = pd.Index(pd.Index(["a", "b", "c"], dtype="object").values) + cat = pd.CategoricalDtype(idx) + arr = pd.Series(["a", "b"], dtype=cat).values + assert hash(arr.dtype) == hash(arr.dtype) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c2c53fbc4637e..47fa354e12393 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -97,7 +97,6 @@ def test_min_max_ordered_empty(self, categories, expected, aggregation): "values, categories", [(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])], ) - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("function", ["min", "max"]) def test_min_max_with_nan(self, values, categories, function, skipna): # GH 25303 @@ -111,7 +110,6 @@ def test_min_max_with_nan(self, values, categories, function, skipna): assert result == expected @pytest.mark.parametrize("function", ["min", "max"]) - @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_only_nan(self, function, skipna): # https://github.com/pandas-dev/pandas/issues/33450 cat = Categorical([np.nan], categories=[1, 2], ordered=True) @@ -296,7 +294,7 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp - def test_memory_usage(self): + def test_memory_usage(self, using_infer_string): cat = Categorical([1, 2, 3]) # .categories is an index, so we include the hashtable @@ -304,7 +302,13 @@ def test_memory_usage(self): assert 0 < cat.nbytes <= cat.memory_usage(deep=True) cat = Categorical(["foo", "foo", "bar"]) - assert cat.memory_usage(deep=True) > cat.nbytes + if using_infer_string: + if cat.categories.dtype.storage == "python": + assert cat.memory_usage(deep=True) > cat.nbytes + else: + assert cat.memory_usage(deep=True) >= cat.nbytes + else: + assert cat.memory_usage(deep=True) > cat.nbytes if not PYPY: # sys.getsizeof will call the .memory_usage with diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index b4215b4a6fe21..2791fd55f54d7 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -18,13 +18,6 @@ class TestCategoricalAPI: - def test_to_list_deprecated(self): - # GH#51254 - cat1 = Categorical(list("acb"), ordered=False) - msg = "Categorical.to_list is deprecated and will be removed" - with tm.assert_produces_warning(FutureWarning, match=msg): - cat1.to_list() - def test_ordered_api(self): # GH 9347 cat1 = Categorical(list("acb"), ordered=False) @@ -291,17 +284,16 @@ def test_set_categories(self): (["a", "b", "c"], ["a", "b"], ["a", "b"]), (["a", "b", "c"], ["a", "b"], ["b", "a"]), (["b", "a", "c"], ["a", "b"], ["a", "b"]), - (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["b", "a"]), # Introduce NaNs (["a", "b", "c"], ["a", "b"], ["a"]), (["a", "b", "c"], ["a", "b"], ["b"]), (["b", "a", "c"], ["a", "b"], ["a"]), - (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["b"]), # No overlap (["a", "b", "c"], ["a", "b"], ["d", "e"]), ], ) - @pytest.mark.parametrize("ordered", [True, False]) def test_set_categories_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) @@ -385,7 +377,8 @@ def test_remove_unused_categories(self): class TestCategoricalAPIWithFactor: - def test_describe(self, factor): + def test_describe(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # string type desc = factor.describe() assert factor.ordered diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index a2a53af6ab1ad..00999d491b242 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -81,7 +81,6 @@ def test_astype_str_int_categories_to_nullable_float(self): expected = array(codes, dtype="Float64") / 2 tm.assert_extension_array_equal(res, expected) - @pytest.mark.parametrize("ordered", [True, False]) def test_astype(self, ordered): # string cat = Categorical(list("abbaaccc"), ordered=ordered) @@ -89,7 +88,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) @@ -108,11 +107,10 @@ def test_astype(self, ordered): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype_ordered", [True, False]) - @pytest.mark.parametrize("cat_ordered", [True, False]) - def test_astype_category(self, dtype_ordered, cat_ordered): + def test_astype_category(self, dtype_ordered, ordered): # GH#10696/GH#18593 data = list("abcaacbab") - cat = Categorical(data, categories=list("bac"), ordered=cat_ordered) + cat = Categorical(data, categories=list("bac"), ordered=ordered) # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 373f1c95463fc..d7eb6800e5d07 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,7 +6,9 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_float_dtype, @@ -24,6 +26,7 @@ IntervalIndex, MultiIndex, NaT, + RangeIndex, Series, Timestamp, date_range, @@ -34,13 +37,6 @@ class TestCategoricalConstructors: - def test_fastpath_deprecated(self): - codes = np.array([1, 2, 3]) - dtype = CategoricalDtype(categories=["a", "b", "c", "d"], ordered=False) - msg = "The 'fastpath' keyword in Categorical is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - Categorical(codes, dtype=dtype, fastpath=True) - def test_categorical_from_cat_and_dtype_str_preserve_ordered(self): # GH#49309 we should preserve orderedness in `res` cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True) @@ -437,7 +433,6 @@ def test_constructor_dtype_and_others_raises(self): Categorical(["a", "b"], ordered=False, dtype=dtype) @pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]]) - @pytest.mark.parametrize("ordered", [True, False]) def test_constructor_str_category(self, categories, ordered): result = Categorical( ["a", "b"], categories=categories, ordered=ordered, dtype="category" @@ -449,7 +444,9 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings" + ) def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) @@ -683,7 +680,6 @@ def test_from_inferred_categories_coerces(self): expected = Categorical([1, 1, 2, np.nan]) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize("ordered", [None, True, False]) def test_construction_with_ordered(self, ordered): # GH 9347, 9190 cat = Categorical([0, 1, 2], ordered=ordered) @@ -781,3 +777,17 @@ def test_constructor_preserves_freq(self): result = cat.categories.freq assert expected == result + + @pytest.mark.parametrize( + "values, categories", + [ + [range(5), None], + [range(4), range(5)], + [[0, 1, 2, 3], range(5)], + [[], range(5)], + ], + ) + def test_range_values_preserves_rangeindex_categories(self, values, categories): + result = Categorical(values=values, categories=categories).categories + expected = RangeIndex(range(5)) + tm.assert_index_equal(result, expected, exact=True) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 525663cad1745..ec1d501ddba16 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -73,17 +73,16 @@ def test_set_dtype_new_categories(self): (["a", "b", "c"], ["a", "b"], ["a", "b"]), (["a", "b", "c"], ["a", "b"], ["b", "a"]), (["b", "a", "c"], ["a", "b"], ["a", "b"]), - (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["b", "a"]), # Introduce NaNs (["a", "b", "c"], ["a", "b"], ["a"]), (["a", "b", "c"], ["a", "b"], ["b"]), (["b", "a", "c"], ["a", "b"], ["a"]), - (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["b"]), # No overlap (["a", "b", "c"], ["a", "b"], ["d", "e"]), ], ) - @pytest.mark.parametrize("ordered", [True, False]) def test_set_dtype_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 3377c411a7084..33c55b2090bd6 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -21,7 +21,8 @@ class TestCategoricalIndexingWithFactor: - def test_getitem(self, factor): + def test_getitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) assert factor[0] == "a" assert factor[-1] == "c" @@ -31,7 +32,8 @@ def test_getitem(self, factor): subf = factor[np.asarray(factor) == "c"] tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) - def test_setitem(self, factor): + def test_setitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # int/positional c = factor.copy() c[0] = "b" @@ -49,12 +51,10 @@ def test_setitem(self, factor): tm.assert_categorical_equal(c, expected) - @pytest.mark.parametrize( - "other", - [Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"])], - ) - def test_setitem_same_but_unordered(self, other): + @pytest.mark.parametrize("categories", [None, ["b", "a"]]) + def test_setitem_same_but_unordered(self, categories): # GH-24142 + other = Categorical(["b", "a"], categories=categories) target = Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) target[mask] = other[mask] diff --git a/pandas/tests/arrays/categorical/test_map.py b/pandas/tests/arrays/categorical/test_map.py index 3d41b7cc7094d..585b207c9b241 100644 --- a/pandas/tests/arrays/categorical/test_map.py +++ b/pandas/tests/arrays/categorical/test_map.py @@ -10,11 +10,6 @@ import pandas._testing as tm -@pytest.fixture(params=[None, "ignore"]) -def na_action(request): - return request.param - - @pytest.mark.parametrize( "data, categories", [ @@ -139,16 +134,3 @@ def test_map_with_dict_or_series(na_action): result = cat.map(mapper, na_action=na_action) # Order of categories in result can be different tm.assert_categorical_equal(result, expected) - - -def test_map_na_action_no_default_deprecated(): - # GH51645 - cat = Categorical(["a", "b", "c"]) - msg = ( - "The default value of 'ignore' for the `na_action` parameter in " - "pandas.Categorical.map is deprecated and will be " - "changed to 'None' in a future version. Please set na_action to the " - "desired value to avoid seeing this warning" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - cat.map(lambda x: x) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 0eeb01b746088..e3cb9664e19f2 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -8,7 +8,6 @@ import pandas as pd from pandas import ( Categorical, - DataFrame, Index, Series, isna, @@ -63,34 +62,6 @@ def test_set_item_nan(self): exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(cat, exp) - @pytest.mark.parametrize( - "fillna_kwargs, msg", - [ - ( - {"value": 1, "method": "ffill"}, - "Cannot specify both 'value' and 'method'.", - ), - ({}, "Must specify a fill 'value' or 'method'."), - ({"method": "bad"}, "Invalid fill method. Expecting .* bad"), - ( - {"value": Series([1, 2, 3, 4, "a"])}, - "Cannot setitem on a Categorical with a new category", - ), - ], - ) - def test_fillna_raises(self, fillna_kwargs, msg): - # https://github.com/pandas-dev/pandas/issues/19682 - # https://github.com/pandas-dev/pandas/issues/13628 - cat = Categorical([1, 2, 3, None, None]) - - if len(fillna_kwargs) == 1 and "value" in fillna_kwargs: - err = TypeError - else: - err = ValueError - - with pytest.raises(err, match=msg): - cat.fillna(**fillna_kwargs) - @pytest.mark.parametrize("named", [True, False]) def test_fillna_iterable_category(self, named): # https://github.com/pandas-dev/pandas/issues/21097 @@ -126,60 +97,6 @@ def test_fillna_array(self): tm.assert_categorical_equal(result, expected) assert isna(cat[-1]) # didn't modify original inplace - @pytest.mark.parametrize( - "values, expected", - [ - ([1, 2, 3], np.array([False, False, False])), - ([1, 2, np.nan], np.array([False, False, True])), - ([1, 2, np.inf], np.array([False, False, True])), - ([1, 2, pd.NA], np.array([False, False, True])), - ], - ) - def test_use_inf_as_na(self, values, expected): - # https://github.com/pandas-dev/pandas/issues/33594 - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.use_inf_as_na", True): - cat = Categorical(values) - result = cat.isna() - tm.assert_numpy_array_equal(result, expected) - - result = Series(cat).isna() - expected = Series(expected) - tm.assert_series_equal(result, expected) - - result = DataFrame(cat).isna() - expected = DataFrame(expected) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "values, expected", - [ - ([1, 2, 3], np.array([False, False, False])), - ([1, 2, np.nan], np.array([False, False, True])), - ([1, 2, np.inf], np.array([False, False, True])), - ([1, 2, pd.NA], np.array([False, False, True])), - ], - ) - def test_use_inf_as_na_outside_context(self, values, expected): - # https://github.com/pandas-dev/pandas/issues/33594 - # Using isna directly for Categorical will fail in general here - cat = Categorical(values) - - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.use_inf_as_na", True): - result = isna(cat) - tm.assert_numpy_array_equal(result, expected) - - result = isna(Series(cat)) - expected = Series(expected) - tm.assert_series_equal(result, expected) - - result = isna(DataFrame(cat)) - expected = DataFrame(expected) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( "a1, a2, categories", [ @@ -204,7 +121,7 @@ def test_compare_categorical_with_missing(self, a1, a2, categories): @pytest.mark.parametrize( "na_value, dtype", [ - (pd.NaT, "datetime64[ns]"), + (pd.NaT, "datetime64[s]"), (None, "float64"), (np.nan, "float64"), (pd.NA, "float64"), diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 16b941eab4830..dbc6cc7715744 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -17,7 +17,8 @@ def test_categories_none_comparisons(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, factor) - def test_comparisons(self, factor): + def test_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) result = factor[factor == "a"] expected = factor[np.asarray(factor) == "a"] tm.assert_categorical_equal(result, expected) @@ -306,29 +307,23 @@ def test_comparisons(self, data, reverse, base): with pytest.raises(TypeError, match=msg): a < cat_rev - @pytest.mark.parametrize( - "ctor", - [ - lambda *args, **kwargs: Categorical(*args, **kwargs), - lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), - ], - ) - def test_unordered_different_order_equal(self, ctor): + @pytest.mark.parametrize("box", [lambda x: x, Series]) + def test_unordered_different_order_equal(self, box): # https://github.com/pandas-dev/pandas/issues/16014 - c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) - c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) + c1 = box(Categorical(["a", "b"], categories=["a", "b"], ordered=False)) + c2 = box(Categorical(["a", "b"], categories=["b", "a"], ordered=False)) assert (c1 == c2).all() - c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) - c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False) + c1 = box(Categorical(["a", "b"], categories=["a", "b"], ordered=False)) + c2 = box(Categorical(["b", "a"], categories=["b", "a"], ordered=False)) assert (c1 != c2).all() - c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) - c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False) + c1 = box(Categorical(["a", "a"], categories=["a", "b"], ordered=False)) + c2 = box(Categorical(["b", "b"], categories=["b", "a"], ordered=False)) assert (c1 != c2).all() - c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) - c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) + c1 = box(Categorical(["a", "a"], categories=["a", "b"], ordered=False)) + c2 = box(Categorical(["a", "b"], categories=["b", "a"], ordered=False)) result = c1 == c2 tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) @@ -379,14 +374,14 @@ def test_numeric_like_ops(self): # min/max) s = df["value_group"] for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]: - msg = f"does not support reduction '{op}'" + msg = f"does not support operation '{op}'" with pytest.raises(TypeError, match=msg): getattr(s, op)(numeric_only=False) def test_numeric_like_ops_series(self): # numpy ops s = Series(Categorical([1, 2, 3, 4])) - with pytest.raises(TypeError, match="does not support reduction 'sum'"): + with pytest.raises(TypeError, match="does not support operation 'sum'"): np.sum(s) @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 3c677142846d7..7f3e8d3ed6e6e 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -6,106 +6,66 @@ @pytest.mark.parametrize( - "to_replace,value,expected,flip_categories", + "to_replace,value,expected", [ # one-to-one - (1, 2, [2, 2, 3], False), - (1, 4, [4, 2, 3], False), - (4, 1, [1, 2, 3], False), - (5, 6, [1, 2, 3], False), + (4, 1, [1, 2, 3]), + (3, 1, [1, 2, 1]), # many-to-one - ([1], 2, [2, 2, 3], False), - ([1, 2], 3, [3, 3, 3], False), - ([1, 2], 4, [4, 4, 3], False), - ((1, 2, 4), 5, [5, 5, 3], False), - ((5, 6), 2, [1, 2, 3], False), - ([1], [2], [2, 2, 3], False), - ([1, 4], [5, 2], [5, 2, 3], False), - # GH49404: overlap between to_replace and value - ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), - # GH50872, GH46884: replace with null - (1, None, [None, 2, 3], False), - (1, pd.NA, [None, 2, 3], False), - # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], False), - ([1, 2, "3"], "5", ["5", "5", 3], True), + ((5, 6), 2, [1, 2, 3]), + ((3, 2), 1, [1, 1, 1]), ], ) -@pytest.mark.filterwarnings( - "ignore:.*with CategoricalDtype is deprecated:FutureWarning" -) -def test_replace_categorical_series(to_replace, value, expected, flip_categories): +def test_replace_categorical_series(to_replace, value, expected): # GH 31720 - ser = pd.Series([1, 2, 3], dtype="category") result = ser.replace(to_replace, value) - expected = pd.Series(expected, dtype="category") - ser.replace(to_replace, value, inplace=True) - - if flip_categories: - expected = expected.cat.set_categories(expected.cat.categories[::-1]) - - tm.assert_series_equal(expected, result, check_category_order=False) - tm.assert_series_equal(expected, ser, check_category_order=False) + expected = pd.Series(Categorical(expected, categories=[1, 2, 3])) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "to_replace, value, result, expected_error_msg", + "to_replace,value", [ - ("b", "c", ["a", "c"], "Categorical.categories are different"), - ("c", "d", ["a", "b"], None), - # https://github.com/pandas-dev/pandas/issues/33288 - ("a", "a", ["a", "b"], None), - ("b", None, ["a", None], "Categorical.categories length are different"), + # one-to-one + (3, 5), + # many-to-one + ((3, 2), 5), ], ) -def test_replace_categorical(to_replace, value, result, expected_error_msg): - # GH#26988 - cat = Categorical(["a", "b"]) - expected = Categorical(result) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if expected_error_msg is not None else None - with tm.assert_produces_warning(warn, match=msg): - result = pd.Series(cat, copy=False).replace(to_replace, value)._values +def test_replace_categorical_series_new_category_raises(to_replace, value): + # GH 31720 + ser = pd.Series([1, 2, 3], dtype="category") + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + ser.replace(to_replace, value) - tm.assert_categorical_equal(result, expected) - if to_replace == "b": # the "c" test is supposed to be unchanged - with pytest.raises(AssertionError, match=expected_error_msg): - # ensure non-inplace call does not affect original - tm.assert_categorical_equal(cat, expected) - ser = pd.Series(cat, copy=False) - with tm.assert_produces_warning(warn, match=msg): - ser.replace(to_replace, value, inplace=True) - tm.assert_categorical_equal(cat, expected) +def test_replace_maintain_ordering(): + # GH51016 + dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) + ser = pd.Series([0, 1, 2], dtype=dtype) + result = ser.replace(0, 2) + expected = pd.Series([2, 1, 2], dtype=dtype) + tm.assert_series_equal(expected, result, check_category_order=True) def test_replace_categorical_ea_dtype(): # GH49404 - cat = Categorical(pd.array(["a", "b"], dtype="string")) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + cat = Categorical(pd.array(["a", "b", "c"], dtype="string")) + result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values + expected = Categorical( + pd.array(["c"] * 3, dtype="string"), + categories=pd.array(["a", "b", "c"], dtype="string"), ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values - expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) -def test_replace_maintain_ordering(): - # GH51016 - dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) - ser = pd.Series([0, 1, 2], dtype=dtype) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace(0, 2) - expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) - expected = pd.Series([2, 1, 2], dtype=expected_dtype) - tm.assert_series_equal(expected, result, check_category_order=True) +def test_replace_categorical_ea_dtype_different_cats_raises(): + # GH49404 + cat = Categorical(pd.array(["a", "b"], dtype="string")) + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + pd.Series(cat).replace(["a", "b"], ["c", pd.NA]) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d6f93fbbd912f..3a2c489920eb0 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -17,11 +17,12 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor, using_infer_string): + def test_print(self, using_infer_string): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, string): [a < b < c]", + "Categories (3, str): [a < b < c]", ] else: expected = [ @@ -77,7 +78,7 @@ def test_print_none_width(self): assert exp == repr(a) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Change once infer_string is set to True by default", ) def test_unicode_print(self): diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index daf4aa3b47f56..74cc3e991bb76 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -16,34 +16,6 @@ def test_from_sequence_invalid_type(self): with pytest.raises(TypeError, match="Cannot create a DatetimeArray"): DatetimeArray._from_sequence(mi, dtype="M8[ns]") - def test_only_1dim_accepted(self): - arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") - - depr_msg = "DatetimeArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 - DatetimeArray(arr.reshape(2, 2, 1)) - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 0-dim - DatetimeArray(arr[[0]].squeeze()) - - def test_freq_validation(self): - # GH#24623 check that invalid instances cannot be created with the - # public constructor - arr = np.arange(5, dtype=np.int64) * 3600 * 10**9 - - msg = ( - "Inferred frequency h from passed values does not " - "conform to passed frequency W-SUN" - ) - depr_msg = "DatetimeArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr, freq="W") - @pytest.mark.parametrize( "meth", [ @@ -56,10 +28,12 @@ def test_mixing_naive_tzaware_raises(self, meth): # GH#24569 arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]) - msg = ( - "Cannot mix tz-aware with tz-naive values|" - "Tz-aware datetime.datetime cannot be converted " - "to datetime64 unless utc=True" + msg = "|".join( + [ + "Cannot mix tz-aware with tz-naive values", + "Tz-aware datetime.datetime cannot be converted " + "to datetime64 unless utc=True", + ] ) for obj in [arr, arr[::-1]]: @@ -76,42 +50,9 @@ def test_from_pandas_array(self): expected = pd.date_range("1970-01-01", periods=5, freq="h")._data tm.assert_datetime_array_equal(result, expected) - def test_mismatched_timezone_raises(self): - depr_msg = "DatetimeArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - arr = DatetimeArray( - np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), - dtype=DatetimeTZDtype(tz="US/Central"), - ) - dtype = DatetimeTZDtype(tz="US/Eastern") - msg = r"dtype=datetime64\[ns.*\] does not match data dtype datetime64\[ns.*\]" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr, dtype=dtype) - - # also with mismatched tzawareness - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr, dtype=np.dtype("M8[ns]")) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr.tz_localize(None), dtype=arr.dtype) - - def test_non_array_raises(self): - depr_msg = "DatetimeArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="list"): - DatetimeArray([1, 2, 3]) - def test_bool_dtype_raises(self): arr = np.array([1, 2, 3], dtype="bool") - depr_msg = "DatetimeArray.__init__ is deprecated" - msg = "Unexpected value for 'dtype': 'bool'. Must be" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr) - msg = r"dtype bool cannot be converted to datetime64\[ns\]" with pytest.raises(TypeError, match=msg): DatetimeArray._from_sequence(arr, dtype="M8[ns]") @@ -122,50 +63,14 @@ def test_bool_dtype_raises(self): with pytest.raises(TypeError, match=msg): pd.to_datetime(arr) - def test_incorrect_dtype_raises(self): - depr_msg = "DatetimeArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="m8[s]") - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[D]") - - def test_mismatched_values_dtype_units(self): - arr = np.array([1, 2, 3], dtype="M8[s]") - dtype = np.dtype("M8[ns]") - msg = "Values resolution does not match dtype." - depr_msg = "DatetimeArray.__init__ is deprecated" - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr, dtype=dtype) - - dtype2 = DatetimeTZDtype(tz="UTC", unit="ns") - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr, dtype=dtype2) - - def test_freq_infer_raises(self): - depr_msg = "DatetimeArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="Frequency inference"): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") - def test_copy(self): data = np.array([1, 2, 3], dtype="M8[ns]") - arr = DatetimeArray._from_sequence(data, copy=False) + arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=False) assert arr._ndarray is data - arr = DatetimeArray._from_sequence(data, copy=True) + arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=True) assert arr._ndarray is not data - @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_numpy_datetime_unit(self, unit): data = np.array([1, 2, 3], dtype=f"M8[{unit}]") arr = DatetimeArray._from_sequence(data) @@ -223,7 +128,7 @@ def test_2d(self, order): ("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE), ], ) -def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( +def test_from_arrow_with_different_units_and_timezones_with( pa_unit, pd_unit, pa_tz, pd_tz, data ): pa = pytest.importorskip("pyarrow") @@ -233,9 +138,8 @@ def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence( - np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"), - dtype=dtype, + expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype( + dtype, copy=False ) tm.assert_extension_array_equal(result, expected) @@ -261,7 +165,9 @@ def test_from_arrow_from_empty(unit, tz): dtype = DatetimeTZDtype(unit=unit, tz=tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]")) + expected = DatetimeArray._from_sequence( + np.array(data, dtype=f"datetime64[{unit}]"), dtype=np.dtype(f"M8[{unit}]") + ) expected = expected.tz_localize(tz=tz) tm.assert_extension_array_equal(result, expected) @@ -277,7 +183,9 @@ def test_from_arrow_from_integers(): dtype = DatetimeTZDtype(unit="ns", tz="UTC") result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]")) + expected = DatetimeArray._from_sequence( + np.array(data, dtype="datetime64[ns]"), dtype=np.dtype("M8[ns]") + ) expected = expected.tz_localize("UTC") tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/datetimes/test_reductions.py b/pandas/tests/arrays/datetimes/test_reductions.py index a941546b13a56..a58b0f57bf0da 100644 --- a/pandas/tests/arrays/datetimes/test_reductions.py +++ b/pandas/tests/arrays/datetimes/test_reductions.py @@ -10,10 +10,6 @@ class TestReductions: - @pytest.fixture(params=["s", "ms", "us", "ns"]) - def unit(self, request): - return request.param - @pytest.fixture def arr1d(self, tz_naive_fixture): """Fixture returning DatetimeArray with parametrized timezones""" @@ -54,7 +50,6 @@ def test_min_max(self, arr1d, unit): assert result is NaT @pytest.mark.parametrize("tz", [None, "US/Central"]) - @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna, tz): dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") arr = DatetimeArray._from_sequence([], dtype=dtype) @@ -65,7 +60,6 @@ def test_min_max_empty(self, skipna, tz): assert result is NaT @pytest.mark.parametrize("tz", [None, "US/Central"]) - @pytest.mark.parametrize("skipna", [True, False]) def test_median_empty(self, skipna, tz): dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") arr = DatetimeArray._from_sequence([], dtype=dtype) @@ -164,7 +158,6 @@ def test_mean_2d(self): expected = dti.mean() assert result == expected - @pytest.mark.parametrize("skipna", [True, False]) def test_mean_empty(self, arr1d, skipna): arr = arr1d[:0] diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index ba081bd01062a..009fac4c2f5ed 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -122,18 +122,11 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - errs = TypeError - # invalid scalars msg = "|".join( [ @@ -149,15 +142,17 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "Concatenation operation is not implemented for NumPy arrays", "has no kernel", "not implemented", + "not supported for dtype", + "Can only string multiply by an integer", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops("foo") - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series("foo", index=s.index)) msg = "|".join( @@ -178,9 +173,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "cannot subtract DatetimeArray from ndarray", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ade3dbd2c99da..752ebe194ffcf 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -63,12 +63,19 @@ def test_astype_to_integer_array(): tm.assert_extension_array_equal(result, expected) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([0.1, 0.2, None], dtype="Float64") - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) + + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) + else: + expected = np.array(["0.1", "0.2", ""], dtype="U32") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_copy(): diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index 40fd66fd049a6..9ea48bfb2413f 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -127,7 +127,6 @@ def test_value_counts_with_normalize(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 4]) def test_floating_array_sum(skipna, min_count, dtype): arr = pd.array([1, 2, 3, None], dtype=dtype) @@ -171,7 +170,6 @@ def test_preserve_dtypes(op): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("method", ["min", "max"]) def test_floating_array_min_max(skipna, method, dtype): arr = pd.array([0.0, 1.0, None], dtype=dtype) @@ -183,7 +181,6 @@ def test_floating_array_min_max(skipna, method, dtype): assert result is pd.NA -@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 9]) def test_floating_array_prod(skipna, min_count, dtype): arr = pd.array([1.0, 2.0, None], dtype=dtype) diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index a25ac40cb3e7c..e954cecba417a 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -33,10 +33,10 @@ def test_to_numpy_float(box): tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - result = arr.to_numpy(dtype="float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) - # need to explicitly specify na_value result = arr.to_numpy(dtype="float64", na_value=np.nan) expected = np.array([0.1, 0.2, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) @@ -100,7 +100,7 @@ def test_to_numpy_dtype(box, dtype): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy_na_raises(box, dtype): con = pd.Series if box else pd.array diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index d979dd445a61a..dee3deeee0f2f 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -172,18 +172,11 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - errs = TypeError - # invalid scalars msg = "|".join( [ @@ -197,24 +190,22 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "Addition/subtraction of integers and integer-arrays with Timestamp", "has no kernel", "not implemented", + "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops("foo") - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes str_ser = pd.Series("foo", index=s.index) # with pytest.raises(TypeError, match=msg): - if ( - all_arithmetic_operators - in [ - "__mul__", - "__rmul__", - ] - and not using_infer_string - ): # (data[~data.isna()] >= 0).all(): + if all_arithmetic_operators in [ + "__mul__", + "__rmul__", + ]: # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) expected = expected.fillna(np.nan) @@ -223,7 +214,7 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(str_ser) msg = "|".join( @@ -238,9 +229,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "cannot subtract DatetimeArray from ndarray", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index e3848cdfe3aa9..7972ba7b9fb0f 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -59,7 +59,6 @@ def test_astype_nansafe(): arr.astype("uint32") -@pytest.mark.parametrize("dropna", [True, False]) def test_construct_index(all_data, dropna): # ensure that we do not coerce to different Index dtype or non-index @@ -76,7 +75,6 @@ def test_construct_index(all_data, dropna): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize("dropna", [True, False]) def test_astype_index(all_data, dropna): # as an int/uint index to Index @@ -271,19 +269,26 @@ def test_to_numpy_dtype(dtype, in_series): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int64", "bool"]) def test_to_numpy_na_raises(dtype): a = pd.array([0, 1, None], dtype="Int64") with pytest.raises(ValueError, match=dtype): a.to_numpy(dtype=dtype) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) + + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) + else: + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_boolean(): diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index d48b636a98feb..33300fff925f6 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -141,7 +141,6 @@ def test_value_counts_with_normalize(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 4]) def test_integer_array_sum(skipna, min_count, any_int_ea_dtype): dtype = any_int_ea_dtype @@ -153,7 +152,6 @@ def test_integer_array_sum(skipna, min_count, any_int_ea_dtype): assert result is pd.NA -@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("method", ["min", "max"]) def test_integer_array_min_max(skipna, method, any_int_ea_dtype): dtype = any_int_ea_dtype @@ -166,7 +164,6 @@ def test_integer_array_min_max(skipna, method, any_int_ea_dtype): assert result is pd.NA -@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 9]) def test_integer_array_prod(skipna, min_count, any_int_ea_dtype): dtype = any_int_ea_dtype diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index db04862e4ea07..1c91cd25ba69c 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -102,9 +102,7 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected, using_infer_string): - if op in ["any", "all"] and using_infer_string: - expected = expected.astype("bool") +def test_mixed_reductions(op, expected): df = DataFrame( { "A": ["a", "b", "b"], diff --git a/pandas/tests/arrays/interval/test_formats.py b/pandas/tests/arrays/interval/test_formats.py index 535efee519374..88c9bf81d718c 100644 --- a/pandas/tests/arrays/interval/test_formats.py +++ b/pandas/tests/arrays/interval/test_formats.py @@ -6,8 +6,6 @@ def test_repr(): arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) result = repr(arr) expected = ( - "\n" - "[(0, 1], (1, 2]]\n" - "Length: 2, dtype: interval[int64, right]" + "\n[(0, 1], (1, 2]]\nLength: 2, dtype: interval[int64, right]" ) assert result == expected diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index be4b2c3e7e74c..8e13dcf25ceba 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -58,12 +58,11 @@ def test_is_empty(self, constructor, left, right, closed): class TestMethods: - @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) - def test_set_closed(self, closed, new_closed): + def test_set_closed(self, closed, other_closed): # GH 21670 array = IntervalArray.from_breaks(range(10), closed=closed) - result = array.set_closed(new_closed) - expected = IntervalArray.from_breaks(range(10), closed=new_closed) + result = array.set_closed(other_closed) + expected = IntervalArray.from_breaks(range(10), closed=other_closed) tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize( @@ -223,9 +222,10 @@ def test_min_max(self, left_right_dtypes, index_or_series_or_array): res = arr_na.max(skipna=False) assert np.isnan(res) - res = arr_na.min(skipna=True) - assert res == MIN - assert type(res) == type(MIN) - res = arr_na.max(skipna=True) - assert res == MAX - assert type(res) == type(MAX) + for kws in [{"skipna": True}, {}]: + res = arr_na.min(**kws) + assert res == MIN + assert type(res) == type(MIN) + res = arr_na.max(**kws) + assert res == MAX + assert type(res) == type(MAX) diff --git a/pandas/tests/arrays/interval/test_overlaps.py b/pandas/tests/arrays/interval/test_overlaps.py index 4853bec51106c..5a48cf024ec0d 100644 --- a/pandas/tests/arrays/interval/test_overlaps.py +++ b/pandas/tests/arrays/interval/test_overlaps.py @@ -1,4 +1,5 @@ """Tests for Interval-Interval operations, such as overlaps, contains, etc.""" + import numpy as np import pytest diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index f4b571ca627b3..ea018d2da4d26 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -55,15 +55,15 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators): scalar_array = pd.array([scalar] * len(data), dtype=data.dtype) # TODO also add len-1 array (np.array([scalar], dtype=data.dtype.numpy_dtype)) - for scalar in [scalar, data.dtype.type(scalar)]: + for val in [scalar, data.dtype.type(scalar)]: if is_bool_not_implemented(data, all_arithmetic_operators): msg = "operator '.*' not implemented for bool dtypes" with pytest.raises(NotImplementedError, match=msg): - op(data, scalar) + op(data, val) with pytest.raises(NotImplementedError, match=msg): op(data, scalar_array) else: - result = op(data, scalar) + result = op(data, val) expected = op(data, scalar_array) tm.assert_extension_array_equal(result, expected) @@ -214,13 +214,13 @@ def test_error_len_mismatch(data, all_arithmetic_operators): msg = "operator '.*' not implemented for bool dtypes" err = NotImplementedError - for other in [other, np.array(other)]: + for val in [other, np.array(other)]: with pytest.raises(err, match=msg): - op(data, other) + op(data, val) s = pd.Series(data) with pytest.raises(err, match=msg): - op(s, other) + op(s, val) @pytest.mark.parametrize("op", ["__neg__", "__abs__", "__invert__"]) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 7a89656bd5aa0..d99b1118444c9 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -8,6 +8,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) + pa = pytest.importorskip("pyarrow") from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -161,7 +162,7 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): # Add offset to the buffer. offset = b"\x00" * (pa_array.type.bit_width // 8) data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) - mask_buffer_offset = pa.py_buffer(b"\x0E") + mask_buffer_offset = pa.py_buffer(b"\x0e") pa_array_offset = pa.Array.from_buffers( type=pa_array.type, length=len(pa_array), diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 4c7bd6e293ef4..b4b1761217826 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] arrays += [ @@ -21,7 +22,7 @@ def data(request): return request.param -@pytest.fixture() +@pytest.fixture def numpy_dtype(data): """ Fixture returning numpy dtype from 'data' input array. @@ -55,3 +56,19 @@ def test_tolist(data): result = data.tolist() expected = list(data) tm.assert_equal(result, expected) + + +def test_to_numpy(): + # GH#56991 + + class MyStringArray(BaseMaskedArray): + dtype = pd.StringDtype() + _dtype_cls = pd.StringDtype + _internal_fill_value = pd.NA + + arr = MyStringArray( + values=np.array(["a", "b", "c"]), mask=np.array([False, True, False]) + ) + result = arr.to_numpy() + expected = np.array(["a", pd.NA, "c"]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/masked/test_indexing.py b/pandas/tests/arrays/masked/test_indexing.py index 28ee451a7ddd7..753d562c87ffa 100644 --- a/pandas/tests/arrays/masked/test_indexing.py +++ b/pandas/tests/arrays/masked/test_indexing.py @@ -8,7 +8,7 @@ class TestSetitemValidation: def _check_setitem_invalid(self, arr, invalid): - msg = f"Invalid value '{str(invalid)}' for dtype {arr.dtype}" + msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'" msg = re.escape(msg) with pytest.raises(TypeError, match=msg): arr[0] = invalid diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py index 3e74402263cf9..545b14af2c98b 100644 --- a/pandas/tests/arrays/masked_shared.py +++ b/pandas/tests/arrays/masked_shared.py @@ -1,6 +1,7 @@ """ Tests shared by MaskedArray subclasses. """ + import numpy as np import pytest diff --git a/pandas/tests/arrays/numpy_/test_numpy.py b/pandas/tests/arrays/numpy_/test_numpy.py index 5112ce262f771..620a553d5a731 100644 --- a/pandas/tests/arrays/numpy_/test_numpy.py +++ b/pandas/tests/arrays/numpy_/test_numpy.py @@ -2,6 +2,7 @@ Additional tests for NumpyExtensionArray that aren't covered by the interface tests. """ + import numpy as np import pytest @@ -21,7 +22,7 @@ np.array([True, False], dtype=bool), np.array([0, 1], dtype="datetime64[ns]"), np.array([0, 1], dtype="timedelta64[ns]"), - ] + ], ) def any_numpy_array(request): """ @@ -29,7 +30,7 @@ def any_numpy_array(request): This excludes string and bytes. """ - return request.param + return request.param.copy() # ---------------------------------------------------------------------------- @@ -322,3 +323,30 @@ def test_factorize_unsigned(): tm.assert_numpy_array_equal(res_codes, exp_codes) tm.assert_extension_array_equal(res_unique, NumpyExtensionArray(exp_unique)) + + +# ---------------------------------------------------------------------------- +# Output formatting + + +def test_array_repr(any_numpy_array): + # GH#61085 + nparray = any_numpy_array + arr = NumpyExtensionArray(nparray) + if nparray.dtype == "object": + values = "['a', 'b']" + elif nparray.dtype == "float64": + values = "[0.0, 1.0]" + elif str(nparray.dtype).startswith("int"): + values = "[0, 1]" + elif nparray.dtype == "complex128": + values = "[0j, (1+2j)]" + elif nparray.dtype == "bool": + values = "[True, False]" + elif nparray.dtype == "datetime64[ns]": + values = "[1970-01-01T00:00:00.000000000, 1970-01-01T00:00:00.000000001]" + elif nparray.dtype == "timedelta64[ns]": + values = "[0 nanoseconds, 1 nanoseconds]" + expected = f"\n{values}\nLength: 2, dtype: {nparray.dtype}" + result = repr(arr) + assert result == expected, f"{result} vs {expected}" diff --git a/pandas/tests/arrays/period/test_constructors.py b/pandas/tests/arrays/period/test_constructors.py index d034162f1b46e..63b0e456c4566 100644 --- a/pandas/tests/arrays/period/test_constructors.py +++ b/pandas/tests/arrays/period/test_constructors.py @@ -135,17 +135,6 @@ def test_from_td64nat_sequence_raises(): pd.DataFrame(arr, dtype=dtype) -def test_freq_deprecated(): - # GH#52462 - data = np.arange(5).astype(np.int64) - msg = "The 'freq' keyword in the PeriodArray constructor is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = PeriodArray(data, freq="M") - - expected = PeriodArray(data, dtype="period[M]") - tm.assert_equal(res, expected) - - def test_period_array_from_datetime64(): arr = np.array( ["2020-01-01T00:00:00", "2020-02-02T00:00:00"], dtype="datetime64[ns]" diff --git a/pandas/tests/arrays/period/test_reductions.py b/pandas/tests/arrays/period/test_reductions.py index 2889cc786dd71..5b859d86eb6d0 100644 --- a/pandas/tests/arrays/period/test_reductions.py +++ b/pandas/tests/arrays/period/test_reductions.py @@ -1,5 +1,3 @@ -import pytest - import pandas as pd from pandas.core.arrays import period_array @@ -32,7 +30,6 @@ def test_min_max(self): result = arr.max(skipna=False) assert result is pd.NaT - @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna): arr = period_array([], freq="D") result = arr.min(skipna=skipna) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 87eb7bcfa9cee..08bfd5b69fdd9 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -105,28 +105,36 @@ def test_accessor_raises(self): @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) - @pytest.mark.parametrize("dtype", ["float64", "int64"]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_from_spmatrix(self, format, labels, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype) - mat = sp_sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) + sp_mat = sp_sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels) + mat = np.eye(10, dtype=dtype) expected = pd.DataFrame( - np.eye(10, dtype=dtype), index=labels, columns=labels + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + index=labels, + columns=labels, ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) - def test_from_spmatrix_including_explicit_zero(self, format): + @pytest.mark.parametrize("dtype", [np.int64, bool]) + def test_from_spmatrix_including_explicit_zero(self, format, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - mat = sp_sparse.random(10, 2, density=0.5, format=format) - mat.data[0] = 0 - result = pd.DataFrame.sparse.from_spmatrix(mat) - dtype = SparseDtype("float64", 0.0) - expected = pd.DataFrame(mat.todense()).astype(dtype) + sp_dtype = SparseDtype(dtype) + + sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype) + sp_mat.data[0] = 0 + result = pd.DataFrame.sparse.from_spmatrix(sp_mat) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value) + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format): def test_from_spmatrix_columns(self, columns): sp_sparse = pytest.importorskip("scipy.sparse") - dtype = SparseDtype("float64", 0.0) + sp_dtype = SparseDtype(np.float64) - mat = sp_sparse.random(10, 2, density=0.5) - result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) - expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) + sp_mat = sp_sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + "columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] ) - def test_to_coo(self, colnames): + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) + def test_to_coo(self, columns, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - df = pd.DataFrame( - {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]" - ) - result = df.sparse.to_coo() - expected = sp_sparse.coo_matrix(np.asarray(df)) - assert (result != expected).nnz == 0 + sp_dtype = SparseDtype(dtype) - @pytest.mark.parametrize("fill_value", [1, np.nan]) - def test_to_coo_nonzero_fill_val_raises(self, fill_value): - pytest.importorskip("scipy") - df = pd.DataFrame( - { - "A": SparseArray( - [fill_value, fill_value, fill_value, 2], fill_value=fill_value - ), - "B": SparseArray( - [fill_value, 2, fill_value, fill_value], fill_value=fill_value - ), - } - ) - with pytest.raises(ValueError, match="fill value must be 0"): - df.sparse.to_coo() + expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype) + mat = expected.toarray() + result = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + dtype=sp_dtype, + ).sparse.to_coo() + assert (result != expected).nnz == 0 def test_to_coo_midx_categorical(self): # GH#50996 @@ -251,3 +252,7 @@ def test_with_column_named_sparse(self): # https://github.com/pandas-dev/pandas/issues/30758 df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) + + def test_subclassing(self): + df = tm.SubclassedDataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse.to_dense(), tm.SubclassedDataFrame) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index ffc93b4e4f176..a47e73d49674d 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -433,9 +433,6 @@ def test_ufuncs(ufunc, arr): [ (SparseArray([0, 0, 0]), np.array([0, 1, 2])), (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), ], ) @pytest.mark.parametrize("ufunc", [np.add, np.greater]) @@ -471,6 +468,19 @@ def test_mismatched_length_cmp_op(cons): left & right +@pytest.mark.parametrize( + "a, b", + [ + ([0, 1, 2], [0, 1, 2, 3]), + ([0, 1, 2, 3], [0, 1, 2]), + ], +) +def test_mismatched_length_arith_op(a, b, all_arithmetic_functions): + op = all_arithmetic_functions + with pytest.raises(AssertionError, match=f"length mismatch: {len(a)} vs. {len(b)}"): + op(SparseArray(a, fill_value=0), np.array(b)) + + @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) @pytest.mark.parametrize("fill_value", [np.nan, 3]) def test_binary_operators(op, fill_value): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 883d6ea3959ff..1b685100e4931 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -52,10 +53,11 @@ def test_set_fill_value(self): arr.fill_value = 2 assert arr.fill_value == 2 - msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "fill_value must be a valid value for the SparseDtype.subtype" + with pytest.raises(ValueError, match=msg): + # GH#53043 arr.fill_value = 3.1 - assert arr.fill_value == 3.1 + assert arr.fill_value == 2 arr.fill_value = np.nan assert np.isnan(arr.fill_value) @@ -64,8 +66,9 @@ def test_set_fill_value(self): arr.fill_value = True assert arr.fill_value is True - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): arr.fill_value = 0 + assert arr.fill_value is True arr.fill_value = np.nan assert np.isnan(arr.fill_value) @@ -478,3 +481,33 @@ def test_zero_sparse_column(): expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + + +def test_array_interface(arr_data, arr): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(arr) + tm.assert_numpy_array_equal(result, arr_data) + + # it always gives a copy by default + result_copy1 = np.asarray(arr) + result_copy2 = np.asarray(arr) + assert not np.may_share_memory(result_copy1, result_copy2) + + # or with explicit copy=True + result_copy1 = np.array(arr, copy=True) + result_copy2 = np.array(arr, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for sparse arrays, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(arr, copy=False) + + # except when there are actually no sparse filled values + arr2 = SparseArray(np.array([1, 2, 3])) + result_nocopy1 = np.array(arr2, copy=False) + result_nocopy2 = np.array(arr2, copy=False) + assert np.may_share_memory(result_nocopy1, result_nocopy2) diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 83a507e679d46..e6e4a11a0f5ab 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype): ), ( SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + np.str_, + SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( diff --git a/pandas/tests/arrays/sparse/test_constructors.py b/pandas/tests/arrays/sparse/test_constructors.py index 2831c8abdaf13..0bf3ab77e9eed 100644 --- a/pandas/tests/arrays/sparse/test_constructors.py +++ b/pandas/tests/arrays/sparse/test_constructors.py @@ -90,13 +90,13 @@ def test_constructor_warns_when_losing_timezone(self): dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) - - with tm.assert_produces_warning(UserWarning): + msg = "loses timezone information" + with tm.assert_produces_warning(UserWarning, match=msg): result = SparseArray(dti) tm.assert_sp_array_equal(result, expected) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): result = SparseArray(pd.Series(dti)) tm.assert_sp_array_equal(result, expected) @@ -144,20 +144,12 @@ def test_constructor_spindex_dtype(self): @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) def test_constructor_spindex_dtype_scalar(self, sparse_index): # scalar input - msg = "Constructing SparseArray with scalar data is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) - exp = SparseArray([1], dtype=None) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 + msg = "Cannot construct SparseArray from scalar data. Pass a sequence instead" + with pytest.raises(TypeError, match=msg): + SparseArray(data=1, sparse_index=sparse_index, dtype=None) - with tm.assert_produces_warning(FutureWarning, match=msg): - arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) - exp = SparseArray([1], dtype=None) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 + with pytest.raises(TypeError, match=msg): + SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) def test_constructor_spindex_dtype_scalar_broadcasts(self): arr = SparseArray( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 234f4092421e5..6143163735ab8 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -68,6 +68,14 @@ def test_nans_equal(): assert b == a +def test_nans_not_equal(): + # GH 54770 + a = SparseDtype(float, 0) + b = SparseDtype(float, pd.NA) + assert a != b + assert b != a + + with warnings.catch_warnings(): msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated" warnings.filterwarnings("ignore", msg, category=FutureWarning) @@ -76,7 +84,6 @@ def test_nans_equal(): (SparseDtype("float64"), SparseDtype("float32")), (SparseDtype("float64"), SparseDtype("float64", 0)), (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)), - (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), (SparseDtype("float64"), np.dtype("float64")), ] @@ -99,15 +106,15 @@ def test_construct_from_string_raises(): @pytest.mark.parametrize( "dtype, expected", [ - (SparseDtype(int), True), - (SparseDtype(float), True), - (SparseDtype(bool), True), - (SparseDtype(object), False), - (SparseDtype(str), False), + (int, True), + (float, True), + (bool, True), + (object, False), + (str, False), ], ) def test_is_numeric(dtype, expected): - assert dtype._is_numeric is expected + assert SparseDtype(dtype)._is_numeric is expected def test_str_uses_object(): @@ -177,7 +184,7 @@ def test_construct_from_string_fill_value_raises(string): [ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ], ) diff --git a/pandas/tests/arrays/sparse/test_reductions.py b/pandas/tests/arrays/sparse/test_reductions.py index f44423d5e635c..4171d1213a0dc 100644 --- a/pandas/tests/arrays/sparse/test_reductions.py +++ b/pandas/tests/arrays/sparse/test_reductions.py @@ -126,13 +126,13 @@ def test_sum(self): @pytest.mark.parametrize( "arr", - [np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])], + [[0, 1, np.nan, 1], [0, 1, 1]], ) @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) @pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)]) def test_sum_min_count(self, arr, fill_value, min_count, expected): # GH#25777 - sparray = SparseArray(arr, fill_value=fill_value) + sparray = SparseArray(np.array(arr), fill_value=fill_value) result = sparray.sum(min_count=min_count) if np.isnan(expected): assert np.isnan(result) @@ -296,11 +296,9 @@ def test_argmax_argmin(self, arr, argmax_expected, argmin_expected): assert argmax_result == argmax_expected assert argmin_result == argmin_expected - @pytest.mark.parametrize( - "arr,method", - [(SparseArray([]), "argmax"), (SparseArray([]), "argmin")], - ) - def test_empty_array(self, arr, method): + @pytest.mark.parametrize("method", ["argmax", "argmin"]) + def test_empty_array(self, method): msg = f"attempt to get {method} of an empty sequence" + arr = SparseArray([]) with pytest.raises(ValueError, match=msg): - arr.argmax() if method == "argmax" else arr.argmin() + getattr(arr, method)() diff --git a/pandas/tests/arrays/string_/test_concat.py b/pandas/tests/arrays/string_/test_concat.py new file mode 100644 index 0000000000000..320d700b2b6c3 --- /dev/null +++ b/pandas/tests/arrays/string_/test_concat.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest + +from pandas.compat import HAS_PYARROW + +from pandas.core.dtypes.cast import find_common_type + +import pandas as pd +import pandas._testing as tm +from pandas.util.version import Version + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + # same types + ([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)), + ([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)), + ([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)), + ([("python", np.nan), ("python", np.nan)], ("python", np.nan)), + # pyarrow preference + ([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)), + # NA preference + ([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)), + ], +) +def test_concat_series(request, to_concat_dtypes, result_dtype): + if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW: + pytest.skip("Could not import 'pyarrow'") + + ser_list = [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value)) + for storage, na_value in to_concat_dtypes + ] + + result = pd.concat(ser_list, ignore_index=True) + expected = pd.Series( + ["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype) + ) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat(ser_list[::1], ignore_index=True) + tm.assert_series_equal(result, expected) + + +def test_concat_with_object(string_dtype_arguments): + # _get_common_dtype cannot inspect values, so object dtype with strings still + # results in object dtype + result = pd.concat( + [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)), + pd.Series(["a", "b", None], dtype=object), + ] + ) + assert result.dtype == np.dtype("object") + + +def test_concat_with_numpy(string_dtype_arguments): + # common type with a numpy string dtype always preserves the pandas string dtype + dtype = pd.StringDtype(*string_dtype_arguments) + assert find_common_type([dtype, np.dtype("U")]) == dtype + assert find_common_type([np.dtype("U"), dtype]) == dtype + assert find_common_type([dtype, np.dtype("U10")]) == dtype + assert find_common_type([np.dtype("U10"), dtype]) == dtype + + # with any other numpy dtype -> object + assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object") + assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object") + + if Version(np.__version__) >= Version("2"): + assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype + assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 320bdca60a932..736c0e1782fc0 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,34 +2,43 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ + import operator import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW +from pandas.compat.pyarrow import ( + pa_version_under12p0, + pa_version_under19p0, +) +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_dtype_equal import pandas as pd import pandas._testing as tm +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ( ArrowStringArray, ArrowStringArrayNumpySemantics, ) -def na_val(dtype): - if dtype.storage == "pyarrow_numpy": - return np.nan - else: - return pd.NA +@pytest.fixture +def dtype(string_dtype_arguments): + """Fixture giving StringDtype from parametrized storage and na_value arguments""" + storage, na_value = string_dtype_arguments + return pd.StringDtype(storage=storage, na_value=na_value) @pytest.fixture -def dtype(string_storage): - """Fixture giving StringDtype from parametrized 'string_storage'""" - return pd.StringDtype(storage=string_storage) +def dtype2(string_dtype_arguments2): + storage, na_value = string_dtype_arguments2 + return pd.StringDtype(storage=storage, na_value=na_value) @pytest.fixture @@ -38,52 +47,109 @@ def cls(dtype): return dtype.construct_array_type() +def string_dtype_highest_priority(dtype1, dtype2): + if HAS_PYARROW: + DTYPE_HIERARCHY = [ + pd.StringDtype("python", na_value=np.nan), + pd.StringDtype("pyarrow", na_value=np.nan), + pd.StringDtype("python", na_value=pd.NA), + pd.StringDtype("pyarrow", na_value=pd.NA), + ] + else: + DTYPE_HIERARCHY = [ + pd.StringDtype("python", na_value=np.nan), + pd.StringDtype("python", na_value=pd.NA), + ] + + h1 = DTYPE_HIERARCHY.index(dtype1) + h2 = DTYPE_HIERARCHY.index(dtype2) + return DTYPE_HIERARCHY[max(h1, h2)] + + +def test_dtype_constructor(): + pytest.importorskip("pyarrow") + + with tm.assert_produces_warning(FutureWarning): + dtype = pd.StringDtype("pyarrow_numpy") + assert dtype == pd.StringDtype("pyarrow", na_value=np.nan) + + +def test_dtype_equality(): + pytest.importorskip("pyarrow") + + dtype1 = pd.StringDtype("python") + dtype2 = pd.StringDtype("pyarrow") + dtype3 = pd.StringDtype("pyarrow", na_value=np.nan) + + assert dtype1 == pd.StringDtype("python", na_value=pd.NA) + assert dtype1 != dtype2 + assert dtype1 != dtype3 + + assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA) + assert dtype2 != dtype1 + assert dtype2 != dtype3 + + assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan) + assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan")) + assert dtype3 != dtype1 + assert dtype3 != dtype2 + + def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = " A\n0 a\n1 NaN\n2 b" else: expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - if dtype.storage == "pyarrow_numpy": - expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + if dtype.na_value is np.nan: + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and dtype.na_value is pd.NA: arr_name = "ArrowStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" - elif dtype.storage == "pyarrow_numpy": + elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" + elif dtype.storage == "python" and dtype.na_value is np.nan: + arr_name = "StringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected +def test_dtype_repr(dtype): + if dtype.storage == "pyarrow": + if dtype.na_value is pd.NA: + assert repr(dtype) == ")>" + else: + assert repr(dtype) == "" + elif dtype.na_value is pd.NA: + assert repr(dtype) == ")>" + else: + assert repr(dtype) == "" + + def test_none_to_nan(cls, dtype): a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None - assert a[1] is na_val(a.dtype) + assert a[1] is a.dtype.na_value def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if cls is pd.arrays.StringArray: - msg = "Cannot set non-string value '10' into a StringArray." - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '10' for dtype 'str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - if cls is pd.arrays.StringArray: - msg = "Must provide strings." - else: - msg = "Scalar must be NA or str" + msg = "Invalid value for dtype 'str" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) @@ -149,8 +215,8 @@ def test_add(dtype): tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: +def test_add_2d(dtype, request): + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.applymarker(mark) @@ -224,7 +290,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = np.array([getattr(item, op_name)(other) for item in a]) if comparison_op == operator.ne: expected[1] = True @@ -243,7 +309,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, True]) else: @@ -270,7 +336,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected_data = { "__eq__": [False, False, False], "__ne__": [True, True, True], @@ -286,13 +352,18 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_array(comparison_op, dtype): +def test_comparison_methods_array(comparison_op, dtype, dtype2): op_name = f"__{comparison_op.__name__}__" a = pd.array(["a", None, "c"], dtype=dtype) - other = [None, None, "c"] - result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + other = pd.array([None, None, "c"], dtype=dtype2) + result = comparison_op(a, other) + + # ensure operation is commutative + result2 = comparison_op(other, a) + tm.assert_equal(result, result2) + + if dtype.na_value is np.nan and dtype2.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -300,11 +371,56 @@ def test_comparison_methods_array(comparison_op, dtype): expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) + else: + max_dtype = string_dtype_highest_priority(dtype, dtype2) + if max_dtype.storage == "python": + expected_dtype = "boolean" + else: + expected_dtype = "bool[pyarrow]" + + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_comparison_methods_array_arrow_extension(comparison_op, dtype2): + # Test pd.ArrowDtype(pa.string()) against other string arrays + import pyarrow as pa + + op_name = f"__{comparison_op.__name__}__" + dtype = pd.ArrowDtype(pa.string()) + a = pd.array(["a", None, "c"], dtype=dtype) + other = pd.array([None, None, "c"], dtype=dtype2) + result = comparison_op(a, other) + + # ensure operation is commutative + result2 = comparison_op(other, a) + tm.assert_equal(result, result2) + + expected = pd.array([None, None, True], dtype="bool[pyarrow]") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_list(comparison_op, dtype): + op_name = f"__{comparison_op.__name__}__" + + a = pd.array(["a", None, "c"], dtype=dtype) + other = [None, None, "c"] + result = comparison_op(a, other) + + # ensure operation is commutative + result2 = comparison_op(other, a) + tm.assert_equal(result, result2) + + if dtype.na_value is np.nan: if operator.ne == comparison_op: - expected = np.array([True, True, True]) + expected = np.array([True, True, False]) else: expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) else: @@ -314,14 +430,12 @@ def test_comparison_methods_array(comparison_op, dtype): expected = pd.array(expected, dtype=expected_dtype) tm.assert_extension_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) - def test_constructor_raises(cls): if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" + elif cls is StringArrayNumpySemantics: + msg = "StringArrayNumpySemantics requires a sequence of strings or NaN" else: msg = "Unsupported type '' for ArrowExtensionArray" @@ -331,7 +445,7 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - if cls is pd.arrays.StringArray: + if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics: # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs # for string dtype cls(np.array(["a", np.nan], dtype=object)) @@ -372,6 +486,8 @@ def test_from_sequence_no_mutate(copy, cls, dtype): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + elif cls is StringArrayNumpySemantics: + expected = cls(nan_arr) else: expected = cls(na_arr) @@ -386,7 +502,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: err = ValueError msg = "cannot convert float NaN to integer" else: @@ -415,16 +531,12 @@ def test_astype_float(dtype, any_float_dtype): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna, dtype): arr = pd.Series(["a", "b", "c"], dtype=dtype) result = arr.sum(skipna=skipna) assert result == "abc" -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna, dtype): arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) result = arr.sum(skipna=skipna) @@ -435,7 +547,6 @@ def test_reduce_missing(skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) -@pytest.mark.parametrize("skipna", [True, False]) def test_min_max(method, skipna, dtype): arr = pd.Series(["a", "b", "c", None], dtype=dtype) result = getattr(arr, method)(skipna=skipna) @@ -443,13 +554,13 @@ def test_min_max(method, skipna, dtype): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is na_val(arr.dtype) + assert result is arr.dtype.na_value @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage and box is pd.array: +def test_min_max_numpy(method, box, dtype, request): + if dtype.storage == "pyarrow" and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -463,7 +574,7 @@ def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): assert result == expected -def test_fillna_args(dtype, arrow_string_storage): +def test_fillna_args(dtype): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -476,10 +587,7 @@ def test_fillna_args(dtype, arrow_string_storage): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage in arrow_string_storage: - msg = "Invalid value '1' for dtype string" - else: - msg = "Cannot set non-string value '1' into a StringArray." + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): arr.fillna(value=1) @@ -492,7 +600,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p0: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) @@ -500,17 +608,10 @@ def test_arrow_array(dtype): @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): +def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -518,29 +619,42 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + if dtype.na_value is np.nan and not using_infer_string: + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is result["a"].dtype.na_value + + +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_from_string(using_infer_string): + # not roundtrip, but starting with pyarrow table without pandas metadata + pa = pytest.importorskip("pyarrow") + table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())}) + + result = table.to_pandas() + + if using_infer_string and not pa_version_under19p0: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str") + else: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object") tm.assert_frame_equal(result, expected) - # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is na_val(result["a"].dtype) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks( - dtype, string_storage2, request, using_infer_string -): +def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -550,18 +664,26 @@ def test_arrow_load_from_zero_chunks( assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") - tm.assert_frame_equal(result, expected) + + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -575,10 +697,10 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "double[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = np.float64 + elif dtype.storage == "pyarrow": + exp_dtype = "double[pyarrow]" else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -587,35 +709,23 @@ def test_value_counts_with_normalize(dtype): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "values, expected", - [ - (["a", "b", "c"], np.array([False, False, False])), - (["a", "b", None], np.array([False, False, True])), - ], -) -def test_use_inf_as_na(values, expected, dtype): - # https://github.com/pandas-dev/pandas/issues/33655 - values = pd.array(values, dtype=dtype) - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.use_inf_as_na", True): - result = values.isna() - tm.assert_numpy_array_equal(result, expected) - - result = pd.Series(values).isna() - expected = pd.Series(expected) - tm.assert_series_equal(result, expected) - - result = pd.DataFrame(values).isna() - expected = pd.DataFrame(expected) - tm.assert_frame_equal(result, expected) +def test_value_counts_sort_false(dtype): + if dtype.na_value is np.nan: + exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" + else: + exp_dtype = "Int64" + ser = pd.Series(["a", "b", "c", "b"], dtype=dtype) + result = ser.value_counts(sort=False) + expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count") + tm.assert_series_equal(result, expected) -def test_memory_usage(dtype, arrow_string_storage): +def test_memory_usage(dtype): # GH 33963 - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) @@ -635,7 +745,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", na_val(dtype), "b"], dtype=object) + expected = np.array(["a", dtype.na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -666,6 +776,35 @@ def test_isin(dtype, fixed_now_ts): expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + result = s.isin([fixed_now_ts]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + + +def test_isin_string_array(dtype, dtype2): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=dtype2)) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=dtype2)) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + +def test_isin_arrow_string_array(dtype): + pa = pytest.importorskip("pyarrow") + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + def test_setitem_scalar_with_mask_validation(dtype): # https://github.com/pandas-dev/pandas/issues/47628 @@ -675,14 +814,11 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is na_val(ser.dtype) + assert ser.array[1] is ser.dtype.na_value # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if type(ser.array) is pd.arrays.StringArray: - msg = "Cannot set non-string value" - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): ser[mask] = 1 @@ -701,3 +837,9 @@ def test_tolist(dtype): result = arr.tolist() expected = vals tm.assert_equal(result, expected) + + +def test_string_array_view_type_error(): + arr = pd.array(["a", "b", "c"], dtype="string") + with pytest.raises(TypeError, match="Cannot change data-type for string array."): + arr.view("i8") diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d7811b6fed883..e6103da5021bb 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -26,15 +26,18 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage != "pyarrow_numpy": - request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) +def test_config(string_storage, using_infer_string): + # with the default string_storage setting + # always "python" at the moment + assert StringDtype().storage == "python" + with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype(string_storage) + # pd.array(..) by default always returns the NA-variant + dtype = StringDtype(string_storage, na_value=pd.NA) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) @@ -46,18 +49,18 @@ def test_config_bad_storage_raises(): @pytest.mark.parametrize("chunked", [True, False]) -@pytest.mark.parametrize("array", ["numpy", "pyarrow"]) -def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): +@pytest.mark.parametrize("array_lib", ["numpy", "pyarrow"]) +def test_constructor_not_string_type_raises(array_lib, chunked): pa = pytest.importorskip("pyarrow") - array = pa if array in arrow_string_storage else np + array_lib = pa if array_lib == "pyarrow" else np - arr = array.array([1, 2, 3]) + arr = array_lib.array([1, 2, 3]) if chunked: - if array is np: + if array_lib is np: pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - if array is np: + if array_lib is np: msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( @@ -82,19 +85,32 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): ArrowStringArray(arr) -@pytest.mark.xfail( - reason="dict conversion does not seem to be implemented for large string in arrow" -) +@pytest.mark.parametrize("string_type", ["string", "large_string"]) @pytest.mark.parametrize("chunked", [True, False]) -def test_constructor_valid_string_type_value_dictionary(chunked): +def test_constructor_valid_string_type_value_dictionary(string_type, chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() + arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode() + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) + + +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_view(chunked): + # requires pyarrow>=18 for casting string_view to string + pa = pytest.importorskip("pyarrow", minversion="18") + + arr = pa.array(["1", "2", "3"], pa.string_view()) if chunked: arr = pa.chunked_array(arr) arr = ArrowStringArray(arr) - assert pa.types.is_string(arr._pa_array.type.value_type) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) def test_constructor_from_list(): @@ -220,29 +236,30 @@ def test_setitem_invalid_indexer_raises(): arr = ArrowStringArray(pa.array(list("abcde"))) - with pytest.raises(IndexError, match=None): + with tm.external_error_raised(IndexError): arr[5] = "foo" - with pytest.raises(IndexError, match=None): + with tm.external_error_raised(IndexError): arr[-6] = "foo" - with pytest.raises(IndexError, match=None): + with tm.external_error_raised(IndexError): arr[[0, 5]] = "foo" - with pytest.raises(IndexError, match=None): + with tm.external_error_raised(IndexError): arr[[0, -6]] = "foo" - with pytest.raises(IndexError, match=None): + with tm.external_error_raised(IndexError): arr[[True, True, False]] = "foo" - with pytest.raises(ValueError, match=None): + with tm.external_error_raised(ValueError): arr[[0, 1]] = ["foo", "bar", "baz"] -@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) -def test_pickle_roundtrip(dtype): +@pytest.mark.parametrize("na_value", [pd.NA, np.nan]) +def test_pickle_roundtrip(na_value): # GH 42600 pytest.importorskip("pyarrow") + dtype = StringDtype("pyarrow", na_value=na_value) expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) @@ -260,6 +277,6 @@ def test_pickle_roundtrip(dtype): def test_string_dtype_error_message(): # GH#55051 pytest.importorskip("pyarrow") - msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + msg = "Storage must be 'python' or 'pyarrow'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 96263f498935b..3c0ef1e4d928b 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -1,10 +1,11 @@ import datetime import decimal -import re +import zoneinfo import numpy as np import pytest -import pytz + +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -29,17 +30,15 @@ ) -@pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]", "M8[m]"]) +@pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]"]) def test_dt64_array(dtype_unit): - # PR 53817 + # GH#53817 dtype_var = np.dtype(dtype_unit) msg = ( r"datetime64 and timedelta64 dtype resolutions other than " - r"'s', 'ms', 'us', and 'ns' are deprecated. " - r"In future releases passing unsupported resolutions will " - r"raise an exception." + r"'s', 'ms', 'us', and 'ns' are no longer supported." ) - with tm.assert_produces_warning(FutureWarning, match=re.escape(msg)): + with pytest.raises(ValueError, match=msg): pd.array([], dtype=dtype_var) @@ -128,7 +127,7 @@ def test_dt64_array(dtype_unit): ( pd.DatetimeIndex(["2000", "2001"]), None, - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( ["2000", "2001"], @@ -216,6 +215,15 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + "str", + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)) + if using_string_dtype() + else NumpyExtensionArray(np.array(["a", "None"])), + ), ( ["a", None], pd.StringDtype(), @@ -223,6 +231,29 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + ), # Boolean ( [True, None], @@ -250,6 +281,14 @@ def test_dt64_array(dtype_unit): "category", pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]), ), + # Complex + ( + np.array([complex(1), complex(2)], dtype=np.complex128), + None, + NumpyExtensionArray( + np.array([complex(1), complex(2)], dtype=np.complex128) + ), + ), ], ) def test_array(data, dtype, expected): @@ -272,9 +311,6 @@ def test_array_copy(): assert tm.shares_memory(a, b) -cet = pytz.timezone("CET") - - @pytest.mark.parametrize( "data, expected", [ @@ -288,11 +324,11 @@ def test_array_copy(): # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[us]"), ), ( np.array([1, 2], dtype="M8[ns]"), @@ -308,16 +344,23 @@ def test_array_copy(): ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="s") ), ), ( [ - datetime.datetime(2000, 1, 1, tzinfo=cet), - datetime.datetime(2001, 1, 1, tzinfo=cet), + datetime.datetime( + 2000, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin") + ), + datetime.datetime( + 2001, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin") + ), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns") + ["2000", "2001"], + dtype=pd.DatetimeTZDtype( + tz=zoneinfo.ZoneInfo("Europe/Berlin"), unit="us" + ), ), ), # timedelta @@ -327,11 +370,15 @@ def test_array_copy(): ), ( np.array([1, 2], dtype="m8[ns]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[ns]"), dtype=np.dtype("m8[ns]") + ), ), ( np.array([1, 2], dtype="m8[us]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[us]"), dtype=np.dtype("m8[us]") + ), ), # integer ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")), @@ -367,6 +414,13 @@ def test_array_copy(): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 82524ea115019..d1ef29b0bf8a0 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -11,7 +11,10 @@ OutOfBoundsDatetime, Timestamp, ) -from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas._libs.tslibs import to_offset +from pandas.compat.numpy import np_version_gt2 + +from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd from pandas import ( @@ -51,7 +54,7 @@ def period_index(freqstr): warnings.filterwarnings( "ignore", message="Period with BDay freq", category=FutureWarning ) - freqstr = freq_to_period_freqstr(1, freqstr) + freqstr = PeriodDtype(to_offset(freqstr))._freqstr pi = pd.period_range(start=Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi @@ -244,7 +247,7 @@ def test_scalar_from_string(self, arr1d): assert result == arr1d[0] def test_reduce_invalid(self, arr1d): - msg = "does not support reduction 'not a method'" + msg = "does not support operation 'not a method'" with pytest.raises(TypeError, match=msg): arr1d._reduce("not a method") @@ -254,7 +257,8 @@ def test_fillna_method_doesnt_change_orig(self, method): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls._from_sequence(data) + dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]" + arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype)) arr[4] = NaT fill_value = arr[3] if method == "pad" else arr[5] @@ -270,7 +274,8 @@ def test_searchsorted(self): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls._from_sequence(data) + dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]" + arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype)) # scalar result = arr.searchsorted(arr[1]) @@ -638,13 +643,14 @@ def test_round(self, arr1d): def test_array_interface(self, datetime_index): arr = datetime_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data (for tz naive) result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -653,11 +659,13 @@ def test_array_interface(self, datetime_index): expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="datetime64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype @@ -696,6 +704,7 @@ def test_array_tz(self, arr1d): # GH#23524 arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8.view("M8[ns]") result = np.array(arr, dtype="M8[ns]") @@ -704,17 +713,18 @@ def test_array_tz(self, arr1d): result = np.array(arr, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) - # check that we are not making copies when setting copy=False - result = np.array(arr, dtype="M8[ns]", copy=False) + # check that we are not making copies when setting copy=copy_false + result = np.array(arr, dtype="M8[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None def test_array_i8_dtype(self, arr1d): arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8 result = np.array(arr, dtype="i8") @@ -723,18 +733,18 @@ def test_array_i8_dtype(self, arr1d): result = np.array(arr, dtype=np.int64) tm.assert_numpy_array_equal(result, expected) - # check that we are still making copies when setting copy=False - result = np.array(arr, dtype="i8", copy=False) + # check that we are still making copies when setting copy=copy_false + result = np.array(arr, dtype="i8", copy=copy_false) assert result.base is not expected.base assert result.base is None def test_from_array_keeps_base(self): # Ensure that DatetimeArray._ndarray.base isn't lost. arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - dta = DatetimeArray._from_sequence(arr) + dta = DatetimeArray._from_sequence(arr, dtype=arr.dtype) assert dta._ndarray is arr - dta = DatetimeArray._from_sequence(arr[:0]) + dta = DatetimeArray._from_sequence(arr[:0], dtype=arr.dtype) assert dta._ndarray.base is arr def test_from_dti(self, arr1d): @@ -761,7 +771,7 @@ def test_to_period(self, datetime_index, freqstr): dti = datetime_index arr = dti._data - freqstr = freq_to_period_freqstr(1, freqstr) + freqstr = PeriodDtype(to_offset(freqstr))._freqstr expected = dti.to_period(freq=freqstr) result = arr.to_period(freq=freqstr) assert isinstance(result, PeriodArray) @@ -772,7 +782,7 @@ def test_to_period_2d(self, arr1d): arr2d = arr1d.reshape(1, -1) warn = None if arr1d.tz is None else UserWarning - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="will drop timezone information"): result = arr2d.to_period("D") expected = arr1d.to_period("D").reshape(1, -1) tm.assert_period_array_equal(result, expected) @@ -880,20 +890,24 @@ def test_concat_same_type_different_freq(self, unit): tm.assert_datetime_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y %b") expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = DatetimeIndex(["2019-01-01", NaT])._data result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) class TestTimedeltaArray(SharedTests): @@ -950,13 +964,14 @@ def test_int_properties(self, timedelta_index, propname): def test_array_interface(self, timedelta_index): arr = timedelta_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -965,11 +980,13 @@ def test_array_interface(self, timedelta_index): expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="timedelta64[ns]", copy=False) + result = np.array(arr, dtype="timedelta64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="timedelta64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype @@ -1135,9 +1152,17 @@ def test_array_interface(self, arr1d): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) + # to int64 gives the underlying representation result = np.asarray(arr, dtype="int64") tm.assert_numpy_array_equal(result, arr.asi8) + result2 = np.asarray(arr, dtype="int64") + assert np.may_share_memory(result, result2) + + result_copy1 = np.array(arr, dtype="int64", copy=True) + result_copy2 = np.array(arr, dtype="int64", copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + # to other dtypes msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): @@ -1147,20 +1172,24 @@ def test_array_interface(self, arr1d): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y") expected = np.array([per.strftime("%Y") for per in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]")) result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -1313,12 +1342,6 @@ def test_from_pandas_array(dtype): cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] - depr_msg = f"{cls.__name__}.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = cls(arr) - expected = cls(data) - tm.assert_extension_array_equal(result, expected) - result = cls._from_sequence(arr, dtype=dtype) expected = cls._from_sequence(data, dtype=dtype) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 9a576be10d5ca..87505b1b22fc4 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -1,17 +1,12 @@ """ Tests for DatetimeArray """ + from __future__ import annotations from datetime import timedelta import operator -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - import numpy as np import pytest @@ -504,7 +499,7 @@ def test_value_counts_preserves_tz(self): @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_preserves_tz(self, method): dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") - arr = DatetimeArray._from_sequence(dti, copy=True) + arr = DatetimeArray._from_sequence(dti, dtype=dti.dtype, copy=True) arr[2] = pd.NaT fill_val = dti[1] if method == "pad" else dti[3] @@ -670,7 +665,9 @@ def test_shift_fill_value(self): dti = pd.date_range("2016-01-01", periods=3) dta = dti._data - expected = DatetimeArray._from_sequence(np.roll(dta._ndarray, 1)) + expected = DatetimeArray._from_sequence( + np.roll(dta._ndarray, 1), dtype=dti.dtype + ) fv = dta[-1] for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]: @@ -723,27 +720,24 @@ def test_tz_localize_t2d(self): roundtrip = expected.tz_localize("US/Pacific") tm.assert_datetime_array_equal(roundtrip, dta) - easts = ["US/Eastern", "dateutil/US/Eastern"] - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - # Argument 1 to "append" of "list" has incompatible type "ZoneInfo"; - # expected "str" - easts.append(tz) # type: ignore[arg-type] - - @pytest.mark.parametrize("tz", easts) + @pytest.mark.parametrize( + "tz", ["US/Eastern", "dateutil/US/Eastern", "pytz/US/Eastern"] + ) def test_iter_zoneinfo_fold(self, tz): # GH#49684 + if tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) utc_vals = np.array( [1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64 ) utc_vals *= 1_000_000_000 - dta = DatetimeArray._from_sequence(utc_vals).tz_localize("UTC").tz_convert(tz) + dta = ( + DatetimeArray._from_sequence(utc_vals, dtype=np.dtype("M8[ns]")) + .tz_localize("UTC") + .tz_convert(tz) + ) left = dta[2] right = list(dta)[2] @@ -763,27 +757,71 @@ def test_iter_zoneinfo_fold(self, tz): assert left.utcoffset() == right2.utcoffset() @pytest.mark.parametrize( - "freq, freq_depr", + "freq", + ["2M", "2SM", "2sm", "2Q", "2Q-SEP", "1Y", "2Y-MAR", "2m", "2q-sep", "2y"], + ) + def test_date_range_frequency_M_Q_Y_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + + @pytest.mark.parametrize("freq_depr", ["2MIN", "2nS", "2Us"]) + def test_date_range_uppercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + ) + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.lower()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq", [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ("1YE", "1A"), - ("2YE-MAR", "2A-MAR"), + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", ], ) - def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): - # GH#9586, GH#54275 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_date_range_lowercase_frequency_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) - expected = pd.date_range("1/1/2000", periods=4, freq=freq) + def test_date_range_lowercase_frequency_deprecated(self): + # GH#9586, GH#54939 + depr_msg = "'w' is deprecated and will be removed in a future version" + + expected = pd.date_range("1/1/2000", periods=4, freq="2W") with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + result = pd.date_range("1/1/2000", periods=4, freq="2w") tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("freq", ["1A", "2A-MAR", "2a-mar"]) + def test_date_range_frequency_A_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + + @pytest.mark.parametrize("freq", ["2H", "2CBH", "2S"]) + def test_date_range_uppercase_frequency_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]") diff --git a/pandas/tests/arrays/test_ndarray_backed.py b/pandas/tests/arrays/test_ndarray_backed.py index 1fe7cc9b03e8a..2af59a03a5b3e 100644 --- a/pandas/tests/arrays/test_ndarray_backed.py +++ b/pandas/tests/arrays/test_ndarray_backed.py @@ -1,6 +1,7 @@ """ Tests for subclasses of NDArrayBackedExtensionArray """ + import numpy as np from pandas import ( diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index a3f15467feb14..fb7c7afdc6ff9 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -194,18 +194,17 @@ def test_add_timedeltaarraylike(self, tda): class TestTimedeltaArray: - @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) - def test_astype_int(self, dtype): + def test_astype_int(self, any_int_numpy_dtype): arr = TimedeltaArray._from_sequence( [Timedelta("1h"), Timedelta("2h")], dtype="m8[ns]" ) - if np.dtype(dtype) != np.int64: + if np.dtype(any_int_numpy_dtype) != np.int64: with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): - arr.astype(dtype) + arr.astype(any_int_numpy_dtype) return - result = arr.astype(dtype) + result = arr.astype(any_int_numpy_dtype) expected = arr._ndarray.view("i8") tm.assert_numpy_array_equal(result, expected) @@ -264,10 +263,10 @@ def test_searchsorted_invalid_types(self, other, index): class TestUnaryOps: def test_abs(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - expected = TimedeltaArray._from_sequence(evals) + expected = TimedeltaArray._from_sequence(evals, dtype=evals.dtype) result = abs(arr) tm.assert_timedelta_array_equal(result, expected) @@ -277,7 +276,7 @@ def test_abs(self): def test_pos(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) result = +arr tm.assert_timedelta_array_equal(result, arr) @@ -289,7 +288,7 @@ def test_pos(self): def test_neg(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]") expected = TimedeltaArray._from_sequence(evals) diff --git a/pandas/tests/arrays/timedeltas/test_constructors.py b/pandas/tests/arrays/timedeltas/test_constructors.py index 91b6f7fa222f9..ee29f505fd7b1 100644 --- a/pandas/tests/arrays/timedeltas/test_constructors.py +++ b/pandas/tests/arrays/timedeltas/test_constructors.py @@ -1,45 +1,10 @@ import numpy as np import pytest -import pandas._testing as tm from pandas.core.arrays import TimedeltaArray class TestTimedeltaArrayConstructor: - def test_only_1dim_accepted(self): - # GH#25282 - arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") - - depr_msg = "TimedeltaArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 - TimedeltaArray(arr.reshape(2, 2, 1)) - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 0-dim - TimedeltaArray(arr[[0]].squeeze()) - - def test_freq_validation(self): - # ensure that the public constructor cannot create an invalid instance - arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10**9 - - msg = ( - "Inferred frequency None from passed values does not " - "conform to passed frequency D" - ) - depr_msg = "TimedeltaArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match=msg): - TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") - - def test_non_array_raises(self): - depr_msg = "TimedeltaArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match="list"): - TimedeltaArray([1, 2, 3]) - def test_other_type_raises(self): msg = r"dtype bool cannot be converted to timedelta64\[ns\]" with pytest.raises(TypeError, match=msg): @@ -78,16 +43,6 @@ def test_incorrect_dtype_raises(self): np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("m8[Y]") ) - def test_mismatched_values_dtype_units(self): - arr = np.array([1, 2, 3], dtype="m8[s]") - dtype = np.dtype("m8[ns]") - msg = r"Values resolution does not match dtype" - depr_msg = "TimedeltaArray.__init__ is deprecated" - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - with pytest.raises(ValueError, match=msg): - TimedeltaArray(arr, dtype=dtype) - def test_copy(self): data = np.array([1, 2, 3], dtype="m8[ns]") arr = TimedeltaArray._from_sequence(data, copy=False) diff --git a/pandas/tests/arrays/timedeltas/test_reductions.py b/pandas/tests/arrays/timedeltas/test_reductions.py index 991dbf41c8087..3565f2e8690e2 100644 --- a/pandas/tests/arrays/timedeltas/test_reductions.py +++ b/pandas/tests/arrays/timedeltas/test_reductions.py @@ -10,7 +10,6 @@ class TestReductions: @pytest.mark.parametrize("name", ["std", "min", "max", "median", "mean"]) - @pytest.mark.parametrize("skipna", [True, False]) def test_reductions_empty(self, name, skipna): tdi = pd.TimedeltaIndex([]) arr = tdi.array @@ -21,7 +20,6 @@ def test_reductions_empty(self, name, skipna): result = getattr(arr, name)(skipna=skipna) assert result is pd.NaT - @pytest.mark.parametrize("skipna", [True, False]) def test_sum_empty(self, skipna): tdi = pd.TimedeltaIndex([]) arr = tdi.array diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f3ac60f672ee1..dffd2009ef373 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -146,10 +146,12 @@ def test_constructor_datetime_outofbound( # No dtype specified (dtype inference) # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data - if a.dtype.kind == "M": + result = constructor(a) + if a.dtype.kind == "M" or isinstance(a[0], np.datetime64): # Can't fit in nanosecond bounds -> get the nearest supported unit - result = constructor(a) assert result.dtype == "M8[s]" + elif isinstance(a[0], datetime): + assert result.dtype == "M8[us]", result.dtype else: result = constructor(a) if using_infer_string and "object-string" in request.node.callspec.id: @@ -177,3 +179,14 @@ def test_constructor_datetime_nonns(self, constructor): arr.flags.writeable = False result = constructor(arr) tm.assert_equal(result, expected) + + def test_constructor_from_dict_keys(self, constructor, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/60343 + d = {"a": 1, "b": 2} + result = constructor(d.keys(), dtype="str") + if using_infer_string: + assert result.dtype == "str" + else: + assert result.dtype == "object" + expected = constructor(list(d.keys()), dtype="str") + tm.assert_equal(result, expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index fe0f1f1454a55..e3a821519c638 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -1,6 +1,9 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -20,6 +23,7 @@ SparseArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics @@ -218,7 +222,9 @@ def test_iter_box_period(self): ) def test_values_consistent(arr, expected_type, dtype, using_infer_string): if using_infer_string and dtype == "object": - expected_type = ArrowStringArrayNumpySemantics + expected_type = ( + ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics + ) l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type @@ -270,7 +276,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): ), ], ) -def test_array(arr, attr, index_or_series, request): +def test_array(arr, attr, index_or_series): box = index_or_series result = box(arr, copy=False).array @@ -290,24 +296,27 @@ def test_array_multiindex_raises(): @pytest.mark.parametrize( - "arr, expected", + "arr, expected, zero_copy", [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False), ( pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + False, ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + False, ), - (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False), # tz-naive datetime ( DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), + True, ), # tz-aware stays tz`-aware ( @@ -322,13 +331,16 @@ def test_array_multiindex_raises(): Timestamp("2000-01-02", tz="US/Central"), ] ), + False, ), # Timedelta ( TimedeltaArray._from_sequence( - np.array([0, 3600000000000], dtype="i8").view("m8[ns]") + np.array([0, 3600000000000], dtype="i8").view("m8[ns]"), + dtype=np.dtype("m8[ns]"), ), np.array([0, 3600000000000], dtype="m8[ns]"), + True, ), # GH#26406 tz is preserved in Categorical[dt64tz] ( @@ -339,10 +351,11 @@ def test_array_multiindex_raises(): Timestamp("2016-01-02", tz="US/Pacific"), ] ), + False, ), ], ) -def test_to_numpy(arr, expected, index_or_series_or_array, request): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): box = index_or_series_or_array with tm.assert_produces_warning(None): @@ -354,6 +367,28 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): result = np.asarray(thing) tm.assert_numpy_array_equal(result, expected) + # Additionally, we check the `copy=` semantics for array/asarray + # (these are implemented by us via `__array__`). + result_cp1 = np.array(thing, copy=True) + result_cp2 = np.array(thing, copy=True) + # When called with `copy=True` NumPy/we should ensure a copy was made + assert not np.may_share_memory(result_cp1, result_cp2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + if not zero_copy: + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + # An error is always acceptable for `copy=False` + np.array(thing, copy=False) + + else: + result_nocopy1 = np.array(thing, copy=False) + result_nocopy2 = np.array(thing, copy=False) + # If copy=False was given, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( @@ -366,13 +401,13 @@ def test_to_numpy_copy(arr, as_series, using_infer_string): # no copy by default result = obj.to_numpy() - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True result = obj.to_numpy(copy=False) - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True @@ -383,7 +418,7 @@ def test_to_numpy_copy(arr, as_series, using_infer_string): @pytest.mark.parametrize("as_series", [True, False]) -def test_to_numpy_dtype(as_series, unit): +def test_to_numpy_dtype(as_series): tz = "US/Eastern" obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) if as_series: @@ -412,7 +447,7 @@ def test_to_numpy_dtype(as_series, unit): [Timestamp("2000"), Timestamp("2000"), pd.NaT], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) @@ -454,7 +489,7 @@ def test_to_numpy_na_value_numpy_dtype( [(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) @@ -499,22 +534,23 @@ def test_to_numpy_dataframe_na_value(data, dtype, na_value): @pytest.mark.parametrize( - "data, expected", + "data, expected_data", [ ( {"a": pd.array([1, 2, None])}, - np.array([[1.0], [2.0], [np.nan]], dtype=float), + [[1.0], [2.0], [np.nan]], ), ( {"a": [1, 2, 3], "b": [1, 2, 3]}, - np.array([[1, 1], [2, 2], [3, 3]], dtype=float), + [[1, 1], [2, 2], [3, 3]], ), ], ) -def test_to_numpy_dataframe_single_block(data, expected): +def test_to_numpy_dataframe_single_block(data, expected_data): # https://github.com/pandas-dev/pandas/issues/33820 df = pd.DataFrame(data) result = df.to_numpy(dtype=float, na_value=np.nan) + expected = np.array(expected_data, dtype=float) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/base/test_fillna.py b/pandas/tests/base/test_fillna.py index 7300d3013305a..8c56bcc169d8e 100644 --- a/pandas/tests/base/test_fillna.py +++ b/pandas/tests/base/test_fillna.py @@ -16,7 +16,7 @@ def test_fillna(index_or_series_obj): obj = index_or_series_obj if isinstance(obj, MultiIndex): - msg = "isna is not defined for MultiIndex" + msg = "fillna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): obj.fillna(0) return diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 65e234e799353..7819b7b75f065 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import PYPY @@ -17,7 +17,6 @@ Index, Series, ) -import pandas._testing as tm def test_isnull_notnull_docstrings(): @@ -83,7 +82,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif( - PYPY or using_pyarrow_string_dtype(), + PYPY or using_string_dtype(), reason="not relevant for PyPy doesn't work properly for arrow strings", ) def test_memory_usage(index_or_series_memory_obj): @@ -130,9 +129,13 @@ def test_memory_usage_components_series(series_with_simple_index): assert total_usage == non_index_usage + index_usage -@pytest.mark.parametrize("dtype", tm.NARROW_NP_DTYPES) -def test_memory_usage_components_narrow_series(dtype): - series = Series(range(5), dtype=dtype, index=[f"i-{i}" for i in range(5)], name="a") +def test_memory_usage_components_narrow_series(any_real_numpy_dtype): + series = Series( + range(5), + dtype=any_real_numpy_dtype, + index=[f"i-{i}" for i in range(5)], + name="a", + ) total_usage = series.memory_usage(index=True) non_index_usage = series.memory_usage(index=False) index_usage = series.index.memory_usage() @@ -180,9 +183,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( - index.dtype, "string[pyarrow_numpy]" - ): + if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow": msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index d3fe144f70cfc..7f094db6ea524 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -100,12 +98,11 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji - obj = index_or_series([uval] * 2) + obj = index_or_series([uval] * 2, dtype=object) result = obj.unique() if isinstance(obj, pd.Index): @@ -116,7 +113,6 @@ def test_unique_bad_unicode(index_or_series): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dropna", [True, False]) def test_nunique_dropna(dropna): # GH37566 ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT]) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 2729666398877..bcb31829a201f 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -47,11 +47,6 @@ def test_value_counts(index_or_series_obj): # i.e IntegerDtype expected = expected.astype("Int64") - # TODO(GH#32514): Order of entries with the same count is inconsistent - # on CI (gh-32449) - if obj.duplicated().any(): - result = result.sort_index() - expected = expected.sort_index() tm.assert_series_equal(result, expected) @@ -89,11 +84,6 @@ def test_value_counts_null(null_obj, index_or_series_obj): expected.index.name = obj.name result = obj.value_counts() - if obj.duplicated().any(): - # TODO(GH#32514): - # Order of entries with the same count is inconsistent on CI (gh-32449) - expected = expected.sort_index() - result = result.sort_index() if not isinstance(result.dtype, np.dtype): if getattr(obj.dtype, "storage", "") == "pyarrow": @@ -106,11 +96,8 @@ def test_value_counts_null(null_obj, index_or_series_obj): expected[null_obj] = 3 result = obj.value_counts(dropna=False) - if obj.duplicated().any(): - # TODO(GH#32514): - # Order of entries with the same count is inconsistent on CI (gh-32449) - expected = expected.sort_index() - result = result.sort_index() + expected = expected.sort_index() + result = result.sort_index() tm.assert_series_equal(result, expected) @@ -127,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string): else: exp = np.unique(np.array(s_values, dtype=np.object_)) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 @@ -205,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string): else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 @@ -329,7 +316,6 @@ def test_value_counts_timedelta64(index_or_series, unit): tm.assert_series_equal(result2, expected_s) -@pytest.mark.parametrize("dropna", [True, False]) def test_value_counts_with_nan(dropna, index_or_series): # GH31944 klass = index_or_series @@ -348,9 +334,8 @@ def test_value_counts_object_inference_deprecated(): dti = pd.date_range("2016-01-01", periods=3, tz="UTC") idx = dti.astype(object) - msg = "The behavior of value_counts with object-dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = idx.value_counts() + res = idx.value_counts() exp = dti.value_counts() + exp.index = exp.index.astype(object) tm.assert_series_equal(res, exp) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 17630f14b08c7..9fed65faee896 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -48,7 +48,6 @@ ) from pandas.core.computation.ops import ( ARITH_OPS_SYMS, - SPECIAL_CASE_ARITH_OPS_SYMS, _binary_math_ops, _binary_ops_dict, _unary_math_ops, @@ -141,8 +140,8 @@ class TestEval: def test_complex_cmp_ops(self, cmp1, cmp2, binop, lhs, rhs, engine, parser): if parser == "python" and binop in ["and", "or"]: msg = "'BoolOp' nodes are not implemented" + ex = f"(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)" with pytest.raises(NotImplementedError, match=msg): - ex = f"(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)" pd.eval(ex, engine=engine, parser=parser) return @@ -161,9 +160,8 @@ def test_simple_cmp_ops(self, cmp_op, lhs, rhs, engine, parser): if parser == "python" and cmp_op in ["in", "not in"]: msg = "'(In|NotIn)' nodes are not implemented" - + ex = f"lhs {cmp_op} rhs" with pytest.raises(NotImplementedError, match=msg): - ex = f"lhs {cmp_op} rhs" pd.eval(ex, engine=engine, parser=parser) return @@ -193,8 +191,8 @@ def test_simple_cmp_ops(self, cmp_op, lhs, rhs, engine, parser): def test_compound_invert_op(self, op, lhs, rhs, request, engine, parser): if parser == "python" and op in ["in", "not in"]: msg = "'(In|NotIn)' nodes are not implemented" + ex = f"~(lhs {op} rhs)" with pytest.raises(NotImplementedError, match=msg): - ex = f"~(lhs {op} rhs)" pd.eval(ex, engine=engine, parser=parser) return @@ -267,7 +265,7 @@ def test_chained_cmp_op(self, cmp1, cmp2, lhs, midhs, rhs, engine, parser): tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( - "arith1", sorted(set(ARITH_OPS_SYMS).difference(SPECIAL_CASE_ARITH_OPS_SYMS)) + "arith1", sorted(set(ARITH_OPS_SYMS).difference({"**", "//", "%"})) ) def test_binary_arith_ops(self, arith1, lhs, rhs, engine, parser): ex = f"lhs {arith1} rhs" @@ -523,14 +521,15 @@ def test_series_negate(self, engine, parser): "lhs", [ # Float - DataFrame(np.random.default_rng(2).standard_normal((5, 2))), + np.random.default_rng(2).standard_normal((5, 2)), # Int - DataFrame(np.random.default_rng(2).integers(5, size=(5, 2))), + np.random.default_rng(2).integers(5, size=(5, 2)), # bool doesn't work with numexpr but works elsewhere - DataFrame(np.random.default_rng(2).standard_normal((5, 2)) > 0.5), + np.array([True, False, True, False, True], dtype=np.bool_), ], ) def test_frame_pos(self, lhs, engine, parser): + lhs = DataFrame(lhs) expr = "+lhs" expect = lhs @@ -541,14 +540,15 @@ def test_frame_pos(self, lhs, engine, parser): "lhs", [ # Float - Series(np.random.default_rng(2).standard_normal(5)), + np.random.default_rng(2).standard_normal(5), # Int - Series(np.random.default_rng(2).integers(5, size=5)), + np.random.default_rng(2).integers(5, size=5), # bool doesn't work with numexpr but works elsewhere - Series(np.random.default_rng(2).standard_normal(5) > 0.5), + np.array([True, False, True, False, True], dtype=np.bool_), ], ) def test_series_pos(self, lhs, engine, parser): + lhs = Series(lhs) expr = "+lhs" expect = lhs @@ -606,11 +606,10 @@ def test_unary_in_array(self): ) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("expr", ["x < -0.1", "-5 > x"]) - def test_float_comparison_bin_op(self, dtype, expr): + def test_float_comparison_bin_op(self, float_numpy_dtype, expr): # GH 16363 - df = DataFrame({"x": np.array([0], dtype=dtype)}) + df = DataFrame({"x": np.array([0], dtype=float_numpy_dtype)}) res = df.eval(expr) assert res.values == np.array([False]) @@ -738,6 +737,17 @@ def test_and_logic_string_match(self): assert pd.eval(f"{event.str.match('hello').a}") assert pd.eval(f"{event.str.match('hello').a and event.str.match('hello').a}") + def test_eval_keep_name(self, engine, parser): + df = Series([2, 15, 28], name="a").to_frame() + res = df.eval("a + a", engine=engine, parser=parser) + expected = Series([4, 30, 56], name="a") + tm.assert_series_equal(expected, res) + + def test_eval_unmatching_names(self, engine, parser): + variable_name = Series([42], name="series_name") + res = pd.eval("variable_name + 0", engine=engine, parser=parser) + tm.assert_series_equal(variable_name, res) + # ------------------------------------- # gh-12388: Typecasting rules consistency with python @@ -747,16 +757,26 @@ class TestTypeCasting: @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"]) # maybe someday... numexpr has too many upcasting rules now # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float'])) - @pytest.mark.parametrize("dt", [np.float32, np.float64]) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) - def test_binop_typecasting(self, engine, parser, op, dt, left_right): - df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dt) + def test_binop_typecasting( + self, engine, parser, op, complex_or_float_dtype, left_right, request + ): + # GH#21374 + dtype = complex_or_float_dtype + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dtype) left, right = left_right s = f"{left} {op} {right}" res = pd.eval(s, engine=engine, parser=parser) - assert df.values.dtype == dt - assert res.values.dtype == dt - tm.assert_frame_equal(res, eval(s)) + if dtype == "complex64" and engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr issue with complex that are upcast " + "to complex 128 " + "https://github.com/pydata/numexpr/issues/492" + ) + request.applymarker(mark) + assert df.values.dtype == dtype + assert res.values.dtype == dtype + tm.assert_frame_equal(res, eval(s), check_exact=False) # ------------------------------------- @@ -1004,15 +1024,18 @@ def test_complex_series_frame_alignment( assert res.shape == expected.shape tm.assert_frame_equal(res, expected) - def test_performance_warning_for_poor_alignment(self, engine, parser): + def test_performance_warning_for_poor_alignment( + self, performance_warning, engine, parser + ): df = DataFrame(np.random.default_rng(2).standard_normal((1000, 10))) s = Series(np.random.default_rng(2).standard_normal(10000)) - if engine == "numexpr": + if engine == "numexpr" and performance_warning: seen = PerformanceWarning else: seen = False - with tm.assert_produces_warning(seen): + msg = "Alignment difference on axis 1 is larger than an order of magnitude" + with tm.assert_produces_warning(seen, match=msg): pd.eval("df + s", engine=engine, parser=parser) s = Series(np.random.default_rng(2).standard_normal(1000)) @@ -1029,15 +1052,15 @@ def test_performance_warning_for_poor_alignment(self, engine, parser): is_python_engine = engine == "python" - if not is_python_engine: + if not is_python_engine and performance_warning: wrn = PerformanceWarning else: wrn = False - with tm.assert_produces_warning(wrn) as w: + with tm.assert_produces_warning(wrn, match=msg) as w: pd.eval("df + s", engine=engine, parser=parser) - if not is_python_engine: + if not is_python_engine and performance_warning: assert len(w) == 1 msg = str(w[0].message) logged = np.log10(s.size - df.shape[1]) @@ -1266,14 +1289,12 @@ def test_assignment_explicit(self): expected["c"] = expected["a"] + expected["b"] tm.assert_frame_equal(df, expected) - def test_column_in(self): + def test_column_in(self, engine): # GH 11235 df = DataFrame({"a": [11], "b": [-32]}) - result = df.eval("a in [11, -32]") - expected = Series([True]) - # TODO: 2022-01-29: Name check failed with numexpr 2.7.3 in CI - # but cannot reproduce locally - tm.assert_series_equal(result, expected, check_names=False) + result = df.eval("a in [11, -32]", engine=engine) + expected = Series([True], name="a") + tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.") def test_assignment_not_inplace(self): @@ -1289,7 +1310,7 @@ def test_assignment_not_inplace(self): expected["c"] = expected["a"] + expected["b"] tm.assert_frame_equal(df, expected) - def test_multi_line_expression(self, warn_copy_on_write): + def test_multi_line_expression(self): # GH 11149 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() @@ -1502,7 +1523,7 @@ def test_date_boolean(self, engine, parser): parser=parser, ) expec = df.dates1 < "20130101" - tm.assert_series_equal(res, expec, check_names=False) + tm.assert_series_equal(res, expec) def test_simple_in_ops(self, engine, parser): if parser != "python": @@ -1607,22 +1628,20 @@ def eval(self, *args, **kwargs): kwargs["level"] = kwargs.pop("level", 0) + 1 return pd.eval(*args, **kwargs) - @pytest.mark.skipif( - not NUMEXPR_INSTALLED, reason="Unary ops only implemented for numexpr" - ) + @pytest.mark.filterwarnings("ignore::RuntimeWarning") @pytest.mark.parametrize("fn", _unary_math_ops) - def test_unary_functions(self, fn): + def test_unary_functions(self, fn, engine, parser): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)}) a = df.a expr = f"{fn}(a)" - got = self.eval(expr) + got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a) - tm.assert_series_equal(got, expect, check_names=False) + tm.assert_series_equal(got, expect) @pytest.mark.parametrize("fn", _binary_math_ops) - def test_binary_functions(self, fn): + def test_binary_functions(self, fn, engine, parser): df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -1633,10 +1652,10 @@ def test_binary_functions(self, fn): b = df.b expr = f"{fn}(a, b)" - got = self.eval(expr) + got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a, b) - tm.assert_almost_equal(got, expect, check_names=False) + tm.assert_almost_equal(got, expect) def test_df_use_case(self, engine, parser): df = DataFrame( @@ -1652,8 +1671,8 @@ def test_df_use_case(self, engine, parser): inplace=True, ) got = df.e - expect = np.arctan2(np.sin(df.a), df.b) - tm.assert_series_equal(got, expect, check_names=False) + expect = np.arctan2(np.sin(df.a), df.b).rename("e") + tm.assert_series_equal(got, expect) def test_df_arithmetic_subexpression(self, engine, parser): df = DataFrame( @@ -1664,8 +1683,8 @@ def test_df_arithmetic_subexpression(self, engine, parser): ) df.eval("e = sin(a + b)", engine=engine, parser=parser, inplace=True) got = df.e - expect = np.sin(df.a + df.b) - tm.assert_series_equal(got, expect, check_names=False) + expect = np.sin(df.a + df.b).rename("e") + tm.assert_series_equal(got, expect) @pytest.mark.parametrize( "dtype, expect_dtype", @@ -1689,10 +1708,10 @@ def test_result_types(self, dtype, expect_dtype, engine, parser): assert df.a.dtype == dtype df.eval("b = sin(a)", engine=engine, parser=parser, inplace=True) got = df.b - expect = np.sin(df.a) + expect = np.sin(df.a).rename("b") assert expect.dtype == got.dtype assert expect_dtype == got.dtype - tm.assert_series_equal(got, expect, check_names=False) + tm.assert_series_equal(got, expect) def test_undefined_func(self, engine, parser): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)}) @@ -1781,7 +1800,7 @@ def test_numexpr_option_incompatible_op(): {"A": [True, False, True, False, None, None], "B": [1, 2, 3, 4, 5, 6]} ) result = df.query("A.isnull()") - expected = DataFrame({"A": [None, None], "B": [5, 6]}, index=[4, 5]) + expected = DataFrame({"A": [None, None], "B": [5, 6]}, index=range(4, 6)) tm.assert_frame_equal(result, expected) @@ -1897,10 +1916,6 @@ def test_equals_various(other): df = DataFrame({"A": ["a", "b", "c"]}, dtype=object) result = df.eval(f"A == {other}") expected = Series([False, False, False], name="A") - if USE_NUMEXPR: - # https://github.com/pandas-dev/pandas/issues/10239 - # lose name with numexpr engine. Remove when that's fixed. - expected.name = None tm.assert_series_equal(result, expected) @@ -1963,29 +1978,52 @@ def test_eval_no_support_column_name(request, column): tm.assert_frame_equal(result, expected) -def test_set_inplace(using_copy_on_write, warn_copy_on_write): +def test_set_inplace(): # https://github.com/pandas-dev/pandas/issues/47449 # Ensure we don't only update the DataFrame inplace, but also the actual # column values, such that references to this column also get updated df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) result_view = df[:] ser = df["A"] - with tm.assert_cow_warning(warn_copy_on_write): - df.eval("A = B + C", inplace=True) + df.eval("A = B + C", inplace=True) expected = DataFrame({"A": [11, 13, 15], "B": [4, 5, 6], "C": [7, 8, 9]}) tm.assert_frame_equal(df, expected) - if not using_copy_on_write: - tm.assert_series_equal(ser, expected["A"]) - tm.assert_series_equal(result_view["A"], expected["A"]) - else: - expected = Series([1, 2, 3], name="A") - tm.assert_series_equal(ser, expected) - tm.assert_series_equal(result_view["A"], expected) + expected = Series([1, 2, 3], name="A") + tm.assert_series_equal(ser, expected) + tm.assert_series_equal(result_view["A"], expected) -class TestValidate: - @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) - def test_validate_bool_args(self, value): - msg = 'For argument "inplace" expected type bool, received type' - with pytest.raises(ValueError, match=msg): - pd.eval("2+2", inplace=value) +@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) +def test_validate_bool_args(value): + msg = 'For argument "inplace" expected type bool, received type' + with pytest.raises(ValueError, match=msg): + pd.eval("2+2", inplace=value) + + +@td.skip_if_no("numexpr") +def test_eval_float_div_numexpr(): + # GH 59736 + result = pd.eval("1 / 2", engine="numexpr") + expected = 0.5 + assert result == expected + + +def test_method_calls_on_binop(): + # GH 61175 + x = Series([1, 2, 3, 5]) + y = Series([2, 3, 4]) + + # Method call on binary operation result + result = pd.eval("(x + y).dropna()") + expected = (x + y).dropna() + tm.assert_series_equal(result, expected) + + # Test with other binary operations + result = pd.eval("(x * y).dropna()") + expected = (x * y).dropna() + tm.assert_series_equal(result, expected) + + # Test with method chaining + result = pd.eval("(x + y).dropna().reset_index(drop=True)") + expected = (x + y).dropna().reset_index(drop=True) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py index f49ae94242399..aaf6178866ecd 100644 --- a/pandas/tests/config/test_config.py +++ b/pandas/tests/config/test_config.py @@ -123,9 +123,11 @@ def test_case_insensitive(self): msg = r"No such keys\(s\): 'no_such_option'" with pytest.raises(OptionError, match=msg): cf.get_option("no_such_option") - cf.deprecate_option("KanBan") - assert cf._is_deprecated("kAnBaN") + cf.deprecate_option("KanBan") + msg = "'kanban' is deprecated, please refrain from using it." + with pytest.raises(FutureWarning, match=msg): + cf.get_option("kAnBaN") def test_get_option(self): cf.register_option("a", 1, "doc") @@ -225,7 +227,6 @@ def test_validation(self): validator = cf.is_one_of_factory([None, cf.is_callable]) cf.register_option("b", lambda: None, "doc", validator=validator) - # pylint: disable-next=consider-using-f-string cf.set_option("b", "%.1f".format) # Formatter is callable cf.set_option("b", None) # Formatter is none (default) with pytest.raises(ValueError, match="Value must be a callable"): @@ -268,7 +269,6 @@ def test_deprecate_option(self): # we can deprecate non-existent options cf.deprecate_option("foo") - assert cf._is_deprecated("foo") with tm.assert_produces_warning(FutureWarning, match="deprecated"): with pytest.raises(KeyError, match="No such keys.s.: 'foo'"): cf.get_option("foo") @@ -395,7 +395,7 @@ def f3(key): assert cf.get_option("a") == 500 cf.reset_option("a") - assert options.a == cf.get_option("a", 0) + assert options.a == cf.get_option("a") msg = "You can only set the value of existing options" with pytest.raises(OptionError, match=msg): diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py index 3907f557d1075..b9a0a44bf8c89 100644 --- a/pandas/tests/config/test_localization.py +++ b/pandas/tests/config/test_localization.py @@ -15,7 +15,6 @@ import pandas as pd _all_locales = get_locales() -_current_locale = locale.setlocale(locale.LC_ALL) # getlocale() is wrong, see GH#46595 # Don't run any of these tests if we have no locales. pytestmark = pytest.mark.skipif(not _all_locales, reason="Need locales") @@ -93,7 +92,7 @@ def test_can_set_locale_invalid_get(monkeypatch): # but a subsequent getlocale() raises a ValueError. def mock_get_locale(): - raise ValueError() + raise ValueError with monkeypatch.context() as m: m.setattr(locale, "getlocale", mock_get_locale) diff --git a/pandas/tests/copy_view/index/test_datetimeindex.py b/pandas/tests/copy_view/index/test_datetimeindex.py index b023297c9549d..6194ea8b122c9 100644 --- a/pandas/tests/copy_view/index/test_datetimeindex.py +++ b/pandas/tests/copy_view/index/test_datetimeindex.py @@ -13,57 +13,44 @@ ) -@pytest.mark.parametrize( - "cons", - [ - lambda x: DatetimeIndex(x), - lambda x: DatetimeIndex(DatetimeIndex(x)), - ], -) -def test_datetimeindex(using_copy_on_write, cons): +@pytest.mark.parametrize("box", [lambda x: x, DatetimeIndex]) +def test_datetimeindex(box): dt = date_range("2019-12-31", periods=3, freq="D") ser = Series(dt) - idx = cons(ser) + idx = box(DatetimeIndex(ser)) expected = idx.copy(deep=True) ser.iloc[0] = Timestamp("2020-12-31") - if using_copy_on_write: - tm.assert_index_equal(idx, expected) + tm.assert_index_equal(idx, expected) -def test_datetimeindex_tz_convert(using_copy_on_write): +def test_datetimeindex_tz_convert(): dt = date_range("2019-12-31", periods=3, freq="D", tz="Europe/Berlin") ser = Series(dt) idx = DatetimeIndex(ser).tz_convert("US/Eastern") expected = idx.copy(deep=True) ser.iloc[0] = Timestamp("2020-12-31", tz="Europe/Berlin") - if using_copy_on_write: - tm.assert_index_equal(idx, expected) + tm.assert_index_equal(idx, expected) -def test_datetimeindex_tz_localize(using_copy_on_write): +def test_datetimeindex_tz_localize(): dt = date_range("2019-12-31", periods=3, freq="D") ser = Series(dt) idx = DatetimeIndex(ser).tz_localize("Europe/Berlin") expected = idx.copy(deep=True) ser.iloc[0] = Timestamp("2020-12-31") - if using_copy_on_write: - tm.assert_index_equal(idx, expected) + tm.assert_index_equal(idx, expected) -def test_datetimeindex_isocalendar(using_copy_on_write): +def test_datetimeindex_isocalendar(): dt = date_range("2019-12-31", periods=3, freq="D") ser = Series(dt) df = DatetimeIndex(ser).isocalendar() expected = df.index.copy(deep=True) ser.iloc[0] = Timestamp("2020-12-31") - if using_copy_on_write: - tm.assert_index_equal(df.index, expected) + tm.assert_index_equal(df.index, expected) -def test_index_values(using_copy_on_write): +def test_index_values(): idx = date_range("2019-12-31", periods=3, freq="D") result = idx.values - if using_copy_on_write: - assert result.flags.writeable is False - else: - assert result.flags.writeable is True + assert result.flags.writeable is False diff --git a/pandas/tests/copy_view/index/test_index.py b/pandas/tests/copy_view/index/test_index.py index 49d756cf32d34..e51f5658cf437 100644 --- a/pandas/tests/copy_view/index/test_index.py +++ b/pandas/tests/copy_view/index/test_index.py @@ -10,7 +10,7 @@ from pandas.tests.copy_view.util import get_array -def index_view(index_data=[1, 2]): +def index_view(index_data): df = DataFrame({"a": index_data, "b": 1.5}) view = df[:] df = df.set_index("a", drop=True) @@ -19,19 +19,15 @@ def index_view(index_data=[1, 2]): return idx, view -def test_set_index_update_column(using_copy_on_write, warn_copy_on_write): +def test_set_index_update_column(): df = DataFrame({"a": [1, 2], "b": 1}) df = df.set_index("a", drop=False) expected = df.index.copy(deep=True) - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 100 - if using_copy_on_write: - tm.assert_index_equal(df.index, expected) - else: - tm.assert_index_equal(df.index, Index([100, 2], name="a")) + df.iloc[0, 0] = 100 + tm.assert_index_equal(df.index, expected) -def test_set_index_drop_update_column(using_copy_on_write): +def test_set_index_drop_update_column(): df = DataFrame({"a": [1, 2], "b": 1.5}) view = df[:] df = df.set_index("a", drop=True) @@ -40,60 +36,44 @@ def test_set_index_drop_update_column(using_copy_on_write): tm.assert_index_equal(df.index, expected) -def test_set_index_series(using_copy_on_write, warn_copy_on_write): +def test_set_index_series(): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) df = df.set_index(ser) expected = df.index.copy(deep=True) - with tm.assert_cow_warning(warn_copy_on_write): - ser.iloc[0] = 100 - if using_copy_on_write: - tm.assert_index_equal(df.index, expected) - else: - tm.assert_index_equal(df.index, Index([100, 11])) + ser.iloc[0] = 100 + tm.assert_index_equal(df.index, expected) -def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write): +def test_assign_index_as_series(): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) df.index = ser expected = df.index.copy(deep=True) - with tm.assert_cow_warning(warn_copy_on_write): - ser.iloc[0] = 100 - if using_copy_on_write: - tm.assert_index_equal(df.index, expected) - else: - tm.assert_index_equal(df.index, Index([100, 11])) + ser.iloc[0] = 100 + tm.assert_index_equal(df.index, expected) -def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write): +def test_assign_index_as_index(): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) rhs_index = Index(ser) df.index = rhs_index rhs_index = None # overwrite to clear reference expected = df.index.copy(deep=True) - with tm.assert_cow_warning(warn_copy_on_write): - ser.iloc[0] = 100 - if using_copy_on_write: - tm.assert_index_equal(df.index, expected) - else: - tm.assert_index_equal(df.index, Index([100, 11])) + ser.iloc[0] = 100 + tm.assert_index_equal(df.index, expected) -def test_index_from_series(using_copy_on_write, warn_copy_on_write): +def test_index_from_series(): ser = Series([1, 2]) idx = Index(ser) expected = idx.copy(deep=True) - with tm.assert_cow_warning(warn_copy_on_write): - ser.iloc[0] = 100 - if using_copy_on_write: - tm.assert_index_equal(idx, expected) - else: - tm.assert_index_equal(idx, Index([100, 2])) + ser.iloc[0] = 100 + tm.assert_index_equal(idx, expected) -def test_index_from_series_copy(using_copy_on_write): +def test_index_from_series_copy(): ser = Series([1, 2]) idx = Index(ser, copy=True) # noqa: F841 arr = get_array(ser) @@ -101,17 +81,13 @@ def test_index_from_series_copy(using_copy_on_write): assert np.shares_memory(get_array(ser), arr) -def test_index_from_index(using_copy_on_write, warn_copy_on_write): +def test_index_from_index(): ser = Series([1, 2]) idx = Index(ser) idx = Index(idx) expected = idx.copy(deep=True) - with tm.assert_cow_warning(warn_copy_on_write): - ser.iloc[0] = 100 - if using_copy_on_write: - tm.assert_index_equal(idx, expected) - else: - tm.assert_index_equal(idx, Index([100, 2])) + ser.iloc[0] = 100 + tm.assert_index_equal(idx, expected) @pytest.mark.parametrize( @@ -141,44 +117,36 @@ def test_index_from_index(using_copy_on_write, warn_copy_on_write): "astype", ], ) -def test_index_ops(using_copy_on_write, func, request): - idx, view_ = index_view() +def test_index_ops(func, request): + idx, view_ = index_view([1, 2]) expected = idx.copy(deep=True) if "astype" in request.node.callspec.id: expected = expected.astype("Int64") idx = func(idx) view_.iloc[0, 0] = 100 - if using_copy_on_write: - tm.assert_index_equal(idx, expected, check_names=False) + tm.assert_index_equal(idx, expected, check_names=False) -def test_infer_objects(using_copy_on_write): +def test_infer_objects(): idx, view_ = index_view(["a", "b"]) expected = idx.copy(deep=True) idx = idx.infer_objects(copy=False) view_.iloc[0, 0] = "aaaa" - if using_copy_on_write: - tm.assert_index_equal(idx, expected, check_names=False) + tm.assert_index_equal(idx, expected, check_names=False) -def test_index_to_frame(using_copy_on_write): +def test_index_to_frame(): idx = Index([1, 2, 3], name="a") expected = idx.copy(deep=True) df = idx.to_frame() - if using_copy_on_write: - assert np.shares_memory(get_array(df, "a"), idx._values) - assert not df._mgr._has_no_reference(0) - else: - assert not np.shares_memory(get_array(df, "a"), idx._values) + assert np.shares_memory(get_array(df, "a"), idx._values) + assert not df._mgr._has_no_reference(0) df.iloc[0, 0] = 100 tm.assert_index_equal(idx, expected) -def test_index_values(using_copy_on_write): +def test_index_values(): idx = Index([1, 2, 3]) result = idx.values - if using_copy_on_write: - assert result.flags.writeable is False - else: - assert result.flags.writeable is True + assert result.flags.writeable is False diff --git a/pandas/tests/copy_view/index/test_periodindex.py b/pandas/tests/copy_view/index/test_periodindex.py index b80ce1d3d838f..2887b191038d2 100644 --- a/pandas/tests/copy_view/index/test_periodindex.py +++ b/pandas/tests/copy_view/index/test_periodindex.py @@ -13,18 +13,11 @@ ) -@pytest.mark.parametrize( - "cons", - [ - lambda x: PeriodIndex(x), - lambda x: PeriodIndex(PeriodIndex(x)), - ], -) -def test_periodindex(using_copy_on_write, cons): +@pytest.mark.parametrize("box", [lambda x: x, PeriodIndex]) +def test_periodindex(box): dt = period_range("2019-12-31", periods=3, freq="D") ser = Series(dt) - idx = cons(ser) + idx = box(PeriodIndex(ser)) expected = idx.copy(deep=True) ser.iloc[0] = Period("2020-12-31") - if using_copy_on_write: - tm.assert_index_equal(idx, expected) + tm.assert_index_equal(idx, expected) diff --git a/pandas/tests/copy_view/index/test_timedeltaindex.py b/pandas/tests/copy_view/index/test_timedeltaindex.py index 5b9832093fded..6984df86b00e3 100644 --- a/pandas/tests/copy_view/index/test_timedeltaindex.py +++ b/pandas/tests/copy_view/index/test_timedeltaindex.py @@ -20,11 +20,10 @@ lambda x: TimedeltaIndex(TimedeltaIndex(x)), ], ) -def test_timedeltaindex(using_copy_on_write, cons): +def test_timedeltaindex(cons): dt = timedelta_range("1 day", periods=3) ser = Series(dt) idx = cons(ser) expected = idx.copy(deep=True) ser.iloc[0] = Timedelta("5 days") - if using_copy_on_write: - tm.assert_index_equal(idx, expected) + tm.assert_index_equal(idx, expected) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 9a3f83e0293f5..2b3ef9201d918 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas import ( DataFrame, Series, @@ -15,92 +17,82 @@ @pytest.mark.parametrize( "method", - [lambda ser: ser.values, lambda ser: np.asarray(ser)], - ids=["values", "asarray"], + [ + lambda ser: ser.values, + lambda ser: np.asarray(ser), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) -def test_series_values(using_copy_on_write, method): +def test_series_values(method): ser = Series([1, 2, 3], name="name") ser_orig = ser.copy() arr = method(ser) - if using_copy_on_write: - # .values still gives a view but is read-only - assert np.shares_memory(arr, get_array(ser, "name")) - assert arr.flags.writeable is False - - # mutating series through arr therefore doesn't work - with pytest.raises(ValueError, match="read-only"): - arr[0] = 0 - tm.assert_series_equal(ser, ser_orig) - - # mutating the series itself still works - ser.iloc[0] = 0 - assert ser.values[0] == 0 - else: - assert arr.flags.writeable is True + # .values still gives a view but is read-only + assert np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): arr[0] = 0 - assert ser.iloc[0] == 0 + tm.assert_series_equal(ser, ser_orig) + + # mutating the series itself still works + ser.iloc[0] = 0 + assert ser.values[0] == 0 @pytest.mark.parametrize( "method", - [lambda df: df.values, lambda df: np.asarray(df)], - ids=["values", "asarray"], + [ + lambda df: df.values, + lambda df: np.asarray(df), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) -def test_dataframe_values(using_copy_on_write, using_array_manager, method): +def test_dataframe_values(method): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() arr = method(df) - if using_copy_on_write: - # .values still gives a view but is read-only - assert np.shares_memory(arr, get_array(df, "a")) - assert arr.flags.writeable is False - - # mutating series through arr therefore doesn't work - with pytest.raises(ValueError, match="read-only"): - arr[0, 0] = 0 - tm.assert_frame_equal(df, df_orig) - - # mutating the series itself still works - df.iloc[0, 0] = 0 - assert df.values[0, 0] == 0 - else: - assert arr.flags.writeable is True + # .values still gives a view but is read-only + assert np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): arr[0, 0] = 0 - if not using_array_manager: - assert df.iloc[0, 0] == 0 - else: - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) + # mutating the series itself still works + df.iloc[0, 0] = 0 + assert df.values[0, 0] == 0 -def test_series_to_numpy(using_copy_on_write): + +def test_series_to_numpy(): ser = Series([1, 2, 3], name="name") ser_orig = ser.copy() # default: copy=False, no dtype or NAs arr = ser.to_numpy() - if using_copy_on_write: - # to_numpy still gives a view but is read-only - assert np.shares_memory(arr, get_array(ser, "name")) - assert arr.flags.writeable is False - - # mutating series through arr therefore doesn't work - with pytest.raises(ValueError, match="read-only"): - arr[0] = 0 - tm.assert_series_equal(ser, ser_orig) - - # mutating the series itself still works - ser.iloc[0] = 0 - assert ser.values[0] == 0 - else: - assert arr.flags.writeable is True + # to_numpy still gives a view but is read-only + assert np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): arr[0] = 0 - assert ser.iloc[0] == 0 + tm.assert_series_equal(ser, ser_orig) + + # mutating the series itself still works + ser.iloc[0] = 0 + assert ser.values[0] == 0 - # specify copy=False gives a writeable array + # specify copy=True gives a writeable array ser = Series([1, 2, 3], name="name") arr = ser.to_numpy(copy=True) assert not np.shares_memory(arr, get_array(ser, "name")) @@ -113,59 +105,33 @@ def test_series_to_numpy(using_copy_on_write): assert arr.flags.writeable is True -@pytest.mark.parametrize("order", ["F", "C"]) -def test_ravel_read_only(using_copy_on_write, order): - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="is deprecated"): - arr = ser.ravel(order=order) - if using_copy_on_write: - assert arr.flags.writeable is False - assert np.shares_memory(get_array(ser), arr) - - -def test_series_array_ea_dtypes(using_copy_on_write): +def test_series_array_ea_dtypes(): ser = Series([1, 2, 3], dtype="Int64") arr = np.asarray(ser, dtype="int64") assert np.shares_memory(arr, get_array(ser)) - if using_copy_on_write: - assert arr.flags.writeable is False - else: - assert arr.flags.writeable is True + assert arr.flags.writeable is False arr = np.asarray(ser) assert np.shares_memory(arr, get_array(ser)) - if using_copy_on_write: - assert arr.flags.writeable is False - else: - assert arr.flags.writeable is True + assert arr.flags.writeable is False -def test_dataframe_array_ea_dtypes(using_copy_on_write): +def test_dataframe_array_ea_dtypes(): df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") arr = np.asarray(df, dtype="int64") assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - assert arr.flags.writeable is False - else: - assert arr.flags.writeable is True + assert arr.flags.writeable is False arr = np.asarray(df) assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - assert arr.flags.writeable is False - else: - assert arr.flags.writeable is True + assert arr.flags.writeable is False -def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): +def test_dataframe_array_string_dtype(): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) - if not using_array_manager: - assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - assert arr.flags.writeable is False - else: - assert arr.flags.writeable is True + assert np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is False def test_dataframe_multiple_numpy_dtypes(): @@ -174,14 +140,28 @@ def test_dataframe_multiple_numpy_dtypes(): assert not np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is True + if np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + arr = np.array(df, copy=False) + + arr = np.array(df, copy=True) + assert arr.flags.writeable is True + + +def test_dataframe_single_block_copy_true(): + # the copy=False/None cases are tested above in test_dataframe_values + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + arr = np.array(df, copy=True) + assert not np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is True + -def test_values_is_ea(using_copy_on_write): +def test_values_is_ea(): df = DataFrame({"a": date_range("2012-01-01", periods=3)}) arr = np.asarray(df) - if using_copy_on_write: - assert arr.flags.writeable is False - else: - assert arr.flags.writeable is True + assert arr.flags.writeable is False def test_empty_dataframe(): diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d462ce3d3187d..91f5badeb9728 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,10 +3,9 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 -import pandas.util._test_decorators as td -import pandas as pd from pandas import ( DataFrame, Series, @@ -17,22 +16,17 @@ from pandas.tests.copy_view.util import get_array -def test_astype_single_dtype(using_copy_on_write): +def test_astype_single_dtype(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5}) df_orig = df.copy() df2 = df.astype("float64") - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 2] = 5.5 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) # mutating parent also doesn't update result @@ -43,22 +37,17 @@ def test_astype_single_dtype(using_copy_on_write): @pytest.mark.parametrize("dtype", ["int64", "Int64"]) @pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) -def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): +def test_astype_avoids_copy(dtype, new_dtype): if new_dtype == "int64[pyarrow]": pytest.importorskip("pyarrow") df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) df_orig = df.copy() df2 = df.astype(new_dtype) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 0] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) # mutating parent also doesn't update result @@ -68,7 +57,7 @@ def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): @pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) -def test_astype_different_target_dtype(using_copy_on_write, dtype): +def test_astype_different_target_dtype(dtype): if dtype == "int32[pyarrow]": pytest.importorskip("pyarrow") df = DataFrame({"a": [1, 2, 3]}) @@ -76,8 +65,7 @@ def test_astype_different_target_dtype(using_copy_on_write, dtype): df2 = df.astype(dtype) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - if using_copy_on_write: - assert df2._mgr._has_no_reference(0) + assert df2._mgr._has_no_reference(0) df2.iloc[0, 0] = 5 tm.assert_frame_equal(df, df_orig) @@ -88,26 +76,20 @@ def test_astype_different_target_dtype(using_copy_on_write, dtype): tm.assert_frame_equal(df2, df_orig.astype(dtype)) -@td.skip_array_manager_invalid_test def test_astype_numpy_to_ea(): ser = Series([1, 2, 3]) - with pd.option_context("mode.copy_on_write", True): - result = ser.astype("Int64") + result = ser.astype("Int64") assert np.shares_memory(get_array(ser), get_array(result)) @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) -def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): +def test_astype_string_and_object(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) df_orig = df.copy() df2 = df.astype(new_dtype) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = "x" tm.assert_frame_equal(df, df_orig) @@ -116,23 +98,18 @@ def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) -def test_astype_string_and_object_update_original( - using_copy_on_write, dtype, new_dtype -): +def test_astype_string_and_object_update_original(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) df2 = df.astype(new_dtype) df_orig = df2.copy() - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df.iloc[0, 0] = "x" tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -141,63 +118,72 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) -def test_astype_dict_dtypes(using_copy_on_write): +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter read-only array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy._values.flags.writeable = False + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_dict_dtypes(): df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} ) df_orig = df.copy() df2 = df.astype({"a": "float64", "c": "float64"}) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 2] = 5.5 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) df2.iloc[0, 1] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) tm.assert_frame_equal(df, df_orig) -def test_astype_different_datetime_resos(using_copy_on_write): +def test_astype_different_datetime_resos(): df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")}) result = df.astype("datetime64[ms]") assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) - if using_copy_on_write: - assert result._mgr._has_no_reference(0) + assert result._mgr._has_no_reference(0) -def test_astype_different_timezones(using_copy_on_write): +def test_astype_different_timezones(): df = DataFrame( {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} ) result = df.astype("datetime64[ns, Europe/Berlin]") - if using_copy_on_write: - assert not result._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert not result._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) -def test_astype_different_timezones_different_reso(using_copy_on_write): +def test_astype_different_timezones_different_reso(): df = DataFrame( {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} ) result = df.astype("datetime64[ms, Europe/Berlin]") - if using_copy_on_write: - assert result._mgr._has_no_reference(0) - assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert result._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) -def test_astype_arrow_timestamp(using_copy_on_write): +def test_astype_arrow_timestamp(): pytest.importorskip("pyarrow") df = DataFrame( { @@ -209,19 +195,16 @@ def test_astype_arrow_timestamp(using_copy_on_write): dtype="M8[ns]", ) result = df.astype("timestamp[ns][pyarrow]") - if using_copy_on_write: - assert not result._mgr._has_no_reference(0) - if pa_version_under12p0: - assert not np.shares_memory( - get_array(df, "a"), get_array(result, "a")._pa_array - ) - else: - assert np.shares_memory( - get_array(df, "a"), get_array(result, "a")._pa_array - ) - - -def test_convert_dtypes_infer_objects(using_copy_on_write): + assert not result._mgr._has_no_reference(0) + if pa_version_under12p0: + assert not np.shares_memory( + get_array(df, "a"), get_array(result, "a")._pa_array + ) + else: + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array) + + +def test_convert_dtypes_infer_objects(): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() result = ser.convert_dtypes( @@ -231,30 +214,25 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): convert_string=False, ) - if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(result)) - else: - assert not np.shares_memory(get_array(ser), get_array(result)) - + assert tm.shares_memory(get_array(ser), get_array(result)) result.iloc[0] = "x" tm.assert_series_equal(ser, ser_orig) -def test_convert_dtypes(using_copy_on_write): +def test_convert_dtypes(using_infer_string): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() df2 = df.convert_dtypes() - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + if using_infer_string and HAS_PYARROW: + # TODO the default nullable string dtype still uses python storage + # this should be changed to pyarrow if installed + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d")) - + assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) df2.iloc[0, 0] = "x" + df2.iloc[0, 1] = 10 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index 80e38380ed27c..4aef69a6fde98 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -1,111 +1,33 @@ import numpy as np import pytest -from pandas.errors import ( - ChainedAssignmentError, - SettingWithCopyWarning, -) +from pandas.errors import ChainedAssignmentError -from pandas import ( - DataFrame, - option_context, -) +from pandas import DataFrame import pandas._testing as tm -def test_methods_iloc_warn(using_copy_on_write): - if not using_copy_on_write: - df = DataFrame({"a": [1, 2, 3], "b": 1}) - with tm.assert_cow_warning(match="A value"): - df.iloc[:, 0].replace(1, 5, inplace=True) - - with tm.assert_cow_warning(match="A value"): - df.iloc[:, 0].fillna(1, inplace=True) - - with tm.assert_cow_warning(match="A value"): - df.iloc[:, 0].interpolate(inplace=True) - - with tm.assert_cow_warning(match="A value"): - df.iloc[:, 0].ffill(inplace=True) - - with tm.assert_cow_warning(match="A value"): - df.iloc[:, 0].bfill(inplace=True) - - -@pytest.mark.parametrize( - "func, args", - [ - ("replace", (4, 5)), - ("fillna", (1,)), - ("interpolate", ()), - ("bfill", ()), - ("ffill", ()), - ], -) -def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): - # ensure we don't incorrectly raise chained assignment warning because - # of the item cache / iloc not setting the item cache - df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) - - df = df_orig.copy() - ser = df.iloc[:, 0] - getattr(ser, func)(*args, inplace=True) - - # parent that holds item_cache is dead, so don't increase ref count - df = df_orig.copy() - ser = df.copy()["a"] - getattr(ser, func)(*args, inplace=True) - - df = df_orig.copy() - df["a"] # populate the item_cache - ser = df.iloc[:, 0] # iloc creates a new object - getattr(ser, func)(*args, inplace=True) - - df = df_orig.copy() - df["a"] # populate the item_cache - ser = df["a"] - getattr(ser, func)(*args, inplace=True) - - df = df_orig.copy() - df["a"] # populate the item_cache - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["a"].fillna(0, inplace=True) - else: - with tm.assert_cow_warning(match="A value"): - df["a"].fillna(0, inplace=True) - - -# TODO(CoW-warn) expand the cases @pytest.mark.parametrize( "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])] ) -def test_series_setitem(indexer, using_copy_on_write, warn_copy_on_write): +def test_series_setitem(indexer): # ensure we only get a single warning for those typical cases of chained # assignment df = DataFrame({"a": [1, 2, 3], "b": 1}) # using custom check instead of tm.assert_produces_warning because that doesn't # fail if multiple warnings are raised - with pytest.warns() as record: + with pytest.warns() as record: # noqa: TID251 df["a"][indexer] = 0 assert len(record) == 1 - if using_copy_on_write: - assert record[0].category == ChainedAssignmentError - else: - assert record[0].category == FutureWarning - assert "ChainedAssignmentError" in record[0].message.args[0] + assert record[0].category == ChainedAssignmentError -@pytest.mark.filterwarnings("ignore::pandas.errors.SettingWithCopyWarning") @pytest.mark.parametrize( "indexer", ["a", ["a", "b"], slice(0, 2), np.array([True, False, True])] ) -def test_frame_setitem(indexer, using_copy_on_write): +def test_frame_setitem(indexer): df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1}) - extra_warnings = () if using_copy_on_write else (SettingWithCopyWarning,) - - with option_context("chained_assignment", "warn"): - with tm.raises_chained_assignment_error(extra_warnings=extra_warnings): - df[0:3][indexer] = 10 + with tm.raises_chained_assignment_error(): + df[0:3][indexer] = 10 diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index 7c87646424e2f..56df33db6d416 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -1,34 +1,24 @@ import numpy as np -from pandas import ( - DataFrame, - option_context, -) +from pandas import DataFrame import pandas._testing as tm from pandas.tests.copy_view.util import get_array -def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write): +def test_clip_inplace_reference(): df = DataFrame({"a": [1.5, 2, 3]}) df_copy = df.copy() arr_a = get_array(df, "a") view = df[:] - if warn_copy_on_write: - with tm.assert_cow_warning(): - df.clip(lower=2, inplace=True) - else: - df.clip(lower=2, inplace=True) - - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr_a) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - tm.assert_frame_equal(df_copy, view) - else: - assert np.shares_memory(get_array(df, "a"), arr_a) - - -def test_clip_inplace_reference_no_op(using_copy_on_write): + df.clip(lower=2, inplace=True) + + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(df_copy, view) + + +def test_clip_inplace_reference_no_op(): df = DataFrame({"a": [1.5, 2, 3]}) df_copy = df.copy() arr_a = get_array(df, "a") @@ -37,65 +27,46 @@ def test_clip_inplace_reference_no_op(using_copy_on_write): assert np.shares_memory(get_array(df, "a"), arr_a) - if using_copy_on_write: - assert not df._mgr._has_no_reference(0) - assert not view._mgr._has_no_reference(0) - tm.assert_frame_equal(df_copy, view) + assert not df._mgr._has_no_reference(0) + assert not view._mgr._has_no_reference(0) + tm.assert_frame_equal(df_copy, view) -def test_clip_inplace(using_copy_on_write): +def test_clip_inplace(): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") df.clip(lower=2, inplace=True) assert np.shares_memory(get_array(df, "a"), arr_a) - - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) -def test_clip(using_copy_on_write): +def test_clip(): df = DataFrame({"a": [1.5, 2, 3]}) df_orig = df.copy() df2 = df.clip(lower=2) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) tm.assert_frame_equal(df_orig, df) -def test_clip_no_op(using_copy_on_write): +def test_clip_no_op(): df = DataFrame({"a": [1.5, 2, 3]}) df2 = df.clip(lower=0) - if using_copy_on_write: - assert not df._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not df._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) -def test_clip_chained_inplace(using_copy_on_write): +def test_clip_chained_inplace(): df = DataFrame({"a": [1, 4, 2], "b": 1}) df_orig = df.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["a"].clip(1, 2, inplace=True) - tm.assert_frame_equal(df, df_orig) - - with tm.raises_chained_assignment_error(): - df[["a"]].clip(1, 2, inplace=True) - tm.assert_frame_equal(df, df_orig) - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - df["a"].clip(1, 2, inplace=True) - - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - df[["a"]].clip(1, 2, inplace=True) - - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - df[df["a"] > 1].clip(1, 2, inplace=True) + with tm.raises_chained_assignment_error(): + df["a"].clip(1, 2, inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + df[["a"]].clip(1, 2, inplace=True) + tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 1aa458a625028..4e4df4ba3cf22 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -21,7 +21,7 @@ @pytest.mark.parametrize("dtype", [None, "int64"]) -def test_series_from_series(dtype, using_copy_on_write, warn_copy_on_write): +def test_series_from_series(dtype): # Case: constructing a Series from another Series object follows CoW rules: # a new object is returned and thus mutations are not propagated ser = Series([1, 2, 3], name="name") @@ -32,38 +32,23 @@ def test_series_from_series(dtype, using_copy_on_write, warn_copy_on_write): # the shallow copy still shares memory assert np.shares_memory(get_array(ser), get_array(result)) - if using_copy_on_write: - assert result._mgr.blocks[0].refs.has_reference() + assert result._mgr.blocks[0].refs.has_reference() - if using_copy_on_write: - # mutating new series copy doesn't mutate original - result.iloc[0] = 0 - assert ser.iloc[0] == 1 - # mutating triggered a copy-on-write -> no longer shares memory - assert not np.shares_memory(get_array(ser), get_array(result)) - else: - # mutating shallow copy does mutate original - with tm.assert_cow_warning(warn_copy_on_write): - result.iloc[0] = 0 - assert ser.iloc[0] == 0 - # and still shares memory - assert np.shares_memory(get_array(ser), get_array(result)) + # mutating new series copy doesn't mutate original + result.iloc[0] = 0 + assert ser.iloc[0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.shares_memory(get_array(ser), get_array(result)) # the same when modifying the parent result = Series(ser, dtype=dtype) - if using_copy_on_write: - # mutating original doesn't mutate new series - ser.iloc[0] = 0 - assert result.iloc[0] == 1 - else: - # mutating original does mutate shallow copy - with tm.assert_cow_warning(warn_copy_on_write): - ser.iloc[0] = 0 - assert result.iloc[0] == 0 + # mutating original doesn't mutate new series + ser.iloc[0] = 0 + assert result.iloc[0] == 1 -def test_series_from_series_with_reindex(using_copy_on_write, warn_copy_on_write): +def test_series_from_series_with_reindex(): # Case: constructing a Series from another Series with specifying an index # that potentially requires a reindex of the values ser = Series([1, 2, 3], name="name") @@ -78,50 +63,33 @@ def test_series_from_series_with_reindex(using_copy_on_write, warn_copy_on_write ]: result = Series(ser, index=index) assert np.shares_memory(ser.values, result.values) - with tm.assert_cow_warning(warn_copy_on_write): - result.iloc[0] = 0 - if using_copy_on_write: - assert ser.iloc[0] == 1 - else: - assert ser.iloc[0] == 0 + result.iloc[0] = 0 + assert ser.iloc[0] == 1 # ensure that if an actual reindex is needed, we don't have any refs # (mutating the result wouldn't trigger CoW) result = Series(ser, index=[0, 1, 2, 3]) assert not np.shares_memory(ser.values, result.values) - if using_copy_on_write: - assert not result._mgr.blocks[0].refs.has_reference() + assert not result._mgr.blocks[0].refs.has_reference() -@pytest.mark.parametrize("fastpath", [False, True]) @pytest.mark.parametrize("dtype", [None, "int64"]) @pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)]) @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")] ) -def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): - if idx is None or dtype is not None: - fastpath = False - msg = "The 'fastpath' keyword in pd.Series is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) +def test_series_from_array(idx, dtype, arr): + ser = Series(arr, dtype=dtype, index=idx) ser_orig = ser.copy() data = getattr(arr, "_data", arr) - if using_copy_on_write: - assert not np.shares_memory(get_array(ser), data) - else: - assert np.shares_memory(get_array(ser), data) + assert not np.shares_memory(get_array(ser), data) arr[0] = 100 - if using_copy_on_write: - tm.assert_series_equal(ser, ser_orig) - else: - expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype) - tm.assert_series_equal(ser, expected) + tm.assert_series_equal(ser, ser_orig) @pytest.mark.parametrize("copy", [True, False, None]) -def test_series_from_array_different_dtype(using_copy_on_write, copy): +def test_series_from_array_different_dtype(copy): arr = np.array([1, 2, 3], dtype="int64") ser = Series(arr, dtype="int32", copy=copy) assert not np.shares_memory(get_array(ser), arr) @@ -136,63 +104,34 @@ def test_series_from_array_different_dtype(using_copy_on_write, copy): TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]), ], ) -def test_series_from_index(using_copy_on_write, idx): +def test_series_from_index(idx): ser = Series(idx) expected = idx.copy(deep=True) - if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(idx)) - assert not ser._mgr._has_no_reference(0) - else: - assert not np.shares_memory(get_array(ser), get_array(idx)) + assert np.shares_memory(get_array(ser), get_array(idx)) + assert not ser._mgr._has_no_reference(0) ser.iloc[0] = ser.iloc[1] tm.assert_index_equal(idx, expected) -def test_series_from_index_different_dtypes(using_copy_on_write): +def test_series_from_index_different_dtypes(): idx = Index([1, 2, 3], dtype="int64") ser = Series(idx, dtype="int32") assert not np.shares_memory(get_array(ser), get_array(idx)) - if using_copy_on_write: - assert ser._mgr._has_no_reference(0) - - -@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") -@pytest.mark.parametrize("fastpath", [False, True]) -@pytest.mark.parametrize("dtype", [None, "int64"]) -@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)]) -def test_series_from_block_manager(using_copy_on_write, idx, dtype, fastpath): - ser = Series([1, 2, 3], dtype="int64") - ser_orig = ser.copy() - msg = "The 'fastpath' keyword in pd.Series is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx) - assert np.shares_memory(get_array(ser), get_array(ser2)) - if using_copy_on_write: - assert not ser2._mgr._has_no_reference(0) - - ser2.iloc[0] = 100 - if using_copy_on_write: - tm.assert_series_equal(ser, ser_orig) - else: - expected = Series([100, 2, 3]) - tm.assert_series_equal(ser, expected) + assert ser._mgr._has_no_reference(0) -def test_series_from_block_manager_different_dtype(using_copy_on_write): +def test_series_from_block_manager_different_dtype(): ser = Series([1, 2, 3], dtype="int64") msg = "Passing a SingleBlockManager to Series" with tm.assert_produces_warning(DeprecationWarning, match=msg): ser2 = Series(ser._mgr, dtype="int32") assert not np.shares_memory(get_array(ser), get_array(ser2)) - if using_copy_on_write: - assert ser2._mgr._has_no_reference(0) + assert ser2._mgr._has_no_reference(0) @pytest.mark.parametrize("use_mgr", [True, False]) @pytest.mark.parametrize("columns", [None, ["a"]]) -def test_dataframe_constructor_mgr_or_df( - using_copy_on_write, warn_copy_on_write, columns, use_mgr -): +def test_dataframe_constructor_mgr_or_df(columns, use_mgr): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() @@ -207,23 +146,16 @@ def test_dataframe_constructor_mgr_or_df( new_df = DataFrame(data) assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) - with tm.assert_cow_warning(warn_copy_on_write and not use_mgr): - new_df.iloc[0] = 100 + new_df.iloc[0] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) - tm.assert_frame_equal(df, df_orig) - else: - assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) - tm.assert_frame_equal(df, new_df) + assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("dtype", [None, "int64", "Int64"]) @pytest.mark.parametrize("index", [None, [0, 1, 2]]) @pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]]) -def test_dataframe_from_dict_of_series( - request, using_copy_on_write, warn_copy_on_write, columns, index, dtype -): +def test_dataframe_from_dict_of_series(columns, index, dtype): # Case: constructing a DataFrame from Series objects with copy=False # has to do a lazy following CoW rules # (the default for DataFrame(dict) is still to copy to ensure consolidation) @@ -242,13 +174,9 @@ def test_dataframe_from_dict_of_series( assert np.shares_memory(get_array(result, "a"), get_array(s1)) # mutating the new dataframe doesn't mutate original - with tm.assert_cow_warning(warn_copy_on_write): - result.iloc[0, 0] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(s1)) - tm.assert_series_equal(s1, s1_orig) - else: - assert s1.iloc[0] == 10 + result.iloc[0, 0] = 10 + assert not np.shares_memory(get_array(result, "a"), get_array(s1)) + tm.assert_series_equal(s1, s1_orig) # the same when modifying the parent series s1 = Series([1, 2, 3]) @@ -256,13 +184,9 @@ def test_dataframe_from_dict_of_series( result = DataFrame( {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False ) - with tm.assert_cow_warning(warn_copy_on_write): - s1.iloc[0] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(s1)) - tm.assert_frame_equal(result, expected) - else: - assert result.iloc[0, 0] == 10 + s1.iloc[0] = 10 + assert not np.shares_memory(get_array(result, "a"), get_array(s1)) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [None, "int64"]) @@ -283,42 +207,33 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): assert np.shares_memory(arr_before, arr_after) -@pytest.mark.parametrize("cons", [Series, Index]) @pytest.mark.parametrize( - "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] + "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], object)] ) -def test_dataframe_from_series_or_index( - using_copy_on_write, warn_copy_on_write, data, dtype, cons -): - obj = cons(data, dtype=dtype) +def test_dataframe_from_series_or_index(data, dtype, index_or_series): + obj = index_or_series(data, dtype=dtype) obj_orig = obj.copy() df = DataFrame(obj, dtype=dtype) assert np.shares_memory(get_array(obj), get_array(df, 0)) - if using_copy_on_write: - assert not df._mgr._has_no_reference(0) + assert not df._mgr._has_no_reference(0) - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = data[-1] - if using_copy_on_write: - tm.assert_equal(obj, obj_orig) + df.iloc[0, 0] = data[-1] + tm.assert_equal(obj, obj_orig) -@pytest.mark.parametrize("cons", [Series, Index]) -def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, cons): - obj = cons([1, 2], dtype="int64") +def test_dataframe_from_series_or_index_different_dtype(index_or_series): + obj = index_or_series([1, 2], dtype="int64") df = DataFrame(obj, dtype="int32") assert not np.shares_memory(get_array(obj), get_array(df, 0)) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) -def test_dataframe_from_series_infer_datetime(using_copy_on_write): +def test_dataframe_from_series_dont_infer_datetime(): ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - df = DataFrame(ser) - assert not np.shares_memory(get_array(ser), get_array(df, 0)) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + df = DataFrame(ser) + assert df.dtypes.iloc[0] == np.dtype(object) + assert np.shares_memory(get_array(ser), get_array(df, 0)) + assert not df._mgr._has_no_reference(0) @pytest.mark.parametrize("index", [None, [0, 1, 2]]) @@ -339,44 +254,22 @@ def test_dataframe_from_dict_of_series_with_dtype(index): @pytest.mark.parametrize("copy", [False, None, True]) -def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager): +def test_frame_from_numpy_array(copy): arr = np.array([[1, 2], [3, 4]]) df = DataFrame(arr, copy=copy) - if ( - using_copy_on_write - and copy is not False - or copy is True - or (using_array_manager and copy is None) - ): + if copy is not False or copy is True: assert not np.shares_memory(get_array(df, 0), arr) else: assert np.shares_memory(get_array(df, 0), arr) -def test_dataframe_from_records_with_dataframe(using_copy_on_write, warn_copy_on_write): - df = DataFrame({"a": [1, 2, 3]}) - df_orig = df.copy() - with tm.assert_produces_warning(FutureWarning): - df2 = DataFrame.from_records(df) - if using_copy_on_write: - assert not df._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - with tm.assert_cow_warning(warn_copy_on_write): - df2.iloc[0, 0] = 100 - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) - else: - tm.assert_frame_equal(df, df2) - - -def test_frame_from_dict_of_index(using_copy_on_write): +def test_frame_from_dict_of_index(): idx = Index([1, 2, 3]) expected = idx.copy(deep=True) df = DataFrame({"a": idx}, copy=False) assert np.shares_memory(get_array(df, "a"), idx._values) - if using_copy_on_write: - assert not df._mgr._has_no_reference(0) + assert not df._mgr._has_no_reference(0) - df.iloc[0, 0] = 100 - tm.assert_index_equal(idx, expected) + df.iloc[0, 0] = 100 + tm.assert_index_equal(idx, expected) diff --git a/pandas/tests/copy_view/test_copy_deprecation.py b/pandas/tests/copy_view/test_copy_deprecation.py new file mode 100644 index 0000000000000..8ee37213b92ab --- /dev/null +++ b/pandas/tests/copy_view/test_copy_deprecation.py @@ -0,0 +1,89 @@ +import pytest + +import pandas as pd +from pandas import ( + concat, + merge, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "meth, kwargs", + [ + ("truncate", {}), + ("tz_convert", {"tz": "UTC"}), + ("tz_localize", {"tz": "UTC"}), + ("infer_objects", {}), + ("astype", {"dtype": "float64"}), + ("reindex", {"index": [2, 0, 1]}), + ("transpose", {}), + ("set_axis", {"labels": [1, 2, 3]}), + ("rename", {"index": {1: 2}}), + ("set_flags", {}), + ("to_period", {}), + ("to_timestamp", {}), + ("swaplevel", {"i": 0, "j": 1}), + ], +) +def test_copy_deprecation(meth, kwargs): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1}) + + if meth in ("tz_convert", "tz_localize", "to_period"): + tz = None if meth in ("tz_localize", "to_period") else "US/Eastern" + df.index = pd.date_range("2020-01-01", freq="D", periods=len(df), tz=tz) + elif meth == "to_timestamp": + df.index = pd.period_range("2020-01-01", freq="D", periods=len(df)) + elif meth == "swaplevel": + df = df.set_index(["b", "c"]) + + if meth != "swaplevel": + with tm.assert_produces_warning(DeprecationWarning, match="copy"): + getattr(df, meth)(copy=False, **kwargs) + + if meth != "transpose": + with tm.assert_produces_warning(DeprecationWarning, match="copy"): + getattr(df.a, meth)(copy=False, **kwargs) + + +def test_copy_deprecation_reindex_like_align(): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + # Somehow the stack level check is incorrect here + with tm.assert_produces_warning( + DeprecationWarning, match="copy", check_stacklevel=False + ): + df.reindex_like(df, copy=False) + + with tm.assert_produces_warning( + DeprecationWarning, match="copy", check_stacklevel=False + ): + df.a.reindex_like(df.a, copy=False) + + with tm.assert_produces_warning( + DeprecationWarning, match="copy", check_stacklevel=False + ): + df.align(df, copy=False) + + with tm.assert_produces_warning( + DeprecationWarning, match="copy", check_stacklevel=False + ): + df.a.align(df.a, copy=False) + + +def test_copy_deprecation_merge_concat(): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + with tm.assert_produces_warning( + DeprecationWarning, match="copy", check_stacklevel=False + ): + df.merge(df, copy=False) + + with tm.assert_produces_warning( + DeprecationWarning, match="copy", check_stacklevel=False + ): + merge(df, df, copy=False) + + with tm.assert_produces_warning( + DeprecationWarning, match="copy", check_stacklevel=False + ): + concat([df, df], copy=False) diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index 8dc80c5cc0e0e..ad16bafdf0ee4 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -6,18 +6,17 @@ from pandas.tests.copy_view.util import get_array -def test_assigning_to_same_variable_removes_references(using_copy_on_write): +def test_assigning_to_same_variable_removes_references(): df = DataFrame({"a": [1, 2, 3]}) df = df.reset_index() - if using_copy_on_write: - assert df._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(1) arr = get_array(df, "a") df.iloc[0, 1] = 100 # Write into a assert np.shares_memory(arr, get_array(df, "a")) -def test_setitem_dont_track_unnecessary_references(using_copy_on_write): +def test_setitem_dont_track_unnecessary_references(): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) df["b"] = 100 @@ -28,46 +27,38 @@ def test_setitem_dont_track_unnecessary_references(using_copy_on_write): assert np.shares_memory(arr, get_array(df, "a")) -def test_setitem_with_view_copies(using_copy_on_write, warn_copy_on_write): +def test_setitem_with_view_copies(): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) view = df[:] expected = df.copy() df["b"] = 100 arr = get_array(df, "a") - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 100 # Check that we correctly track reference - if using_copy_on_write: - assert not np.shares_memory(arr, get_array(df, "a")) - tm.assert_frame_equal(view, expected) + df.iloc[0, 0] = 100 # Check that we correctly track reference + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(view, expected) -def test_setitem_with_view_invalidated_does_not_copy( - using_copy_on_write, warn_copy_on_write, request -): +def test_setitem_with_view_invalidated_does_not_copy(request): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) view = df[:] df["b"] = 100 arr = get_array(df, "a") view = None # noqa: F841 - # TODO(CoW-warn) false positive? -> block gets split because of `df["b"] = 100` + # TODO(CoW) block gets split because of `df["b"] = 100` # which introduces additional refs, even when those of `view` go out of scopes - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 100 - if using_copy_on_write: - # Setitem split the block. Since the old block shared data with view - # all the new blocks are referencing view and each other. When view - # goes out of scope, they don't share data with any other block, - # so we should not trigger a copy - mark = pytest.mark.xfail( - reason="blk.delete does not track references correctly" - ) - request.applymarker(mark) - assert np.shares_memory(arr, get_array(df, "a")) - - -def test_out_of_scope(using_copy_on_write): + df.iloc[0, 0] = 100 + # Setitem split the block. Since the old block shared data with view + # all the new blocks are referencing view and each other. When view + # goes out of scope, they don't share data with any other block, + # so we should not trigger a copy + mark = pytest.mark.xfail(reason="blk.delete does not track references correctly") + request.applymarker(mark) + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_out_of_scope(): def func(): df = DataFrame({"a": [1, 2], "b": 1.5, "c": 1}) # create some subset @@ -75,32 +66,28 @@ def func(): return result result = func() - if using_copy_on_write: - assert not result._mgr.blocks[0].refs.has_reference() - assert not result._mgr.blocks[1].refs.has_reference() + assert not result._mgr.blocks[0].refs.has_reference() + assert not result._mgr.blocks[1].refs.has_reference() -def test_delete(using_copy_on_write): +def test_delete(): df = DataFrame( np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"] ) del df["b"] - if using_copy_on_write: - assert not df._mgr.blocks[0].refs.has_reference() - assert not df._mgr.blocks[1].refs.has_reference() + assert not df._mgr.blocks[0].refs.has_reference() + assert not df._mgr.blocks[1].refs.has_reference() df = df[["a"]] - if using_copy_on_write: - assert not df._mgr.blocks[0].refs.has_reference() + assert not df._mgr.blocks[0].refs.has_reference() -def test_delete_reference(using_copy_on_write): +def test_delete_reference(): df = DataFrame( np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"] ) x = df[:] del df["b"] - if using_copy_on_write: - assert df._mgr.blocks[0].refs.has_reference() - assert df._mgr.blocks[1].refs.has_reference() - assert x._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + assert x._mgr.blocks[0].refs.has_reference() diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 56e4b186350f2..32fea794975b6 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, Index, @@ -12,189 +16,141 @@ from pandas.tests.copy_view.util import get_array -def test_concat_frames(using_copy_on_write): - df = DataFrame({"b": ["a"] * 3}) - df2 = DataFrame({"a": ["a"] * 3}) +def test_concat_frames(): + df = DataFrame({"b": ["a"] * 3}, dtype=object) + df2 = DataFrame({"a": ["a"] * 3}, dtype=object) df_orig = df.copy() result = concat([df, df2], axis=1) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) - else: - assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) result.iloc[0, 0] = "d" - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) result.iloc[0, 1] = "d" - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) tm.assert_frame_equal(df, df_orig) -def test_concat_frames_updating_input(using_copy_on_write): - df = DataFrame({"b": ["a"] * 3}) - df2 = DataFrame({"a": ["a"] * 3}) +def test_concat_frames_updating_input(): + df = DataFrame({"b": ["a"] * 3}, dtype=object) + df2 = DataFrame({"a": ["a"] * 3}, dtype=object) result = concat([df, df2], axis=1) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) - else: - assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) expected = result.copy() df.iloc[0, 0] = "d" - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) df2.iloc[0, 0] = "d" - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) + assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) tm.assert_frame_equal(result, expected) -def test_concat_series(using_copy_on_write): +def test_concat_series(): ser = Series([1, 2], name="a") ser2 = Series([3, 4], name="b") ser_orig = ser.copy() ser2_orig = ser2.copy() result = concat([ser, ser2], axis=1) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), ser.values) - assert np.shares_memory(get_array(result, "b"), ser2.values) - else: - assert not np.shares_memory(get_array(result, "a"), ser.values) - assert not np.shares_memory(get_array(result, "b"), ser2.values) + assert np.shares_memory(get_array(result, "a"), ser.values) + assert np.shares_memory(get_array(result, "b"), ser2.values) result.iloc[0, 0] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), ser.values) - assert np.shares_memory(get_array(result, "b"), ser2.values) + assert not np.shares_memory(get_array(result, "a"), ser.values) + assert np.shares_memory(get_array(result, "b"), ser2.values) result.iloc[0, 1] = 1000 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "b"), ser2.values) + assert not np.shares_memory(get_array(result, "b"), ser2.values) tm.assert_series_equal(ser, ser_orig) tm.assert_series_equal(ser2, ser2_orig) -def test_concat_frames_chained(using_copy_on_write): +def test_concat_frames_chained(): df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) df2 = DataFrame({"c": [4, 5, 6]}) df3 = DataFrame({"d": [4, 5, 6]}) result = concat([concat([df1, df2], axis=1), df3], axis=1) expected = result.copy() - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "c"), get_array(df2, "c")) - assert np.shares_memory(get_array(result, "d"), get_array(df3, "d")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c")) - assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d")) + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(df2, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(df3, "d")) df1.iloc[0, 0] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) tm.assert_frame_equal(result, expected) -def test_concat_series_chained(using_copy_on_write): +def test_concat_series_chained(): ser1 = Series([1, 2, 3], name="a") ser2 = Series([4, 5, 6], name="c") ser3 = Series([4, 5, 6], name="d") result = concat([concat([ser1, ser2], axis=1), ser3], axis=1) expected = result.copy() - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) - assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) - assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) - assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) - assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) + assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) ser1.iloc[0] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) + assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) tm.assert_frame_equal(result, expected) -def test_concat_series_updating_input(using_copy_on_write): +def test_concat_series_updating_input(): ser = Series([1, 2], name="a") ser2 = Series([3, 4], name="b") expected = DataFrame({"a": [1, 2], "b": [3, 4]}) result = concat([ser, ser2], axis=1) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(ser, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) - assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) ser.iloc[0] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) tm.assert_frame_equal(result, expected) ser2.iloc[0] = 1000 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) + assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) tm.assert_frame_equal(result, expected) -def test_concat_mixed_series_frame(using_copy_on_write): +def test_concat_mixed_series_frame(): df = DataFrame({"a": [1, 2, 3], "c": 1}) ser = Series([4, 5, 6], name="d") result = concat([df, ser], axis=1) expected = result.copy() - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(result, "c"), get_array(df, "c")) - assert np.shares_memory(get_array(result, "d"), get_array(ser, "d")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - assert not np.shares_memory(get_array(result, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(result, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(result, "d"), get_array(ser, "d")) ser.iloc[0] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) + assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d")) df.iloc[0, 0] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("copy", [True, None, False]) -def test_concat_copy_keyword(using_copy_on_write, copy): +def test_concat_copy_keyword(): df = DataFrame({"a": [1, 2]}) df2 = DataFrame({"b": [1.5, 2.5]}) - result = concat([df, df2], axis=1, copy=copy) + result = concat([df, df2], axis=1) - if using_copy_on_write or copy is False: - assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) - assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) - else: - assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) - assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) @pytest.mark.parametrize( @@ -204,36 +160,30 @@ def test_concat_copy_keyword(using_copy_on_write, copy): lambda df1, df2, **kwargs: merge(df1, df2, **kwargs), ], ) -def test_merge_on_key(using_copy_on_write, func): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) +def test_merge_on_key(func): + df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]}) + df2 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "b": [4, 5, 6]}) df1_orig = df1.copy() df2_orig = df2.copy() result = func(df1, df2, on="key") - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) - assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) + assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) result.iloc[0, 1] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) result.iloc[0, 2] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) tm.assert_frame_equal(df1, df1_orig) tm.assert_frame_equal(df2, df2_orig) -def test_merge_on_index(using_copy_on_write): +def test_merge_on_index(): df1 = DataFrame({"a": [1, 2, 3]}) df2 = DataFrame({"b": [4, 5, 6]}) df1_orig = df1.copy() @@ -241,21 +191,15 @@ def test_merge_on_index(using_copy_on_write): result = merge(df1, df2, left_index=True, right_index=True) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) result.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) result.iloc[0, 1] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) tm.assert_frame_equal(df1, df1_orig) tm.assert_frame_equal(df2, df2_orig) @@ -267,54 +211,49 @@ def test_merge_on_index(using_copy_on_write): (lambda df1, df2, **kwargs: merge(df1, df2, on="key", **kwargs), "left"), ], ) -def test_merge_on_key_enlarging_one(using_copy_on_write, func, how): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b"], "b": [4, 5]}) +def test_merge_on_key_enlarging_one(func, how): + df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]}) + df2 = DataFrame({"key": Series(["a", "b"], dtype=object), "b": [4, 5]}) df1_orig = df1.copy() df2_orig = df2.copy() result = func(df1, df2, how=how) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - assert df2._mgr._has_no_reference(1) - assert df2._mgr._has_no_reference(0) - assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) is ( - how == "left" - ) - assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert df2._mgr._has_no_reference(1) + assert df2._mgr._has_no_reference(0) + assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) is ( + how == "left" + ) + assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) if how == "left": result.iloc[0, 1] = 0 else: result.iloc[0, 2] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) tm.assert_frame_equal(df1, df1_orig) tm.assert_frame_equal(df2, df2_orig) -@pytest.mark.parametrize("copy", [True, None, False]) -def test_merge_copy_keyword(using_copy_on_write, copy): +def test_merge_copy_keyword(): df = DataFrame({"a": [1, 2]}) df2 = DataFrame({"b": [3, 4.5]}) - result = df.merge(df2, copy=copy, left_index=True, right_index=True) + result = df.merge(df2, left_index=True, right_index=True) - if using_copy_on_write or copy is False: - assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) - assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) - else: - assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) - assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -def test_join_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, + reason="TODO(infer_string); result.index infers str dtype while both " + "df1 and df2 index are object.", +) +def test_join_on_key(): + df_index = Index(["a", "b", "c"], name="key", dtype=object) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)) @@ -324,30 +263,24 @@ def test_join_on_key(using_copy_on_write): result = df1.join(df2, on="key") - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - assert np.shares_memory(get_array(result.index), get_array(df1.index)) - assert not np.shares_memory(get_array(result.index), get_array(df2.index)) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result.index), get_array(df1.index)) + assert not np.shares_memory(get_array(result.index), get_array(df2.index)) result.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) result.iloc[0, 1] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) tm.assert_frame_equal(df1, df1_orig) tm.assert_frame_equal(df2, df2_orig) -def test_join_multiple_dataframes_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") +def test_join_multiple_dataframes_on_key(): + df_index = Index(["a", "b", "c"], name="key", dtype=object) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) dfs_list = [ @@ -360,36 +293,24 @@ def test_join_multiple_dataframes_on_key(using_copy_on_write): result = df1.join(dfs_list) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) - assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) - assert np.shares_memory(get_array(result.index), get_array(df1.index)) - assert not np.shares_memory( - get_array(result.index), get_array(dfs_list[0].index) - ) - assert not np.shares_memory( - get_array(result.index), get_array(dfs_list[1].index) - ) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) - assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + assert np.shares_memory(get_array(result.index), get_array(df1.index)) + assert not np.shares_memory(get_array(result.index), get_array(dfs_list[0].index)) + assert not np.shares_memory(get_array(result.index), get_array(dfs_list[1].index)) result.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) - assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) - assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) result.iloc[0, 1] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) - assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) result.iloc[0, 2] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) tm.assert_frame_equal(df1, df1_orig) for df, df_orig in zip(dfs_list, dfs_list_orig): diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6f3850ab64daa..37a21e1098e78 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.errors import SettingWithCopyWarning - from pandas.core.dtypes.common import is_float_dtype import pandas as pd @@ -50,7 +48,7 @@ def make_series(*args, **kwargs): # Indexing operations taking subset + modifying the subset/parent -def test_subset_column_selection(backend, using_copy_on_write): +def test_subset_column_selection(backend): # Case: taking a subset of the columns of a DataFrame # + afterwards modifying the subset _, DataFrame, _ = backend @@ -59,17 +57,10 @@ def test_subset_column_selection(backend, using_copy_on_write): subset = df[["a", "c"]] - if using_copy_on_write: - # the subset shares memory ... - assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) - # ... but uses CoW when being modified - subset.iloc[0, 0] = 0 - else: - assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) - # INFO this no longer raise warning since pandas 1.4 - # with pd.option_context("chained_assignment", "warn"): - # with tm.assert_produces_warning(SettingWithCopyWarning): - subset.iloc[0, 0] = 0 + # the subset shares memory ... + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + # ... but uses CoW when being modified + subset.iloc[0, 0] = 0 assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) @@ -78,7 +69,7 @@ def test_subset_column_selection(backend, using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_subset_column_selection_modify_parent(backend, using_copy_on_write): +def test_subset_column_selection_modify_parent(backend): # Case: taking a subset of the columns of a DataFrame # + afterwards modifying the parent _, DataFrame, _ = backend @@ -86,22 +77,20 @@ def test_subset_column_selection_modify_parent(backend, using_copy_on_write): subset = df[["a", "c"]] - if using_copy_on_write: - # the subset shares memory ... - assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) - # ... but parent uses CoW parent when it is modified + # the subset shares memory ... + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + # ... but parent uses CoW parent when it is modified df.iloc[0, 0] = 0 assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) - if using_copy_on_write: - # different column/block still shares memory - assert np.shares_memory(get_array(subset, "c"), get_array(df, "c")) + # different column/block still shares memory + assert np.shares_memory(get_array(subset, "c"), get_array(df, "c")) expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) -def test_subset_row_slice(backend, using_copy_on_write, warn_copy_on_write): +def test_subset_row_slice(backend): # Case: taking a subset of the rows of a DataFrame using a slice # + afterwards modifying the subset _, DataFrame, _ = backend @@ -113,42 +102,24 @@ def test_subset_row_slice(backend, using_copy_on_write, warn_copy_on_write): assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) - if using_copy_on_write: - subset.iloc[0, 0] = 0 - assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) - - else: - # INFO this no longer raise warning since pandas 1.4 - # with pd.option_context("chained_assignment", "warn"): - # with tm.assert_produces_warning(SettingWithCopyWarning): - with tm.assert_cow_warning(warn_copy_on_write): - subset.iloc[0, 0] = 0 + subset.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) subset._mgr._verify_integrity() expected = DataFrame({"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) - if using_copy_on_write: - # original parent dataframe is not modified (CoW) - tm.assert_frame_equal(df, df_orig) - else: - # original parent dataframe is actually updated - df_orig.iloc[1, 0] = 0 - tm.assert_frame_equal(df, df_orig) + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_column_slice( - backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype -): +def test_subset_column_slice(backend, dtype): # Case: taking a subset of the columns of a DataFrame using a slice # + afterwards modifying the subset dtype_backend, DataFrame, _ = backend - single_block = ( - dtype == "int64" and dtype_backend == "numpy" - ) and not using_array_manager df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) @@ -157,30 +128,16 @@ def test_subset_column_slice( subset = df.iloc[:, 1:] subset._mgr._verify_integrity() - if using_copy_on_write: - assert np.shares_memory(get_array(subset, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(subset, "b"), get_array(df, "b")) - subset.iloc[0, 0] = 0 - assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) - elif warn_copy_on_write: - with tm.assert_cow_warning(single_block): - subset.iloc[0, 0] = 0 - else: - # we only get a warning in case of a single block - warn = SettingWithCopyWarning if single_block else None - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(warn): - subset.iloc[0, 0] = 0 + subset.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) expected = DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}) tm.assert_frame_equal(subset, expected) # original parent dataframe is not modified (also not for BlockManager case, # except for single block) - if not using_copy_on_write and (using_array_manager or single_block): - df_orig.iloc[0, 1] = 0 - tm.assert_frame_equal(df, df_orig) - else: - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( @@ -201,9 +158,6 @@ def test_subset_loc_rows_columns( dtype, row_indexer, column_indexer, - using_array_manager, - using_copy_on_write, - warn_copy_on_write, ): # Case: taking a subset of the rows+columns of a DataFrame using .loc # + afterwards modifying the subset @@ -219,31 +173,13 @@ def test_subset_loc_rows_columns( subset = df.loc[row_indexer, column_indexer] - # a few corner cases _do_ actually modify the parent (with both row and column - # slice, and in case of ArrayManager or BlockManager with single block) - mutate_parent = ( - isinstance(row_indexer, slice) - and isinstance(column_indexer, slice) - and ( - using_array_manager - or ( - dtype == "int64" - and dtype_backend == "numpy" - and not using_copy_on_write - ) - ) - ) - # modifying the subset never modifies the parent - with tm.assert_cow_warning(warn_copy_on_write and mutate_parent): - subset.iloc[0, 0] = 0 + subset.iloc[0, 0] = 0 expected = DataFrame( {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) ) tm.assert_frame_equal(subset, expected) - if mutate_parent: - df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) @@ -265,9 +201,6 @@ def test_subset_iloc_rows_columns( dtype, row_indexer, column_indexer, - using_array_manager, - using_copy_on_write, - warn_copy_on_write, ): # Case: taking a subset of the rows+columns of a DataFrame using .iloc # + afterwards modifying the subset @@ -283,31 +216,13 @@ def test_subset_iloc_rows_columns( subset = df.iloc[row_indexer, column_indexer] - # a few corner cases _do_ actually modify the parent (with both row and column - # slice, and in case of ArrayManager or BlockManager with single block) - mutate_parent = ( - isinstance(row_indexer, slice) - and isinstance(column_indexer, slice) - and ( - using_array_manager - or ( - dtype == "int64" - and dtype_backend == "numpy" - and not using_copy_on_write - ) - ) - ) - # modifying the subset never modifies the parent - with tm.assert_cow_warning(warn_copy_on_write and mutate_parent): - subset.iloc[0, 0] = 0 + subset.iloc[0, 0] = 0 expected = DataFrame( {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) ) tm.assert_frame_equal(subset, expected) - if mutate_parent: - df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) @@ -316,9 +231,7 @@ def test_subset_iloc_rows_columns( [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], ids=["slice", "mask", "array"], ) -def test_subset_set_with_row_indexer( - backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write -): +def test_subset_set_with_row_indexer(backend, indexer_si, indexer): # Case: setting values with a row indexer on a viewing subset # subset[indexer] = value and subset.iloc[indexer] = value _, DataFrame, _ = backend @@ -333,32 +246,17 @@ def test_subset_set_with_row_indexer( ): pytest.skip("setitem with labels selects on columns") - if using_copy_on_write: - indexer_si(subset)[indexer] = 0 - elif warn_copy_on_write: - with tm.assert_cow_warning(): - indexer_si(subset)[indexer] = 0 - else: - # INFO iloc no longer raises warning since pandas 1.4 - warn = SettingWithCopyWarning if indexer_si is tm.setitem else None - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(warn): - indexer_si(subset)[indexer] = 0 + indexer_si(subset)[indexer] = 0 expected = DataFrame( {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) ) tm.assert_frame_equal(subset, expected) - if using_copy_on_write: - # original parent dataframe is not modified (CoW) - tm.assert_frame_equal(df, df_orig) - else: - # original parent dataframe is actually updated - df_orig[1:3] = 0 - tm.assert_frame_equal(df, df_orig) + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) -def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write): +def test_subset_set_with_mask(backend): # Case: setting values with a mask on a viewing subset: subset[mask] = value _, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) @@ -367,31 +265,16 @@ def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write): mask = subset > 3 - if using_copy_on_write: - subset[mask] = 0 - elif warn_copy_on_write: - with tm.assert_cow_warning(): - subset[mask] = 0 - else: - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(SettingWithCopyWarning): - subset[mask] = 0 + subset[mask] = 0 expected = DataFrame( {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4) ) tm.assert_frame_equal(subset, expected) - if using_copy_on_write: - # original parent dataframe is not modified (CoW) - tm.assert_frame_equal(df, df_orig) - else: - # original parent dataframe is actually updated - df_orig.loc[3, "a"] = 0 - df_orig.loc[1:3, "b"] = 0 - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) -def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): +def test_subset_set_column(backend): # Case: setting a single column on a viewing subset -> subset[col] = value dtype_backend, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -403,13 +286,7 @@ def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): else: arr = pd.array([10, 11], dtype="Int64") - if using_copy_on_write or warn_copy_on_write: - subset["a"] = arr - else: - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(SettingWithCopyWarning): - subset["a"] = arr - + subset["a"] = arr subset._mgr._verify_integrity() expected = DataFrame( {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) @@ -421,9 +298,7 @@ def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_set_column_with_loc( - backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype -): +def test_subset_set_column_with_loc(backend, dtype): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value _, DataFrame, _ = backend @@ -433,18 +308,7 @@ def test_subset_set_column_with_loc( df_orig = df.copy() subset = df[1:3] - if using_copy_on_write: - subset.loc[:, "a"] = np.array([10, 11], dtype="int64") - elif warn_copy_on_write: - with tm.assert_cow_warning(): - subset.loc[:, "a"] = np.array([10, 11], dtype="int64") - else: - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning( - None, - raise_on_extra_warnings=not using_array_manager, - ): - subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") subset._mgr._verify_integrity() expected = DataFrame( @@ -452,18 +316,11 @@ def test_subset_set_column_with_loc( index=range(1, 3), ) tm.assert_frame_equal(subset, expected) - if using_copy_on_write: - # original parent dataframe is not modified (CoW) - tm.assert_frame_equal(df, df_orig) - else: - # original parent dataframe is actually updated - df_orig.loc[1:3, "a"] = np.array([10, 11], dtype="int64") - tm.assert_frame_equal(df, df_orig) + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) -def test_subset_set_column_with_loc2( - backend, using_copy_on_write, warn_copy_on_write, using_array_manager -): +def test_subset_set_column_with_loc2(backend): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value # separate test for case of DataFrame of a single column -> takes a separate @@ -473,35 +330,19 @@ def test_subset_set_column_with_loc2( df_orig = df.copy() subset = df[1:3] - if using_copy_on_write: - subset.loc[:, "a"] = 0 - elif warn_copy_on_write: - with tm.assert_cow_warning(): - subset.loc[:, "a"] = 0 - else: - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning( - None, - raise_on_extra_warnings=not using_array_manager, - ): - subset.loc[:, "a"] = 0 + subset.loc[:, "a"] = 0 subset._mgr._verify_integrity() expected = DataFrame({"a": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) - if using_copy_on_write: - # original parent dataframe is not modified (CoW) - tm.assert_frame_equal(df, df_orig) - else: - # original parent dataframe is actually updated - df_orig.loc[1:3, "a"] = 0 - tm.assert_frame_equal(df, df_orig) + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dtype): +def test_subset_set_columns(backend, dtype): # Case: setting multiple columns on a viewing subset # -> subset[[col1, col2]] = value dtype_backend, DataFrame, _ = backend @@ -511,17 +352,11 @@ def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dt df_orig = df.copy() subset = df[1:3] - if using_copy_on_write or warn_copy_on_write: - subset[["a", "c"]] = 0 - else: - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(SettingWithCopyWarning): - subset[["a", "c"]] = 0 + subset[["a", "c"]] = 0 subset._mgr._verify_integrity() - if using_copy_on_write: - # first and third column should certainly have no references anymore - assert all(subset._mgr._has_no_reference(i) for i in [0, 2]) + # first and third column should certainly have no references anymore + assert all(subset._mgr._has_no_reference(i) for i in [0, 2]) expected = DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) if dtype_backend == "nullable": # there is not yet a global option, so overriding a column by setting a scalar @@ -538,9 +373,7 @@ def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dt [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], ids=["slice", "mask", "array"], ) -def test_subset_set_with_column_indexer( - backend, indexer, using_copy_on_write, warn_copy_on_write -): +def test_subset_set_with_column_indexer(backend, indexer): # Case: setting multiple columns with a column indexer on a viewing subset # -> subset.loc[:, [col1, col2]] = value _, DataFrame, _ = backend @@ -548,28 +381,12 @@ def test_subset_set_with_column_indexer( df_orig = df.copy() subset = df[1:3] - if using_copy_on_write: - subset.loc[:, indexer] = 0 - elif warn_copy_on_write: - with tm.assert_cow_warning(): - subset.loc[:, indexer] = 0 - else: - with pd.option_context("chained_assignment", "warn"): - # As of 2.0, this setitem attempts (successfully) to set values - # inplace, so the assignment is not chained. - subset.loc[:, indexer] = 0 + subset.loc[:, indexer] = 0 subset._mgr._verify_integrity() expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) - else: - # pre-2.0, in the mixed case with BlockManager, only column "a" - # would be mutated in the parent frame. this changed with the - # enforcement of GH#45333 - df_orig.loc[1:2, ["a", "b"]] = 0 - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( @@ -599,9 +416,6 @@ def test_subset_chained_getitem( backend, method, dtype, - using_copy_on_write, - using_array_manager, - warn_copy_on_write, ): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour @@ -611,48 +425,23 @@ def test_subset_chained_getitem( ) df_orig = df.copy() - # when not using CoW, it depends on whether we have a single block or not - # and whether we are slicing the columns -> in that case we have a view - test_callspec = request.node.callspec.id - if not using_array_manager: - subset_is_view = test_callspec in ( - "numpy-single-block-column-iloc-slice", - "numpy-single-block-column-loc-slice", - ) - else: - # with ArrayManager, it doesn't matter whether we have - # single vs mixed block or numpy vs nullable dtypes - subset_is_view = test_callspec.endswith( - ("column-iloc-slice", "column-loc-slice") - ) - # modify subset -> don't modify parent subset = method(df) - with tm.assert_cow_warning(warn_copy_on_write and subset_is_view): - subset.iloc[0, 0] = 0 - if using_copy_on_write or (not subset_is_view): - tm.assert_frame_equal(df, df_orig) - else: - assert df.iloc[0, 0] == 0 + subset.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) # modify parent -> don't modify subset subset = method(df) - with tm.assert_cow_warning(warn_copy_on_write and subset_is_view): - df.iloc[0, 0] = 0 + df.iloc[0, 0] = 0 expected = DataFrame({"a": [1, 2], "b": [4, 5]}) - if using_copy_on_write or not subset_is_view: - tm.assert_frame_equal(subset, expected) - else: - assert subset.iloc[0, 0] == 0 + tm.assert_frame_equal(subset, expected) @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_chained_getitem_column( - backend, dtype, using_copy_on_write, warn_copy_on_write -): +def test_subset_chained_getitem_column(backend, dtype): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour dtype_backend, DataFrame, Series = backend @@ -663,24 +452,14 @@ def test_subset_chained_getitem_column( # modify subset -> don't modify parent subset = df[:]["a"][0:2] - df._clear_item_cache() - with tm.assert_cow_warning(warn_copy_on_write): - subset.iloc[0] = 0 - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) - else: - assert df.iloc[0, 0] == 0 + subset.iloc[0] = 0 + tm.assert_frame_equal(df, df_orig) # modify parent -> don't modify subset subset = df[:]["a"][0:2] - df._clear_item_cache() - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 0 + df.iloc[0, 0] = 0 expected = Series([1, 2], name="a") - if using_copy_on_write: - tm.assert_series_equal(subset, expected) - else: - assert subset.iloc[0] == 0 + tm.assert_series_equal(subset, expected) @pytest.mark.parametrize( @@ -697,9 +476,7 @@ def test_subset_chained_getitem_column( ], ids=["getitem", "iloc", "loc", "long-chain"], ) -def test_subset_chained_getitem_series( - backend, method, using_copy_on_write, warn_copy_on_write -): +def test_subset_chained_getitem_series(backend, method): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour _, _, Series = backend @@ -708,49 +485,31 @@ def test_subset_chained_getitem_series( # modify subset -> don't modify parent subset = method(s) - with tm.assert_cow_warning(warn_copy_on_write): - subset.iloc[0] = 0 - if using_copy_on_write: - tm.assert_series_equal(s, s_orig) - else: - assert s.iloc[0] == 0 + subset.iloc[0] = 0 + tm.assert_series_equal(s, s_orig) # modify parent -> don't modify subset subset = s.iloc[0:3].iloc[0:2] - with tm.assert_cow_warning(warn_copy_on_write): - s.iloc[0] = 0 + s.iloc[0] = 0 expected = Series([1, 2], index=["a", "b"]) - if using_copy_on_write: - tm.assert_series_equal(subset, expected) - else: - assert subset.iloc[0] == 0 + tm.assert_series_equal(subset, expected) -def test_subset_chained_single_block_row( - using_copy_on_write, using_array_manager, warn_copy_on_write -): +def test_subset_chained_single_block_row(): # not parametrizing this for dtype backend, since this explicitly tests single block df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() # modify subset -> don't modify parent subset = df[:].iloc[0].iloc[0:2] - with tm.assert_cow_warning(warn_copy_on_write): - subset.iloc[0] = 0 - if using_copy_on_write or using_array_manager: - tm.assert_frame_equal(df, df_orig) - else: - assert df.iloc[0, 0] == 0 + subset.iloc[0] = 0 + tm.assert_frame_equal(df, df_orig) # modify parent -> don't modify subset subset = df[:].iloc[0].iloc[0:2] - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 0 + df.iloc[0, 0] = 0 expected = Series([1, 4], index=["a", "b"], name=0) - if using_copy_on_write or using_array_manager: - tm.assert_series_equal(subset, expected) - else: - assert subset.iloc[0] == 0 + tm.assert_series_equal(subset, expected) @pytest.mark.parametrize( @@ -764,7 +523,7 @@ def test_subset_chained_single_block_row( ], ids=["getitem", "loc", "loc-rows", "iloc", "iloc-rows"], ) -def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write): +def test_null_slice(backend, method): # Case: also all variants of indexing with a null slice (:) should return # new objects to ensure we correctly use CoW for the results dtype_backend, DataFrame, _ = backend @@ -777,12 +536,8 @@ def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write): assert df2 is not df # and those trigger CoW when mutated - with tm.assert_cow_warning(warn_copy_on_write): - df2.iloc[0, 0] = 0 - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) - else: - assert df.iloc[0, 0] == 0 + df2.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( @@ -794,7 +549,7 @@ def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write): ], ids=["getitem", "loc", "iloc"], ) -def test_null_slice_series(backend, method, using_copy_on_write, warn_copy_on_write): +def test_null_slice_series(backend, method): _, _, Series = backend s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() @@ -805,12 +560,8 @@ def test_null_slice_series(backend, method, using_copy_on_write, warn_copy_on_wr assert s2 is not s # and those trigger CoW when mutated - with tm.assert_cow_warning(warn_copy_on_write): - s2.iloc[0] = 0 - if using_copy_on_write: - tm.assert_series_equal(s, s_orig) - else: - assert s.iloc[0] == 0 + s2.iloc[0] = 0 + tm.assert_series_equal(s, s_orig) # TODO add more tests modifying the parent @@ -820,7 +571,7 @@ def test_null_slice_series(backend, method, using_copy_on_write, warn_copy_on_wr # Series -- Indexing operations taking subset + modifying the subset/parent -def test_series_getitem_slice(backend, using_copy_on_write, warn_copy_on_write): +def test_series_getitem_slice(backend): # Case: taking a slice of a Series + afterwards modifying the subset _, _, Series = backend s = Series([1, 2, 3], index=["a", "b", "c"]) @@ -829,24 +580,18 @@ def test_series_getitem_slice(backend, using_copy_on_write, warn_copy_on_write): subset = s[:] assert np.shares_memory(get_array(subset), get_array(s)) - with tm.assert_cow_warning(warn_copy_on_write): - subset.iloc[0] = 0 + subset.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(subset), get_array(s)) + assert not np.shares_memory(get_array(subset), get_array(s)) expected = Series([0, 2, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) - if using_copy_on_write: - # original parent series is not modified (CoW) - tm.assert_series_equal(s, s_orig) - else: - # original parent series is actually updated - assert s.iloc[0] == 0 + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) -def test_series_getitem_ellipsis(using_copy_on_write, warn_copy_on_write): +def test_series_getitem_ellipsis(): # Case: taking a view of a Series using Ellipsis + afterwards modifying the subset s = Series([1, 2, 3]) s_orig = s.copy() @@ -854,21 +599,15 @@ def test_series_getitem_ellipsis(using_copy_on_write, warn_copy_on_write): subset = s[...] assert np.shares_memory(get_array(subset), get_array(s)) - with tm.assert_cow_warning(warn_copy_on_write): - subset.iloc[0] = 0 + subset.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(subset), get_array(s)) + assert not np.shares_memory(get_array(subset), get_array(s)) expected = Series([0, 2, 3]) tm.assert_series_equal(subset, expected) - if using_copy_on_write: - # original parent series is not modified (CoW) - tm.assert_series_equal(s, s_orig) - else: - # original parent series is actually updated - assert s.iloc[0] == 0 + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) @pytest.mark.parametrize( @@ -876,43 +615,35 @@ def test_series_getitem_ellipsis(using_copy_on_write, warn_copy_on_write): [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], ids=["slice", "mask", "array"], ) -def test_series_subset_set_with_indexer( - backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write -): +def test_series_subset_set_with_indexer(backend, indexer_si, indexer): # Case: setting values in a viewing Series with an indexer _, _, Series = backend s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() subset = s[:] - warn = None - msg = "Series.__setitem__ treating keys as positions is deprecated" if ( indexer_si is tm.setitem and isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i" ): - warn = FutureWarning - if warn_copy_on_write: - with tm.assert_cow_warning(raise_on_extra_warnings=warn is not None): - indexer_si(subset)[indexer] = 0 - else: - with tm.assert_produces_warning(warn, match=msg): + # In 3.0 we treat integers as always-labels + with pytest.raises(KeyError): indexer_si(subset)[indexer] = 0 + return + + indexer_si(subset)[indexer] = 0 expected = Series([0, 0, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) - if using_copy_on_write: - tm.assert_series_equal(s, s_orig) - else: - tm.assert_series_equal(s, expected) + tm.assert_series_equal(s, s_orig) # ----------------------------------------------------------------------------- # del operator -def test_del_frame(backend, using_copy_on_write, warn_copy_on_write): +def test_del_frame(backend): # Case: deleting a column with `del` on a viewing child dataframe should # not modify parent + update the references dtype_backend, DataFrame, _ = backend @@ -929,18 +660,13 @@ def test_del_frame(backend, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df2, df_orig[["a", "c"]]) df2._mgr._verify_integrity() - with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"): - df.loc[0, "b"] = 200 + df.loc[0, "b"] = 200 assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) df_orig = df.copy() - with tm.assert_cow_warning(warn_copy_on_write): - df2.loc[0, "a"] = 100 - if using_copy_on_write: - # modifying child after deleting a column still doesn't update parent - tm.assert_frame_equal(df, df_orig) - else: - assert df.loc[0, "a"] == 100 + df2.loc[0, "a"] = 100 + # modifying child after deleting a column still doesn't update parent + tm.assert_frame_equal(df, df_orig) def test_del_series(backend): @@ -967,9 +693,7 @@ def test_del_series(backend): # Accessing column as Series -def test_column_as_series( - backend, using_copy_on_write, warn_copy_on_write, using_array_manager -): +def test_column_as_series(backend): # Case: selecting a single column now also uses Copy-on-Write dtype_backend, DataFrame, Series = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -978,34 +702,17 @@ def test_column_as_series( s = df["a"] assert np.shares_memory(get_array(s, "a"), get_array(df, "a")) - - if using_copy_on_write or using_array_manager: - s[0] = 0 - else: - if warn_copy_on_write: - with tm.assert_cow_warning(): - s[0] = 0 - else: - warn = SettingWithCopyWarning if dtype_backend == "numpy" else None - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(warn): - s[0] = 0 + s[0] = 0 expected = Series([0, 2, 3], name="a") tm.assert_series_equal(s, expected) - if using_copy_on_write: - # assert not np.shares_memory(s.values, get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - # ensure cached series on getitem is not the changed series - tm.assert_series_equal(df["a"], df_orig["a"]) - else: - df_orig.iloc[0, 0] = 0 - tm.assert_frame_equal(df, df_orig) + # assert not np.shares_memory(s.values, get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) -def test_column_as_series_set_with_upcast( - backend, using_copy_on_write, using_array_manager, warn_copy_on_write -): +def test_column_as_series_set_with_upcast(backend): # Case: selecting a single column now also uses Copy-on-Write -> when # setting a value causes an upcast, we don't need to update the parent # DataFrame through the cache mechanism @@ -1015,37 +722,16 @@ def test_column_as_series_set_with_upcast( s = df["a"] if dtype_backend == "nullable": - with tm.assert_cow_warning(warn_copy_on_write): - with pytest.raises(TypeError, match="Invalid value"): - s[0] = "foo" - expected = Series([1, 2, 3], name="a") - elif using_copy_on_write or warn_copy_on_write or using_array_manager: - # TODO(CoW-warn) assert the FutureWarning for CoW is also raised - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): s[0] = "foo" - expected = Series(["foo", 2, 3], dtype=object, name="a") - else: - with pd.option_context("chained_assignment", "warn"): - msg = "|".join( - [ - "A value is trying to be set on a copy of a slice from a DataFrame", - "Setting an item of incompatible dtype is deprecated", - ] - ) - with tm.assert_produces_warning( - (SettingWithCopyWarning, FutureWarning), match=msg - ): - s[0] = "foo" - expected = Series(["foo", 2, 3], dtype=object, name="a") - - tm.assert_series_equal(s, expected) - if using_copy_on_write: + expected = Series([1, 2, 3], name="a") + tm.assert_series_equal(s, expected) tm.assert_frame_equal(df, df_orig) # ensure cached series on getitem is not the changed series tm.assert_series_equal(df["a"], df_orig["a"]) else: - df_orig["a"] = expected - tm.assert_frame_equal(df, df_orig) + with pytest.raises(TypeError, match="Invalid value"): + s[0] = "foo" @pytest.mark.parametrize( @@ -1057,14 +743,7 @@ def test_column_as_series_set_with_upcast( ], ids=["getitem", "loc", "iloc"], ) -def test_column_as_series_no_item_cache( - request, - backend, - method, - using_copy_on_write, - warn_copy_on_write, - using_array_manager, -): +def test_column_as_series_no_item_cache(request, backend, method): # Case: selecting a single column (which now also uses Copy-on-Write to protect # the view) should always give a new object (i.e. not make use of a cache) dtype_backend, DataFrame, _ = backend @@ -1074,34 +753,18 @@ def test_column_as_series_no_item_cache( s1 = method(df) s2 = method(df) - is_iloc = "iloc" in request.node.name - if using_copy_on_write or warn_copy_on_write or is_iloc: - assert s1 is not s2 - else: - assert s1 is s2 + assert s1 is not s2 - if using_copy_on_write or using_array_manager: - s1.iloc[0] = 0 - elif warn_copy_on_write: - with tm.assert_cow_warning(): - s1.iloc[0] = 0 - else: - warn = SettingWithCopyWarning if dtype_backend == "numpy" else None - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(warn): - s1.iloc[0] = 0 + s1.iloc[0] = 0 - if using_copy_on_write: - tm.assert_series_equal(s2, df_orig["a"]) - tm.assert_frame_equal(df, df_orig) - else: - assert s2.iloc[0] == 0 + tm.assert_series_equal(s2, df_orig["a"]) + tm.assert_frame_equal(df, df_orig) # TODO add tests for other indexing methods on the Series -def test_dataframe_add_column_from_series(backend, using_copy_on_write): +def test_dataframe_add_column_from_series(backend): # Case: adding a new column to a DataFrame from an existing column/series # -> delays copy under CoW _, DataFrame, Series = backend @@ -1109,10 +772,7 @@ def test_dataframe_add_column_from_series(backend, using_copy_on_write): s = Series([10, 11, 12]) df["new"] = s - if using_copy_on_write: - assert np.shares_memory(get_array(df, "new"), get_array(s)) - else: - assert not np.shares_memory(get_array(df, "new"), get_array(s)) + assert np.shares_memory(get_array(df, "new"), get_array(s)) # editing series -> doesn't modify column in frame s[0] = 0 @@ -1135,106 +795,89 @@ def test_dataframe_add_column_from_series(backend, using_copy_on_write): @pytest.mark.parametrize( "col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"] ) -def test_set_value_copy_only_necessary_column( - using_copy_on_write, warn_copy_on_write, indexer_func, indexer, val, col -): +def test_set_value_copy_only_necessary_column(indexer_func, indexer, val, col): # When setting inplace, only copy column that is modified instead of the whole # block (by splitting the block) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col}) df_orig = df.copy() view = df[:] - if val == "a" and indexer[0] != slice(None): - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype is deprecated" - ): + if val == "a": + with pytest.raises(TypeError, match="Invalid value"): indexer_func(df)[indexer] = val else: - with tm.assert_cow_warning(warn_copy_on_write and val == 100): - indexer_func(df)[indexer] = val + indexer_func(df)[indexer] = val - if using_copy_on_write: assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) tm.assert_frame_equal(view, df_orig) - else: - assert np.shares_memory(get_array(df, "c"), get_array(view, "c")) - if val == "a": - assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) - else: - assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) -def test_series_midx_slice(using_copy_on_write, warn_copy_on_write): +def test_series_midx_slice(): ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]])) ser_orig = ser.copy() result = ser[1] assert np.shares_memory(get_array(ser), get_array(result)) - with tm.assert_cow_warning(warn_copy_on_write): - result.iloc[0] = 100 - if using_copy_on_write: - tm.assert_series_equal(ser, ser_orig) - else: - expected = Series( - [100, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]]) - ) - tm.assert_series_equal(ser, expected) + result.iloc[0] = 100 + tm.assert_series_equal(ser, ser_orig) -def test_getitem_midx_slice( - using_copy_on_write, warn_copy_on_write, using_array_manager -): +def test_getitem_midx_slice(): df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2}) df_orig = df.copy() new_df = df[("a",)] - if using_copy_on_write: - assert not new_df._mgr._has_no_reference(0) + assert not new_df._mgr._has_no_reference(0) - if not using_array_manager: - assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x")) - if using_copy_on_write: - new_df.iloc[0, 0] = 100 - tm.assert_frame_equal(df_orig, df) - else: - if warn_copy_on_write: - with tm.assert_cow_warning(): - new_df.iloc[0, 0] = 100 - else: - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(SettingWithCopyWarning): - new_df.iloc[0, 0] = 100 - assert df.iloc[0, 0] == 100 + assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x")) + new_df.iloc[0, 0] = 100 + tm.assert_frame_equal(df_orig, df) -def test_series_midx_tuples_slice(using_copy_on_write, warn_copy_on_write): +def test_series_midx_tuples_slice(): ser = Series( [1, 2, 3], index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]), ) result = ser[(1, 2)] assert np.shares_memory(get_array(ser), get_array(result)) - with tm.assert_cow_warning(warn_copy_on_write): - result.iloc[0] = 100 - if using_copy_on_write: - expected = Series( - [1, 2, 3], - index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]), - ) - tm.assert_series_equal(ser, expected) + result.iloc[0] = 100 + expected = Series( + [1, 2, 3], + index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]), + ) + tm.assert_series_equal(ser, expected) + +def test_midx_read_only_bool_indexer(): + # GH#56635 + def mklbl(prefix, n): + return [f"{prefix}{i}" for i in range(n)] -def test_loc_enlarging_with_dataframe(using_copy_on_write): + idx = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + cols = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1) + + mask = df[("a", "foo")] == 1 + expected_mask = mask.copy() + result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :] + expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + tm.assert_series_equal(mask, expected_mask) + + +def test_loc_enlarging_with_dataframe(): df = DataFrame({"a": [1, 2, 3]}) rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) rhs_orig = rhs.copy() df.loc[:, ["b", "c"]] = rhs - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) - assert np.shares_memory(get_array(df, "c"), get_array(rhs, "c")) - assert not df._mgr._has_no_reference(1) - else: - assert not np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) + assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) + assert np.shares_memory(get_array(df, "c"), get_array(rhs, "c")) + assert not df._mgr._has_no_reference(1) df.iloc[0, 1] = 100 tm.assert_frame_equal(rhs, rhs_orig) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index a727331307d7e..b7baf01ecc36e 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,16 +1,15 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - -import pandas as pd -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm from pandas.tests.copy_view.util import get_array -@td.skip_array_manager_invalid_test -def test_consolidate(using_copy_on_write): +def test_consolidate(): # create unconsolidated DataFrame df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) df["c"] = [4, 5, 6] @@ -39,43 +38,11 @@ def test_consolidate(using_copy_on_write): assert not df._mgr.blocks[2].refs.has_reference() # and modifying subset still doesn't modify parent - if using_copy_on_write: - subset.iloc[0, 1] = 0.0 - assert not df._mgr.blocks[1].refs.has_reference() - assert df.loc[0, "b"] == 0.1 - - -@pytest.mark.single_cpu -@td.skip_array_manager_invalid_test -def test_switch_options(): - # ensure we can switch the value of the option within one session - # (assuming data is constructed after switching) - - # using the option_context to ensure we set back to global option value - # after running the test - with pd.option_context("mode.copy_on_write", False): - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - subset = df[:] - subset.iloc[0, 0] = 0 - # df updated with CoW disabled - assert df.iloc[0, 0] == 0 - - pd.options.mode.copy_on_write = True - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - subset = df[:] - subset.iloc[0, 0] = 0 - # df not updated with CoW enabled - assert df.iloc[0, 0] == 1 - - pd.options.mode.copy_on_write = False - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - subset = df[:] - subset.iloc[0, 0] = 0 - # df updated with CoW disabled - assert df.iloc[0, 0] == 0 - - -@td.skip_array_manager_invalid_test + subset.iloc[0, 1] = 0.0 + assert not df._mgr.blocks[1].refs.has_reference() + assert df.loc[0, "b"] == 0.1 + + @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", @@ -88,10 +55,9 @@ def test_switch_options(): ([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T), ([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T), ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), - ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), ], ) -def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): +def test_iset_splits_blocks_inplace(locs, arr, dtype): # Nothing currently calls iset with # more than 1 loc with inplace=True (only happens with inplace=False) # but ensure that it works @@ -102,23 +68,18 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): "c": [7, 8, 9], "d": [10, 11, 12], "e": [13, 14, 15], - "f": ["a", "b", "c"], + "f": Series(["a", "b", "c"], dtype=object), }, ) arr = arr.astype(dtype) df_orig = df.copy() - df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy) + df2 = df.copy(deep=False) # Trigger a CoW (if enabled, otherwise makes copy) df2._mgr.iset(locs, arr, inplace=True) tm.assert_frame_equal(df, df_orig) - - if using_copy_on_write: - for i, col in enumerate(df.columns): - if i not in locs: - assert np.shares_memory(get_array(df, col), get_array(df2, col)) - else: - for col in df.columns: - assert not np.shares_memory(get_array(df, col), get_array(df2, col)) + for i, col in enumerate(df.columns): + if i not in locs: + assert np.shares_memory(get_array(df, col), get_array(df2, col)) def test_exponential_backoff(): diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index ddc5879a56d54..6bcda0ef2c35a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -3,60 +3,48 @@ from pandas import ( NA, - ArrowDtype, DataFrame, Interval, NaT, Series, Timestamp, interval_range, - option_context, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @pytest.mark.parametrize("method", ["pad", "nearest", "linear"]) -def test_interpolate_no_op(using_copy_on_write, method): +def test_interpolate_no_op(method): df = DataFrame({"a": [1, 2]}) df_orig = df.copy() - warn = None if method == "pad": - warn = FutureWarning - msg = "DataFrame.interpolate with method=pad is deprecated" - with tm.assert_produces_warning(warn, match=msg): + msg = f"Can not interpolate with method={method}" + with pytest.raises(ValueError, match=msg): + df.interpolate(method=method) + else: result = df.interpolate(method=method) - - if using_copy_on_write: assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - result.iloc[0, 0] = 100 + result.iloc[0, 0] = 100 - if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("func", ["ffill", "bfill"]) -def test_interp_fill_functions(using_copy_on_write, func): +def test_interp_fill_functions(func): # Check that these takes the same code paths as interpolate df = DataFrame({"a": [1, 2]}) df_orig = df.copy() result = getattr(df, func)() - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) result.iloc[0, 0] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) @@ -64,220 +52,161 @@ def test_interp_fill_functions(using_copy_on_write, func): @pytest.mark.parametrize( "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] ) -def test_interpolate_triggers_copy(using_copy_on_write, vals, func): +def test_interpolate_triggers_copy(vals, func): df = DataFrame({"a": vals}) result = getattr(df, func)() assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - if using_copy_on_write: - # Check that we don't have references when triggering a copy - assert result._mgr._has_no_reference(0) + # Check that we don't have references when triggering a copy + assert result._mgr._has_no_reference(0) @pytest.mark.parametrize( "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] ) -def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals): +def test_interpolate_inplace_no_reference_no_copy(vals): df = DataFrame({"a": vals}) arr = get_array(df, "a") df.interpolate(method="linear", inplace=True) assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - # Check that we don't have references when triggering a copy - assert df._mgr._has_no_reference(0) + # Check that we don't have references when triggering a copy + assert df._mgr._has_no_reference(0) @pytest.mark.parametrize( "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] ) -def test_interpolate_inplace_with_refs(using_copy_on_write, vals, warn_copy_on_write): +def test_interpolate_inplace_with_refs(vals): df = DataFrame({"a": [1, np.nan, 2]}) df_orig = df.copy() arr = get_array(df, "a") view = df[:] - with tm.assert_cow_warning(warn_copy_on_write): - df.interpolate(method="linear", inplace=True) - - if using_copy_on_write: - # Check that copy was triggered in interpolate and that we don't - # have any references left - assert not np.shares_memory(arr, get_array(df, "a")) - tm.assert_frame_equal(df_orig, view) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - else: - assert np.shares_memory(arr, get_array(df, "a")) + df.interpolate(method="linear", inplace=True) + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) @pytest.mark.parametrize("func", ["ffill", "bfill"]) @pytest.mark.parametrize("dtype", ["float64", "Float64"]) -def test_interp_fill_functions_inplace( - using_copy_on_write, func, warn_copy_on_write, dtype -): +def test_interp_fill_functions_inplace(func, dtype): # Check that these takes the same code paths as interpolate df = DataFrame({"a": [1, np.nan, 2]}, dtype=dtype) df_orig = df.copy() arr = get_array(df, "a") view = df[:] - with tm.assert_cow_warning(warn_copy_on_write and dtype == "float64"): - getattr(df, func)(inplace=True) + getattr(df, func)(inplace=True) - if using_copy_on_write: - # Check that copy was triggered in interpolate and that we don't - # have any references left - assert not np.shares_memory(arr, get_array(df, "a")) - tm.assert_frame_equal(df_orig, view) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - else: - assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) -def test_interpolate_cleaned_fill_method(using_copy_on_write): - # Check that "method is set to None" case works correctly +def test_interpolate_cannot_with_object_dtype(): df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) - df_orig = df.copy() - - msg = "DataFrame.interpolate with object dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.interpolate(method="linear") + df["a"] = df["a"].astype(object) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - - result.iloc[0, 0] = Timestamp("2021-12-31") - - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) + msg = "DataFrame cannot interpolate with object dtype" + with pytest.raises(TypeError, match=msg): + df.interpolate() -def test_interpolate_object_convert_no_op(using_copy_on_write): +def test_interpolate_object_convert_no_op(): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) + df["a"] = df["a"].astype(object) arr_a = get_array(df, "a") - msg = "DataFrame.interpolate with method=pad is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.interpolate(method="pad", inplace=True) # Now CoW makes a copy, it should not! - if using_copy_on_write: - assert df._mgr._has_no_reference(0) - assert np.shares_memory(arr_a, get_array(df, "a")) - - -def test_interpolate_object_convert_copies(using_copy_on_write): - df = DataFrame({"a": Series([1, 2], dtype=object), "b": 1}) - arr_a = get_array(df, "a") - msg = "DataFrame.interpolate with method=pad is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.interpolate(method="pad", inplace=True) - - if using_copy_on_write: - assert df._mgr._has_no_reference(0) - assert not np.shares_memory(arr_a, get_array(df, "a")) + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr_a, get_array(df, "a")) -def test_interpolate_downcast(using_copy_on_write): +def test_interpolate_object_convert_copies(): df = DataFrame({"a": [1, np.nan, 2.5], "b": 1}) arr_a = get_array(df, "a") - msg = "DataFrame.interpolate with method=pad is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "Can not interpolate with method=pad" + with pytest.raises(ValueError, match=msg): df.interpolate(method="pad", inplace=True, downcast="infer") - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) assert np.shares_memory(arr_a, get_array(df, "a")) -def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write): +def test_interpolate_downcast_reference_triggers_copy(): df = DataFrame({"a": [1, np.nan, 2.5], "b": 1}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - msg = "DataFrame.interpolate with method=pad is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.interpolate(method="pad", inplace=True, downcast="infer") - if using_copy_on_write: + msg = "Can not interpolate with method=pad" + with pytest.raises(ValueError, match=msg): + df.interpolate(method="pad", inplace=True, downcast="infer") assert df._mgr._has_no_reference(0) assert not np.shares_memory(arr_a, get_array(df, "a")) - tm.assert_frame_equal(df_orig, view) - else: - tm.assert_frame_equal(df, view) + tm.assert_frame_equal(df_orig, view) -def test_fillna(using_copy_on_write): + +def test_fillna(): df = DataFrame({"a": [1.5, np.nan], "b": 1}) df_orig = df.copy() df2 = df.fillna(5.5) - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - else: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) df2.iloc[0, 1] = 100 tm.assert_frame_equal(df_orig, df) -def test_fillna_dict(using_copy_on_write): +def test_fillna_dict(): df = DataFrame({"a": [1.5, np.nan], "b": 1}) df_orig = df.copy() df2 = df.fillna({"a": 100.5}) - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - else: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) df2.iloc[0, 1] = 100 tm.assert_frame_equal(df_orig, df) -@pytest.mark.parametrize("downcast", [None, False]) -def test_fillna_inplace(using_copy_on_write, downcast): +def test_fillna_inplace(): df = DataFrame({"a": [1.5, np.nan], "b": 1}) arr_a = get_array(df, "a") arr_b = get_array(df, "b") - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.fillna(5.5, inplace=True, downcast=downcast) + df.fillna(5.5, inplace=True) assert np.shares_memory(get_array(df, "a"), arr_a) assert np.shares_memory(get_array(df, "b"), arr_b) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) - assert df._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(1) -def test_fillna_inplace_reference(using_copy_on_write, warn_copy_on_write): +def test_fillna_inplace_reference(): df = DataFrame({"a": [1.5, np.nan], "b": 1}) df_orig = df.copy() arr_a = get_array(df, "a") arr_b = get_array(df, "b") view = df[:] - with tm.assert_cow_warning(warn_copy_on_write): - df.fillna(5.5, inplace=True) - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr_a) - assert np.shares_memory(get_array(df, "b"), arr_b) - assert view._mgr._has_no_reference(0) - assert df._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) - else: - assert np.shares_memory(get_array(df, "a"), arr_a) - assert np.shares_memory(get_array(df, "b"), arr_b) + df.fillna(5.5, inplace=True) + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + assert view._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) expected = DataFrame({"a": [1.5, 5.5], "b": 1}) tm.assert_frame_equal(df, expected) -def test_fillna_interval_inplace_reference(using_copy_on_write, warn_copy_on_write): +def test_fillna_interval_inplace_reference(): # Set dtype explicitly to avoid implicit cast when setting nan ser = Series( interval_range(start=0, end=5), name="a", dtype="interval[float64, right]" @@ -286,147 +215,86 @@ def test_fillna_interval_inplace_reference(using_copy_on_write, warn_copy_on_wri ser_orig = ser.copy() view = ser[:] - with tm.assert_cow_warning(warn_copy_on_write): - ser.fillna(value=Interval(left=0, right=5), inplace=True) - - if using_copy_on_write: - assert not np.shares_memory( - get_array(ser, "a").left.values, get_array(view, "a").left.values - ) - tm.assert_series_equal(view, ser_orig) - else: - assert np.shares_memory( - get_array(ser, "a").left.values, get_array(view, "a").left.values - ) + ser.fillna(value=Interval(left=0, right=5), inplace=True) + + assert not np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) + tm.assert_series_equal(view, ser_orig) -def test_fillna_series_empty_arg(using_copy_on_write): +def test_fillna_series_empty_arg(): ser = Series([1, np.nan, 2]) ser_orig = ser.copy() result = ser.fillna({}) - - if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(result)) - else: - assert not np.shares_memory(get_array(ser), get_array(result)) + assert np.shares_memory(get_array(ser), get_array(result)) ser.iloc[0] = 100.5 tm.assert_series_equal(ser_orig, result) -def test_fillna_series_empty_arg_inplace(using_copy_on_write): +def test_fillna_series_empty_arg_inplace(): ser = Series([1, np.nan, 2]) arr = get_array(ser) ser.fillna({}, inplace=True) assert np.shares_memory(get_array(ser), arr) - if using_copy_on_write: - assert ser._mgr._has_no_reference(0) + assert ser._mgr._has_no_reference(0) -def test_fillna_ea_noop_shares_memory( - using_copy_on_write, any_numeric_ea_and_arrow_dtype -): +def test_fillna_ea_noop_shares_memory(any_numeric_ea_and_arrow_dtype): df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) df_orig = df.copy() df2 = df.fillna(100) assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert not df2._mgr._has_no_reference(1) - elif isinstance(df.dtypes.iloc[0], ArrowDtype): - # arrow is immutable, so no-ops do not need to copy underlying array - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - else: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not df2._mgr._has_no_reference(1) tm.assert_frame_equal(df_orig, df) df2.iloc[0, 1] = 100 - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert df2._mgr._has_no_reference(1) - assert df._mgr._has_no_reference(1) + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert df2._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(1) tm.assert_frame_equal(df_orig, df) -def test_fillna_inplace_ea_noop_shares_memory( - using_copy_on_write, warn_copy_on_write, any_numeric_ea_and_arrow_dtype -): +def test_fillna_inplace_ea_noop_shares_memory(any_numeric_ea_and_arrow_dtype): df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) df_orig = df.copy() view = df[:] - with tm.assert_cow_warning(warn_copy_on_write): - df.fillna(100, inplace=True) - - if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) - else: - # MaskedArray can actually respect inplace=True - assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) + df.fillna(100, inplace=True) + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) - if using_copy_on_write: - assert not df._mgr._has_no_reference(1) - assert not view._mgr._has_no_reference(1) - - with tm.assert_cow_warning( - warn_copy_on_write and "pyarrow" not in any_numeric_ea_and_arrow_dtype - ): - df.iloc[0, 1] = 100 - if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: - tm.assert_frame_equal(df_orig, view) - else: - # we actually have a view - tm.assert_frame_equal(df, view) + assert not df._mgr._has_no_reference(1) + assert not view._mgr._has_no_reference(1) + df.iloc[0, 1] = 100 + tm.assert_frame_equal(df_orig, view) -def test_fillna_chained_assignment(using_copy_on_write): + +def test_fillna_chained_assignment(): df = DataFrame({"a": [1, np.nan, 2], "b": 1}) df_orig = df.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["a"].fillna(100, inplace=True) - tm.assert_frame_equal(df, df_orig) - - with tm.raises_chained_assignment_error(): - df[["a"]].fillna(100, inplace=True) - tm.assert_frame_equal(df, df_orig) - else: - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - df[["a"]].fillna(100, inplace=True) - - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - df[df.a > 5].fillna(100, inplace=True) + with tm.raises_chained_assignment_error(): + df["a"].fillna(100, inplace=True) + tm.assert_frame_equal(df, df_orig) - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - df["a"].fillna(100, inplace=True) + with tm.raises_chained_assignment_error(): + df[["a"]].fillna(100, inplace=True) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("func", ["interpolate", "ffill", "bfill"]) -def test_interpolate_chained_assignment(using_copy_on_write, func): +def test_interpolate_chained_assignment(func): df = DataFrame({"a": [1, np.nan, 2], "b": 1}) df_orig = df.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - getattr(df["a"], func)(inplace=True) - tm.assert_frame_equal(df, df_orig) - - with tm.raises_chained_assignment_error(): - getattr(df[["a"]], func)(inplace=True) - tm.assert_frame_equal(df, df_orig) - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - getattr(df["a"], func)(inplace=True) - - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - getattr(df[["a"]], func)(inplace=True) + with tm.raises_chained_assignment_error(): + getattr(df["a"], func)(inplace=True) + tm.assert_frame_equal(df, df_orig) - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - getattr(df[df["a"] > 1], func)(inplace=True) + with tm.raises_chained_assignment_error(): + getattr(df[["a"]], func)(inplace=True) + tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 862aebdc70a9d..250697c91ff13 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas.errors import SettingWithCopyWarning +from pandas.compat import HAS_PYARROW import pandas as pd from pandas import ( @@ -12,14 +12,13 @@ Series, Timestamp, date_range, - option_context, period_range, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array -def test_copy(using_copy_on_write): +def test_copy(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy() @@ -31,52 +30,39 @@ def test_copy(using_copy_on_write): # the deep copy doesn't share memory assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) - if using_copy_on_write: - assert not df_copy._mgr.blocks[0].refs.has_reference() - assert not df_copy._mgr.blocks[1].refs.has_reference() + assert not df_copy._mgr.blocks[0].refs.has_reference() + assert not df_copy._mgr.blocks[1].refs.has_reference() # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 1 -def test_copy_shallow(using_copy_on_write, warn_copy_on_write): +def test_copy_shallow(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy(deep=False) # the shallow copy also makes a shallow copy of the index - if using_copy_on_write: - assert df_copy.index is not df.index - assert df_copy.columns is not df.columns - assert df_copy.index.is_(df.index) - assert df_copy.columns.is_(df.columns) - else: - assert df_copy.index is df.index - assert df_copy.columns is df.columns + assert df_copy.index is not df.index + assert df_copy.columns is not df.columns + assert df_copy.index.is_(df.index) + assert df_copy.columns.is_(df.columns) # the shallow copy still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) - if using_copy_on_write: - assert df_copy._mgr.blocks[0].refs.has_reference() - assert df_copy._mgr.blocks[1].refs.has_reference() - - if using_copy_on_write: - # mutating shallow copy doesn't mutate original - df_copy.iloc[0, 0] = 0 - assert df.iloc[0, 0] == 1 - # mutating triggered a copy-on-write -> no longer shares memory - assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) - # but still shares memory for the other columns/blocks - assert np.shares_memory(get_array(df_copy, "c"), get_array(df, "c")) - else: - # mutating shallow copy does mutate original - with tm.assert_cow_warning(warn_copy_on_write): - df_copy.iloc[0, 0] = 0 - assert df.iloc[0, 0] == 0 - # and still shares memory - assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) + assert df_copy._mgr.blocks[0].refs.has_reference() + assert df_copy._mgr.blocks[1].refs.has_reference() + # mutating shallow copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) + # but still shares memory for the other columns/blocks + assert np.shares_memory(get_array(df_copy, "c"), get_array(df, "c")) + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize("copy", [True, None, False]) @pytest.mark.parametrize( "method", @@ -90,7 +76,6 @@ def test_copy_shallow(using_copy_on_write, warn_copy_on_write): lambda df, copy: df.rename_axis(columns="test", copy=copy), lambda df, copy: df.astype({"b": "int64"}, copy=copy), # lambda df, copy: df.swaplevel(0, 0, copy=copy), - lambda df, copy: df.swapaxes(0, 0, copy=copy), lambda df, copy: df.truncate(0, 5, copy=copy), lambda df, copy: df.infer_objects(copy=copy), lambda df, copy: df.to_timestamp(copy=copy), @@ -109,7 +94,6 @@ def test_copy_shallow(using_copy_on_write, warn_copy_on_write): "rename_axis1", "astype", # "swaplevel", # only series - "swapaxes", "truncate", "infer_objects", "to_timestamp", @@ -119,9 +103,7 @@ def test_copy_shallow(using_copy_on_write, warn_copy_on_write): "set_flags", ], ) -def test_methods_copy_keyword( - request, method, copy, using_copy_on_write, using_array_manager -): +def test_methods_copy_keyword(request, method, copy): index = None if "to_timestamp" in request.node.callspec.id: index = period_range("2012-01-01", freq="D", periods=3) @@ -133,27 +115,11 @@ def test_methods_copy_keyword( index = date_range("2012-01-01", freq="D", periods=3, tz="Europe/Brussels") df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=index) - - if "swapaxes" in request.node.callspec.id: - msg = "'DataFrame.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = method(df, copy=copy) - else: - df2 = method(df, copy=copy) - - share_memory = using_copy_on_write or copy is False - - if request.node.callspec.id.startswith("reindex-"): - # TODO copy=False without CoW still returns a copy in this case - if not using_copy_on_write and not using_array_manager and copy is False: - share_memory = False - - if share_memory: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + df2 = method(df, copy=copy) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize("copy", [True, None, False]) @pytest.mark.parametrize( "method", @@ -167,7 +133,6 @@ def test_methods_copy_keyword( lambda ser, copy: ser.rename_axis(index="test", copy=copy), lambda ser, copy: ser.astype("int64", copy=copy), lambda ser, copy: ser.swaplevel(0, 1, copy=copy), - lambda ser, copy: ser.swapaxes(0, 0, copy=copy), lambda ser, copy: ser.truncate(0, 5, copy=copy), lambda ser, copy: ser.infer_objects(copy=copy), lambda ser, copy: ser.to_timestamp(copy=copy), @@ -186,7 +151,6 @@ def test_methods_copy_keyword( "rename_axis0", "astype", "swaplevel", - "swapaxes", "truncate", "infer_objects", "to_timestamp", @@ -196,7 +160,7 @@ def test_methods_copy_keyword( "set_flags", ], ) -def test_methods_series_copy_keyword(request, method, copy, using_copy_on_write): +def test_methods_series_copy_keyword(request, method, copy): index = None if "to_timestamp" in request.node.callspec.id: index = period_range("2012-01-01", freq="D", periods=3) @@ -210,40 +174,15 @@ def test_methods_series_copy_keyword(request, method, copy, using_copy_on_write) index = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]]) ser = Series([1, 2, 3], index=index) - - if "swapaxes" in request.node.callspec.id: - msg = "'Series.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser2 = method(ser, copy=copy) - else: - ser2 = method(ser, copy=copy) - - share_memory = using_copy_on_write or copy is False - - if share_memory: - assert np.shares_memory(get_array(ser2), get_array(ser)) - else: - assert not np.shares_memory(get_array(ser2), get_array(ser)) - - -@pytest.mark.parametrize("copy", [True, None, False]) -def test_transpose_copy_keyword(using_copy_on_write, copy, using_array_manager): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - result = df.transpose(copy=copy) - share_memory = using_copy_on_write or copy is False or copy is None - share_memory = share_memory and not using_array_manager - - if share_memory: - assert np.shares_memory(get_array(df, "a"), get_array(result, 0)) - else: - assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) + ser2 = method(ser, copy=copy) + assert np.shares_memory(get_array(ser2), get_array(ser)) # ----------------------------------------------------------------------------- # DataFrame methods returning new DataFrame using shallow copy -def test_reset_index(using_copy_on_write): +def test_reset_index(): # Case: resetting the index (i.e. adding a new column) + mutating the # resulting dataframe df = DataFrame( @@ -253,72 +192,72 @@ def test_reset_index(using_copy_on_write): df2 = df.reset_index() df2._mgr._verify_integrity() - if using_copy_on_write: - # still shares memory (df2 is a shallow copy) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + # still shares memory (df2 is a shallow copy) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 2] = 0 assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("index", [pd.RangeIndex(0, 2), Index([1, 2])]) -def test_reset_index_series_drop(using_copy_on_write, index): +def test_reset_index_series_drop(index): ser = Series([1, 2], index=index) ser_orig = ser.copy() ser2 = ser.reset_index(drop=True) - if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(ser2)) - assert not ser._mgr._has_no_reference(0) - else: - assert not np.shares_memory(get_array(ser), get_array(ser2)) + assert np.shares_memory(get_array(ser), get_array(ser2)) + assert not ser._mgr._has_no_reference(0) ser2.iloc[0] = 100 tm.assert_series_equal(ser, ser_orig) -def test_rename_columns(using_copy_on_write): +def test_groupby_column_index_in_references(): + df = DataFrame( + {"A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": ["a", "a", "b", "b"]} + ) + df = df.set_index("A") + key = df["C"] + result = df.groupby(key, observed=True).sum() + expected = df.groupby("C", observed=True).sum() + tm.assert_frame_equal(result, expected) + + +def test_rename_columns(): # Case: renaming columns returns a new dataframe # + afterwards modifying the result df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.rename(columns=str.upper) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) expected = DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df2, expected) tm.assert_frame_equal(df, df_orig) -def test_rename_columns_modify_parent(using_copy_on_write): +def test_rename_columns_modify_parent(): # Case: renaming columns returns a new dataframe # + afterwards modifying the original (parent) dataframe df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df2 = df.rename(columns=str.upper) df2_orig = df2.copy() - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) df.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) expected = DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df2, df2_orig) -def test_pipe(using_copy_on_write): +def test_pipe(): df = DataFrame({"a": [1, 2, 3], "b": 1.5}) df_orig = df.copy() @@ -331,18 +270,12 @@ def testfunc(df): # mutating df2 triggers a copy-on-write for that column df2.iloc[0, 0] = 0 - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - expected = DataFrame({"a": [0, 2, 3], "b": 1.5}) - tm.assert_frame_equal(df, expected) - - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) -def test_pipe_modify_df(using_copy_on_write): +def test_pipe_modify_df(): df = DataFrame({"a": [1, 2, 3], "b": 1.5}) df_orig = df.copy() @@ -354,34 +287,24 @@ def testfunc(df): assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - expected = DataFrame({"a": [100, 2, 3], "b": 1.5}) - tm.assert_frame_equal(df, expected) - - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) -def test_reindex_columns(using_copy_on_write): +def test_reindex_columns(): # Case: reindexing the column returns a new dataframe # + afterwards modifying the result df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.reindex(columns=["a", "c"]) - if using_copy_on_write: - # still shares memory (df2 is a shallow copy) - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + # still shares memory (df2 is a shallow copy) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) @@ -395,46 +318,37 @@ def test_reindex_columns(using_copy_on_write): ], ids=["identical", "view", "copy", "values"], ) -def test_reindex_rows(index, using_copy_on_write): +def test_reindex_rows(index): # Case: reindexing the rows with an index that matches the current index # can use a shallow copy df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.reindex(index=index(df.index)) - if using_copy_on_write: - # still shares memory (df2 is a shallow copy) - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + # still shares memory (df2 is a shallow copy) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) -def test_drop_on_column(using_copy_on_write): +def test_drop_on_column(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.drop(columns="a") df2._mgr._verify_integrity() - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - else: - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) -def test_select_dtypes(using_copy_on_write): +def test_select_dtypes(): # Case: selecting columns using `select_dtypes()` returns a new dataframe # + afterwards modifying the result df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -442,40 +356,31 @@ def test_select_dtypes(using_copy_on_write): df2 = df.select_dtypes("int64") df2._mgr._verify_integrity() - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( "filter_kwargs", [{"items": ["a"]}, {"like": "a"}, {"regex": "a"}] ) -def test_filter(using_copy_on_write, filter_kwargs): +def test_filter(filter_kwargs): # Case: selecting columns using `filter()` returns a new dataframe # + afterwards modifying the result df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.filter(**filter_kwargs) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block - if using_copy_on_write: - df2.iloc[0, 0] = 0 - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) -def test_shift_no_op(using_copy_on_write): +def test_shift_no_op(): df = DataFrame( [[1, 2], [3, 4], [5, 6]], index=date_range("2020-01-01", "2020-01-03"), @@ -483,20 +388,15 @@ def test_shift_no_op(using_copy_on_write): ) df_orig = df.copy() df2 = df.shift(periods=0) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) tm.assert_frame_equal(df2, df_orig) -def test_shift_index(using_copy_on_write): +def test_shift_index(): df = DataFrame( [[1, 2], [3, 4], [5, 6]], index=date_range("2020-01-01", "2020-01-03"), @@ -507,7 +407,7 @@ def test_shift_index(using_copy_on_write): assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) -def test_shift_rows_freq(using_copy_on_write): +def test_shift_rows_freq(): df = DataFrame( [[1, 2], [3, 4], [5, 6]], index=date_range("2020-01-01", "2020-01-03"), @@ -517,38 +417,31 @@ def test_shift_rows_freq(using_copy_on_write): df_orig.index = date_range("2020-01-02", "2020-01-04") df2 = df.shift(periods=1, freq="1D") - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) tm.assert_frame_equal(df2, df_orig) -def test_shift_columns(using_copy_on_write, warn_copy_on_write): +def test_shift_columns(): df = DataFrame( [[1, 2], [3, 4], [5, 6]], columns=date_range("2020-01-01", "2020-01-02") ) df2 = df.shift(periods=1, axis=1) assert np.shares_memory(get_array(df2, "2020-01-02"), get_array(df, "2020-01-01")) - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory( - get_array(df2, "2020-01-02"), get_array(df, "2020-01-01") - ) - expected = DataFrame( - [[np.nan, 1], [np.nan, 3], [np.nan, 5]], - columns=date_range("2020-01-01", "2020-01-02"), - ) - tm.assert_frame_equal(df2, expected) + df.iloc[0, 0] = 0 + assert not np.shares_memory( + get_array(df2, "2020-01-02"), get_array(df, "2020-01-01") + ) + expected = DataFrame( + [[np.nan, 1], [np.nan, 3], [np.nan, 5]], + columns=date_range("2020-01-01", "2020-01-02"), + ) + tm.assert_frame_equal(df2, expected) -def test_pop(using_copy_on_write, warn_copy_on_write): +def test_pop(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() view_original = df[:] @@ -557,17 +450,11 @@ def test_pop(using_copy_on_write, warn_copy_on_write): assert np.shares_memory(result.values, get_array(view_original, "a")) assert np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) - if using_copy_on_write: - result.iloc[0] = 0 - assert not np.shares_memory(result.values, get_array(view_original, "a")) - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) - tm.assert_frame_equal(view_original, df_orig) - else: - expected = DataFrame({"a": [1, 2, 3], "b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) - tm.assert_frame_equal(view_original, expected) + result.iloc[0] = 0 + assert not np.shares_memory(result.values, get_array(view_original, "a")) + df.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) + tm.assert_frame_equal(view_original, df_orig) @pytest.mark.parametrize( @@ -578,81 +465,68 @@ def test_pop(using_copy_on_write, warn_copy_on_write): lambda x, y: x.align(y.a.iloc[slice(0, 1)], axis=1), ], ) -def test_align_frame(using_copy_on_write, func): +def test_align_frame(func): df = DataFrame({"a": [1, 2, 3], "b": "a"}) df_orig = df.copy() df_changed = df[["b", "a"]].copy() df2, _ = func(df, df_changed) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) -def test_align_series(using_copy_on_write): +def test_align_series(): ser = Series([1, 2]) ser_orig = ser.copy() ser_other = ser.copy() ser2, ser_other_result = ser.align(ser_other) - if using_copy_on_write: - assert np.shares_memory(ser2.values, ser.values) - assert np.shares_memory(ser_other_result.values, ser_other.values) - else: - assert not np.shares_memory(ser2.values, ser.values) - assert not np.shares_memory(ser_other_result.values, ser_other.values) - + assert np.shares_memory(ser2.values, ser.values) + assert np.shares_memory(ser_other_result.values, ser_other.values) ser2.iloc[0] = 0 ser_other_result.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(ser2.values, ser.values) - assert not np.shares_memory(ser_other_result.values, ser_other.values) + assert not np.shares_memory(ser2.values, ser.values) + assert not np.shares_memory(ser_other_result.values, ser_other.values) tm.assert_series_equal(ser, ser_orig) tm.assert_series_equal(ser_other, ser_orig) -def test_align_copy_false(using_copy_on_write): +def test_align_copy_false(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() - df2, df3 = df.align(df, copy=False) + df2, df3 = df.align(df) assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - if using_copy_on_write: - df2.loc[0, "a"] = 0 - tm.assert_frame_equal(df, df_orig) # Original is unchanged + df2.loc[0, "a"] = 0 + tm.assert_frame_equal(df, df_orig) # Original is unchanged - df3.loc[0, "a"] = 0 - tm.assert_frame_equal(df, df_orig) # Original is unchanged + df3.loc[0, "a"] = 0 + tm.assert_frame_equal(df, df_orig) # Original is unchanged -def test_align_with_series_copy_false(using_copy_on_write): +def test_align_with_series_copy_false(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) ser = Series([1, 2, 3], name="x") ser_orig = ser.copy() df_orig = df.copy() - df2, ser2 = df.align(ser, copy=False, axis=0) + df2, ser2 = df.align(ser, axis=0) assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) assert np.shares_memory(get_array(ser, "x"), get_array(ser2, "x")) - if using_copy_on_write: - df2.loc[0, "a"] = 0 - tm.assert_frame_equal(df, df_orig) # Original is unchanged + df2.loc[0, "a"] = 0 + tm.assert_frame_equal(df, df_orig) # Original is unchanged - ser2.loc[0] = 0 - tm.assert_series_equal(ser, ser_orig) # Original is unchanged + ser2.loc[0] = 0 + tm.assert_series_equal(ser, ser_orig) # Original is unchanged -def test_to_frame(using_copy_on_write, warn_copy_on_write): +def test_to_frame(): # Case: converting a Series to a DataFrame with to_frame ser = Series([1, 2, 3]) ser_orig = ser.copy() @@ -662,79 +536,17 @@ def test_to_frame(using_copy_on_write, warn_copy_on_write): # currently this always returns a "view" assert np.shares_memory(ser.values, get_array(df, 0)) - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 0 + df.iloc[0, 0] = 0 - if using_copy_on_write: - # mutating df triggers a copy-on-write for that column - assert not np.shares_memory(ser.values, get_array(df, 0)) - tm.assert_series_equal(ser, ser_orig) - else: - # but currently select_dtypes() actually returns a view -> mutates parent - expected = ser_orig.copy() - expected.iloc[0] = 0 - tm.assert_series_equal(ser, expected) + # mutating df triggers a copy-on-write for that column + assert not np.shares_memory(ser.values, get_array(df, 0)) + tm.assert_series_equal(ser, ser_orig) # modify original series -> don't modify dataframe df = ser[:].to_frame() - with tm.assert_cow_warning(warn_copy_on_write): - ser.iloc[0] = 0 + ser.iloc[0] = 0 - if using_copy_on_write: - tm.assert_frame_equal(df, ser_orig.to_frame()) - else: - expected = ser_orig.copy().to_frame() - expected.iloc[0, 0] = 0 - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("ax", ["index", "columns"]) -def test_swapaxes_noop(using_copy_on_write, ax): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df_orig = df.copy() - msg = "'DataFrame.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = df.swapaxes(ax, ax) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - - # mutating df2 triggers a copy-on-write for that column/block - df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - - -def test_swapaxes_single_block(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["x", "y", "z"]) - df_orig = df.copy() - msg = "'DataFrame.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = df.swapaxes("index", "columns") - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "x"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "x"), get_array(df, "a")) - - # mutating df2 triggers a copy-on-write for that column/block - df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "x"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - - -def test_swapaxes_read_only_array(): - df = DataFrame({"a": [1, 2], "b": 3}) - msg = "'DataFrame.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df = df.swapaxes(axis1="index", axis2="columns") - df.iloc[0, 0] = 100 - expected = DataFrame({0: [100, 3], 1: [2, 3]}, index=["a", "b"]) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, ser_orig.to_frame()) @pytest.mark.parametrize( @@ -747,39 +559,29 @@ def test_swapaxes_read_only_array(): ], ids=["shallow-copy", "reset_index", "rename", "select_dtypes"], ) -def test_chained_methods(request, method, idx, using_copy_on_write, warn_copy_on_write): +def test_chained_methods(request, method, idx): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() - # when not using CoW, only the copy() variant actually gives a view - df2_is_view = not using_copy_on_write and request.node.callspec.id == "shallow-copy" - # modify df2 -> don't modify df df2 = method(df) - with tm.assert_cow_warning(warn_copy_on_write and df2_is_view): - df2.iloc[0, idx] = 0 - if not df2_is_view: - tm.assert_frame_equal(df, df_orig) + df2.iloc[0, idx] = 0 + tm.assert_frame_equal(df, df_orig) # modify df -> don't modify df2 df2 = method(df) - with tm.assert_cow_warning(warn_copy_on_write and df2_is_view): - df.iloc[0, 0] = 0 - if not df2_is_view: - tm.assert_frame_equal(df2.iloc[:, idx:], df_orig) + df.iloc[0, 0] = 0 + tm.assert_frame_equal(df2.iloc[:, idx:], df_orig) @pytest.mark.parametrize("obj", [Series([1, 2], name="a"), DataFrame({"a": [1, 2]})]) -def test_to_timestamp(using_copy_on_write, obj): +def test_to_timestamp(obj): obj.index = Index([Period("2012-1-1", freq="D"), Period("2012-1-2", freq="D")]) obj_orig = obj.copy() obj2 = obj.to_timestamp() - if using_copy_on_write: - assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) - else: - assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) # mutating obj2 triggers a copy-on-write for that column / block obj2.iloc[0] = 0 @@ -788,16 +590,13 @@ def test_to_timestamp(using_copy_on_write, obj): @pytest.mark.parametrize("obj", [Series([1, 2], name="a"), DataFrame({"a": [1, 2]})]) -def test_to_period(using_copy_on_write, obj): +def test_to_period(obj): obj.index = Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")]) obj_orig = obj.copy() obj2 = obj.to_period(freq="Y") - if using_copy_on_write: - assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) - else: - assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) # mutating obj2 triggers a copy-on-write for that column / block obj2.iloc[0] = 0 @@ -805,16 +604,13 @@ def test_to_period(using_copy_on_write, obj): tm.assert_equal(obj, obj_orig) -def test_set_index(using_copy_on_write): +def test_set_index(): # GH 49473 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.set_index("a") - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - else: - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 1] = 0 @@ -831,20 +627,18 @@ def test_set_index_mutating_parent_does_not_mutate_index(): tm.assert_frame_equal(result, expected) -def test_add_prefix(using_copy_on_write): +def test_add_prefix(): # GH 49473 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.add_prefix("CoW_") - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "CoW_a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "CoW_a"), get_array(df, "a")) df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "CoW_a"), get_array(df, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "CoW_c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "CoW_c"), get_array(df, "c")) expected = DataFrame( {"CoW_a": [0, 2, 3], "CoW_b": [4, 5, 6], "CoW_c": [0.1, 0.2, 0.3]} ) @@ -852,17 +646,15 @@ def test_add_prefix(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_add_suffix(using_copy_on_write): +def test_add_suffix(): # GH 49473 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.add_suffix("_CoW") - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a_CoW"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a_CoW"), get_array(df, "a")) df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "a_CoW"), get_array(df, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c_CoW"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "c_CoW"), get_array(df, "c")) expected = DataFrame( {"a_CoW": [0, 2, 3], "b_CoW": [4, 5, 6], "c_CoW": [0.1, 0.2, 0.3]} ) @@ -871,36 +663,27 @@ def test_add_suffix(using_copy_on_write): @pytest.mark.parametrize("axis, val", [(0, 5.5), (1, np.nan)]) -def test_dropna(using_copy_on_write, axis, val): +def test_dropna(axis, val): df = DataFrame({"a": [1, 2, 3], "b": [4, val, 6], "c": "d"}) df_orig = df.copy() df2 = df.dropna(axis=axis) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("val", [5, 5.5]) -def test_dropna_series(using_copy_on_write, val): +def test_dropna_series(val): ser = Series([1, val, 4]) ser_orig = ser.copy() ser2 = ser.dropna() - - if using_copy_on_write: - assert np.shares_memory(ser2.values, ser.values) - else: - assert not np.shares_memory(ser2.values, ser.values) + assert np.shares_memory(ser2.values, ser.values) ser2.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(ser2.values, ser.values) + assert not np.shares_memory(ser2.values, ser.values) tm.assert_series_equal(ser, ser_orig) @@ -913,63 +696,54 @@ def test_dropna_series(using_copy_on_write, val): lambda df: df.tail(3), ], ) -def test_head_tail(method, using_copy_on_write, warn_copy_on_write): +def test_head_tail(method): df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = method(df) df2._mgr._verify_integrity() - if using_copy_on_write: - # We are explicitly deviating for CoW here to make an eager copy (avoids - # tracking references for very cheap ops) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + # We are explicitly deviating for CoW here to make an eager copy (avoids + # tracking references for very cheap ops) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) # modify df2 to trigger CoW for that block - with tm.assert_cow_warning(warn_copy_on_write): - df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - # without CoW enabled, head and tail return views. Mutating df2 also mutates df. - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - with tm.assert_cow_warning(warn_copy_on_write): - df2.iloc[0, 0] = 1 + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) -def test_infer_objects(using_copy_on_write): - df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) +def test_infer_objects(using_infer_string): + df = DataFrame( + {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"} + ) df_orig = df.copy() df2 = df.infer_objects() - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_infer_string and HAS_PYARROW: + assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) df2.iloc[0, 0] = 0 df2.iloc[0, 1] = "d" - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) tm.assert_frame_equal(df, df_orig) -def test_infer_objects_no_reference(using_copy_on_write): +def test_infer_objects_no_reference(using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" ), - "e": "b", + "e": Series(["z", "w"], dtype=object), } ) df = df.infer_objects() @@ -981,18 +755,23 @@ def test_infer_objects_no_reference(using_copy_on_write): df.iloc[0, 0] = 0 df.iloc[0, 1] = "d" df.iloc[0, 3] = Timestamp("2018-12-31") - if using_copy_on_write: - assert np.shares_memory(arr_a, get_array(df, "a")) + assert np.shares_memory(arr_a, get_array(df, "a")) + if using_infer_string and HAS_PYARROW: + # note that the underlying memory of arr_b has been copied anyway + # because of the assignment, but the EA is updated inplace so still + # appears the share memory + assert tm.shares_memory(arr_b, get_array(df, "b")) + else: # TODO(CoW): Block splitting causes references here assert not np.shares_memory(arr_b, get_array(df, "b")) - assert np.shares_memory(arr_d, get_array(df, "d")) + assert np.shares_memory(arr_d, get_array(df, "d")) -def test_infer_objects_reference(using_copy_on_write): +def test_infer_objects_reference(): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" @@ -1009,10 +788,9 @@ def test_infer_objects_reference(using_copy_on_write): df.iloc[0, 0] = 0 df.iloc[0, 1] = "d" df.iloc[0, 3] = Timestamp("2018-12-31") - if using_copy_on_write: - assert not np.shares_memory(arr_a, get_array(df, "a")) - assert not np.shares_memory(arr_b, get_array(df, "b")) - assert np.shares_memory(arr_d, get_array(df, "d")) + assert not np.shares_memory(arr_a, get_array(df, "a")) + assert not np.shares_memory(arr_b, get_array(df, "b")) + assert np.shares_memory(arr_d, get_array(df, "d")) @pytest.mark.parametrize( @@ -1022,103 +800,76 @@ def test_infer_objects_reference(using_copy_on_write): {"before": 0, "after": 1, "axis": 0}, ], ) -def test_truncate(using_copy_on_write, kwargs): +def test_truncate(kwargs): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2}) df_orig = df.copy() df2 = df.truncate(**kwargs) df2._mgr._verify_integrity() - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("method", ["assign", "drop_duplicates"]) -def test_assign_drop_duplicates(using_copy_on_write, method): +def test_assign_drop_duplicates(method): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() df2 = getattr(df, method)() df2._mgr._verify_integrity() - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) -def test_take(using_copy_on_write, obj): +def test_take(obj): # Check that no copy is made when we take all rows in original order obj_orig = obj.copy() obj2 = obj.take([0, 1]) - - if using_copy_on_write: - assert np.shares_memory(obj2.values, obj.values) - else: - assert not np.shares_memory(obj2.values, obj.values) + assert np.shares_memory(obj2.values, obj.values) obj2.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(obj2.values, obj.values) + assert not np.shares_memory(obj2.values, obj.values) tm.assert_equal(obj, obj_orig) @pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) -def test_between_time(using_copy_on_write, obj): +def test_between_time(obj): obj.index = date_range("2018-04-09", periods=2, freq="1D20min") obj_orig = obj.copy() obj2 = obj.between_time("0:00", "1:00") - - if using_copy_on_write: - assert np.shares_memory(obj2.values, obj.values) - else: - assert not np.shares_memory(obj2.values, obj.values) + assert np.shares_memory(obj2.values, obj.values) obj2.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(obj2.values, obj.values) + assert not np.shares_memory(obj2.values, obj.values) tm.assert_equal(obj, obj_orig) -def test_reindex_like(using_copy_on_write): +def test_reindex_like(): df = DataFrame({"a": [1, 2], "b": "a"}) other = DataFrame({"b": "a", "a": [1, 2]}) df_orig = df.copy() df2 = df.reindex_like(other) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 1] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) -def test_sort_index(using_copy_on_write): +def test_sort_index(): # GH 49473 ser = Series([1, 2, 3]) ser_orig = ser.copy() ser2 = ser.sort_index() - - if using_copy_on_write: - assert np.shares_memory(ser.values, ser2.values) - else: - assert not np.shares_memory(ser.values, ser2.values) + assert np.shares_memory(ser.values, ser2.values) # mutating ser triggers a copy-on-write for the column / block ser2.iloc[0] = 0 @@ -1130,14 +881,10 @@ def test_sort_index(using_copy_on_write): "obj, kwargs", [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})], ) -def test_sort_values(using_copy_on_write, obj, kwargs): +def test_sort_values(obj, kwargs): obj_orig = obj.copy() obj2 = obj.sort_values(**kwargs) - - if using_copy_on_write: - assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) - else: - assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) # mutating df triggers a copy-on-write for the column / block obj2.iloc[0] = 0 @@ -1149,7 +896,7 @@ def test_sort_values(using_copy_on_write, obj, kwargs): "obj, kwargs", [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})], ) -def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_write): +def test_sort_values_inplace(obj, kwargs): obj_orig = obj.copy() view = obj[:] obj.sort_values(inplace=True, **kwargs) @@ -1157,107 +904,80 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) # mutating obj triggers a copy-on-write for the column / block - with tm.assert_cow_warning(warn_copy_on_write): - obj.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(obj, "a"), get_array(view, "a")) - tm.assert_equal(view, obj_orig) - else: - assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) + obj.iloc[0] = 0 + assert not np.shares_memory(get_array(obj, "a"), get_array(view, "a")) + tm.assert_equal(view, obj_orig) @pytest.mark.parametrize("decimals", [-1, 0, 1]) -def test_round(using_copy_on_write, warn_copy_on_write, decimals): +def test_round(decimals): df = DataFrame({"a": [1, 2], "b": "c"}) df_orig = df.copy() df2 = df.round(decimals=decimals) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - # TODO: Make inplace by using out parameter of ndarray.round? - if decimals >= 0: - # Ensure lazy copy if no-op - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + # TODO: Make inplace by using out parameter of ndarray.round? + if decimals >= 0: + # Ensure lazy copy if no-op + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 1] = "d" df2.iloc[0, 0] = 4 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) -def test_reorder_levels(using_copy_on_write): +def test_reorder_levels(): index = MultiIndex.from_tuples( [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] ) df = DataFrame({"a": [1, 2, 3, 4]}, index=index) df_orig = df.copy() df2 = df.reorder_levels(order=["two", "one"]) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) -def test_series_reorder_levels(using_copy_on_write): +def test_series_reorder_levels(): index = MultiIndex.from_tuples( [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] ) ser = Series([1, 2, 3, 4], index=index) ser_orig = ser.copy() ser2 = ser.reorder_levels(order=["two", "one"]) - - if using_copy_on_write: - assert np.shares_memory(ser2.values, ser.values) - else: - assert not np.shares_memory(ser2.values, ser.values) + assert np.shares_memory(ser2.values, ser.values) ser2.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(ser2.values, ser.values) + assert not np.shares_memory(ser2.values, ser.values) tm.assert_series_equal(ser, ser_orig) @pytest.mark.parametrize("obj", [Series([1, 2, 3]), DataFrame({"a": [1, 2, 3]})]) -def test_swaplevel(using_copy_on_write, obj): +def test_swaplevel(obj): index = MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["one", "two"]) obj.index = index obj_orig = obj.copy() obj2 = obj.swaplevel() - - if using_copy_on_write: - assert np.shares_memory(obj2.values, obj.values) - else: - assert not np.shares_memory(obj2.values, obj.values) + assert np.shares_memory(obj2.values, obj.values) obj2.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(obj2.values, obj.values) + assert not np.shares_memory(obj2.values, obj.values) tm.assert_equal(obj, obj_orig) -def test_frame_set_axis(using_copy_on_write): +def test_frame_set_axis(): # GH 49473 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.set_axis(["a", "b", "c"], axis="index") - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 0] = 0 @@ -1265,16 +985,12 @@ def test_frame_set_axis(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_series_set_axis(using_copy_on_write): +def test_series_set_axis(): # GH 49473 ser = Series([1, 2, 3]) ser_orig = ser.copy() ser2 = ser.set_axis(["a", "b", "c"], axis="index") - - if using_copy_on_write: - assert np.shares_memory(ser, ser2) - else: - assert not np.shares_memory(ser, ser2) + assert np.shares_memory(ser, ser2) # mutating ser triggers a copy-on-write for the column / block ser2.iloc[0] = 0 @@ -1282,7 +998,7 @@ def test_series_set_axis(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) -def test_set_flags(using_copy_on_write, warn_copy_on_write): +def test_set_flags(): ser = Series([1, 2, 3]) ser_orig = ser.copy() ser2 = ser.set_flags(allows_duplicate_labels=False) @@ -1290,49 +1006,34 @@ def test_set_flags(using_copy_on_write, warn_copy_on_write): assert np.shares_memory(ser, ser2) # mutating ser triggers a copy-on-write for the column / block - with tm.assert_cow_warning(warn_copy_on_write): - ser2.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(ser2, ser) - tm.assert_series_equal(ser, ser_orig) - else: - assert np.shares_memory(ser2, ser) - expected = Series([0, 2, 3]) - tm.assert_series_equal(ser, expected) + ser2.iloc[0] = 0 + assert not np.shares_memory(ser2, ser) + tm.assert_series_equal(ser, ser_orig) @pytest.mark.parametrize("kwargs", [{"mapper": "test"}, {"index": "test"}]) -def test_rename_axis(using_copy_on_write, kwargs): +def test_rename_axis(kwargs): df = DataFrame({"a": [1, 2, 3, 4]}, index=Index([1, 2, 3, 4], name="a")) df_orig = df.copy() df2 = df.rename_axis(**kwargs) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( "func, tz", [("tz_convert", "Europe/Berlin"), ("tz_localize", None)] ) -def test_tz_convert_localize(using_copy_on_write, func, tz): +def test_tz_convert_localize(func, tz): # GH 49473 ser = Series( [1, 2], index=date_range(start="2014-08-01 09:00", freq="h", periods=2, tz=tz) ) ser_orig = ser.copy() ser2 = getattr(ser, func)("US/Central") - - if using_copy_on_write: - assert np.shares_memory(ser.values, ser2.values) - else: - assert not np.shares_memory(ser.values, ser2.values) + assert np.shares_memory(ser.values, ser2.values) # mutating ser triggers a copy-on-write for the column / block ser2.iloc[0] = 0 @@ -1340,31 +1041,26 @@ def test_tz_convert_localize(using_copy_on_write, func, tz): tm.assert_series_equal(ser, ser_orig) -def test_droplevel(using_copy_on_write): +def test_droplevel(): # GH 49473 index = MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["one", "two"]) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, index=index) df_orig = df.copy() df2 = df.droplevel(0) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) tm.assert_frame_equal(df, df_orig) -def test_squeeze(using_copy_on_write, warn_copy_on_write): +def test_squeeze(): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() series = df.squeeze() @@ -1373,18 +1069,12 @@ def test_squeeze(using_copy_on_write, warn_copy_on_write): assert np.shares_memory(series.values, get_array(df, "a")) # mutating squeezed df triggers a copy-on-write for that column/block - with tm.assert_cow_warning(warn_copy_on_write): - series.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(series.values, get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - else: - # Without CoW the original will be modified - assert np.shares_memory(series.values, get_array(df, "a")) - assert df.loc[0, "a"] == 0 + series.iloc[0] = 0 + assert not np.shares_memory(series.values, get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) -def test_items(using_copy_on_write, warn_copy_on_write): +def test_items(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() @@ -1395,74 +1085,52 @@ def test_items(using_copy_on_write, warn_copy_on_write): assert np.shares_memory(get_array(ser, name), get_array(df, name)) # mutating df triggers a copy-on-write for that column / block - with tm.assert_cow_warning(warn_copy_on_write): - ser.iloc[0] = 0 + ser.iloc[0] = 0 - if using_copy_on_write: - assert not np.shares_memory(get_array(ser, name), get_array(df, name)) - tm.assert_frame_equal(df, df_orig) - else: - # Original frame will be modified - assert df.loc[0, name] == 0 + assert not np.shares_memory(get_array(ser, name), get_array(df, name)) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("dtype", ["int64", "Int64"]) -def test_putmask(using_copy_on_write, dtype, warn_copy_on_write): +def test_putmask(dtype): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) view = df[:] df_orig = df.copy() - with tm.assert_cow_warning(warn_copy_on_write): - df[df == df] = 5 + df[df == df] = 5 - if using_copy_on_write: - assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) - tm.assert_frame_equal(view, df_orig) - else: - # Without CoW the original will be modified - assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) - assert view.iloc[0, 0] == 5 + assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) + tm.assert_frame_equal(view, df_orig) @pytest.mark.parametrize("dtype", ["int64", "Int64"]) -def test_putmask_no_reference(using_copy_on_write, dtype): +def test_putmask_no_reference(dtype): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) arr_a = get_array(df, "a") df[df == df] = 5 - - if using_copy_on_write: - assert np.shares_memory(arr_a, get_array(df, "a")) + assert np.shares_memory(arr_a, get_array(df, "a")) @pytest.mark.parametrize("dtype", ["float64", "Float64"]) -def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype): +def test_putmask_aligns_rhs_no_reference(dtype): df = DataFrame({"a": [1.5, 2], "b": 1.5}, dtype=dtype) arr_a = get_array(df, "a") df[df == df] = DataFrame({"a": [5.5, 5]}) + assert np.shares_memory(arr_a, get_array(df, "a")) - if using_copy_on_write: - assert np.shares_memory(arr_a, get_array(df, "a")) - -@pytest.mark.parametrize( - "val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)] -) -def test_putmask_dont_copy_some_blocks( - using_copy_on_write, val, exp, warn, warn_copy_on_write -): +@pytest.mark.parametrize("val, exp, raises", [(5.5, True, True), (5, False, False)]) +def test_putmask_dont_copy_some_blocks(val, exp, raises: bool): df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) view = df[:] df_orig = df.copy() indexer = DataFrame( [[True, False, False], [True, False, False]], columns=list("abc") ) - if warn_copy_on_write: - with tm.assert_cow_warning(): + if raises: + with pytest.raises(TypeError, match="Invalid value"): df[indexer] = val else: - with tm.assert_produces_warning(warn, match="incompatible dtype"): - df[indexer] = val - - if using_copy_on_write: + df[indexer] = val assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) # TODO(CoW): Could split blocks to avoid copying the whole block assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp @@ -1470,11 +1138,6 @@ def test_putmask_dont_copy_some_blocks( assert df._mgr._has_no_reference(1) is not exp assert not df._mgr._has_no_reference(2) tm.assert_frame_equal(view, df_orig) - elif val == 5: - # Without CoW the original will be modified, the other case upcasts, e.g. copy - assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) - assert view.iloc[0, 0] == 5 @pytest.mark.parametrize("dtype", ["int64", "Int64"]) @@ -1485,20 +1148,15 @@ def test_putmask_dont_copy_some_blocks( lambda ser: ser.mask(ser <= 0, 10), ], ) -def test_where_mask_noop(using_copy_on_write, dtype, func): +def test_where_mask_noop(dtype, func): ser = Series([1, 2, 3], dtype=dtype) ser_orig = ser.copy() result = func(ser) - - if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(result)) - else: - assert not np.shares_memory(get_array(ser), get_array(result)) + assert np.shares_memory(get_array(ser), get_array(result)) result.iloc[0] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(ser), get_array(result)) + assert not np.shares_memory(get_array(ser), get_array(result)) tm.assert_series_equal(ser, ser_orig) @@ -1510,7 +1168,7 @@ def test_where_mask_noop(using_copy_on_write, dtype, func): lambda ser: ser.mask(ser >= 0, 10), ], ) -def test_where_mask(using_copy_on_write, dtype, func): +def test_where_mask(dtype, func): ser = Series([1, 2, 3], dtype=dtype) ser_orig = ser.copy() @@ -1528,61 +1186,40 @@ def test_where_mask(using_copy_on_write, dtype, func): lambda df, val: df.mask(df >= 0, val), ], ) -def test_where_mask_noop_on_single_column(using_copy_on_write, dtype, val, func): +def test_where_mask_noop_on_single_column(dtype, val, func): df = DataFrame({"a": [1, 2, 3], "b": [-4, -5, -6]}, dtype=dtype) df_orig = df.copy() result = func(df, val) - - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(result, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) - else: - assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) + assert np.shares_memory(get_array(df, "b"), get_array(result, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) result.iloc[0, 1] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) + assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("func", ["mask", "where"]) -def test_chained_where_mask(using_copy_on_write, func): +def test_chained_where_mask(func): df = DataFrame({"a": [1, 4, 2], "b": 1}) df_orig = df.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - getattr(df["a"], func)(df["a"] > 2, 5, inplace=True) - tm.assert_frame_equal(df, df_orig) - - with tm.raises_chained_assignment_error(): - getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) - tm.assert_frame_equal(df, df_orig) - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - getattr(df["a"], func)(df["a"] > 2, 5, inplace=True) - - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) + with tm.raises_chained_assignment_error(): + getattr(df["a"], func)(df["a"] > 2, 5, inplace=True) + tm.assert_frame_equal(df, df_orig) - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - getattr(df[df["a"] > 1], func)(df["a"] > 2, 5, inplace=True) + with tm.raises_chained_assignment_error(): + getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) + tm.assert_frame_equal(df, df_orig) -def test_asfreq_noop(using_copy_on_write): +def test_asfreq_noop(): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]}, index=date_range("1/1/2000", periods=4, freq="min"), ) df_orig = df.copy() df2 = df.asfreq(freq="min") - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 0] = 0 @@ -1591,68 +1228,50 @@ def test_asfreq_noop(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_iterrows(using_copy_on_write): +def test_iterrows(): df = DataFrame({"a": 0, "b": 1}, index=[1, 2, 3]) df_orig = df.copy() for _, sub in df.iterrows(): sub.iloc[0] = 100 - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) -def test_interpolate_creates_copy(using_copy_on_write, warn_copy_on_write): +def test_interpolate_creates_copy(): # GH#51126 df = DataFrame({"a": [1.5, np.nan, 3]}) view = df[:] expected = df.copy() - with tm.assert_cow_warning(warn_copy_on_write): - df.ffill(inplace=True) - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 100.5 - - if using_copy_on_write: - tm.assert_frame_equal(view, expected) - else: - expected = DataFrame({"a": [100.5, 1.5, 3]}) - tm.assert_frame_equal(view, expected) + df.ffill(inplace=True) + df.iloc[0, 0] = 100.5 + tm.assert_frame_equal(view, expected) -def test_isetitem(using_copy_on_write): +def test_isetitem(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() - df2 = df.copy(deep=None) # Trigger a CoW + df2 = df.copy(deep=False) # Trigger a CoW df2.isetitem(1, np.array([-1, -2, -3])) # This is inplace - - if using_copy_on_write: - assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - else: - assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) df2.loc[0, "a"] = 0 tm.assert_frame_equal(df, df_orig) # Original is unchanged - - if using_copy_on_write: - assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - else: - assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_isetitem_series(using_copy_on_write, dtype): +def test_isetitem_series(dtype): df = DataFrame({"a": [1, 2, 3], "b": np.array([4, 5, 6], dtype=dtype)}) ser = Series([7, 8, 9]) ser_orig = ser.copy() df.isetitem(0, ser) - if using_copy_on_write: - assert np.shares_memory(get_array(df, "a"), get_array(ser)) - assert not df._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a"), get_array(ser)) + assert not df._mgr._has_no_reference(0) # mutating dataframe doesn't update series df.loc[0, "a"] = 0 @@ -1668,17 +1287,13 @@ def test_isetitem_series(using_copy_on_write, dtype): tm.assert_frame_equal(df, expected) -def test_isetitem_frame(using_copy_on_write): +def test_isetitem_frame(): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2}) rhs = DataFrame({"a": [4, 5, 6], "b": 2}) df.isetitem([0, 1], rhs) - if using_copy_on_write: - assert np.shares_memory(get_array(df, "a"), get_array(rhs, "a")) - assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) - assert not df._mgr._has_no_reference(0) - else: - assert not np.shares_memory(get_array(df, "a"), get_array(rhs, "a")) - assert not np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(rhs, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) + assert not df._mgr._has_no_reference(0) expected = df.copy() rhs.iloc[0, 0] = 100 rhs.iloc[0, 1] = 100 @@ -1686,43 +1301,24 @@ def test_isetitem_frame(using_copy_on_write): @pytest.mark.parametrize("key", ["a", ["a"]]) -def test_get(using_copy_on_write, warn_copy_on_write, key): +def test_get(key): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() result = df.get(key) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - result.iloc[0] = 0 - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - else: - # for non-CoW it depends on whether we got a Series or DataFrame if it - # is a view or copy or triggers a warning or not - if warn_copy_on_write: - warn = FutureWarning if isinstance(key, str) else None - else: - warn = SettingWithCopyWarning if isinstance(key, list) else None - with option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(warn): - result.iloc[0] = 0 - - if isinstance(key, list): - tm.assert_frame_equal(df, df_orig) - else: - assert df.iloc[0, 0] == 0 + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + result.iloc[0] = 0 + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("axis, key", [(0, 0), (1, "a")]) @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_xs( - using_copy_on_write, warn_copy_on_write, using_array_manager, axis, key, dtype -): - single_block = (dtype == "int64") and not using_array_manager - is_view = single_block or (using_array_manager and axis == 1) +def test_xs(axis, key, dtype): + single_block = dtype == "int64" df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) @@ -1732,30 +1328,16 @@ def test_xs( if axis == 1 or single_block: assert np.shares_memory(get_array(df, "a"), get_array(result)) - elif using_copy_on_write: - assert result._mgr._has_no_reference(0) - - if using_copy_on_write or (is_view and not warn_copy_on_write): - result.iloc[0] = 0 - elif warn_copy_on_write: - with tm.assert_cow_warning(single_block or axis == 1): - result.iloc[0] = 0 else: - with option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(SettingWithCopyWarning): - result.iloc[0] = 0 + assert result._mgr._has_no_reference(0) - if using_copy_on_write or (not single_block and axis == 0): - tm.assert_frame_equal(df, df_orig) - else: - assert df.iloc[0, 0] == 0 + result.iloc[0] = 0 + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("key, level", [("l1", 0), (2, 1)]) -def test_xs_multiindex( - using_copy_on_write, warn_copy_on_write, using_array_manager, key, level, axis -): +def test_xs_multiindex(key, level, axis): arr = np.arange(18).reshape(6, 3) index = MultiIndex.from_product([["l1", "l2"], [1, 2, 3]], names=["lev1", "lev2"]) df = DataFrame(arr, index=index, columns=list("abc")) @@ -1769,155 +1351,104 @@ def test_xs_multiindex( assert np.shares_memory( get_array(df, df.columns[0]), get_array(result, result.columns[0]) ) - - if warn_copy_on_write: - warn = FutureWarning if level == 0 else None - elif not using_copy_on_write and not using_array_manager: - warn = SettingWithCopyWarning - else: - warn = None - with option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(warn): - result.iloc[0, 0] = 0 + result.iloc[0, 0] = 0 tm.assert_frame_equal(df, df_orig) -def test_update_frame(using_copy_on_write, warn_copy_on_write): +def test_update_frame(): df1 = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) df2 = DataFrame({"b": [100.0]}, index=[1]) df1_orig = df1.copy() view = df1[:] - - # TODO(CoW) better warning message? - with tm.assert_cow_warning(warn_copy_on_write): - df1.update(df2) + df1.update(df2) expected = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 100.0, 6.0]}) tm.assert_frame_equal(df1, expected) - if using_copy_on_write: - # df1 is updated, but its view not - tm.assert_frame_equal(view, df1_orig) - assert np.shares_memory(get_array(df1, "a"), get_array(view, "a")) - assert not np.shares_memory(get_array(df1, "b"), get_array(view, "b")) - else: - tm.assert_frame_equal(view, expected) + # df1 is updated, but its view not + tm.assert_frame_equal(view, df1_orig) + assert np.shares_memory(get_array(df1, "a"), get_array(view, "a")) + assert not np.shares_memory(get_array(df1, "b"), get_array(view, "b")) -def test_update_series(using_copy_on_write, warn_copy_on_write): +def test_update_series(): ser1 = Series([1.0, 2.0, 3.0]) ser2 = Series([100.0], index=[1]) ser1_orig = ser1.copy() view = ser1[:] - if warn_copy_on_write: - with tm.assert_cow_warning(): - ser1.update(ser2) - else: - ser1.update(ser2) + ser1.update(ser2) expected = Series([1.0, 100.0, 3.0]) tm.assert_series_equal(ser1, expected) - if using_copy_on_write: - # ser1 is updated, but its view not - tm.assert_series_equal(view, ser1_orig) - else: - tm.assert_series_equal(view, expected) + # ser1 is updated, but its view not + tm.assert_series_equal(view, ser1_orig) -def test_update_chained_assignment(using_copy_on_write): +def test_update_chained_assignment(): df = DataFrame({"a": [1, 2, 3]}) ser2 = Series([100.0], index=[1]) df_orig = df.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["a"].update(ser2) - tm.assert_frame_equal(df, df_orig) - - with tm.raises_chained_assignment_error(): - df[["a"]].update(ser2.to_frame()) - tm.assert_frame_equal(df, df_orig) - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - df["a"].update(ser2) - - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - df[["a"]].update(ser2.to_frame()) + with tm.raises_chained_assignment_error(): + df["a"].update(ser2) + tm.assert_frame_equal(df, df_orig) - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - df[df["a"] > 1].update(ser2.to_frame()) + with tm.raises_chained_assignment_error(): + df[["a"]].update(ser2.to_frame()) + tm.assert_frame_equal(df, df_orig) -def test_inplace_arithmetic_series(using_copy_on_write): +def test_inplace_arithmetic_series(): ser = Series([1, 2, 3]) ser_orig = ser.copy() data = get_array(ser) ser *= 2 - if using_copy_on_write: - # https://github.com/pandas-dev/pandas/pull/55745 - # changed to NOT update inplace because there is no benefit (actual - # operation already done non-inplace). This was only for the optics - # of updating the backing array inplace, but we no longer want to make - # that guarantee - assert not np.shares_memory(get_array(ser), data) - tm.assert_numpy_array_equal(data, get_array(ser_orig)) - else: - assert np.shares_memory(get_array(ser), data) - tm.assert_numpy_array_equal(data, get_array(ser)) + # https://github.com/pandas-dev/pandas/pull/55745 + # changed to NOT update inplace because there is no benefit (actual + # operation already done non-inplace). This was only for the optics + # of updating the backing array inplace, but we no longer want to make + # that guarantee + assert not np.shares_memory(get_array(ser), data) + tm.assert_numpy_array_equal(data, get_array(ser_orig)) -def test_inplace_arithmetic_series_with_reference( - using_copy_on_write, warn_copy_on_write -): +def test_inplace_arithmetic_series_with_reference(): ser = Series([1, 2, 3]) ser_orig = ser.copy() view = ser[:] - with tm.assert_cow_warning(warn_copy_on_write): - ser *= 2 - if using_copy_on_write: - assert not np.shares_memory(get_array(ser), get_array(view)) - tm.assert_series_equal(ser_orig, view) - else: - assert np.shares_memory(get_array(ser), get_array(view)) + ser *= 2 + assert not np.shares_memory(get_array(ser), get_array(view)) + tm.assert_series_equal(ser_orig, view) -@pytest.mark.parametrize("copy", [True, False]) -def test_transpose(using_copy_on_write, copy, using_array_manager): +def test_transpose(): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() - result = df.transpose(copy=copy) - - if not copy and not using_array_manager or using_copy_on_write: - assert np.shares_memory(get_array(df, "a"), get_array(result, 0)) - else: - assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) + result = df.transpose() + assert np.shares_memory(get_array(df, "a"), get_array(result, 0)) result.iloc[0, 0] = 100 - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) -def test_transpose_different_dtypes(using_copy_on_write): +def test_transpose_different_dtypes(): df = DataFrame({"a": [1, 2, 3], "b": 1.5}) df_orig = df.copy() result = df.T assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) result.iloc[0, 0] = 100 - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) -def test_transpose_ea_single_column(using_copy_on_write): +def test_transpose_ea_single_column(): df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") result = df.T assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) -def test_transform_frame(using_copy_on_write, warn_copy_on_write): +def test_transform_frame(): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() @@ -1925,13 +1456,11 @@ def func(ser): ser.iloc[0] = 100 return ser - with tm.assert_cow_warning(warn_copy_on_write): - df.transform(func) - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) + df.transform(func) + tm.assert_frame_equal(df, df_orig) -def test_transform_series(using_copy_on_write, warn_copy_on_write): +def test_transform_series(): ser = Series([1, 2, 3]) ser_orig = ser.copy() @@ -1939,10 +1468,8 @@ def func(ser): ser.iloc[0] = 100 return ser - with tm.assert_cow_warning(warn_copy_on_write): - ser.transform(func) - if using_copy_on_write: - tm.assert_series_equal(ser, ser_orig) + ser.transform(func) + tm.assert_series_equal(ser, ser_orig) def test_count_read_only_array(): @@ -1953,56 +1480,30 @@ def test_count_read_only_array(): tm.assert_series_equal(result, expected) -def test_series_view(using_copy_on_write, warn_copy_on_write): - ser = Series([1, 2, 3]) - ser_orig = ser.copy() - - with tm.assert_produces_warning(FutureWarning, match="is deprecated"): - ser2 = ser.view() - assert np.shares_memory(get_array(ser), get_array(ser2)) - if using_copy_on_write: - assert not ser2._mgr._has_no_reference(0) - - with tm.assert_cow_warning(warn_copy_on_write): - ser2.iloc[0] = 100 - - if using_copy_on_write: - tm.assert_series_equal(ser_orig, ser) - else: - expected = Series([100, 2, 3]) - tm.assert_series_equal(ser, expected) - - -def test_insert_series(using_copy_on_write): +def test_insert_series(): df = DataFrame({"a": [1, 2, 3]}) ser = Series([1, 2, 3]) ser_orig = ser.copy() df.insert(loc=1, value=ser, column="b") - if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(df, "b")) - assert not df._mgr._has_no_reference(1) - else: - assert not np.shares_memory(get_array(ser), get_array(df, "b")) + assert np.shares_memory(get_array(ser), get_array(df, "b")) + assert not df._mgr._has_no_reference(1) df.iloc[0, 1] = 100 tm.assert_series_equal(ser, ser_orig) -def test_eval(using_copy_on_write): +def test_eval(): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() result = df.eval("c = a+b") - if using_copy_on_write: - assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) - else: - assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) result.iloc[0, 0] = 100 tm.assert_frame_equal(df, df_orig) -def test_eval_inplace(using_copy_on_write, warn_copy_on_write): +def test_eval_inplace(): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() df_view = df[:] @@ -2010,13 +1511,11 @@ def test_eval_inplace(using_copy_on_write, warn_copy_on_write): df.eval("c = a+b", inplace=True) assert np.shares_memory(get_array(df, "a"), get_array(df_view, "a")) - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 100 - if using_copy_on_write: - tm.assert_frame_equal(df_view, df_orig) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df_view, df_orig) -def test_apply_modify_row(using_copy_on_write, warn_copy_on_write): +def test_apply_modify_row(): # Case: applying a function on each row as a Series object, where the # function mutates the row object (which needs to trigger CoW if row is a view) df = DataFrame({"A": [1, 2], "B": [3, 4]}) @@ -2026,13 +1525,9 @@ def transform(row): row["B"] = 100 return row - with tm.assert_cow_warning(warn_copy_on_write): - df.apply(transform, axis=1) + df.apply(transform, axis=1) - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) - else: - assert df.loc[0, "B"] == 100 + tm.assert_frame_equal(df, df_orig) # row Series is a copy df = DataFrame({"A": [1, 2], "B": ["b", "c"]}) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 6d16bc3083883..d4838a5e68ab8 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -4,7 +4,6 @@ from pandas import ( Categorical, DataFrame, - option_context, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -25,21 +24,19 @@ # 1 ], ) -def test_replace(using_copy_on_write, replace_kwargs): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) +def test_replace(replace_kwargs): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df_replaced = df.replace(**replace_kwargs) - if using_copy_on_write: - if (df_replaced["b"] == df["b"]).all(): - assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + if (df_replaced["b"] == df["b"]).all(): + assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) # mutating squeezed df triggers a copy-on-write for that column/block df_replaced.loc[0, "c"] = -1 - if using_copy_on_write: - assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) if "a" in replace_kwargs["to_replace"]: arr = get_array(df_replaced, "a") @@ -48,259 +45,177 @@ def test_replace(using_copy_on_write, replace_kwargs): tm.assert_frame_equal(df, df_orig) -def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): +def test_replace_regex_inplace_refs(): df = DataFrame({"a": ["aaa", "bbb"]}) df_orig = df.copy() view = df[:] arr = get_array(df, "a") - with tm.assert_cow_warning(warn_copy_on_write): - df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) - if using_copy_on_write: - assert not np.shares_memory(arr, get_array(df, "a")) - assert df._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) - else: - assert np.shares_memory(arr, get_array(df, "a")) + df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) + assert not np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) -def test_replace_regex_inplace(using_copy_on_write): +def test_replace_regex_inplace(): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) - assert np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) + assert tm.shares_memory(arr, get_array(df, "a")) df_orig = df.copy() df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True) tm.assert_frame_equal(df_orig, df) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) -def test_replace_regex_inplace_no_op(using_copy_on_write): +def test_replace_regex_inplace_no_op(): df = DataFrame({"a": [1, 2]}) arr = get_array(df, "a") df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) assert np.shares_memory(arr, get_array(df, "a")) df_orig = df.copy() df2 = df.replace(to_replace=r"^x.$", value="new", regex=True) tm.assert_frame_equal(df_orig, df) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) -def test_replace_mask_all_false_second_block(using_copy_on_write): +def test_replace_mask_all_false_second_block(): df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2}) df_orig = df.copy() df2 = df.replace(to_replace=1.5, value=55.5) - if using_copy_on_write: - # TODO: Block splitting would allow us to avoid copying b - assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - else: - assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + # TODO: Block splitting would allow us to avoid copying b + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) df2.loc[0, "c"] = 1 tm.assert_frame_equal(df, df_orig) # Original is unchanged - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - # TODO: This should split and not copy the whole block - # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) -def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): +def test_replace_coerce_single_column(): df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) df_orig = df.copy() df2 = df.replace(to_replace=1.5, value="a") + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - elif not using_array_manager: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - if using_copy_on_write: - df2.loc[0, "b"] = 0.5 - tm.assert_frame_equal(df, df_orig) # Original is unchanged - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) -def test_replace_to_replace_wrong_dtype(using_copy_on_write): +def test_replace_to_replace_wrong_dtype(): df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) df_orig = df.copy() df2 = df.replace(to_replace="xxx", value=1.5) - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - else: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) df2.loc[0, "b"] = 0.5 tm.assert_frame_equal(df, df_orig) # Original is unchanged + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - -def test_replace_list_categorical(using_copy_on_write): +def test_replace_list_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) + + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) df_orig = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = df.replace(["b"], value="a") + df.replace(["b"], value="a") + df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"})) assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) -def test_replace_list_inplace_refs_categorical(using_copy_on_write): +def test_replace_list_inplace_refs_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) - if using_copy_on_write: - assert not np.shares_memory( - get_array(view, "a").codes, get_array(df, "a").codes - ) - tm.assert_frame_equal(df_orig, view) - else: - # This could be inplace - assert not np.shares_memory( - get_array(view, "a").codes, get_array(df, "a").codes - ) + df.replace(["c"], value="a", inplace=True) + tm.assert_frame_equal(df_orig, view) @pytest.mark.parametrize("to_replace", [1.5, [1.5], []]) -def test_replace_inplace(using_copy_on_write, to_replace): +def test_replace_inplace(to_replace): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") df.replace(to_replace=1.5, value=15.5, inplace=True) assert np.shares_memory(get_array(df, "a"), arr_a) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) @pytest.mark.parametrize("to_replace", [1.5, [1.5]]) -def test_replace_inplace_reference(using_copy_on_write, to_replace, warn_copy_on_write): +def test_replace_inplace_reference(to_replace): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] - with tm.assert_cow_warning(warn_copy_on_write): - df.replace(to_replace=to_replace, value=15.5, inplace=True) + df.replace(to_replace=to_replace, value=15.5, inplace=True) - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr_a) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - else: - assert np.shares_memory(get_array(df, "a"), arr_a) + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) @pytest.mark.parametrize("to_replace", ["a", 100.5]) -def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace): +def test_replace_inplace_reference_no_op(to_replace): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] df.replace(to_replace=to_replace, value=15.5, inplace=True) assert np.shares_memory(get_array(df, "a"), arr_a) - if using_copy_on_write: - assert not df._mgr._has_no_reference(0) - assert not view._mgr._has_no_reference(0) + assert not df._mgr._has_no_reference(0) + assert not view._mgr._has_no_reference(0) @pytest.mark.parametrize("to_replace", [1, [1]]) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace): +def test_replace_categorical_inplace_reference(to_replace): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=to_replace, value=val, inplace=True) - - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) - else: - assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) - - -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace(using_copy_on_write, val): + df.replace(to_replace=to_replace, value=1, inplace=True) + assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + + +def test_replace_categorical_inplace(): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=1, value=val, inplace=True) + df.replace(to_replace=1, value=1, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) - expected = DataFrame({"a": Categorical([val, 2, 3])}) + expected = DataFrame({"a": Categorical([1, 2, 3])}) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical(using_copy_on_write, val): +def test_replace_categorical(): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df2 = df.replace(to_replace=1, value=val) - - if using_copy_on_write: - assert df._mgr._has_no_reference(0) - assert df2._mgr._has_no_reference(0) + df2 = df.replace(to_replace=1, value=1) + + assert df._mgr._has_no_reference(0) + assert df2._mgr._has_no_reference(0) assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -310,72 +225,52 @@ def test_replace_categorical(using_copy_on_write, val): @pytest.mark.parametrize("method", ["where", "mask"]) -def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write): +def test_masking_inplace(method): df = DataFrame({"a": [1.5, 2, 3]}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] method = getattr(df, method) - if warn_copy_on_write: - with tm.assert_cow_warning(): - method(df["a"] > 1.6, -1, inplace=True) - else: - method(df["a"] > 1.6, -1, inplace=True) - - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr_a) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) - else: - assert np.shares_memory(get_array(df, "a"), arr_a) - - -def test_replace_empty_list(using_copy_on_write): + method(df["a"] > 1.6, -1, inplace=True) + + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + + +def test_replace_empty_list(): df = DataFrame({"a": [1, 2]}) df2 = df.replace([], []) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert not df._mgr._has_no_reference(0) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not df._mgr._has_no_reference(0) arr_a = get_array(df, "a") df.replace([], []) - if using_copy_on_write: - assert np.shares_memory(get_array(df, "a"), arr_a) - assert not df._mgr._has_no_reference(0) - assert not df2._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a"), arr_a) + assert not df._mgr._has_no_reference(0) + assert not df2._mgr._has_no_reference(0) @pytest.mark.parametrize("value", ["d", None]) -def test_replace_object_list_inplace(using_copy_on_write, value): - df = DataFrame({"a": ["a", "b", "c"]}) +def test_replace_object_list_inplace(value): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=object) arr = get_array(df, "a") df.replace(["c"], value, inplace=True) - if using_copy_on_write or value is None: - assert np.shares_memory(arr, get_array(df, "a")) - else: - # This could be inplace - assert not np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) -def test_replace_list_multiple_elements_inplace(using_copy_on_write): +def test_replace_list_multiple_elements_inplace(): df = DataFrame({"a": [1, 2, 3]}) arr = get_array(df, "a") df.replace([1, 2], 4, inplace=True) - if using_copy_on_write: - assert np.shares_memory(arr, get_array(df, "a")) - assert df._mgr._has_no_reference(0) - else: - assert np.shares_memory(arr, get_array(df, "a")) + assert np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) -def test_replace_list_none(using_copy_on_write): +def test_replace_list_none(): df = DataFrame({"a": ["a", "b", "c"]}) df_orig = df.copy() @@ -384,76 +279,61 @@ def test_replace_list_none(using_copy_on_write): assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + # replace multiple values that don't actually replace anything with None + # https://github.com/pandas-dev/pandas/issues/59770 + df3 = df.replace(["d", "e", "f"], value=None) + tm.assert_frame_equal(df3, df_orig) + assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a")) -def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write): + +def test_replace_list_none_inplace_refs(): df = DataFrame({"a": ["a", "b", "c"]}) arr = get_array(df, "a") df_orig = df.copy() view = df[:] - with tm.assert_cow_warning(warn_copy_on_write): - df.replace(["a"], value=None, inplace=True) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) - assert not np.shares_memory(arr, get_array(df, "a")) - tm.assert_frame_equal(df_orig, view) - else: - assert np.shares_memory(arr, get_array(df, "a")) + df.replace(["a"], value=None, inplace=True) + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) -def test_replace_columnwise_no_op_inplace(using_copy_on_write): +def test_replace_columnwise_no_op_inplace(): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) view = df[:] df_orig = df.copy() df.replace({"a": 10}, 100, inplace=True) - if using_copy_on_write: - assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) - df.iloc[0, 0] = 100 - tm.assert_frame_equal(view, df_orig) + assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(view, df_orig) -def test_replace_columnwise_no_op(using_copy_on_write): +def test_replace_columnwise_no_op(): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) df_orig = df.copy() df2 = df.replace({"a": 10}, 100) - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 100 tm.assert_frame_equal(df, df_orig) -def test_replace_chained_assignment(using_copy_on_write): +def test_replace_chained_assignment(): df = DataFrame({"a": [1, np.nan, 2], "b": 1}) df_orig = df.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["a"].replace(1, 100, inplace=True) - tm.assert_frame_equal(df, df_orig) - - with tm.raises_chained_assignment_error(): - df[["a"]].replace(1, 100, inplace=True) - tm.assert_frame_equal(df, df_orig) - else: - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - df[["a"]].replace(1, 100, inplace=True) - - with tm.assert_produces_warning(None): - with option_context("mode.chained_assignment", None): - df[df.a > 5].replace(1, 100, inplace=True) + with tm.raises_chained_assignment_error(): + df["a"].replace(1, 100, inplace=True) + tm.assert_frame_equal(df, df_orig) - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - df["a"].replace(1, 100, inplace=True) + with tm.raises_chained_assignment_error(): + df[["a"]].replace(1, 100, inplace=True) + tm.assert_frame_equal(df, df_orig) -def test_replace_listlike(using_copy_on_write): +def test_replace_listlike(): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) df_orig = df.copy() result = df.replace([200, 201], [11, 11]) - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) result.iloc[0, 0] = 100 tm.assert_frame_equal(df, df) @@ -463,7 +343,7 @@ def test_replace_listlike(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_replace_listlike_inplace(using_copy_on_write, warn_copy_on_write): +def test_replace_listlike_inplace(): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) arr = get_array(df, "a") df.replace([200, 2], [10, 11], inplace=True) @@ -471,11 +351,6 @@ def test_replace_listlike_inplace(using_copy_on_write, warn_copy_on_write): view = df[:] df_orig = df.copy() - with tm.assert_cow_warning(warn_copy_on_write): - df.replace([200, 3], [10, 11], inplace=True) - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr) - tm.assert_frame_equal(view, df_orig) - else: - assert np.shares_memory(get_array(df, "a"), arr) - tm.assert_frame_equal(df, view) + df.replace([200, 3], [10, 11], inplace=True) + assert not np.shares_memory(get_array(df, "a"), arr) + tm.assert_frame_equal(view, df_orig) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index bc3b939734534..2f28e9826c7a1 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -28,7 +28,7 @@ def test_set_column_with_array(): tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) -def test_set_column_with_series(using_copy_on_write): +def test_set_column_with_series(): # Case: setting a series as a new column (df[col] = s) copies that data # (with delayed copy with CoW) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -36,11 +36,7 @@ def test_set_column_with_series(using_copy_on_write): df["c"] = ser - if using_copy_on_write: - assert np.shares_memory(get_array(df, "c"), get_array(ser)) - else: - # the series data is copied - assert not np.shares_memory(get_array(df, "c"), get_array(ser)) + assert np.shares_memory(get_array(df, "c"), get_array(ser)) # and modifying the series does not modify the DataFrame ser.iloc[0] = 0 @@ -48,7 +44,7 @@ def test_set_column_with_series(using_copy_on_write): tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) -def test_set_column_with_index(using_copy_on_write): +def test_set_column_with_index(): # Case: setting an index as a new column (df[col] = idx) copies that data df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) idx = Index([1, 2, 3]) @@ -66,7 +62,7 @@ def test_set_column_with_index(using_copy_on_write): assert not np.shares_memory(get_array(df, "d"), arr) -def test_set_columns_with_dataframe(using_copy_on_write): +def test_set_columns_with_dataframe(): # Case: setting a DataFrame as new columns copies that data # (with delayed copy with CoW) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -74,18 +70,13 @@ def test_set_columns_with_dataframe(using_copy_on_write): df[["c", "d"]] = df2 - if using_copy_on_write: - assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - else: - # the data is copied - assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) # and modifying the set DataFrame does not modify the original DataFrame df2.iloc[0, 0] = 0 tm.assert_series_equal(df["c"], Series([7, 8, 9], name="c")) -def test_setitem_series_no_copy(using_copy_on_write): +def test_setitem_series_no_copy(): # Case: setting a Series as column into a DataFrame can delay copying that data df = DataFrame({"a": [1, 2, 3]}) rhs = Series([4, 5, 6]) @@ -93,42 +84,39 @@ def test_setitem_series_no_copy(using_copy_on_write): # adding a new column df["b"] = rhs - if using_copy_on_write: - assert np.shares_memory(get_array(rhs), get_array(df, "b")) + assert np.shares_memory(get_array(rhs), get_array(df, "b")) df.iloc[0, 1] = 100 tm.assert_series_equal(rhs, rhs_orig) -def test_setitem_series_no_copy_single_block(using_copy_on_write): +def test_setitem_series_no_copy_single_block(): # Overwriting an existing column that is a single block df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) rhs = Series([4, 5, 6]) rhs_orig = rhs.copy() df["a"] = rhs - if using_copy_on_write: - assert np.shares_memory(get_array(rhs), get_array(df, "a")) + assert np.shares_memory(get_array(rhs), get_array(df, "a")) df.iloc[0, 0] = 100 tm.assert_series_equal(rhs, rhs_orig) -def test_setitem_series_no_copy_split_block(using_copy_on_write): +def test_setitem_series_no_copy_split_block(): # Overwriting an existing column that is part of a larger block df = DataFrame({"a": [1, 2, 3], "b": 1}) rhs = Series([4, 5, 6]) rhs_orig = rhs.copy() df["b"] = rhs - if using_copy_on_write: - assert np.shares_memory(get_array(rhs), get_array(df, "b")) + assert np.shares_memory(get_array(rhs), get_array(df, "b")) df.iloc[0, 1] = 100 tm.assert_series_equal(rhs, rhs_orig) -def test_setitem_series_column_midx_broadcasting(using_copy_on_write): +def test_setitem_series_column_midx_broadcasting(): # Setting a Series to multiple columns will repeat the data # (currently copying the data eagerly) df = DataFrame( @@ -138,11 +126,10 @@ def test_setitem_series_column_midx_broadcasting(using_copy_on_write): rhs = Series([10, 11]) df["a"] = rhs assert not np.shares_memory(get_array(rhs), df._get_column_array(0)) - if using_copy_on_write: - assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) -def test_set_column_with_inplace_operator(using_copy_on_write, warn_copy_on_write): +def test_set_column_with_inplace_operator(): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) # this should not raise any warning @@ -152,5 +139,4 @@ def test_set_column_with_inplace_operator(using_copy_on_write, warn_copy_on_writ # when it is not in a chain, then it should produce a warning df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) ser = df["a"] - with tm.assert_cow_warning(warn_copy_on_write): - ser += 1 + ser += 1 diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index ab468c81124bc..6b9b2dfda6e8b 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na( ): result = sanitize_array(values, index=None, dtype=dtype) if using_infer_string and expected.dtype == object and dtype is None: - tm.assert_extension_array_equal(result, pd.array(expected)) + tm.assert_extension_array_equal(result, pd.array(expected, dtype="str")) else: tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 9430ba2c478ae..69200b2e5fc96 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -33,9 +33,9 @@ ( # This is a judgement call, but we do _not_ downcast Decimal # objects - np.array([decimal.Decimal(0.0)]), + np.array([decimal.Decimal("0.0")]), "int64", - np.array([decimal.Decimal(0.0)]), + np.array([decimal.Decimal("0.0")]), ), ( # GH#45837 diff --git a/pandas/tests/dtypes/cast/test_maybe_box_native.py b/pandas/tests/dtypes/cast/test_maybe_box_native.py index 3f62f31dac219..151586962d517 100644 --- a/pandas/tests/dtypes/cast/test_maybe_box_native.py +++ b/pandas/tests/dtypes/cast/test_maybe_box_native.py @@ -17,7 +17,7 @@ "obj,expected_dtype", [ (b"\x00\x10", bytes), - (int(4), int), + ((4), int), (np.uint(4), int), (np.int32(-4), int), (np.uint8(4), int), diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f04..d30fa9fc2ea0f 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -21,6 +22,7 @@ import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.arrays import SparseArray +from pandas.util.version import Version # EA & Actual Dtypes @@ -787,15 +789,84 @@ def test_validate_allhashable(): def test_pandas_dtype_numpy_warning(): # GH#51523 - with tm.assert_produces_warning( - DeprecationWarning, - check_stacklevel=False, - match="Converting `np.integer` or `np.signedinteger` to a dtype is deprecated", - ): + if Version(np.__version__) < Version("2.3.0.dev0"): + ctx = tm.assert_produces_warning( + DeprecationWarning, + check_stacklevel=False, + match=( + "Converting `np.integer` or `np.signedinteger` to a dtype is deprecated" + ), + ) + else: + ctx = tm.external_error_raised(TypeError) + + with ctx: pandas_dtype(np.integer) def test_pandas_dtype_ea_not_instance(): # GH 31356 GH 54592 - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="without any arguments"): assert pandas_dtype(CategoricalDtype) == CategoricalDtype() + + +def test_pandas_dtype_string_dtypes(string_storage): + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype("str") + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype(str) + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + assert result == pd.StringDtype(string_storage, na_value=np.nan) + + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype(str) + assert result == pd.StringDtype(string_storage, na_value=np.nan) + + with pd.option_context("future.infer_string", False): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + assert result == np.dtype("U") + + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("string") + assert result == pd.StringDtype(string_storage, na_value=pd.NA) + + +def test_pandas_dtype_string_dtype_alias_with_storage(): + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[python]") + + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[pyarrow]") + + result = pandas_dtype("string[python]") + assert result == pd.StringDtype("python", na_value=pd.NA) + + if HAS_PYARROW: + result = pandas_dtype("string[pyarrow]") + assert result == pd.StringDtype("pyarrow", na_value=pd.NA) + else: + with pytest.raises( + ImportError, match="required for PyArrow backed StringArray" + ): + pandas_dtype("string[pyarrow]") + + +@td.skip_if_installed("pyarrow") +def test_construct_from_string_without_pyarrow_installed(): + # GH 57928 + with pytest.raises(ImportError, match="pyarrow>=10.0.1 is required"): + pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]") diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 97718386dabb7..571e12d0c3303 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -12,22 +12,18 @@ def test_concat_mismatched_categoricals_with_empty(): ser1 = Series(["a", "b", "c"], dtype="category") ser2 = Series([], dtype="category") - msg = "The behavior of array concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = _concat.concat_compat([ser1._values, ser2._values]) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = pd.concat([ser1, ser2])._values - tm.assert_categorical_equal(result, expected) + result = _concat.concat_compat([ser1._values, ser2._values]) + expected = pd.concat([ser1, ser2])._values + tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("copy", [True, False]) -def test_concat_single_dataframe_tz_aware(copy): +def test_concat_single_dataframe_tz_aware(): # https://github.com/pandas-dev/pandas/issues/25257 df = pd.DataFrame( {"timestamp": [pd.Timestamp("2020-04-08 09:00:00.709949+0000", tz="UTC")]} ) expected = df.copy() - result = pd.concat([df], copy=copy) + result = pd.concat([df]) tm.assert_frame_equal(result, expected) @@ -49,3 +45,22 @@ def test_concat_periodarray_2d(): with pytest.raises(ValueError, match=msg): _concat.concat_compat([arr[:2], arr[2:]], axis=1) + + +def test_concat_series_between_empty_and_tzaware_series(using_infer_string): + tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00") + ser1 = Series(index=[tzaware_time], data=0, dtype=float) + ser2 = Series(dtype=float) + + result = pd.concat([ser1, ser2], axis=1) + expected = pd.DataFrame( + data=[ + (0.0, None), + ], + index=[tzaware_time] + if using_infer_string + else pd.Index([tzaware_time], dtype=object), + columns=[0, 1], + dtype=float, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0dad0b05303ad..621217a8c9317 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -3,7 +3,6 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit @@ -391,8 +390,9 @@ def test_empty(self): def test_tz_standardize(self): # GH 24713 + pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") - dr = date_range("2013-01-01", periods=3, tz="US/Eastern") + dr = date_range("2013-01-01", periods=3, tz=tz) dtype = DatetimeTZDtype("ns", dr.tz) assert dtype.tz == tz dtype = DatetimeTZDtype("ns", dr[0].tz) @@ -445,12 +445,12 @@ def test_construction(self): def test_cannot_use_custom_businessday(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" + msg = "C is not supported as period frequency" + msg1 = " is not supported as period frequency" msg2 = r"PeriodDtype\[B\] is deprecated" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - PeriodDtype("C") - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): + PeriodDtype("C") + with pytest.raises(ValueError, match=msg1): with tm.assert_produces_warning(FutureWarning, match=msg2): PeriodDtype(pd.offsets.CustomBusinessDay()) @@ -660,8 +660,7 @@ def test_construction_generic(self, subtype): def test_construction_not_supported(self, subtype): # GH 19016 msg = ( - "category, object, and string subtypes are not supported " - "for IntervalDtype" + "category, object, and string subtypes are not supported for IntervalDtype" ) with pytest.raises(TypeError, match=msg): IntervalDtype(subtype) @@ -959,25 +958,24 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(["b", "a"], ordered=True) assert c1 is not c2 - @pytest.mark.parametrize("ordered1", [True, False, None]) @pytest.mark.parametrize("ordered2", [True, False, None]) - def test_categorical_equality(self, ordered1, ordered2): + def test_categorical_equality(self, ordered, ordered2): # same categories, same order # any combination of None/False are equal # True/True is the only combination with True that are equal - c1 = CategoricalDtype(list("abc"), ordered1) + c1 = CategoricalDtype(list("abc"), ordered) c2 = CategoricalDtype(list("abc"), ordered2) result = c1 == c2 - expected = bool(ordered1) is bool(ordered2) + expected = bool(ordered) is bool(ordered2) assert result is expected # same categories, different order # any combination of None/False are equal (order doesn't matter) # any combination with True are not equal (different order of cats) - c1 = CategoricalDtype(list("abc"), ordered1) + c1 = CategoricalDtype(list("abc"), ordered) c2 = CategoricalDtype(list("cab"), ordered2) result = c1 == c2 - expected = (bool(ordered1) is False) and (bool(ordered2) is False) + expected = (bool(ordered) is False) and (bool(ordered2) is False) assert result is expected # different categories @@ -985,9 +983,9 @@ def test_categorical_equality(self, ordered1, ordered2): assert c1 != c2 # none categories - c1 = CategoricalDtype(list("abc"), ordered1) + c1 = CategoricalDtype(list("abc"), ordered) c2 = CategoricalDtype(None, ordered2) - c3 = CategoricalDtype(None, ordered1) + c3 = CategoricalDtype(None, ordered) assert c1 != c2 assert c2 != c1 assert c2 == c3 @@ -1059,7 +1057,7 @@ def test_str_vs_repr(self, ordered, using_infer_string): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " rf"categories_dtype={dtype}\)" @@ -1232,3 +1230,24 @@ def test_multi_column_dtype_assignment(): df["b"] = 0 tm.assert_frame_equal(df, expected) + + +def test_loc_setitem_empty_labels_no_dtype_conversion(): + # GH 29707 + + df = pd.DataFrame({"a": [2, 3]}) + expected = df.copy() + assert df.a.dtype == "int64" + df.loc[[]] = 0.1 + + assert df.a.dtype == "int64" + tm.assert_frame_equal(df, expected) + + +def test_categorical_nan_no_dtype_conversion(): + # GH 43996 + + df = pd.DataFrame({"a": Categorical([np.nan], [1]), "b": [1]}) + expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]}) + df.loc[0, "a"] = np.array([1]) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 02c827853b29d..2b90886a8d070 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -20,8 +20,8 @@ class TestABCClasses: df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) sparse_array = pd.arrays.SparseArray(np.random.default_rng(2).standard_normal(10)) - datetime_array = pd.core.arrays.DatetimeArray._from_sequence(datetime_index) - timedelta_array = pd.core.arrays.TimedeltaArray._from_sequence(timedelta_index) + datetime_array = datetime_index.array + timedelta_array = timedelta_index.array abc_pairs = [ ("ABCMultiIndex", multi_index), @@ -124,7 +124,7 @@ def test_setattr_warnings(): # this should not raise a warning df.two.not_an_index = [1, 2] - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="doesn't allow columns"): # warn when setting column to nonexistent name df.four = df.two + 2 assert df.four.sum() > df.two.sum() diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 49eb06c299886..db98751324ebc 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -3,6 +3,7 @@ related to inference and not otherwise tested in types/test_common.py """ + import collections from collections import namedtuple from collections.abc import Iterator @@ -11,6 +12,7 @@ datetime, time, timedelta, + timezone, ) from decimal import Decimal from fractions import Fraction @@ -26,7 +28,6 @@ import numpy as np import pytest -import pytz from pandas._libs import ( lib, @@ -63,7 +64,6 @@ Index, Interval, Period, - PeriodIndex, Series, Timedelta, TimedeltaIndex, @@ -112,8 +112,8 @@ def it_outer(): def __len__(self) -> int: return len(self._values) - def __array__(self, t=None): - return np.asarray(self._values, dtype=t) + def __array__(self, dtype=None, copy=None): + return np.asarray(self._values, dtype=dtype) @property def ndim(self): @@ -240,8 +240,7 @@ def test_is_list_like_generic(): # is_list_like was yielding false positives for Generic classes in python 3.11 T = TypeVar("T") - class MyDataFrame(DataFrame, Generic[T]): - ... + class MyDataFrame(DataFrame, Generic[T]): ... tstc = MyDataFrame[int] tst = MyDataFrame[int]({"x": [1, 2, 3]}) @@ -656,9 +655,7 @@ def test_convert_numeric_uint64_nan_values( arr = np.array([2**63, 2**63 + 1], dtype=object) na_values = {2**63} - expected = ( - np.array([np.nan, 2**63 + 1], dtype=float) if coerce else arr.copy() - ) + expected = np.array([np.nan, 2**63 + 1], dtype=float) if coerce else arr.copy() result = lib.maybe_convert_numeric( arr, na_values, @@ -833,7 +830,11 @@ def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): out = lib.maybe_convert_objects(arr, convert_non_numeric=True) # no OutOfBoundsDatetime/OutOfBoundsTimedeltas - tm.assert_numpy_array_equal(out, arr) + if dtype == "datetime64[ns]": + expected = np.array(["2363-10-04"], dtype="M8[us]") + else: + expected = arr + tm.assert_numpy_array_equal(out, expected) def test_maybe_convert_objects_mixed_datetimes(self): ts = Timestamp("now") @@ -939,9 +940,9 @@ def test_maybe_convert_objects_bool_nan(self): def test_maybe_convert_objects_nullable_boolean(self): # GH50047 arr = np.array([True, False], dtype=object) - exp = np.array([True, False]) + exp = BooleanArray._from_sequence([True, False], dtype="boolean") out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) - tm.assert_numpy_array_equal(out, exp) + tm.assert_extension_array_equal(out, exp) arr = np.array([True, False, pd.NaT], dtype=object) exp = np.array([True, False, pd.NaT], dtype=object) @@ -1021,7 +1022,7 @@ def test_maybe_convert_objects_itemsize(self, data0, data1): def test_mixed_dtypes_remain_object_array(self): # GH14956 - arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) + arr = np.array([datetime(2015, 1, 1, tzinfo=timezone.utc), 1], dtype=object) result = lib.maybe_convert_objects(arr, convert_non_numeric=True) tm.assert_numpy_array_equal(result, arr) @@ -1052,7 +1053,6 @@ def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): # make sure the inferred dtype of the fixture is as requested assert inferred_dtype == lib.infer_dtype(values, skipna=True) - @pytest.mark.parametrize("skipna", [True, False]) def test_length_zero(self, skipna): result = lib.infer_dtype(np.array([], dtype="i4"), skipna=skipna) assert result == "integer" @@ -1081,15 +1081,15 @@ def test_integers(self): @pytest.mark.parametrize( "arr, skipna", [ - (np.array([1, 2, np.nan, np.nan, 3], dtype="O"), False), - (np.array([1, 2, np.nan, np.nan, 3], dtype="O"), True), - (np.array([1, 2, 3, np.int64(4), np.int32(5), np.nan], dtype="O"), False), - (np.array([1, 2, 3, np.int64(4), np.int32(5), np.nan], dtype="O"), True), + ([1, 2, np.nan, np.nan, 3], False), + ([1, 2, np.nan, np.nan, 3], True), + ([1, 2, 3, np.int64(4), np.int32(5), np.nan], False), + ([1, 2, 3, np.int64(4), np.int32(5), np.nan], True), ], ) def test_integer_na(self, arr, skipna): # GH 27392 - result = lib.infer_dtype(arr, skipna=skipna) + result = lib.infer_dtype(np.array(arr, dtype="O"), skipna=skipna) expected = "integer" if skipna else "integer-na" assert result == expected @@ -1164,7 +1164,6 @@ def test_decimals(self): assert result == "decimal" # complex is compatible with nan, so skipna has no effect - @pytest.mark.parametrize("skipna", [True, False]) def test_complex(self, skipna): # gets cast to complex on array construction arr = np.array([1.0, 2.0, 1 + 1j]) @@ -1291,13 +1290,13 @@ def test_infer_dtype_mixed_integer(self): @pytest.mark.parametrize( "arr", [ - np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]), - np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]), - np.array([datetime(2011, 1, 1), Timestamp("2011-01-02")]), + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], + [datetime(2011, 1, 1), datetime(2012, 2, 1)], + [datetime(2011, 1, 1), Timestamp("2011-01-02")], ], ) def test_infer_dtype_datetime(self, arr): - assert lib.infer_dtype(arr, skipna=True) == "datetime" + assert lib.infer_dtype(np.array(arr), skipna=True) == "datetime" @pytest.mark.parametrize("na_value", [pd.NaT, np.nan]) @pytest.mark.parametrize( @@ -1343,9 +1342,8 @@ def test_infer_dtype_period(self): arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="M")]) assert lib.infer_dtype(arr, skipna=True) == "mixed" - @pytest.mark.parametrize("klass", [pd.array, Series, Index]) - @pytest.mark.parametrize("skipna", [True, False]) - def test_infer_dtype_period_array(self, klass, skipna): + def test_infer_dtype_period_array(self, index_or_series_or_array, skipna): + klass = index_or_series_or_array # https://github.com/pandas-dev/pandas/issues/23553 values = klass( [ @@ -1534,7 +1532,6 @@ def test_date(self): [pd.NaT, date(2020, 1, 1)], ], ) - @pytest.mark.parametrize("skipna", [True, False]) def test_infer_dtype_date_order_invariant(self, values, skipna): # https://github.com/pandas-dev/pandas/issues/33741 result = lib.infer_dtype(values, skipna=skipna) @@ -1585,6 +1582,31 @@ def test_is_string_array(self): ) assert not lib.is_string_array(np.array([1, 2])) + @pytest.mark.parametrize( + "func", + [ + "is_bool_array", + "is_date_array", + "is_datetime_array", + "is_datetime64_array", + "is_float_array", + "is_integer_array", + "is_interval_array", + "is_string_array", + "is_time_array", + "is_timedelta_or_timedelta64_array", + ], + ) + def test_is_dtype_array_empty_obj(self, func): + # https://github.com/pandas-dev/pandas/pull/60796 + func = getattr(lib, func) + + arr = np.empty((2, 0), dtype=object) + assert not func(arr) + + arr = np.empty((0, 2), dtype=object) + assert not func(arr) + def test_to_object_array_tuples(self): r = (5, 6) values = [r] @@ -1623,25 +1645,6 @@ def test_to_object_array_width(self): out = lib.to_object_array(rows, min_width=5) tm.assert_numpy_array_equal(out, expected) - def test_is_period(self): - # GH#55264 - msg = "is_period is deprecated and will be removed in a future version" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert lib.is_period(Period("2011-01", freq="M")) - assert not lib.is_period(PeriodIndex(["2011-01"], freq="M")) - assert not lib.is_period(Timestamp("2011-01")) - assert not lib.is_period(1) - assert not lib.is_period(np.nan) - - def test_is_interval(self): - # GH#55264 - msg = "is_interval is deprecated and will be removed in a future version" - item = Interval(1, 2) - with tm.assert_produces_warning(FutureWarning, match=msg): - assert lib.is_interval(item) - assert not lib.is_interval(pd.IntervalIndex([item])) - assert not lib.is_interval(pd.IntervalIndex([item])._engine) - def test_categorical(self): # GH 8974 arr = Categorical(list("abc")) @@ -1705,21 +1708,19 @@ def test_interval_mismatched_subtype(self): arr = np.array([first, flt_interval], dtype=object) assert lib.infer_dtype(arr, skipna=False) == "interval" - @pytest.mark.parametrize("klass", [pd.array, Series]) - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) - def test_string_dtype(self, data, skipna, klass, nullable_string_dtype): + def test_string_dtype( + self, data, skipna, index_or_series_or_array, nullable_string_dtype + ): # StringArray - val = klass(data, dtype=nullable_string_dtype) + val = index_or_series_or_array(data, dtype=nullable_string_dtype) inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "string" - @pytest.mark.parametrize("klass", [pd.array, Series]) - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [[True, False, True], [True, False, pd.NA]]) - def test_boolean_dtype(self, data, skipna, klass): + def test_boolean_dtype(self, data, skipna, index_or_series_or_array): # BooleanArray - val = klass(data, dtype="boolean") + val = index_or_series_or_array(data, dtype="boolean") inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "boolean" @@ -1910,14 +1911,15 @@ def test_is_scalar_numpy_array_scalars(self): @pytest.mark.parametrize( "zerodim", [ - np.array(1), - np.array("foobar"), - np.array(np.datetime64("2014-01-01")), - np.array(np.timedelta64(1, "h")), - np.array(np.datetime64("NaT")), + 1, + "foobar", + np.datetime64("2014-01-01"), + np.timedelta64(1, "h"), + np.datetime64("NaT"), ], ) def test_is_scalar_numpy_zerodim_arrays(self, zerodim): + zerodim = np.array(zerodim) assert not is_scalar(zerodim) assert is_scalar(lib.item_from_zerodim(zerodim)) @@ -1948,7 +1950,7 @@ def test_is_scalar_pandas_containers(self): assert not is_scalar(pd.array([1, 2, 3])) def test_is_scalar_number(self): - # Number() is not recognied by PyNumber_Check, so by extension + # Number() is not recognized by PyNumber_Check, so by extension # is not recognized by is_scalar, but instances of non-abstract # subclasses are. @@ -1983,9 +1985,12 @@ def test_nan_to_nat_conversions(): @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") +@pytest.mark.parametrize("spmatrix", ["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) def test_is_scipy_sparse(spmatrix): - pytest.importorskip("scipy") - assert is_scipy_sparse(spmatrix([[0, 1]])) + sparse = pytest.importorskip("scipy.sparse") + + klass = getattr(sparse, spmatrix + "_matrix") + assert is_scipy_sparse(klass([[0, 1]])) assert not is_scipy_sparse(np.array([1])) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index e1f8d8eca2537..a5b22ac30d820 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,15 +1,11 @@ -from contextlib import nullcontext from datetime import datetime from decimal import Decimal import numpy as np import pytest -from pandas._config import config as cf - from pandas._libs import missing as libmissing from pandas._libs.tslibs import iNaT -from pandas.compat.numpy import np_version_gte1p25 from pandas.core.dtypes.common import ( is_float, @@ -54,25 +50,6 @@ def test_notna_notnull(notna_f): assert not notna_f(None) assert not notna_f(np.nan) - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with cf.option_context("mode.use_inf_as_na", False): - assert notna_f(np.inf) - assert notna_f(-np.inf) - - arr = np.array([1.5, np.inf, 3.5, -np.inf]) - result = notna_f(arr) - assert result.all() - - with tm.assert_produces_warning(FutureWarning, match=msg): - with cf.option_context("mode.use_inf_as_na", True): - assert not notna_f(np.inf) - assert not notna_f(-np.inf) - - arr = np.array([1.5, np.inf, 3.5, -np.inf]) - result = notna_f(arr) - assert result.sum() == 2 - @pytest.mark.parametrize("null_func", [notna, notnull, isna, isnull]) @pytest.mark.parametrize( @@ -88,10 +65,7 @@ def test_notna_notnull(notna_f): ], ) def test_null_check_is_series(null_func, ser): - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with cf.option_context("mode.use_inf_as_na", False): - assert isinstance(null_func(ser), Series) + assert isinstance(null_func(ser), Series) class TestIsNA: @@ -232,10 +206,7 @@ def test_isna_old_datetimelike(self): objs = [dta, dta.tz_localize("US/Eastern"), dta - dta, dta.to_period("D")] for obj in objs: - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with cf.option_context("mode.use_inf_as_na", True): - result = isna(obj) + result = isna(obj) tm.assert_numpy_array_equal(result, expected) @@ -350,7 +321,7 @@ def test_period(self): def test_decimal(self): # scalars GH#23530 - a = Decimal(1.0) + a = Decimal("1.0") assert isna(a) is False assert notna(a) is True @@ -485,15 +456,7 @@ def test_array_equivalent_dti(dtype_equal): ) def test_array_equivalent_series(val): arr = np.array([1, 2]) - msg = "elementwise comparison failed" - cm = ( - # stacklevel is chosen to make sense when called from .equals - tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False) - if isinstance(val, str) and not np_version_gte1p25 - else nullcontext() - ) - with cm: - assert not array_equivalent(Series([arr, arr]), Series([arr, val])) + assert not array_equivalent(Series([arr, arr]), Series([arr, val])) def test_array_equivalent_array_mismatched_shape(): @@ -531,7 +494,6 @@ def test_array_equivalent_different_dtype_but_equal(): (fix_now, fix_utcnow), (fix_now.to_datetime64(), fix_utcnow), (fix_now.to_pydatetime(), fix_utcnow), - (fix_now, fix_utcnow), (fix_now.to_datetime64(), fix_utcnow.to_pydatetime()), (fix_now.to_pydatetime(), fix_utcnow.to_pydatetime()), ], @@ -725,6 +687,9 @@ def test_array_equivalent_index_with_tuples(): ("f2", np.nan), ("f4", np.nan), ("f8", np.nan), + # Complex + ("c8", np.nan), + ("c16", np.nan), # Object ("O", np.nan), # Interval @@ -744,23 +709,17 @@ class TestNAObj: def _check_behavior(self, arr, expected): result = libmissing.isnaobj(arr) tm.assert_numpy_array_equal(result, expected) - result = libmissing.isnaobj(arr, inf_as_na=True) - tm.assert_numpy_array_equal(result, expected) arr = np.atleast_2d(arr) expected = np.atleast_2d(expected) result = libmissing.isnaobj(arr) tm.assert_numpy_array_equal(result, expected) - result = libmissing.isnaobj(arr, inf_as_na=True) - tm.assert_numpy_array_equal(result, expected) # Test fortran order arr = arr.copy(order="F") result = libmissing.isnaobj(arr) tm.assert_numpy_array_equal(result, expected) - result = libmissing.isnaobj(arr, inf_as_na=True) - tm.assert_numpy_array_equal(result, expected) def test_basic(self): arr = np.array([1, None, "foo", -5.1, NaT, np.nan]) @@ -810,8 +769,8 @@ def test_empty_like(self): np.datetime64("NaT"), np.timedelta64("NaT"), ] - + [np.datetime64("NaT", unit) for unit in m8_units] - + [np.timedelta64("NaT", unit) for unit in m8_units] + + [np.datetime64("NaT", unit) for unit in m8_units] # type: ignore[call-overload] + + [np.timedelta64("NaT", unit) for unit in m8_units] # type: ignore[call-overload] ) inf_vals = [ @@ -843,7 +802,8 @@ def test_empty_like(self): class TestLibMissing: @pytest.mark.parametrize("func", [libmissing.checknull, isna]) @pytest.mark.parametrize( - "value", na_vals + sometimes_na_vals # type: ignore[operator] + "value", + na_vals + sometimes_na_vals, # type: ignore[operator] ) def test_checknull_na_vals(self, func, value): assert func(value) @@ -864,22 +824,15 @@ def test_checknull_never_na_vals(self, func, value): assert not func(value) @pytest.mark.parametrize( - "value", na_vals + sometimes_na_vals # type: ignore[operator] + "value", + na_vals + sometimes_na_vals, # type: ignore[operator] ) def test_checknull_old_na_vals(self, value): - assert libmissing.checknull(value, inf_as_na=True) - - @pytest.mark.parametrize("value", inf_vals) - def test_checknull_old_inf_vals(self, value): - assert libmissing.checknull(value, inf_as_na=True) + assert libmissing.checknull(value) @pytest.mark.parametrize("value", int_na_vals) def test_checknull_old_intna_vals(self, value): - assert not libmissing.checknull(value, inf_as_na=True) - - @pytest.mark.parametrize("value", int_na_vals) - def test_checknull_old_never_na_vals(self, value): - assert not libmissing.checknull(value, inf_as_na=True) + assert not libmissing.checknull(value) def test_is_matching_na(self, nulls_fixture, nulls_fixture2): left = nulls_fixture diff --git a/pandas/tests/extension/array_with_attr/array.py b/pandas/tests/extension/array_with_attr/array.py index d0249d9af8098..4f65424ece145 100644 --- a/pandas/tests/extension/array_with_attr/array.py +++ b/pandas/tests/extension/array_with_attr/array.py @@ -2,6 +2,7 @@ Test extension array that has custom attribute information (not stored on the dtype). """ + from __future__ import annotations import numbers @@ -49,7 +50,10 @@ def __init__(self, values, attr=None) -> None: @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): - data = np.array(scalars, dtype="float64", copy=copy) + if not copy: + data = np.asarray(scalars, dtype="float64") + else: + data = np.array(scalars, dtype="float64", copy=copy) return cls(data) def __getitem__(self, item): diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 6efaa95aef1b5..2abf739d2428d 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -34,6 +34,7 @@ class TestMyDtype(BaseDtypeTests): wherever the test requires it. You're free to implement additional tests. """ + from pandas.tests.extension.base.accumulate import BaseAccumulateTests from pandas.tests.extension.base.casting import BaseCastingTests from pandas.tests.extension.base.constructors import BaseConstructorsTests @@ -63,9 +64,7 @@ class TestMyDtype(BaseDtypeTests): # One test class that you can inherit as an alternative to inheriting all the # test classes above. -# Note 1) this excludes Dim2CompatTests and NDArrayBacked2DTests. -# Note 2) this uses BaseReduceTests and and _not_ BaseBooleanReduceTests, -# BaseNoReduceTests, or BaseNumericReduceTests +# Note this excludes Dim2CompatTests and NDArrayBacked2DTests. class ExtensionTests( BaseAccumulateTests, BaseCastingTests, @@ -88,44 +87,3 @@ class ExtensionTests( Dim2CompatTests, ): pass - - -def __getattr__(name: str): - import warnings - - if name == "BaseNoReduceTests": - warnings.warn( - "BaseNoReduceTests is deprecated and will be removed in a " - "future version. Use BaseReduceTests and override " - "`_supports_reduction` instead.", - FutureWarning, - ) - from pandas.tests.extension.base.reduce import BaseNoReduceTests - - return BaseNoReduceTests - - elif name == "BaseNumericReduceTests": - warnings.warn( - "BaseNumericReduceTests is deprecated and will be removed in a " - "future version. Use BaseReduceTests and override " - "`_supports_reduction` instead.", - FutureWarning, - ) - from pandas.tests.extension.base.reduce import BaseNumericReduceTests - - return BaseNumericReduceTests - - elif name == "BaseBooleanReduceTests": - warnings.warn( - "BaseBooleanReduceTests is deprecated and will be removed in a " - "future version. Use BaseReduceTests and override " - "`_supports_reduction` instead.", - FutureWarning, - ) - from pandas.tests.extension.base.reduce import BaseBooleanReduceTests - - return BaseBooleanReduceTests - - raise AttributeError( - f"module 'pandas.tests.extension.base' has no attribute '{name}'" - ) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 2bfe801c48a77..8e3f21e1a4f56 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -30,8 +30,9 @@ def test_astype_object_frame(self, all_data): blk = result._mgr.blocks[0] assert isinstance(blk, NumpyBlock), type(blk) assert blk.is_object - assert isinstance(result._mgr.arrays[0], np.ndarray) - assert result._mgr.arrays[0].dtype == np.dtype(object) + arr = result._mgr.blocks[0].values + assert isinstance(arr, np.ndarray) + assert arr.dtype == np.dtype(object) # check that we can compare the dtypes comp = result.dtypes == df.dtypes @@ -43,8 +44,8 @@ def test_tolist(self, data): assert result == expected def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) - expected = pd.Series([str(x) for x in data[:5]], dtype=str) + result = pd.Series(data[:2]).astype(str) + expected = pd.Series([str(x) for x in data[:2]], dtype=str) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index c32a6a6a115ac..639dc874c9fb9 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -69,7 +69,7 @@ def test_dataframe_constructor_from_dict(self, data, from_series): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_dataframe_from_series(self, data): result = pd.DataFrame(pd.Series(data)) @@ -77,7 +77,7 @@ def test_dataframe_from_series(self, data): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_series_given_mismatched_index_raises(self, data): msg = r"Length of values \(3\) does not match length of index \(5\)" diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 132cda5a94ed0..8c7d8ff491cd3 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -1,6 +1,7 @@ """ Tests for 2D compatibility. """ + import numpy as np import pytest @@ -90,9 +91,9 @@ def test_reshape(self, data): assert arr2d.shape == (data.size, 1) assert len(arr2d) == len(data) - with pytest.raises(ValueError): + with tm.external_error_raised(ValueError): data.reshape((data.size, 2)) - with pytest.raises(ValueError): + with tm.external_error_raised(ValueError): data.reshape(data.size, 2) def test_getitem_2d(self, data): diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index c7b768f6e3c88..38478ed3c40ae 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -85,7 +85,7 @@ def test_str(self, dtype): def test_eq(self, dtype): assert dtype == dtype.name - assert dtype != "anonther_type" + assert dtype != "another_type" def test_construct_from_string_own_name(self, dtype): result = dtype.construct_from_string(dtype.name) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 5f0c1b960a475..1f3680bf67e90 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -139,8 +139,8 @@ def test_getitem_invalid(self, data): "index out of bounds", # pyarrow "Out of bounds access", # Sparse f"loc must be an integer between -{ub} and {ub}", # Sparse - f"index {ub+1} is out of bounds for axis 0 with size {ub}", - f"index -{ub+1} is out of bounds for axis 0 with size {ub}", + f"index {ub + 1} is out of bounds for axis 0 with size {ub}", + f"index -{ub + 1} is out of bounds for axis 0 with size {ub}", ] ) with pytest.raises(IndexError, match=msg): @@ -329,11 +329,10 @@ def test_get(self, data): result = s.get("Z") assert result is None - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert s.get(4) == s.iloc[4] - assert s.get(-1) == s.iloc[-1] - assert s.get(len(s)) is None + # As of 3.0, getitem with int keys treats them as labels + assert s.get(4) is None + assert s.get(-1) is None + assert s.get(len(s)) is None # GH 21257 s = pd.Series(data) @@ -394,7 +393,7 @@ def test_take_non_na_fill_value(self, data_missing): tm.assert_extension_array_equal(result, expected) def test_take_pandas_style_negative_raises(self, data, na_value): - with pytest.raises(ValueError, match=""): + with tm.external_error_raised(ValueError): data.take([0, -2], fill_value=na_value, allow_fill=True) @pytest.mark.parametrize("allow_fill", [True, False]) @@ -409,7 +408,7 @@ def test_take_series(self, data): result = s.take([0, -1]) expected = pd.Series( data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), - index=[0, len(data) - 1], + index=range(0, 198, 99), ) tm.assert_series_equal(result, expected) @@ -429,7 +428,8 @@ def test_reindex(self, data, na_value): result = s.reindex([n, n + 1]) expected = pd.Series( - data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1] + data._from_sequence([na_value, na_value], dtype=s.dtype), + index=range(n, n + 2, 1), ) tm.assert_series_equal(result, expected) @@ -451,7 +451,7 @@ def test_loc_len1(self, data): df = pd.DataFrame({"A": data}) res = df.loc[[0], "A"] assert res.ndim == 1 - assert res._mgr.arrays[0].ndim == 1 + assert res._mgr.blocks[0].ndim == 1 if hasattr(res._mgr, "blocks"): assert res._mgr._block.ndim == 1 diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 75628ea177fc2..60cade97ab528 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -113,14 +113,10 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("B", group_keys=False).apply(groupby_apply_op) - df.groupby("B", group_keys=False).A.apply(groupby_apply_op) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("A", group_keys=False).apply(groupby_apply_op) - df.groupby("A", group_keys=False).B.apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) @@ -162,8 +158,10 @@ def test_in_numeric_groupby(self, data_for_grouping): msg = "|".join( [ - # period/datetime + # period "does not support sum operations", + # datetime + "does not support operation 'sum'", # all others re.escape(f"agg function failed [how->sum,dtype->{dtype}"), ] diff --git a/pandas/tests/extension/base/index.py b/pandas/tests/extension/base/index.py index 72c4ebfb5d84a..e7bfebec92287 100644 --- a/pandas/tests/extension/base/index.py +++ b/pandas/tests/extension/base/index.py @@ -1,6 +1,7 @@ """ Tests for Indexes backed by arbitrary ExtensionArrays. """ + import pandas as pd diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6683c87e2b8fc..79eb64b5a654f 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -71,6 +73,25 @@ def test_array_interface(self, data): expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.array(data, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.array(data, copy=False) + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index 3a6f2eb5ba8b1..f7367fceeb52f 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -13,7 +13,7 @@ class BaseParsingTests: def test_EA_types(self, engine, data, request): if isinstance(data.dtype, pd.CategoricalDtype): # in parsers.pyx _convert_with_dtype there is special-casing for - # Categorical that pre-empts _from_sequence_of_strings + # Categorical that preempts _from_sequence_of_strings pass elif isinstance(data.dtype, pd.core.dtypes.dtypes.NumpyEADtype): # These get unwrapped internally so are treated as numpy dtypes diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c803a8113b4a4..fd9fec0cb490c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data): expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) - if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( + if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan: + # TODO: avoid special-casing + expected = expected.astype("float64") + elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( data.dtype, pd.ArrowDtype ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") - elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": - # TODO: avoid special-casing - expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") @@ -116,10 +116,8 @@ def test_argsort_missing_array(self, data_missing_for_sorting): tm.assert_numpy_array_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting): - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([2, 0, 1], dtype=np.intp)) tm.assert_series_equal(result, expected) def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): @@ -169,8 +167,8 @@ def test_argmin_argmax_all_na(self, method, data, na_value): ("idxmin", True, 2), ("argmax", True, 0), ("argmin", True, 2), - ("idxmax", False, np.nan), - ("idxmin", False, np.nan), + ("idxmax", False, -1), + ("idxmin", False, -1), ("argmax", False, -1), ("argmin", False, -1), ], @@ -179,26 +177,22 @@ def test_argreduce_series( self, data_missing_for_sorting, op_name, skipna, expected ): # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing. - warn = None - msg = "The behavior of Series.argmax/argmin" - if op_name.startswith("arg") and expected == -1: - warn = FutureWarning - if op_name.startswith("idx") and np.isnan(expected): - warn = FutureWarning - msg = f"The behavior of Series.{op_name}" ser = pd.Series(data_missing_for_sorting) - with tm.assert_produces_warning(warn, match=msg): + if expected == -1: + with pytest.raises(ValueError, match="Encountered an NA value"): + getattr(ser, op_name)(skipna=skipna) + else: result = getattr(ser, op_name)(skipna=skipna) - tm.assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting): # GH#38733 data = data_missing_for_sorting - with pytest.raises(NotImplementedError, match=""): + with pytest.raises(ValueError, match="Encountered an NA value"): data.argmin(skipna=False) - with pytest.raises(NotImplementedError, match=""): + with pytest.raises(ValueError, match="Encountered an NA value"): data.argmax(skipna=False) @pytest.mark.parametrize( @@ -303,6 +297,20 @@ def test_factorize_empty(self, data): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_extension_array_equal(uniques, expected_uniques) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + df = pd.DataFrame({"A": data_missing.take([0, 1, 0, 1])}) + expected = pd.DataFrame({"A": data_missing.take([1, 1, 0, 1])}) + result = df.fillna(value=data_missing[1], limit=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + ser = pd.Series(data_missing.take([0, 1, 0, 1])) + expected = pd.Series(data_missing.take([1, 1, 0, 1])) + result = ser.fillna(value=data_missing[1], limit=1) + tm.assert_series_equal(result, expected) + def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}) @@ -541,7 +549,7 @@ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): dtype = data_for_sorting.dtype data_for_sorting = pd.array([True, False], dtype=dtype) b, a = data_for_sorting - arr = type(data_for_sorting)._from_sequence([a, b]) + arr = type(data_for_sorting)._from_sequence([a, b], dtype=dtype) if as_series: arr = pd.Series(arr) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index ffb7a24b4b390..cee565d4f7c1e 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -27,7 +27,9 @@ def test_isna_returns_copy(self, data_missing, na_func): expected = result.copy() mask = getattr(result, na_func)() if isinstance(mask.dtype, pd.SparseDtype): + # TODO: GH 57739 mask = np.array(mask) + mask.flags.writeable = True mask[:] = True tm.assert_series_equal(result, expected) @@ -68,21 +70,43 @@ def test_fillna_scalar(self, data_missing): expected = data_missing.fillna(valid) tm.assert_extension_array_equal(result, expected) - @pytest.mark.filterwarnings( - "ignore:Series.fillna with 'method' is deprecated:FutureWarning" - ) + def test_fillna_with_none(self, data_missing): + # GH#57723 + result = data_missing.fillna(None) + expected = data_missing + tm.assert_extension_array_equal(result, expected) + def test_fillna_limit_pad(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) result = pd.Series(arr).ffill(limit=2) expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) tm.assert_series_equal(result, expected) - @pytest.mark.filterwarnings( - "ignore:Series.fillna with 'method' is deprecated:FutureWarning" + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + arr = data_missing.take(input_ilocs) + result = pd.Series(arr).ffill(limit_area=limit_area) + expected = pd.Series(data_missing.take(expected_ilocs)) + tm.assert_series_equal(result, expected) + def test_fillna_limit_backfill(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) - result = pd.Series(arr).fillna(method="backfill", limit=2) + result = pd.Series(arr).bfill(limit=2) expected = pd.Series(data_missing.take([1, 0, 1, 1, 1])) tm.assert_series_equal(result, expected) @@ -155,12 +179,3 @@ def test_fillna_fill_other(self, data): expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)}) tm.assert_frame_equal(result, expected) - - def test_use_inf_as_na_no_effect(self, data_missing): - ser = pd.Series(data_missing) - expected = ser.isna() - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.use_inf_as_na", True): - result = ser.isna() - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 5cd66d8a874c7..222ff42d45052 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -22,7 +20,7 @@ class BaseOpsUtil: def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: # Find the Exception, if any we expect to raise calling # obj.__op_name__(other) @@ -37,14 +35,6 @@ def _get_expected_exception( else: result = self.frame_scalar_exc - if using_pyarrow_string_dtype() and result is not None: - import pyarrow as pa - - result = ( # type: ignore[assignment] - result, - pa.lib.ArrowNotImplementedError, - NotImplementedError, - ) return result def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 6ea1b3a6fbe9d..4b3431d938f96 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -4,7 +4,6 @@ import pandas as pd import pandas._testing as tm -from pandas.api.types import is_numeric_dtype class BaseReduceTests: @@ -57,7 +56,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array df = pd.DataFrame({"a": arr}) - kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} + kwargs = {"ddof": 1} if op_name in ["var", "std", "sem"] else {} cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) @@ -86,7 +85,7 @@ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): # TODO: the message being checked here isn't actually checking anything msg = ( "[Cc]annot perform|Categorical is not ordered for operation|" - "does not support reduction|" + "does not support operation|" ) with pytest.raises(TypeError, match=msg): @@ -105,7 +104,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): # TODO: the message being checked here isn't actually checking anything msg = ( "[Cc]annot perform|Categorical is not ordered for operation|" - "does not support reduction|" + "does not support operation|" ) with pytest.raises(TypeError, match=msg): @@ -119,35 +118,11 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) - if not is_numeric_dtype(ser.dtype): - pytest.skip(f"{ser.dtype} is not numeric dtype") - if op_name in ["count", "kurt", "sem"]: + if op_name == "count": pytest.skip(f"{op_name} not an array method") if not self._supports_reduction(ser, op_name): pytest.skip(f"Reduction {op_name} not supported for this dtype") self.check_reduce_frame(ser, op_name, skipna) - - -# TODO(3.0): remove BaseNoReduceTests, BaseNumericReduceTests, -# BaseBooleanReduceTests -class BaseNoReduceTests(BaseReduceTests): - """we don't define any reductions""" - - -class BaseNumericReduceTests(BaseReduceTests): - # For backward compatibility only, this only runs the numeric reductions - def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - if op_name in ["any", "all"]: - pytest.skip("These are tested in BaseBooleanReduceTests") - return True - - -class BaseBooleanReduceTests(BaseReduceTests): - # For backward compatibility only, this only runs the numeric reductions - def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - if op_name not in ["any", "all"]: - pytest.skip("These are tested in BaseNumericReduceTests") - return True diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 4550e3b055cfe..a760cbc3995b3 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import NumpyEADtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import ExtensionArray @@ -29,12 +31,12 @@ def test_concat(self, data, in_frame): assert dtype == data.dtype if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): - valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1]) - na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3]) + valid_block = pd.Series(data_missing.take([1, 1]), index=range(2)) + na_block = pd.Series(data_missing.take([0, 0]), index=range(2, 4)) if in_frame: valid_block = pd.DataFrame({"a": valid_block}) na_block = pd.DataFrame({"a": na_block}) @@ -106,7 +108,7 @@ def test_concat_extension_arrays_copy_false(self, data, na_value): "B": data[3:7], } ) - result = pd.concat([df1, df2], axis=1, copy=False) + result = pd.concat([df1, df2], axis=1) tm.assert_frame_equal(result, expected) def test_concat_with_reindex(self, data): @@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack): expected = expected.astype(object) if isinstance(expected, pd.Series): - assert result.dtype == df.iloc[:, 0].dtype + if future_stack and isinstance(data.dtype, NumpyEADtype): + # GH#58817 future_stack=True constructs the result specifying the dtype + # using the dtype of the input; we thus get the underlying + # NumPy dtype as the result instead of the NumpyExtensionArray + assert result.dtype == df.iloc[:, 0].to_numpy().dtype + else: + assert result.dtype == df.iloc[:, 0].dtype else: assert all(result.dtypes == df.iloc[:, 0].dtype) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index ca19845041e23..1d613ced2c03f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -202,6 +202,22 @@ def test_setitem_integer_array(self, data, idx, box_in_series): arr[idx] = arr[0] tm.assert_equal(arr, expected) + @pytest.mark.parametrize( + "idx", + [[0, 0, 1], pd.array([0, 0, 1], dtype="Int64"), np.array([0, 0, 1])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array_with_repeats(self, data, idx, box_in_series): + arr = data[:5].copy() + expected = data.take([2, 3, 2, 3, 4]) + + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + + arr[idx] = [arr[2], arr[2], arr[3]] + tm.assert_equal(arr, expected) + @pytest.mark.parametrize( "idx, box_in_series", [ @@ -210,7 +226,8 @@ def test_setitem_integer_array(self, data, idx, box_in_series): [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948") ), (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), - (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + # TODO: change False to True? + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), # noqa: PT014 ], ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], ) @@ -333,7 +350,7 @@ def test_setitem_loc_iloc_slice(self, data): def test_setitem_slice_mismatch_length_raises(self, data): arr = data[:5] - with pytest.raises(ValueError): + with tm.external_error_raised(ValueError): arr[:1] = arr[:2] def test_setitem_slice_array(self, data): @@ -343,7 +360,7 @@ def test_setitem_slice_array(self, data): def test_setitem_scalar_key_sequence_raise(self, data): arr = data[:5].copy() - with pytest.raises(ValueError): + with tm.external_error_raised(ValueError): arr[0] = arr[[0, 1]] def test_setitem_preserves_views(self, data): @@ -357,7 +374,7 @@ def test_setitem_preserves_views(self, data): def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 - df = expected = pd.DataFrame({0: pd.Series(data)}) + df = expected = pd.DataFrame(pd.Series(data)) result = pd.DataFrame(index=df.index) key = full_indexer(df) @@ -397,14 +414,6 @@ def test_setitem_series(self, data, full_indexer): def test_setitem_frame_2d_values(self, data): # GH#44514 df = pd.DataFrame({"A": data}) - - # Avoiding using_array_manager fixture - # https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410 - using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager) - using_copy_on_write = pd.options.mode.copy_on_write - - blk_data = df._mgr.arrays[0] - orig = df.copy() df.iloc[:] = df.copy() @@ -415,10 +424,6 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:] = df.values tm.assert_frame_equal(df, orig) - if not using_array_manager and not using_copy_on_write: - # GH#33457 Check that this setting occurred in-place - # FIXME(ArrayManager): this should work there too - assert df._mgr.arrays[0] is blk_data df.iloc[:-1] = df.values[:-1] tm.assert_frame_equal(df, orig) diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index c5b1295ee4a7d..97fb5a0bc5066 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -2,12 +2,7 @@ import pytest -from pandas._config.config import _get_option - -from pandas import ( - Series, - options, -) +from pandas import Series @pytest.fixture @@ -194,7 +189,7 @@ def use_numpy(request): def fillna_method(request): """ Parametrized fixture giving method parameters 'ffill' and 'bfill' for - Series.fillna(method=) testing. + Series. testing. """ return request.param @@ -217,14 +212,3 @@ def invalid_scalar(data): If the array can hold any item (i.e. object dtype), then use pytest.skip. """ return object.__new__(object) - - -@pytest.fixture -def using_copy_on_write() -> bool: - """ - Fixture to check if Copy-on-Write is enabled. - """ - return ( - options.mode.copy_on_write is True - and _get_option("mode.data_manager", silent=True) == "block" - ) diff --git a/pandas/tests/extension/date/array.py b/pandas/tests/extension/date/array.py index 2306f5974ba18..0c51570189a7c 100644 --- a/pandas/tests/extension/date/array.py +++ b/pandas/tests/extension/date/array.py @@ -113,7 +113,7 @@ def __init__( # error: "object_" object is not iterable obj = np.char.split(dates, sep="-") - for (i,), (y, m, d) in np.ndenumerate(obj): # type: ignore[misc] + for (i,), (y, m, d) in np.ndenumerate(obj): self._year[i] = int(y) self._month[i] = int(m) self._day[i] = int(d) diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py index 34727b43a7b0f..47b1c7c57a47a 100644 --- a/pandas/tests/extension/decimal/__init__.py +++ b/pandas/tests/extension/decimal/__init__.py @@ -5,4 +5,4 @@ to_decimal, ) -__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"] +__all__ = ["DecimalArray", "DecimalDtype", "make_data", "to_decimal"] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 521c1ff0b96bc..2ee6a73ec4054 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -101,7 +101,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): return cls(scalars) @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + def _from_sequence_of_strings(cls, strings, *, dtype: ExtensionDtype, copy=False): return cls._from_sequence( [decimal.Decimal(x) for x in strings], dtype=dtype, copy=copy ) @@ -125,7 +125,6 @@ def to_numpy( return result def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): - # if not all( isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs ): @@ -287,17 +286,10 @@ def value_counts(self, dropna: bool = True): return value_counts(self.to_numpy(), dropna=dropna) # We override fillna here to simulate a 3rd party EA that has done so. This - # lets us test the deprecation telling authors to implement _pad_or_backfill - # Simulate a 3rd-party EA that has not yet updated to include a "copy" + # lets us test a 3rd-party EA that has not yet updated to include a "copy" # keyword in its fillna method. - # error: Signature of "fillna" incompatible with supertype "ExtensionArray" - def fillna( # type: ignore[override] - self, - value=None, - method=None, - limit: int | None = None, - ): - return super().fillna(value=value, method=method, limit=limit, copy=True) + def fillna(self, value=None, limit=None): + return super().fillna(value=value, limit=limit, copy=True) def to_decimal(values, context=None): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b3c57ee49a724..e0b35b7450303 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -68,10 +68,12 @@ def data_for_grouping(): class TestDecimalArray(base.ExtensionTests): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["kurt", "sem"]: + return False return True def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): @@ -137,65 +139,6 @@ def test_fillna_frame(self, data_missing): ): super().test_fillna_frame(data_missing) - def test_fillna_limit_pad(self, data_missing): - msg = "ExtensionArray.fillna 'method' keyword is deprecated" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - check_stacklevel=False, - raise_on_extra_warnings=False, - ): - super().test_fillna_limit_pad(data_missing) - - msg = "The 'method' keyword in DecimalArray.fillna is deprecated" - with tm.assert_produces_warning( - FutureWarning, - match=msg, - check_stacklevel=False, - raise_on_extra_warnings=False, - ): - super().test_fillna_limit_pad(data_missing) - - def test_fillna_limit_backfill(self, data_missing): - msg = "Series.fillna with 'method' is deprecated" - with tm.assert_produces_warning( - FutureWarning, - match=msg, - check_stacklevel=False, - raise_on_extra_warnings=False, - ): - super().test_fillna_limit_backfill(data_missing) - - msg = "ExtensionArray.fillna 'method' keyword is deprecated" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - check_stacklevel=False, - raise_on_extra_warnings=False, - ): - super().test_fillna_limit_backfill(data_missing) - - msg = "The 'method' keyword in DecimalArray.fillna is deprecated" - with tm.assert_produces_warning( - FutureWarning, - match=msg, - check_stacklevel=False, - raise_on_extra_warnings=False, - ): - super().test_fillna_limit_backfill(data_missing) - - def test_fillna_no_op_returns_copy(self, data): - msg = "|".join( - [ - "ExtensionArray.fillna 'method' keyword is deprecated", - "The 'method' keyword in DecimalArray.fillna is deprecated", - ] - ) - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False - ): - super().test_fillna_no_op_returns_copy(data) - def test_fillna_series(self, data_missing): msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning( @@ -203,32 +146,32 @@ def test_fillna_series(self, data_missing): ): super().test_fillna_series(data_missing) - def test_fillna_series_method(self, data_missing, fillna_method): - msg = "|".join( - [ - "ExtensionArray.fillna 'method' keyword is deprecated", - "The 'method' keyword in DecimalArray.fillna is deprecated", - ] - ) - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False - ): - super().test_fillna_series_method(data_missing, fillna_method) + def test_fillna_with_none(self, data_missing): + # GH#57723 + # EAs that don't have special logic for None will raise, unlike pandas' + # which interpret None as the NA value for the dtype. + msg = "conversion from NoneType to Decimal is not supported" + with pytest.raises(TypeError, match=msg): + super().test_fillna_with_none(data_missing) - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - warn = DeprecationWarning if not using_copy_on_write else None + def test_fillna_limit_frame(self, data_missing): + # GH#58001 msg = "ExtensionArray.fillna added a 'copy' keyword" - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - super().test_fillna_copy_frame(data_missing) + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_frame(data_missing) - def test_fillna_copy_series(self, data_missing, using_copy_on_write): - warn = DeprecationWarning if not using_copy_on_write else None + def test_fillna_limit_series(self, data_missing): + # GH#58001 msg = "ExtensionArray.fillna added a 'copy' keyword" - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - super().test_fillna_copy_series(data_missing) + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + super().test_fillna_limit_series(data_missing) @pytest.mark.parametrize("dropna", [True, False]) - def test_value_counts(self, all_data, dropna, request): + def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: other = np.array(all_data[~all_data.isna()]) @@ -303,8 +246,7 @@ def test_dataframe_constructor_with_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("frame", [True, False]) -def test_astype_dispatches(frame): +def test_astype_dispatches(frame_or_series): # This is a dtype-specific test that ensures Series[decimal].astype # gets all the way through to ExtensionArray.astype # Designing a reliable smoke test that works for arbitrary data types @@ -313,12 +255,11 @@ def test_astype_dispatches(frame): ctx = decimal.Context() ctx.prec = 5 - if frame: - data = data.to_frame() + data = frame_or_series(data) result = data.astype(DecimalDtype(ctx)) - if frame: + if frame_or_series is pd.DataFrame: result = result["a"] assert result.dtype.context.prec == ctx.prec @@ -526,12 +467,11 @@ def test_to_numpy_keyword(): tm.assert_numpy_array_equal(result, expected) -def test_array_copy_on_write(using_copy_on_write): +def test_array_copy_on_write(): df = pd.DataFrame({"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype="object") df2 = df.astype(DecimalDtype()) df.iloc[0, 0] = 0 - if using_copy_on_write: - expected = pd.DataFrame( - {"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype() - ) - tm.assert_equal(df2.values, expected.values) + expected = pd.DataFrame( + {"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype() + ) + tm.assert_equal(df2.values, expected.values) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index d3d9dcc4a4712..b110911bda400 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -11,6 +11,7 @@ in that case. We *want* the dictionaries to be treated as scalars, so we hack around pandas by using UserDicts. """ + from __future__ import annotations from collections import ( @@ -146,13 +147,21 @@ def __eq__(self, other): def __ne__(self, other): return NotImplemented - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype is None: dtype = object if dtype == object: # on py38 builds it looks like numpy is inferring to a non-1D array return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) @property def nbytes(self) -> int: @@ -167,8 +176,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): # an ndarary. indexer = np.asarray(indexer) msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." + "Index is out of bounds or cannot do a non-empty take from an empty array." ) if allow_fill: @@ -207,11 +215,12 @@ def astype(self, dtype, copy=True): return self.copy() return self elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) - - return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + return arr_cls._from_sequence(self, dtype=dtype, copy=False) + elif not copy: + return np.asarray([dict(x) for x in self], dtype=dtype) + else: + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) def unique(self): # Parent method doesn't work since np.array will try to infer @@ -235,6 +244,10 @@ def _values_for_argsort(self): frozen = [tuple(x.items()) for x in self] return construct_1d_object_array_from_listlike(frozen) + def _pad_or_backfill(self, *, method, limit=None, copy=True): + # GH#56616 - test EA method without limit_area argument + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 7686bc5abb44c..4bc9562f1895d 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -127,7 +127,7 @@ def test_series_constructor_scalar_with_index(self, data, dtype): @pytest.mark.xfail(reason="Different definitions of NA") def test_stack(self): """ - The test does .astype(object).stack(future_stack=True). If we happen to have + The test does .astype(object).stack(). If we happen to have any missing values in `data`, then we'll end up with different rows since we consider `{}` NA, but `.astype(object)` doesn't. """ @@ -149,6 +149,46 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + def test_fillna_with_none(self, data_missing): + # GH#57723 + # EAs that don't have special logic for None will raise, unlike pandas' + # which interpret None as the NA value for the dtype. + with pytest.raises(AssertionError): + super().test_fillna_with_none(data_missing) + + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_series(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "JSONArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + @unhashable def test_value_counts(self, all_data, dropna): super().test_value_counts(all_data, dropna) @@ -162,18 +202,6 @@ def test_sort_values_frame(self): # TODO (EA.factorize): see if _values_for_factorize allows this. super().test_sort_values_frame() - @pytest.mark.parametrize("ascending", [True, False]) - def test_sort_values(self, data_for_sorting, ascending, sort_by_key): - super().test_sort_values(data_for_sorting, ascending, sort_by_key) - - @pytest.mark.parametrize("ascending", [True, False]) - def test_sort_values_missing( - self, data_missing_for_sorting, ascending, sort_by_key - ): - super().test_sort_values_missing( - data_missing_for_sorting, ascending, sort_by_key - ) - @pytest.mark.xfail(reason="combine for JSONArray not supported") def test_combine_le(self, data_repeated): super().test_combine_le(data_repeated) @@ -206,12 +234,8 @@ def test_equals(self, data, na_value, as_series): def test_fillna_copy_frame(self, data_missing): super().test_fillna_copy_frame(data_missing) - def test_equals_same_data_different_object( - self, data, using_copy_on_write, request - ): - if using_copy_on_write: - mark = pytest.mark.xfail(reason="Fails with CoW") - request.applymarker(mark) + @pytest.mark.xfail(reason="Fails with CoW") + def test_equals_same_data_different_object(self, data): super().test_equals_same_data_different_object(data) @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)") @@ -344,7 +368,7 @@ def test_setitem_integer_array(self, data, idx, box_in_series, request): [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948") ), (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), - (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), True), ], ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], ) diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index f07585c0aec10..8b4728c7d6292 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -3,6 +3,7 @@ The ListArray stores an ndarray of lists. """ + from __future__ import annotations import numbers @@ -80,8 +81,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): # an ndarary. indexer = np.asarray(indexer) msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." + "Index is out of bounds or cannot do a non-empty take from an empty array." ) if allow_fill: @@ -115,7 +115,10 @@ def astype(self, dtype, copy=True): elif is_string_dtype(dtype) and not is_object_dtype(dtype): # numpy has problems with astype(str) for nested elements return np.array([str(x) for x in self.data], dtype=dtype) - return np.array(self.data, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self.data, dtype=dtype) + else: + return np.array(self.data, dtype=dtype, copy=copy) @classmethod def _concat_same_type(cls, to_concat): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3b03272f18203..fc5930ebcd8ac 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -10,6 +10,7 @@ classes (if they are relevant for the extension interface for all dtypes), or be added to the array-specific tests in `pandas/tests/arrays/`. """ + from __future__ import annotations from datetime import ( @@ -26,6 +27,7 @@ import operator import pickle import re +import sys import numpy as np import pytest @@ -40,8 +42,9 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under19p0, + pa_version_under20p0, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -53,6 +56,7 @@ from pandas.api.extensions import no_default from pandas.api.types import ( is_bool_dtype, + is_datetime64_any_dtype, is_float_dtype, is_integer_dtype, is_numeric_dtype, @@ -64,7 +68,10 @@ pa = pytest.importorskip("pyarrow") -from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.array import ( + ArrowExtensionArray, + get_unit_from_pa_dtype, +) from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -286,7 +293,7 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_astype_str(self, data, request): + def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): request.applymarker( @@ -294,9 +301,10 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) - elif ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): + elif not using_infer_string and ( + (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + or pa.types.is_duration(pa_dtype) + ): request.applymarker( pytest.mark.xfail( reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", @@ -304,25 +312,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], - ) - def test_astype_string(self, data, nullable_string_dtype, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) - super().test_astype_string(data, nullable_string_dtype) - def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): @@ -352,10 +341,9 @@ def test_from_sequence_pa_array(self, data): assert isinstance(result._pa_array, pa.ChunkedArray) def test_from_sequence_pa_array_notimplemented(self, request): + dtype = ArrowDtype(pa.month_day_nano_interval()) with pytest.raises(NotImplementedError, match="Converting strings to"): - ArrowExtensionArray._from_sequence_of_strings( - ["12-1"], dtype=pa.month_day_nano_interval() - ) + ArrowExtensionArray._from_sequence_of_strings(["12-1"], dtype=dtype) def test_from_sequence_of_strings_pa_array(self, data, request): pa_dtype = data.dtype.pyarrow_dtype @@ -407,13 +395,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -428,6 +415,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): @@ -455,42 +448,31 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", - raises=NotImplementedError, + raises=TypeError, ) ) self.check_accumulate(ser, op_name, skipna) def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"): + return False + dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has # no attribute "pyarrow_dtype" pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr] - if pa.types.is_temporal(pa_dtype) and op_name in [ - "sum", - "var", - "skew", - "kurt", - "prod", - ]: + if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod", "skew"]: if pa.types.is_duration(pa_dtype) and op_name in ["sum"]: # summing timedeltas is one case that *is* well-defined pass else: return False + elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]: + return False elif ( pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) - ) and op_name in [ - "sum", - "mean", - "median", - "prod", - "std", - "sem", - "var", - "skew", - "kurt", - ]: + ) and op_name in ["mean", "median", "prod", "std", "sem", "var", "skew"]: return False if ( @@ -503,6 +485,16 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: # behavior which does not support this. return False + if pa.types.is_boolean(pa_dtype) and op_name in [ + "median", + "std", + "var", + "skew", + "kurt", + "sem", + ]: + return False + return True def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): @@ -526,32 +518,6 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = getattr(alt, op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): - dtype = data.dtype - pa_dtype = dtype.pyarrow_dtype - - xfail_mark = pytest.mark.xfail( - raises=TypeError, - reason=( - f"{all_numeric_reductions} is not implemented in " - f"pyarrow={pa.__version__} for {pa_dtype}" - ), - ) - if all_numeric_reductions in {"skew", "kurt"} and ( - dtype._is_numeric or dtype.kind == "b" - ): - request.applymarker(xfail_mark) - - elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { - "sem", - "std", - "var", - "median", - }: - request.applymarker(xfail_mark) - super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) - @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean( self, data, all_boolean_reductions, skipna, na_value, request @@ -572,15 +538,34 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + pa_type = arr._pa_array.type + if op_name in ["max", "min"]: cmp_dtype = arr.dtype + elif pa.types.is_temporal(pa_type): + if op_name in ["std", "sem"]: + if pa.types.is_duration(pa_type): + cmp_dtype = arr.dtype + elif pa.types.is_date(pa_type): + cmp_dtype = ArrowDtype(pa.duration("s")) + elif pa.types.is_time(pa_type): + unit = get_unit_from_pa_dtype(pa_type) + cmp_dtype = ArrowDtype(pa.duration(unit)) + else: + cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) + else: + cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": - if op_name not in ["median", "var", "std"]: + if op_name not in ["median", "var", "std", "sem", "skew"]: cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" - elif op_name in ["median", "var", "std", "mean", "skew"]: + elif op_name in ["median", "var", "std", "mean", "skew", "sem"]: cmp_dtype = "float64[pyarrow]" + elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type): + cmp_dtype = "uint64[pyarrow]" + elif op_name == "sum" and pa.types.is_string(pa_type): + cmp_dtype = arr.dtype else: cmp_dtype = { "i": "int64[pyarrow]", @@ -589,13 +574,40 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): }[arr.dtype.kind] return cmp_dtype + @pytest.mark.filterwarnings("ignore::RuntimeWarning") + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): + if ( + not pa_version_under20p0 + and skipna + and all_numeric_reductions == "skew" + and ( + pa.types.is_integer(data.dtype.pyarrow_dtype) + or pa.types.is_floating(data.dtype.pyarrow_dtype) + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/apache/arrow/issues/45733", + ) + ) + return super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): op_name = all_numeric_reductions - if op_name == "skew": + if op_name == "skew" and pa_version_under20p0: if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") request.applymarker(mark) + elif ( + op_name in ["std", "sem"] + and pa.types.is_date64(data._pa_array.type) + and skipna + ): + # overflow + mark = pytest.mark.xfail(reason="Cannot cast") + request.applymarker(mark) return super().test_reduce_frame(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) @@ -604,26 +616,6 @@ def test_median_not_approximate(self, typ): result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median() assert result == 1.5 - def test_in_numeric_groupby(self, data_for_grouping): - dtype = data_for_grouping.dtype - if is_string_dtype(dtype): - df = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping, - "C": [1, 1, 1, 1, 1, 1, 1, 1], - } - ) - - expected = pd.Index(["C"]) - msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}") - with pytest.raises(TypeError, match=msg): - df.groupby("A").sum() - result = df.groupby("A").sum(numeric_only=True).columns - tm.assert_index_equal(result, expected) - else: - super().test_in_numeric_groupby(data_for_grouping) - def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): @@ -704,10 +696,6 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data tm.assert_extension_array_equal(result, data) - result = data.fillna(method="backfill") - assert result is not data - tm.assert_extension_array_equal(result, data) - @pytest.mark.xfail( reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False ) @@ -800,8 +788,6 @@ def test_value_counts_returns_pyarrow_int64(self, data): _combine_le_expected_dtype = "bool[pyarrow]" - divmod_exc = NotImplementedError - def get_op_from_name(self, op_name): short_opname = op_name.strip("_") if short_opname == "rtruediv": @@ -929,16 +915,15 @@ def _is_temporal_supported(self, opname, pa_dtype): ) ) and pa.types.is_duration(pa_dtype) - or opname in ("__sub__", "__rsub__") - and pa.types.is_temporal(pa_dtype) - ) + ) or (opname in ("__sub__", "__rsub__") and pa.types.is_temporal(pa_dtype)) def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: if op_name in ("__divmod__", "__rdivmod__"): - return self.divmod_exc + return (NotImplementedError, TypeError) + exc: type[Exception] | tuple[type[Exception], ...] | None dtype = tm.get_dtype(obj) # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no # attribute "pyarrow_dtype" @@ -949,7 +934,7 @@ def _get_expected_exception( "__mod__", "__rmod__", }: - exc = NotImplementedError + exc = (NotImplementedError, TypeError) elif arrow_temporal_supported: exc = None elif op_name in ["__add__", "__radd__"] and ( @@ -961,10 +946,7 @@ def _get_expected_exception( or pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype) ): - # TODO: in many of these cases, e.g. non-duration temporal, - # these will *never* be allowed. Would it make more sense to - # re-raise as TypeError, more consistent with non-pyarrow cases? - exc = pa.ArrowNotImplementedError + exc = TypeError else: exc = None return exc @@ -996,8 +978,7 @@ def _get_arith_xfail_marker(self, opname, pa_dtype): mark = pytest.mark.xfail( raises=TypeError, reason=( - f"{opname} not supported between" - f"pd.NA and {pa_dtype} Python scalar" + f"{opname} not supported betweenpd.NA and {pa_dtype} Python scalar" ), ) elif opname == "__rfloordiv__" and ( @@ -1020,14 +1001,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype): pytest.skip("Skip testing Python string formatting") - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1042,14 +1015,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ): pytest.skip("Skip testing Python string formatting") - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1073,14 +1038,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): ), ) ) - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1531,6 +1488,14 @@ def test_is_unsigned_integer_dtype(data): assert not is_unsigned_integer_dtype(data) +def test_is_datetime64_any_dtype(data): + pa_type = data.dtype.pyarrow_dtype + if pa.types.is_timestamp(pa_type) or pa.types.is_date(pa_type): + assert is_datetime64_any_dtype(data) + else: + assert not is_datetime64_any_dtype(data) + + def test_is_float_dtype(data): pa_type = data.dtype.pyarrow_dtype if pa.types.is_floating(pa_type): @@ -1700,7 +1665,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) @@ -1868,6 +1833,17 @@ def test_str_replace_negative_n(): expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string())) tm.assert_series_equal(expected, actual) + # Same bug for pyarrow-backed StringArray GH#59628 + ser2 = ser.astype(pd.StringDtype(storage="pyarrow")) + actual2 = ser2.str.replace("a", "", -3, True) + expected2 = expected.astype(ser2.dtype) + tm.assert_series_equal(expected2, actual2) + + ser3 = ser.astype(pd.StringDtype(storage="pyarrow", na_value=np.nan)) + actual3 = ser3.str.replace("a", "", -3, True) + expected3 = expected.astype(ser3.dtype) + tm.assert_series_equal(expected3, actual3) + def test_str_repeat_unsupported(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) @@ -1903,29 +1879,39 @@ def test_str_match(pat, case, na, exp): @pytest.mark.parametrize( "pat, case, na, exp", [ - ["abc", False, None, [True, None]], - ["Abc", True, None, [False, None]], - ["bc", True, None, [False, None]], - ["ab", False, True, [True, True]], - ["a[a-z]{2}", False, None, [True, None]], - ["A[a-z]{1}", True, None, [False, None]], + ["abc", False, None, [True, True, False, None]], + ["Abc", True, None, [False, False, False, None]], + ["bc", True, None, [False, False, False, None]], + ["ab", False, None, [True, True, False, None]], + ["a[a-z]{2}", False, None, [True, True, False, None]], + ["A[a-z]{1}", True, None, [False, False, False, None]], + # GH Issue: #56652 + ["abc$", False, None, [True, False, False, None]], + ["abc\\$", False, None, [False, True, False, None]], + ["Abc$", True, None, [False, False, False, None]], + ["Abc\\$", True, None, [False, False, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.match(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "sub, start, end, exp, exp_typ", - [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [1, None], pa.int64()]], + "sub, start, end, exp, exp_type", + [ + ["ab", 0, None, [0, None], pa.int32()], + ["bc", 1, 3, [1, None], pa.int64()], + ["ab", 1, 3, [-1, None], pa.int64()], + ["ab", -3, -3, [-1, None], pa.int64()], + ], ) -def test_str_find(sub, start, end, exp, exp_typ): +def test_str_find(sub, start, end, exp, exp_type): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.find(sub, start=start, end=end) - expected = pd.Series(exp, dtype=ArrowDtype(exp_typ)) + expected = pd.Series(exp, dtype=ArrowDtype(exp_type)) tm.assert_series_equal(result, expected) @@ -1937,10 +1923,56 @@ def test_str_find_negative_start(): tm.assert_series_equal(result, expected) -def test_str_find_notimplemented(): +def test_str_find_no_end(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) - with pytest.raises(NotImplementedError, match="find not implemented"): - ser.str.find("ab", start=1) + result = ser.str.find("ab", start=1) + expected = pd.Series([-1, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + +def test_str_find_negative_start_negative_end(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=-6, end=-3) + expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + +def test_str_find_large_start(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=16) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + +@pytest.mark.skipif( + pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311" +) +@pytest.mark.parametrize("start", [-15, -3, 0, 1, 15, None]) +@pytest.mark.parametrize("end", [-15, -1, 0, 3, 15, None]) +@pytest.mark.parametrize("sub", ["", "az", "abce", "a", "caa"]) +def test_str_find_e2e(start, end, sub): + s = pd.Series( + ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""], + dtype=ArrowDtype(pa.string()), + ) + object_series = s.astype(pd.StringDtype(storage="python")) + result = s.str.find(sub, start, end) + expected = object_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result, expected) + + arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow")) + result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result2, expected) + + +def test_str_find_negative_start_negative_end_no_match(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=-3, end=-6) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1984,6 +2016,7 @@ def test_str_join_string_type(): [None, 2, None, ["ab", None]], [None, 2, 1, ["ab", None]], [1, 3, 1, ["bc", None]], + (None, None, -1, ["dcba", None]), ], ) def test_str_slice(start, stop, step, exp): @@ -2101,14 +2134,21 @@ def test_str_removeprefix(val): @pytest.mark.parametrize( "encoding, exp", [ - ["utf8", b"abc"], - ["utf32", b"\xff\xfe\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00"], + ("utf8", {"little": b"abc", "big": "abc"}), + ( + "utf32", + { + "little": b"\xff\xfe\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00", + "big": b"\x00\x00\xfe\xff\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c", + }, + ), ], + ids=["utf8", "utf32"], ) def test_str_encode(errors, encoding, exp): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.encode(encoding, errors) - expected = pd.Series([exp, None], dtype=ArrowDtype(pa.binary())) + expected = pd.Series([exp[sys.byteorder], None], dtype=ArrowDtype(pa.binary())) tm.assert_series_equal(result, expected) @@ -2189,9 +2229,11 @@ def test_str_partition(): ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string())) result = ser.str.partition("b") expected = pd.DataFrame( - [["a", "b", "cba"], [None, None, None]], dtype=ArrowDtype(pa.string()) + [["a", "b", "cba"], [None, None, None]], + dtype=ArrowDtype(pa.string()), + columns=pd.RangeIndex(3), ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=True) result = ser.str.partition("b", expand=False) expected = pd.Series(ArrowExtensionArray(pa.array([["a", "b", "cba"], None]))) @@ -2199,9 +2241,11 @@ def test_str_partition(): result = ser.str.rpartition("b") expected = pd.DataFrame( - [["abc", "b", "a"], [None, None, None]], dtype=ArrowDtype(pa.string()) + [["abc", "b", "a"], [None, None, None]], + dtype=ArrowDtype(pa.string()), + columns=pd.RangeIndex(3), ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=True) result = ser.str.rpartition("b", expand=False) expected = pd.Series(ArrowExtensionArray(pa.array([["abc", "b", "a"], None]))) @@ -2330,7 +2374,8 @@ def test_duration_from_strings_with_nat(unit): # GH51175 strings = ["1000", "NaT"] pa_type = pa.duration(unit) - result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa_type) + dtype = ArrowDtype(pa_type) + result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) expected = ArrowExtensionArray(pa.array([1000, None], type=pa_type)) tm.assert_extension_array_equal(result, expected) @@ -2357,13 +2402,13 @@ def test_unsupported_dt(data): ["hour", 3], ["minute", 4], ["is_leap_year", False], - ["microsecond", 5], + ["microsecond", 2000], ["month", 1], ["nanosecond", 6], ["quarter", 1], ["second", 7], ["date", date(2023, 1, 2)], - ["time", time(3, 4, 7, 5)], + ["time", time(3, 4, 7, 2000)], ], ) def test_dt_properties(prop, expected): @@ -2376,7 +2421,7 @@ def test_dt_properties(prop, expected): hour=3, minute=4, second=7, - microsecond=5, + microsecond=2000, nanosecond=6, ), None, @@ -2393,6 +2438,28 @@ def test_dt_properties(prop, expected): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("microsecond", [2000, 5, 0]) +def test_dt_microsecond(microsecond): + # GH 59183 + ser = pd.Series( + [ + pd.Timestamp( + year=2024, + month=7, + day=7, + second=5, + microsecond=microsecond, + nanosecond=6, + ), + None, + ], + dtype=ArrowDtype(pa.timestamp("ns")), + ) + result = ser.dt.microsecond + expected = pd.Series([microsecond, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_dt_is_month_start_end(): ser = pd.Series( [ @@ -2597,18 +2664,13 @@ def test_dt_to_pydatetime(): # GH 51859 data = [datetime(2022, 1, 1), datetime(2023, 1, 1)] ser = pd.Series(data, dtype=ArrowDtype(pa.timestamp("ns"))) + result = ser.dt.to_pydatetime() + expected = pd.Series(data, dtype=object) + tm.assert_series_equal(result, expected) + assert all(type(expected.iloc[i]) is datetime for i in range(len(expected))) - msg = "The behavior of ArrowTemporalProperties.to_pydatetime is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.dt.to_pydatetime() - expected = np.array(data, dtype=object) - tm.assert_numpy_array_equal(result, expected) - assert all(type(res) is datetime for res in result) - - msg = "The behavior of DatetimeProperties.to_pydatetime is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = ser.astype("datetime64[ns]").dt.to_pydatetime() - tm.assert_numpy_array_equal(result, expected) + expected = ser.astype("datetime64[ns]").dt.to_pydatetime() + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("date_type", [32, 64]) @@ -2618,10 +2680,8 @@ def test_dt_to_pydatetime_date_error(date_type): [date(2022, 12, 31)], dtype=ArrowDtype(getattr(pa, f"date{date_type}")()), ) - msg = "The behavior of ArrowTemporalProperties.to_pydatetime is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pytest.raises(ValueError, match="to_pydatetime cannot be called with"): - ser.dt.to_pydatetime() + with pytest.raises(ValueError, match="to_pydatetime cannot be called with"): + ser.dt.to_pydatetime() def test_dt_tz_localize_unsupported_tz_options(): @@ -2723,6 +2783,140 @@ def test_dt_tz_convert(unit): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"]) +def test_as_unit(dtype): + # GH 52284 + ser = pd.Series([1000, None], dtype=dtype) + result = ser.dt.as_unit("ns") + expected = ser.astype(dtype.replace("ms", "ns")) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "prop, expected", + [ + ["days", 1], + ["seconds", 2], + ["microseconds", 3], + ["nanoseconds", 4], + ], +) +def test_dt_timedelta_properties(prop, expected): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = getattr(ser.dt, prop) + expected = pd.Series( + ArrowExtensionArray(pa.array([expected, None], type=pa.int32())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_timedelta_total_seconds(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.total_seconds() + expected = pd.Series( + ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_to_pytimedelta(): + # GH 52284 + data = [timedelta(1, 2, 3), timedelta(1, 2, 4)] + ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns"))) + + msg = "The behavior of ArrowTemporalProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.dt.to_pytimedelta() + expected = np.array(data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + assert all(type(res) is timedelta for res in result) + + msg = "The behavior of TimedeltaProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() + tm.assert_numpy_array_equal(result, expected) + + +def test_dt_components(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + +def test_dt_components_large_values(): + ser = pd.Series( + [ + pd.Timedelta("365 days 23:59:59.999000"), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 @@ -2744,13 +2938,14 @@ def test_from_sequence_of_strings_boolean(): [True] * len(true_strings) + [False] * len(false_strings) + [None] * len(nulls) ) - result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) + dtype = ArrowDtype(pa.bool_()) + result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) expected = pd.array(bools, dtype="boolean[pyarrow]") tm.assert_extension_array_equal(result, expected) strings = ["True", "foo"] with pytest.raises(pa.ArrowInvalid, match="Failed to parse"): - ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) + ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) def test_concat_empty_arrow_backed_series(dtype): @@ -3093,6 +3288,30 @@ def test_pow_missing_operand(): tm.assert_series_equal(result, expected) +@pytest.mark.skipif( + pa_version_under11p0, reason="Decimal128 to string cast implemented in pyarrow 11" +) +def test_decimal_parse_raises(): + # GH 56984 + ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string())) + with pytest.raises( + pa.lib.ArrowInvalid, match="Rescaling Decimal128 value would cause data loss" + ): + ser.astype(ArrowDtype(pa.decimal128(1, 0))) + + +@pytest.mark.skipif( + pa_version_under11p0, reason="Decimal128 to string cast implemented in pyarrow 11" +) +def test_decimal_parse_succeeds(): + # GH 56984 + ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string())) + dtype = ArrowDtype(pa.decimal128(5, 4)) + result = ser.astype(dtype) + expected = pd.Series([Decimal("1.2345")], dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES) def test_duration_fillna_numpy(pa_type): # GH 54707 @@ -3124,6 +3343,34 @@ def test_factorize_chunked_dictionary(): tm.assert_index_equal(res_uniques, exp_uniques) +def test_factorize_dictionary_with_na(): + # GH#60567 + arr = pd.array( + ["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8())) + ) + indices, uniques = arr.factorize(use_na_sentinel=False) + expected_indices = np.array([0, 1], dtype=np.intp) + expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string())) + tm.assert_numpy_array_equal(indices, expected_indices) + tm.assert_extension_array_equal(uniques, expected_uniques) + + +def test_dictionary_astype_categorical(): + # GH#56672 + arrs = [ + pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(), + pa.array(np.array(["a", "d", "c"])).dictionary_encode(), + ] + ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs))) + result = ser.astype("category") + categories = pd.Index(["a", "x", "c", "d"], dtype=ArrowDtype(pa.string())) + expected = pd.Series( + ["a", "x", "c", "a", "a", "d", "c"], + dtype=pd.CategoricalDtype(categories=categories), + ) + tm.assert_series_equal(result, expected) + + def test_arrow_floordiv(): # GH 55561 a = pd.Series([-7], dtype="int64[pyarrow]") @@ -3133,16 +3380,124 @@ def test_arrow_floordiv(): tm.assert_series_equal(result, expected) +def test_arrow_floordiv_large_values(): + # GH 56645 + a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") + expected = pd.Series([1425801600000], dtype="int64[pyarrow]") + result = a // 1_000_000 + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floordiv_large_integral_result(dtype): + # GH 56676 + a = pd.Series([18014398509481983], dtype=dtype) + result = a // 1 + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_larger_divisor(pa_type): + # GH 56676 + dtype = ArrowDtype(pa_type) + a = pd.Series([-23], dtype=dtype) + result = a // 24 + expected = pd.Series([-1], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_integral_invalid(pa_type): + # GH 56676 + min_value = np.iinfo(pa_type.to_pandas_dtype()).min + a = pd.Series([min_value], dtype=ArrowDtype(pa_type)) + with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"): + a // -1 + with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"): + a // 0 + + +@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR) +def test_arrow_floordiv_floating_0_divisor(dtype): + # GH 56676 + a = pd.Series([2], dtype=dtype) + result = a // 0 + expected = pd.Series([float("inf")], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"]) +def test_astype_int_with_null_to_numpy_dtype(dtype): + # GH 57093 + ser = pd.Series([1, None], dtype="int64[pyarrow]") + result = ser.astype(dtype) + expected = pd.Series([1, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_arrow_integral_floordiv_large_values(pa_type): + # GH 56676 + max_value = np.iinfo(pa_type.to_pandas_dtype()).max + dtype = ArrowDtype(pa_type) + a = pd.Series([max_value], dtype=dtype) + b = pd.Series([1], dtype=dtype) + result = a // b + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_true_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype="float64[pyarrow]") + result = a / b + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floor_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype=dtype) + result = a // b + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] - result = pd.Series(string_dates, dtype="timestamp[ns][pyarrow]") + result = pd.Series(string_dates, dtype="timestamp[s][pyarrow]") expected = pd.Series( ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True)) ) tm.assert_series_equal(result, expected) +@pytest.mark.skipif( + pa_version_under13p0, reason="pairwise_diff_checked not implemented in pyarrow" +) +def test_interpolate_not_numeric(data): + if not data.dtype._is_numeric: + ser = pd.Series(data) + msg = re.escape(f"Cannot interpolate with {ser.dtype} dtype") + with pytest.raises(TypeError, match=msg): + pd.Series(data).interpolate() + + +@pytest.mark.skipif( + pa_version_under13p0, reason="pairwise_diff_checked not implemented in pyarrow" +) +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "float64[pyarrow]"]) +def test_interpolate_linear(dtype): + ser = pd.Series([None, 1, 2, None, 4, None], dtype=dtype) + result = ser.interpolate() + expected = pd.Series([None, 1, 2, 3, 4, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_string_to_time_parsing_cast(): # GH 56463 string_times = ["11:41:43.076160"] @@ -3153,9 +3508,59 @@ def test_string_to_time_parsing_cast(): tm.assert_series_equal(result, expected) +def test_to_numpy_float(): + # GH#56267 + ser = pd.Series([32, 40, None], dtype="float[pyarrow]") + result = ser.astype("float64") + expected = pd.Series([32, 40, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) + + def test_to_numpy_timestamp_to_int(): # GH 55997 ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]") result = ser.to_numpy(dtype=np.int64) expected = np.array([1577853000000000000]) tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("arrow_type", [pa.large_string(), pa.string()]) +def test_cast_dictionary_different_value_dtype(arrow_type): + df = pd.DataFrame({"a": ["x", "y"]}, dtype="string[pyarrow]") + data_type = ArrowDtype(pa.dictionary(pa.int32(), arrow_type)) + result = df.astype({"a": data_type}) + assert result.dtypes.iloc[0] == data_type + + +def test_map_numeric_na_action(): + ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") + result = ser.map(lambda x: 42, na_action="ignore") + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) + + +def test_categorical_from_arrow_dictionary(): + # GH 60563 + df = pd.DataFrame( + {"A": ["a1", "a2"]}, dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8())) + ) + result = df.value_counts(dropna=False) + expected = pd.Series( + [1, 1], + index=pd.MultiIndex.from_arrays( + [pd.Index(["a1", "a2"], dtype=ArrowDtype(pa.string()), name="A")] + ), + name="count", + dtype="int64", + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.skipif( + pa_version_under19p0, reason="pa.json_ was introduced in pyarrow v19.0" +) +def test_arrow_json_type(): + # GH 60958 + dtype = ArrowDtype(pa.json_(pa.string())) + result = dtype.type + assert result == str diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6f33b18b19c51..8f8af607585df 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -13,12 +13,13 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ + import string import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import Categorical @@ -75,11 +76,6 @@ def data_for_grouping(): class TestCategorical(base.ExtensionTests): - @pytest.mark.xfail(reason="Memory usage doesn't match") - def test_memory_usage(self, data): - # TODO: Is this deliberate? - super().test_memory_usage(data) - def test_contains(self, data, data_missing): # GH-37867 # na value handling in Categorical.__contains__ is deprecated. @@ -103,7 +99,7 @@ def test_contains(self, data, data_missing): continue assert na_value_obj not in data # this section suffers from super method - if not using_pyarrow_string_dtype(): + if not using_string_dtype(): assert na_value_obj in data_missing def test_empty(self, dtype): @@ -122,10 +118,6 @@ def test_getitem_scalar(self, data): # to break things by changing. super().test_getitem_scalar(data) - @pytest.mark.xfail(reason="Unobserved categories included") - def test_value_counts(self, all_data, dropna): - return super().test_value_counts(all_data, dropna) - def test_combine_add(self, data_repeated): # GH 20825 # When adding categoricals in combine, result is a string diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 3d8523f344d46..5eda0f00f54ca 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -17,7 +17,7 @@ class DummyArray(ExtensionArray): def __init__(self, data) -> None: self.data = data - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property @@ -30,8 +30,10 @@ def astype(self, dtype, copy=True): if copy: return type(self)(self.data) return self - - return np.array(self, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) class TestExtensionArrayDtype: diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 7f70957007dad..356d5352f41f4 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -13,6 +13,7 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ + import numpy as np import pytest @@ -24,9 +25,9 @@ from pandas.tests.extension import base -@pytest.fixture(params=["US/Central"]) -def dtype(request): - return DatetimeTZDtype(unit="ns", tz=request.param) +@pytest.fixture +def dtype(): + return DatetimeTZDtype(unit="ns", tz="US/Central") @pytest.fixture @@ -94,6 +95,11 @@ def _get_expected_exception(self, op_name, obj, other): return None return super()._get_expected_exception(op_name, obj, other) + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + if op_name == "std": + return "timedelta64[ns]" + return arr.dtype + def _supports_accumulation(self, ser, op_name: str) -> bool: return op_name in ["cummin", "cummax"] @@ -103,10 +109,8 @@ def _supports_reduction(self, obj, op_name: str) -> bool: @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): meth = all_boolean_reductions - msg = f"'{meth}' with datetime64 dtypes is deprecated and will raise in" - with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False - ): + msg = f"datetime64 type does not support operation '{meth}'" + with pytest.raises(TypeError, match=msg): super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) def test_series_constructor(self, data): diff --git a/pandas/tests/extension/test_extension.py b/pandas/tests/extension/test_extension.py index 1ed626cd51080..456f4863b1c31 100644 --- a/pandas/tests/extension/test_extension.py +++ b/pandas/tests/extension/test_extension.py @@ -1,6 +1,7 @@ """ Tests for behavior if an author does *not* implement EA methods. """ + import numpy as np import pytest diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 98dd1c5cb615f..011bf0b2016b2 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -13,6 +13,7 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ + from __future__ import annotations from typing import TYPE_CHECKING @@ -83,6 +84,16 @@ class TestIntervalArray(base.ExtensionTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return op_name in ["min", "max"] + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + @pytest.mark.xfail( reason="Raises with incorrect message bc it disallows *all* listlikes " "instead of just wrong-length listlikes" @@ -90,6 +101,31 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3efc561d6a125..3b9079d06e231 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -13,7 +13,6 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ -import warnings import numpy as np import pytest @@ -179,6 +178,15 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) + def test_map_na_action_ignore(self, data_missing_for_sorting): + zero = data_missing_for_sorting[2] + result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") + if data_missing_for_sorting.dtype.kind == "b": + expected = np.array([False, pd.NA, False], dtype=object) + else: + expected = np.array([zero, np.nan, zero]) + tm.assert_numpy_array_equal(result, expected) + def _get_expected_exception(self, op_name, obj, other): try: dtype = tm.get_dtype(obj) @@ -205,13 +213,7 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): if sdtype.kind in "iu": if op_name in ("__rtruediv__", "__truediv__", "__div__"): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Downcasting object dtype arrays", - category=FutureWarning, - ) - filled = expected.fillna(np.nan) + filled = expected.fillna(np.nan) expected = filled.astype("Float64") else: # combine method result in 'biggest' (int64) dtype @@ -299,7 +301,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name - elif op_name in ["mean", "median", "var", "std", "skew"]: + elif op_name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: cmp_dtype = "Float64" elif op_name in ["max", "min"]: cmp_dtype = arr.dtype.name @@ -321,9 +323,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): else "UInt64" ) elif arr.dtype.kind == "b": - if op_name in ["mean", "median", "var", "std", "skew"]: - cmp_dtype = "Float64" - elif op_name in ["min", "max"]: + if op_name in ["min", "max"]: cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: cmp_dtype = ( diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index aaf49f53ba02b..79cfb736941d6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -15,6 +15,7 @@ Note: we do not bother with base.BaseIndexTests because NumpyExtensionArray will never be held in an Index. """ + import numpy as np import pytest @@ -204,6 +205,18 @@ def test_shift_fill_value(self, data): # np.array shape inference. Shift implementation fails. super().test_shift_fill_value(data) + @skip_nested + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + # The "scalar" for this array isn't a scalar. + super().test_fillna_limit_frame(data_missing) + + @skip_nested + def test_fillna_limit_series(self, data_missing): + # GH#58001 + # The "scalar" for this array isn't a scalar. + super().test_fillna_limit_series(data_missing) + @skip_nested def test_fillna_copy_frame(self, data_missing): # The "scalar" for this array isn't a scalar. @@ -374,19 +387,6 @@ def test_setitem_mask(self, data, mask, box_in_series): def test_setitem_integer_array(self, data, idx, box_in_series): super().test_setitem_integer_array(data, idx, box_in_series) - @pytest.mark.parametrize( - "idx, box_in_series", - [ - ([0, 1, 2, pd.NA], False), - pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail), - (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), - (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), - ], - ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], - ) - def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): - super().test_setitem_integer_with_missing_raises(data, idx, box_in_series) - @skip_nested def test_setitem_slice(self, data, box_in_series): super().test_setitem_slice(data, box_in_series) @@ -421,16 +421,6 @@ def test_index_from_listlike_with_dtype(self, data): def test_EA_types(self, engine, data, request): super().test_EA_types(engine, data, request) - @pytest.mark.xfail(reason="Expect NumpyEA, get np.ndarray") - def test_compare_array(self, data, comparison_op): - super().test_compare_array(data, comparison_op) - - def test_compare_scalar(self, data, comparison_op, request): - if data.dtype.kind == "f" or comparison_op.__name__ in ["eq", "ne"]: - mark = pytest.mark.xfail(reason="Expect NumpyEA, get np.ndarray") - request.applymarker(mark) - super().test_compare_scalar(data, comparison_op) - class Test2DCompat(base.NDArrayBacked2DTests): pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 2d1d213322bac..142bad6db4f95 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -13,6 +13,7 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ + from __future__ import annotations from typing import TYPE_CHECKING diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 4039a5d01f372..b7685a61d4937 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd from pandas import SparseDtype import pandas._testing as tm @@ -70,7 +68,7 @@ def gen(count): for _ in range(count): yield SparseArray(make_data(request.param), fill_value=request.param) - yield gen + return gen @pytest.fixture(params=[0, np.nan]) @@ -236,16 +234,7 @@ def test_isna(self, data_missing): expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype) tm.assert_equal(sarr.isna(), expected) - def test_fillna_limit_backfill(self, data_missing): - warns = (PerformanceWarning, FutureWarning) - with tm.assert_produces_warning(warns, check_stacklevel=False): - super().test_fillna_limit_backfill(data_missing) - def test_fillna_no_op_returns_copy(self, data, request): - if np.isnan(data.fill_value): - request.applymarker( - pytest.mark.xfail(reason="returns array with different fill value") - ) super().test_fillna_no_op_returns_copy(data) @pytest.mark.xfail(reason="Unsupported") @@ -275,9 +264,19 @@ def test_fillna_frame(self, data_missing): tm.assert_frame_equal(result, expected) + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + + def test_fillna_limit_series(self, data_missing): + # GH#58001 + with pytest.raises(ValueError, match="limit must be None"): + super().test_fillna_limit_frame(data_missing) + _combine_le_expected_dtype = "Sparse[bool]" - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): + def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}, copy=False) @@ -285,24 +284,17 @@ def test_fillna_copy_frame(self, data_missing, using_copy_on_write): result = df.fillna(filled_val) if hasattr(df._mgr, "blocks"): - if using_copy_on_write: - assert df.values.base is result.values.base - else: - assert df.values.base is not result.values.base + assert df.values.base is result.values.base assert df.A._values.to_dense() is arr.to_dense() - def test_fillna_copy_series(self, data_missing, using_copy_on_write): + def test_fillna_copy_series(self, data_missing): arr = data_missing.take([1, 1]) ser = pd.Series(arr, copy=False) filled_val = ser[0] result = ser.fillna(filled_val) - if using_copy_on_write: - assert ser._values is result._values - - else: - assert ser._values is not result._values + assert ser._values is result._values assert ser._values.to_dense() is arr.to_dense() @pytest.mark.xfail(reason="Not Applicable") @@ -331,8 +323,8 @@ def test_where_series(self, data, na_value): expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) tm.assert_series_equal(result, expected) - def test_searchsorted(self, data_for_sorting, as_series): - with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): + def test_searchsorted(self, performance_warning, data_for_sorting, as_series): + with tm.assert_produces_warning(performance_warning, check_stacklevel=False): super().test_searchsorted(data_for_sorting, as_series) def test_shift_0_periods(self, data): @@ -348,11 +340,16 @@ def test_argmin_argmax_all_na(self, method, data, na_value): self._check_unsupported(data) super().test_argmin_argmax_all_na(method, data, na_value) + @pytest.mark.fails_arm_wheels @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) super().test_equals(data, na_value, as_series, box) + @pytest.mark.fails_arm_wheels + def test_equals_same_data_different_object(self, data): + super().test_equals_same_data_different_object(data) + @pytest.mark.parametrize( "func, na_action, expected", [ @@ -409,6 +406,8 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): "rmul", "floordiv", "rfloordiv", + "truediv", + "rtruediv", "pow", "mod", "rmod", diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 2d5a134f8560a..96c014f549056 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -13,6 +13,7 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ + from __future__ import annotations import string @@ -21,11 +22,16 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW + +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype +from pandas.tests.arrays.string_.test_string import string_dtype_highest_priority from pandas.tests.extension import base @@ -52,8 +58,9 @@ def chunked(request): @pytest.fixture -def dtype(string_storage): - return StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + storage, na_value = string_dtype_arguments + return StringDtype(storage=storage, na_value=na_value) @pytest.fixture @@ -95,16 +102,36 @@ def data_for_grouping(dtype, chunked): class TestStringArray(base.ExtensionTests): def test_eq_with_str(self, dtype): - assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + if dtype.na_value is pd.NA: + # only the NA-variant supports parametrized string alias + assert dtype == f"string[{dtype.storage}]" + elif dtype.storage == "pyarrow": + with tm.assert_produces_warning(FutureWarning): + assert dtype == "string[pyarrow_numpy]" + def test_is_not_string_type(self, dtype): # Different from BaseDtypeTests.test_is_not_string_type # because StringDtype is a string type assert is_string_dtype(dtype) - def test_view(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_is_dtype_from_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + result = type(dtype).is_dtype(dtype.name) + assert result is False + else: + super().test_is_dtype_from_name(dtype) + + def test_construct_from_string_own_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"): + dtype.construct_from_string(dtype.name) + else: + super().test_construct_from_string_own_name(dtype) + + def test_view(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -112,13 +139,13 @@ def test_from_dtype(self, data): # base test uses string representation of dtype pass - def test_transpose(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_transpose(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) - def test_setitem_preserves_views(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_setitem_preserves_views(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) @@ -135,37 +162,17 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data tm.assert_extension_array_equal(result, data) - result = data.fillna(method="backfill") - assert result is not data - tm.assert_extension_array_equal(result, data) - def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: - if op_name in ["__divmod__", "__rdivmod__"]: - if isinstance(obj, pd.Series) and cast( - StringDtype, tm.get_dtype(obj) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: - # TODO: re-raise as TypeError? - return NotImplementedError - elif isinstance(other, pd.Series) and cast( - StringDtype, tm.get_dtype(other) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: - # TODO: re-raise as TypeError? - return NotImplementedError - return TypeError - elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: - return NotImplementedError + ) -> type[Exception] | tuple[type[Exception], ...] | None: + if op_name in [ + "__mod__", + "__rmod__", + "__divmod__", + "__rdivmod__", + "__pow__", + "__rpow__", + ]: return TypeError elif op_name in ["__mul__", "__rmul__"]: # Can only multiply strings by integers @@ -178,33 +185,31 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: - import pyarrow as pa - - # TODO: better to re-raise as TypeError? - return pa.ArrowNotImplementedError return TypeError return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - return ( - op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + return op_name in ["min", "max", "sum"] or ( + ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) + return op_name in ["cummin", "cummax", "cumsum"] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: cast_to = dtype - elif dtype.storage == "pyarrow": - cast_to = "boolean[pyarrow]" # type: ignore[assignment] - elif dtype.storage == "pyarrow_numpy": + dtype_other = tm.get_dtype(other) if not isinstance(other, str) else None + if isinstance(dtype_other, StringDtype): + cast_to = string_dtype_highest_priority(dtype, dtype_other) + elif dtype.na_value is np.nan: cast_to = np.bool_ # type: ignore[assignment] + elif dtype.storage == "pyarrow": + cast_to = "bool[pyarrow]" # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) @@ -213,10 +218,39 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) + def test_combine_add(self, data_repeated, using_infer_string, request): + dtype = next(data_repeated(1)).dtype + if using_infer_string and ( + (dtype.na_value is pd.NA) and dtype.storage == "python" + ): + mark = pytest.mark.xfail( + reason="The pointwise operation result will be inferred to " + "string[nan, pyarrow], which does not match the input dtype" + ) + request.applymarker(mark) + super().test_combine_add(data_repeated) + + def test_arith_series_with_array( + self, data, all_arithmetic_operators, using_infer_string, request + ): + dtype = data.dtype + if ( + using_infer_string + and all_arithmetic_operators == "__radd__" + and dtype.na_value is pd.NA + and (HAS_PYARROW or dtype.storage == "pyarrow") + ): + # TODO(infer_string) + mark = pytest.mark.xfail( + reason="The pointwise operation result will be inferred to " + "string[nan, pyarrow], which does not match the input dtype" + ) + request.applymarker(mark) + super().test_arith_series_with_array(data, all_arithmetic_operators) + class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index e07024b2e2a09..b3140bad8276b 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -17,9 +17,9 @@ def datetime_frame() -> DataFrame: Columns are ['A', 'B', 'C', 'D'] """ return DataFrame( - np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), - index=date_range("2000-01-01", periods=100, freq="B"), + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -33,7 +33,7 @@ def float_string_frame(): df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), ) df["foo"] = "bar" return df diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 60a8e688b3b8a..845174bbf600e 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas import ( DataFrame, Index, @@ -44,9 +42,6 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="columns inferring logic broken" - ) def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 3622571f1365d..1d4a2c0075e3e 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -1,12 +1,14 @@ from collections.abc import Iterator -from datetime import datetime +from datetime import ( + datetime, + timezone, +) from decimal import Decimal import numpy as np import pytest -import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import is_platform_little_endian @@ -17,19 +19,16 @@ Interval, RangeIndex, Series, - date_range, ) import pandas._testing as tm class TestFromRecords: def test_from_records_dt64tz_frame(self): - # GH#51162 don't lose tz when calling from_records with DataFrame input - dti = date_range("2016-01-01", periods=10, tz="US/Pacific") - df = DataFrame({i: dti for i in range(4)}) - with tm.assert_produces_warning(FutureWarning): - res = DataFrame.from_records(df) - tm.assert_frame_equal(res, df) + # GH#51697 + df = DataFrame({"a": [1, 2, 3]}) + with pytest.raises(TypeError, match="not supported"): + DataFrame.from_records(df) def test_from_records_with_datetimes(self): # this may fail on certain platforms because of a numpy issue @@ -42,7 +41,7 @@ def test_from_records_with_datetimes(self): expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] - dtypes = [("EXPIRY", " None: @@ -279,7 +245,7 @@ def test_from_records_series_categorical_index(self): tm.assert_frame_equal(frame, expected) def test_frame_from_records_utc(self): - rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} + rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=timezone.utc)} # it works DataFrame.from_records([rec], index="begin_time") @@ -503,3 +469,26 @@ def test_from_records_empty2(self): alt = DataFrame(arr) tm.assert_frame_equal(alt, expected) + + def test_from_records_structured_array(self): + # GH 59717 + data = np.array( + [ + ("John", 25, "New York", 50000), + ("Jane", 30, "San Francisco", 75000), + ("Bob", 35, "Chicago", 65000), + ("Alice", 28, "Los Angeles", 60000), + ], + dtype=[("name", "U10"), ("age", "i4"), ("city", "U15"), ("salary", "i4")], + ) + + actual_result = DataFrame.from_records(data, columns=["name", "salary", "city"]) + + modified_data = { + "name": ["John", "Jane", "Bob", "Alice"], + "salary": np.array([50000, 75000, 65000, 60000], dtype="int32"), + "city": ["New York", "San Francisco", "Chicago", "Los Angeles"], + } + expected_result = DataFrame(modified_data) + + tm.assert_frame_equal(actual_result, expected_result) diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index ba0d8613b6228..472bfb7772a80 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -4,6 +4,7 @@ For the most part, these should be multi-column DataFrames, otherwise we would share the tests with Series. """ + import numpy as np import pytest @@ -48,35 +49,19 @@ def test_loc_setitem_multiindex_columns(self, consolidate): def test_37477(): # fixed by GH#45121 orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) - expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]}) df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.at[1, "B"] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1, "B"] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.iat[1, 1] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[1, 1] = 1.2 - tm.assert_frame_equal(df, expected) def test_6942(indexer_al): @@ -106,19 +91,11 @@ def test_26395(indexer_al): expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(df)["C", "D"] = 44.5 - expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(df)["C", "D"] = "hello" - expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) - tm.assert_frame_equal(df, expected) @pytest.mark.xfail(reason="unwanted upcast") diff --git a/pandas/tests/frame/indexing/test_get.py b/pandas/tests/frame/indexing/test_get.py index 5f2651eec683c..75bad0ec1f159 100644 --- a/pandas/tests/frame/indexing/test_get.py +++ b/pandas/tests/frame/indexing/test_get.py @@ -15,13 +15,13 @@ def test_get(self, float_frame): ) @pytest.mark.parametrize( - "df", + "columns, index", [ - DataFrame(), - DataFrame(columns=list("AB")), - DataFrame(columns=list("AB"), index=range(3)), + [None, None], + [list("AB"), None], + [list("AB"), range(3)], ], ) - def test_get_none(self, df): + def test_get_none(self, columns, index): # see gh-5652 - assert df.get(None) is None + assert DataFrame(columns=columns, index=index).get(None) is None diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index a36b0c0e850b3..25d6e06a4c675 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -391,19 +391,13 @@ def test_getitem_empty_frame_with_boolean(self): df2 = df[df > 0] tm.assert_frame_equal(df, df2) - def test_getitem_returns_view_when_column_is_unique_in_df( - self, using_copy_on_write, warn_copy_on_write - ): + def test_getitem_returns_view_when_column_is_unique_in_df(self): # GH#45316 df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) df_orig = df.copy() view = df["b"] - with tm.assert_cow_warning(warn_copy_on_write): - view.loc[:] = 100 - if using_copy_on_write: - expected = df_orig - else: - expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"]) + view.loc[:] = 100 + expected = df_orig tm.assert_frame_equal(df, expected) def test_getitem_frozenset_unique_in_column(self): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 97e7ae15c6c63..0c99b08cb30c4 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -10,12 +10,7 @@ import pytest from pandas._libs import iNaT -from pandas.errors import ( - InvalidIndexError, - PerformanceWarning, - SettingWithCopyError, -) -import pandas.util._test_decorators as td +from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_integer @@ -30,7 +25,6 @@ Timestamp, date_range, isna, - notna, to_datetime, ) import pandas._testing as tm @@ -123,7 +117,7 @@ def test_setitem_list2(self): def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_frame): # boolean indexing - d = datetime_frame.index[10] + d = datetime_frame.index[len(datetime_frame) // 2] indexer = datetime_frame.index > d indexer_obj = indexer.astype(object) @@ -137,7 +131,7 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram subframe_obj = datetime_frame[indexer_obj] tm.assert_frame_equal(subframe_obj, subframe) - with pytest.raises(ValueError, match="Boolean array expected"): + with pytest.raises(TypeError, match="Boolean array expected"): datetime_frame[datetime_frame] # test that Series work @@ -150,7 +144,7 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram # we are producing a warning that since the passed boolean # key is not the same as the given index, we will reindex # not sure this is really necessary - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="will be reindexed"): indexer_obj = indexer_obj.reindex(datetime_frame.index[::-1]) subframe_obj = datetime_frame[indexer_obj] tm.assert_frame_equal(subframe_obj, subframe) @@ -288,9 +282,7 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem( - self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string - ): + def test_setitem(self, float_frame, using_infer_string): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -325,16 +317,10 @@ def test_setitem( # so raise/warn smaller = float_frame[:2] - msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - if using_copy_on_write or warn_copy_on_write: - # With CoW, adding a new column doesn't raise a warning - smaller["col10"] = ["1", "2"] - else: - with pytest.raises(SettingWithCopyError, match=msg): - smaller["col10"] = ["1", "2"] + smaller["col10"] = ["1", "2"] if using_infer_string: - assert smaller["col10"].dtype == "string" + assert smaller["col10"].dtype == "str" else: assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -469,13 +455,13 @@ def test_setitem_corner(self, float_frame, using_infer_string): del dm["foo"] dm["foo"] = "bar" if using_infer_string: - assert dm["foo"].dtype == "string" + assert dm["foo"].dtype == "str" else: assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] if using_infer_string: - assert dm["coercible"].dtype == "string" + assert dm["coercible"].dtype == "str" else: assert dm["coercible"].dtype == np.object_ @@ -511,21 +497,20 @@ def test_setitem_ambig(self, using_infer_string): dm[2] = uncoercable_series assert len(dm.columns) == 3 if using_infer_string: - assert dm[2].dtype == "string" + assert dm[2].dtype == "str" else: assert dm[2].dtype == np.object_ - def test_setitem_None(self, float_frame, using_infer_string): + def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] - key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, key], float_frame["A"], check_names=False + float_frame.loc[:, None], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 @@ -537,6 +522,16 @@ def test_loc_setitem_boolean_mask_allfalse(self): result.loc[result.b.isna(), "a"] = result.a.copy() tm.assert_frame_equal(result, df) + def test_getitem_slice_empty(self): + df = DataFrame([[1]], columns=MultiIndex.from_product([["A"], ["a"]])) + result = df[:] + + expected = DataFrame([[1]], columns=MultiIndex.from_product([["A"], ["a"]])) + + tm.assert_frame_equal(result, expected) + # Ensure df[:] returns a view of df, not the same object + assert result is not df + def test_getitem_fancy_slice_integers_step(self): df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) @@ -574,10 +569,7 @@ def test_getitem_setitem_integer_slice_keyerrors(self): with pytest.raises(KeyError, match=r"^3$"): df2.loc[3:11] = 0 - @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view - def test_fancy_getitem_slice_mixed( - self, float_frame, float_string_frame, using_copy_on_write, warn_copy_on_write - ): + def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): sliced = float_string_frame.iloc[:, -3:] assert sliced["D"].dtype == np.float64 @@ -588,15 +580,8 @@ def test_fancy_getitem_slice_mixed( assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values) - with tm.assert_cow_warning(warn_copy_on_write): - sliced.loc[:, "C"] = 4.0 - if not using_copy_on_write: - assert (float_frame["C"] == 4).all() - - # with the enforcement of GH#45333 in 2.0, this remains a view - np.shares_memory(sliced["C"]._values, float_frame["C"]._values) - else: - tm.assert_frame_equal(float_frame, original) + sliced.loc[:, "C"] = 4.0 + tm.assert_frame_equal(float_frame, original) def test_getitem_setitem_non_ix_labels(self): df = DataFrame(range(20), index=date_range("2020-01-01", periods=20)) @@ -640,7 +625,6 @@ def test_getitem_fancy_scalar(self, float_frame): for idx in f.index[::5]: assert ix[idx, col] == ts[idx] - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_setitem_fancy_scalar(self, float_frame): f = float_frame expected = float_frame.copy() @@ -680,7 +664,6 @@ def test_getitem_fancy_boolean(self, float_frame): expected = f.reindex(index=f.index[boolvec], columns=["C", "D"]) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_setitem_fancy_boolean(self, float_frame): # from 2d, set with booleans frame = float_frame.copy() @@ -739,7 +722,15 @@ def test_getitem_setitem_boolean_multi(self): expected.loc[[0, 2], [1]] = 5 tm.assert_frame_equal(df, expected) - def test_getitem_setitem_float_labels(self, using_array_manager): + def test_getitem_float_label_positional(self): + # GH 53338 + index = Index([1.5, 2]) + df = DataFrame(range(2), index=index) + result = df[1:2] + expected = DataFrame([1], index=[2.0]) + tm.assert_frame_equal(result, expected) + + def test_getitem_setitem_float_labels(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index) @@ -763,12 +754,6 @@ def test_getitem_setitem_float_labels(self, using_array_manager): expected = df.iloc[0:2] tm.assert_frame_equal(result, expected) - expected = df.iloc[0:2] - msg = r"The behavior of obj\[i:j\] with a float-dtype index" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df[1:2] - tm.assert_frame_equal(result, expected) - # #2727 index = Index([1.0, 2.5, 3.5, 4.5, 5.0]) df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index) @@ -846,13 +831,8 @@ def test_setitem_single_column_mixed_datetime(self): tm.assert_series_equal(result, expected) # GH#16674 iNaT is treated as an integer when given by the user - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc["b", "timestamp"] = iNaT - assert not isna(df.loc["b", "timestamp"]) - assert df["timestamp"].dtype == np.object_ - assert df.loc["b", "timestamp"] == iNaT # allow this syntax (as of GH#3216) df.loc["c", "timestamp"] = np.nan @@ -864,35 +844,11 @@ def test_setitem_single_column_mixed_datetime(self): def test_setitem_mixed_datetime(self): # GH 9336 - expected = DataFrame( - { - "a": [0, 0, 0, 0, 13, 14], - "b": [ - datetime(2012, 1, 1), - 1, - "x", - "y", - datetime(2013, 1, 1), - datetime(2014, 1, 1), - ], - } - ) df = DataFrame(0, columns=list("ab"), index=range(6)) df["b"] = pd.NaT df.loc[0, "b"] = datetime(2012, 1, 1) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1, "b"] = 1 - df.loc[[2, 3], "b"] = "x", "y" - A = np.array( - [ - [13, np.datetime64("2013-01-01T00:00:00")], - [14, np.datetime64("2014-01-01T00:00:00")], - ] - ) - df.loc[[4, 5], ["a", "b"]] = A - tm.assert_frame_equal(df, expected) def test_setitem_frame_float(self, float_frame): piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] @@ -949,6 +905,11 @@ def test_setitem_frame_upcast(self): # needs upcasting df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() + with pytest.raises(TypeError, match="Invalid value"): + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + # Manually upcast so we can add .5 + df = df.astype({"A": "float64", "B": "float64"}) + df2 = df2.astype({"A": "float64", "B": "float64"}) df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 expected = df.reindex(columns=["A", "B"]) expected += 0.5 @@ -1029,17 +990,17 @@ def test_single_element_ix_dont_upcast(self, float_frame): result = df.loc[0, "b"] assert is_integer(result) - expected = Series([666], [0], name="b") + expected = Series([666], index=range(1), name="b") result = df.loc[[0], "b"] tm.assert_series_equal(result, expected) - def test_iloc_callable_tuple_return_value(self): - # GH53769 + def test_iloc_callable_tuple_return_value_raises(self): + # GH53769: Enforced pandas 3.0 df = DataFrame(np.arange(40).reshape(10, 4), index=range(0, 20, 2)) - msg = "callable with iloc" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "Returning a tuple from" + with pytest.raises(ValueError, match=msg): df.iloc[lambda _: (0,)] - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): df.iloc[lambda _: (0,)] = 1 def test_iloc_row(self): @@ -1065,7 +1026,7 @@ def test_iloc_row(self): expected = df.reindex(df.index[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_row_slice_view(self, using_copy_on_write, warn_copy_on_write): + def test_iloc_row_slice_view(self): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), index=range(0, 20, 2) ) @@ -1078,13 +1039,7 @@ def test_iloc_row_slice_view(self, using_copy_on_write, warn_copy_on_write): assert np.shares_memory(df[2], subset[2]) exp_col = original[2].copy() - with tm.assert_cow_warning(warn_copy_on_write): - subset.loc[:, 2] = 0.0 - if not using_copy_on_write: - exp_col._values[4:8] = 0.0 - - # With the enforcement of GH#45333 in 2.0, this remains a view - assert np.shares_memory(df[2], subset[2]) + subset.loc[:, 2] = 0.0 tm.assert_series_equal(df[2], exp_col) def test_iloc_col(self): @@ -1110,35 +1065,20 @@ def test_iloc_col(self): expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_col_slice_view( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_iloc_col_slice_view(self): df = DataFrame( np.random.default_rng(2).standard_normal((4, 10)), columns=range(0, 20, 2) ) original = df.copy() subset = df.iloc[:, slice(4, 8)] - if not using_array_manager and not using_copy_on_write: - # verify slice is view - assert np.shares_memory(df[8]._values, subset[8]._values) - - with tm.assert_cow_warning(warn_copy_on_write): - subset.loc[:, 8] = 0.0 - - assert (df[8] == 0).all() - - # with the enforcement of GH#45333 in 2.0, this remains a view - assert np.shares_memory(df[8]._values, subset[8]._values) - else: - if using_copy_on_write: - # verify slice is view - assert np.shares_memory(df[8]._values, subset[8]._values) - subset[8] = 0.0 - # subset changed - assert (subset[8] == 0).all() - # but df itself did not change (setitem replaces full column) - tm.assert_frame_equal(df, original) + # verify slice is view + assert np.shares_memory(df[8]._values, subset[8]._values) + subset[8] = 0.0 + # subset changed + assert (subset[8] == 0).all() + # but df itself did not change (setitem replaces full column) + tm.assert_frame_equal(df, original) def test_loc_duplicates(self): # gh-17105 @@ -1243,7 +1183,7 @@ def test_getitem_boolean_indexing_mixed(self): tm.assert_frame_equal(df2, expected) df["foo"] = "test" - msg = "not supported between instances|unorderable types" + msg = "not supported between instances|unorderable types|Invalid comparison" with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 @@ -1252,7 +1192,7 @@ def test_type_error_multiindex(self): # See gh-12218 mi = MultiIndex.from_product([["x", "y"], [0, 1]], names=[None, "c"]) dg = DataFrame( - [[1, 1, 2, 2], [3, 3, 4, 4]], columns=mi, index=Index([0, 1], name="i") + [[1, 1, 2, 2], [3, 3, 4, 4]], columns=mi, index=Index(range(2), name="i") ) with pytest.raises(InvalidIndexError, match="slice"): dg[:, 0] @@ -1331,7 +1271,7 @@ def test_setting_mismatched_na_into_nullable_fails( r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype", r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype", "'values' contains non-numeric NA", - r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}", + r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'", ] ) with pytest.raises(TypeError, match=msg): @@ -1387,27 +1327,22 @@ def test_loc_expand_empty_frame_keep_midx_names(self): tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "val, idxr, warn", + "val, idxr", [ - ("x", "a", None), # TODO: this should warn as well - ("x", ["a"], None), # TODO: this should warn as well - (1, "a", None), # TODO: this should warn as well - (1, ["a"], FutureWarning), + ("x", "a"), + ("x", ["a"]), + (1, "a"), + (1, ["a"]), ], ) - def test_loc_setitem_rhs_frame(self, idxr, val, warn): + def test_loc_setitem_rhs_frame(self, idxr, val): # GH#47578 df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning( - warn, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) - expected = DataFrame({"a": [np.nan, val]}) - tm.assert_frame_equal(df, expected) - @td.skip_array_manager_invalid_test - def test_iloc_setitem_enlarge_no_warning(self, warn_copy_on_write): + def test_iloc_setitem_enlarge_no_warning(self): # GH#47381 df = DataFrame(columns=["a", "b"]) expected = df.copy() @@ -1516,7 +1451,7 @@ def test_iloc_ea_series_indexer(self): indexer = Series([0, 1], dtype="Int64") row_indexer = Series([1], dtype="Int64") result = df.iloc[row_indexer, indexer] - expected = DataFrame([[5, 6]], index=[1]) + expected = DataFrame([[5, 6]], index=range(1, 2)) tm.assert_frame_equal(result, expected) result = df.iloc[row_indexer.values, indexer.values] @@ -1534,7 +1469,7 @@ def test_iloc_ea_series_indexer_with_na(self): @pytest.mark.parametrize("indexer", [True, (True,)]) @pytest.mark.parametrize("dtype", [bool, "boolean"]) - def test_loc_bool_multiindex(self, dtype, indexer): + def test_loc_bool_multiindex(self, performance_warning, dtype, indexer): # GH#47687 midx = MultiIndex.from_arrays( [ @@ -1544,7 +1479,7 @@ def test_loc_bool_multiindex(self, dtype, indexer): names=["a", "b"], ) df = DataFrame({"c": [1, 2, 3, 4]}, index=midx) - with tm.maybe_produces_warning(PerformanceWarning, isinstance(indexer, tuple)): + with tm.maybe_produces_warning(performance_warning, isinstance(indexer, tuple)): result = df.loc[indexer] expected = DataFrame( {"c": [1, 2]}, index=Index([True, False], name="b", dtype=dtype) @@ -1580,6 +1515,16 @@ def test_setitem_value_coercing_dtypes(self, indexer, idx): expected = DataFrame([[1, np.nan], [2, np.nan], ["3", np.nan]], dtype=object) tm.assert_frame_equal(df, expected) + def test_big_endian_support_selecting_columns(self): + # GH#57457 + columns = ["a"] + data = [np.array([1, 2], dtype=">f8")] + df = DataFrame(dict(zip(columns, data))) + result = df[df.columns] + dfexp = DataFrame({"a": [1, 2]}, dtype=">f8") + expected = dfexp[dfexp.columns] + tm.assert_frame_equal(result, expected) + class TestDataFrameIndexingUInt64: def test_setitem(self): @@ -1603,21 +1548,12 @@ def test_setitem(self): # With NaN: because uint64 has no NaN element, # the column should be cast to object. df2 = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal( - df2.dtypes, - Series( - [np.dtype("uint64"), np.dtype("O"), np.dtype("O")], - index=["A", "B", "C"], - ), - ) -def test_object_casting_indexing_wraps_datetimelike(using_array_manager): +def test_object_casting_indexing_wraps_datetimelike(): # GH#31649, check the indexing methods all the way down the stack df = DataFrame( { @@ -1639,10 +1575,6 @@ def test_object_casting_indexing_wraps_datetimelike(using_array_manager): assert isinstance(ser.values[1], Timestamp) assert isinstance(ser.values[2], pd.Timedelta) - if using_array_manager: - # remainder of the test checking BlockManager internals - return - mgr = df._mgr mgr._rebuild_blknos_and_blklocs() arr = mgr.fast_xs(0).array @@ -1673,25 +1605,6 @@ def orig(self): orig = DataFrame({"cats": cats, "values": values}, index=idx) return orig - @pytest.fixture - def exp_single_row(self): - # The expected values if we change a single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) - return exp_single_row - - @pytest.fixture - def exp_multi_row(self): - # assign multiple rows (mixed values) (-> array) -> exp_multi_row - # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) - return exp_multi_row - @pytest.fixture def exp_parts_cats_col(self): # changed part of the cats column @@ -1712,21 +1625,25 @@ def exp_single_cats_value(self): ) return exp_single_cats_value - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer): + def test_loc_iloc_setitem_list_of_lists(self, orig, indexer_li): # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() key = slice(2, 4) - if indexer is tm.loc: + if indexer_li is tm.loc: key = slice("j", "k") - indexer(df)[key, :] = [["b", 2], ["b", 2]] + indexer_li(df)[key, :] = [["b", 2], ["b", 2]] + + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) tm.assert_frame_equal(df, exp_multi_row) df = orig.copy() with pytest.raises(TypeError, match=msg1): - indexer(df)[key, :] = [["c", 2], ["c", 2]] + indexer_li(df)[key, :] = [["c", 2], ["c", 2]] @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat]) def test_loc_iloc_at_iat_setitem_single_value_in_categories( @@ -1747,55 +1664,54 @@ def test_loc_iloc_at_iat_setitem_single_value_in_categories( with pytest.raises(TypeError, match=msg1): indexer(df)[key] = "c" - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) def test_loc_iloc_setitem_mask_single_value_in_categories( - self, orig, exp_single_cats_value, indexer + self, orig, exp_single_cats_value, indexer_li ): # mask with single True df = orig.copy() mask = df.index == "j" key = 0 - if indexer is tm.loc: + if indexer_li is tm.loc: key = df.columns[key] - indexer(df)[mask, key] = "b" + indexer_li(df)[mask, key] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_loc_iloc_setitem_full_row_non_categorical_rhs( - self, orig, exp_single_row, indexer - ): + def test_loc_iloc_setitem_full_row_non_categorical_rhs(self, orig, indexer_li): # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() key = 2 - if indexer is tm.loc: + if indexer_li is tm.loc: key = df.index[2] # not categorical dtype, but "b" _is_ among the categories for df["cat"] - indexer(df)[key, :] = ["b", 2] + indexer_li(df)[key, :] = ["b", 2] + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values1 = [1, 1, 2, 1, 1, 1, 1] + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) tm.assert_frame_equal(df, exp_single_row) # "c" is not among the categories for df["cat"] with pytest.raises(TypeError, match=msg1): - indexer(df)[key, :] = ["c", 2] + indexer_li(df)[key, :] = ["c", 2] - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) def test_loc_iloc_setitem_partial_col_categorical_rhs( - self, orig, exp_parts_cats_col, indexer + self, orig, exp_parts_cats_col, indexer_li ): # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() key = (slice(2, 4), 0) - if indexer is tm.loc: + if indexer_li is tm.loc: key = (slice("j", "k"), df.columns[0]) # same categories as we currently have in df["cats"] compat = Categorical(["b", "b"], categories=["a", "b"]) - indexer(df)[key] = compat + indexer_li(df)[key] = compat tm.assert_frame_equal(df, exp_parts_cats_col) # categories do not match df["cat"]'s, but "b" is among them @@ -1803,32 +1719,31 @@ def test_loc_iloc_setitem_partial_col_categorical_rhs( with pytest.raises(TypeError, match=msg2): # different categories but holdable values # -> not sure if this should fail or pass - indexer(df)[key] = semi_compat + indexer_li(df)[key] = semi_compat # categories do not match df["cat"]'s, and "c" is not among them incompat = Categorical(list("cc"), categories=list("abc")) with pytest.raises(TypeError, match=msg2): # different values - indexer(df)[key] = incompat + indexer_li(df)[key] = incompat - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) def test_loc_iloc_setitem_non_categorical_rhs( - self, orig, exp_parts_cats_col, indexer + self, orig, exp_parts_cats_col, indexer_li ): # assign a part of a column with dtype != categorical -> exp_parts_cats_col df = orig.copy() key = (slice(2, 4), 0) - if indexer is tm.loc: + if indexer_li is tm.loc: key = (slice("j", "k"), df.columns[0]) # "b" is among the categories for df["cat"] - indexer(df)[key] = ["b", "b"] + indexer_li(df)[key] = ["b", "b"] tm.assert_frame_equal(df, exp_parts_cats_col) # "c" not part of the categories with pytest.raises(TypeError, match=msg1): - indexer(df)[key] = ["c", "c"] + indexer_li(df)[key] = ["c", "c"] @pytest.mark.parametrize("indexer", [tm.getitem, tm.loc, tm.iloc]) def test_getitem_preserve_object_index_with_dates(self, indexer): @@ -1939,13 +1854,11 @@ def test_adding_new_conditional_column() -> None: ("dtype", "infer_string"), [ (object, False), - ("string[pyarrow_numpy]", True), + (pd.StringDtype(na_value=np.nan), True), ], ) def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 - pytest.importorskip("pyarrow") - df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" @@ -1957,36 +1870,42 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: def test_add_new_column_infer_string(): # GH#55366 - pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( - {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype=object), + {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))}, + columns=Index(["x", "y"], dtype="str"), ) tm.assert_frame_equal(df, expected) class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py - # but checks for warnings instead of errors. - def _check_setitem_invalid(self, df, invalid, indexer, warn): - msg = "Setting an item of incompatible dtype is deprecated" - msg = re.escape(msg) - + def _check_setitem_invalid(self, df, invalid, indexer): orig_df = df.copy() # iloc - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[indexer, 0] = invalid df = orig_df.copy() # loc - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): df.loc[indexer, "a"] = invalid df = orig_df.copy() + def _check_setitem_valid(self, df, value, indexer): + orig_df = df.copy() + + # iloc + df.iloc[indexer, 0] = value + df = orig_df.copy() + + # loc + df.loc[indexer, "a"] = value + df = orig_df.copy() + _invalid_scalars = [ 1 + 2j, "True", @@ -1996,7 +1915,7 @@ def _check_setitem_invalid(self, df, invalid, indexer, warn): np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -2004,20 +1923,19 @@ def _check_setitem_invalid(self, df, invalid, indexer, warn): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): df = DataFrame({"a": [True, False, False]}, dtype="bool") - self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + self._check_setitem_invalid(df, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not pd.NaT: - warn = None + if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): + self._check_setitem_valid(df, invalid, indexer) else: - warn = FutureWarning - self._check_setitem_invalid(df, invalid, indexer, warn) + self._check_setitem_invalid(df, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + self._check_setitem_invalid(df, invalid, indexer) diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 7e702bdc993bd..b530cb98ef46c 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -3,6 +3,7 @@ confused with tests with "insert" in their names that are really testing __setitem__. """ + import numpy as np import pytest @@ -71,28 +72,18 @@ def test_insert_with_columns_dups(self): ) tm.assert_frame_equal(df, exp) - def test_insert_item_cache(self, using_array_manager, using_copy_on_write): + def test_insert_item_cache(self, performance_warning): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) ser = df[0] - - if using_array_manager: - expected_warning = None - else: - # with BlockManager warn about high fragmentation of single dtype - expected_warning = PerformanceWarning + expected_warning = PerformanceWarning if performance_warning else None with tm.assert_produces_warning(expected_warning): for n in range(100): df[n + 3] = df[1] * n - if using_copy_on_write: - ser.iloc[0] = 99 - assert df.iloc[0, 0] == df[0][0] - assert df.iloc[0, 0] != 99 - else: - ser.values[0] = 99 - assert df.iloc[0, 0] == df[0][0] - assert df.iloc[0, 0] == 99 + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df[0][0] + assert df.iloc[0, 0] != 99 def test_insert_EA_no_warning(self): # PerformanceWarning about fragmented frame should not be raised when diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index 264e27c9c122e..ac6f0a1ac0f73 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -122,7 +122,7 @@ def test_mask_stringdtype(frame_or_series): def test_mask_where_dtype_timedelta(): # https://github.com/pandas-dev/pandas/issues/39548 - df = DataFrame([Timedelta(i, unit="d") for i in range(5)]) + df = DataFrame([Timedelta(i, unit="D") for i in range(5)]) expected = DataFrame(np.full(5, np.nan, dtype="timedelta64[ns]")) tm.assert_frame_equal(df.mask(df.notna()), expected) @@ -130,7 +130,7 @@ def test_mask_where_dtype_timedelta(): expected = DataFrame( [np.nan, np.nan, np.nan, Timedelta("3 day"), Timedelta("4 day")] ) - tm.assert_frame_equal(df.where(df > Timedelta(2, unit="d")), expected) + tm.assert_frame_equal(df.where(df > Timedelta(2, unit="D")), expected) def test_mask_return_dtype(): diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index ce771280bc264..124505f440e6c 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas.core.dtypes.common import is_float_dtype @@ -6,7 +7,6 @@ DataFrame, isna, ) -import pandas._testing as tm class TestSetValue: @@ -28,7 +28,7 @@ def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame.copy() res._set_value("foobar", "baz", "sam") if using_infer_string: - assert res["baz"].dtype == "string" + assert res["baz"].dtype == "str" else: assert res["baz"].dtype == np.object_ res = float_frame.copy() @@ -40,11 +40,8 @@ def test_set_value_resize(self, float_frame, using_infer_string): assert is_float_dtype(res["baz"]) assert isna(res["baz"].drop(["foobar"])).all() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): res._set_value("foobar", "baz", "sam") - assert res.loc["foobar", "baz"] == "sam" def test_set_value_with_index_dtype_change(self): df_orig = DataFrame( diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e802a56ecbc81..20dd7b0c4d3e7 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.base import _registry as ea_registry from pandas.core.dtypes.common import is_object_dtype from pandas.core.dtypes.dtypes import ( @@ -43,7 +41,7 @@ class TestDataFrameSetItem: def test_setitem_str_subclass(self): # GH#37366 class mystring(str): - pass + __slots__ = () data = ["2020-10-22 01:21:00+00:00"] index = DatetimeIndex(data) @@ -146,14 +144,32 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) - def test_setitem_empty_columns(self): - # GH 13522 + def test_setitem_overwrite_index(self): + # GH 13522 - assign the index as a column and then overwrite the values + # -> should not affect the index df = DataFrame(index=["A", "B", "C"]) df["X"] = df.index df["X"] = ["x", "y", "z"] - exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) + exp = DataFrame( + data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"] + ) tm.assert_frame_equal(df, exp) + def test_setitem_empty_columns(self): + # Starting from an empty DataFrame and setting a column should result + # in a default string dtype for the columns' Index + # https://github.com/pandas-dev/pandas/issues/60338 + + df = DataFrame() + df["foo"] = [1, 2, 3] + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=Index([])) + df["foo"] = [1, 2, 3] + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(df, expected) + def test_setitem_dt64_index_empty_columns(self): rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") df = DataFrame(index=np.arange(len(rng))) @@ -164,10 +180,10 @@ def test_setitem_dt64_index_empty_columns(self): def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) - df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns") + df["now"] = Timestamp("20130101", tz="UTC") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] + [[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"] ) tm.assert_frame_equal(df, expected) @@ -206,7 +222,7 @@ def test_setitem_period_preserves_dtype(self): result = DataFrame([]) result["a"] = data - expected = DataFrame({"a": data}) + expected = DataFrame({"a": data}, columns=["a"]) tm.assert_frame_equal(result, expected) @@ -326,7 +342,7 @@ def test_frame_setitem_existing_datetime64_col_other_units(self, unit): df["dates"] = vals assert (df["dates"].values == ex_vals).all() - def test_setitem_dt64tz(self, timezone_frame, using_copy_on_write): + def test_setitem_dt64tz(self, timezone_frame): df = timezone_frame idx = df["B"].rename("foo") @@ -342,15 +358,12 @@ def test_setitem_dt64tz(self, timezone_frame, using_copy_on_write): # assert that A & C are not sharing the same base (e.g. they # are copies) # Note: This does not hold with Copy on Write (because of lazy copying) - v1 = df._mgr.arrays[1] - v2 = df._mgr.arrays[2] + v1 = df._mgr.blocks[1].values + v2 = df._mgr.blocks[2].values tm.assert_extension_array_equal(v1, v2) v1base = v1._ndarray.base v2base = v2._ndarray.base - if not using_copy_on_write: - assert v1base is None or (id(v1base) != id(v2base)) - else: - assert id(v1base) == id(v2base) + assert id(v1base) == id(v2base) # with nan df2 = df.copy() @@ -675,7 +688,7 @@ def test_setitem_iloc_two_dimensional_generator(self): def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") - df = DataFrame(index=index) + df = DataFrame(index=index, columns=Index([], dtype="str")) df["a"] = Series(name="a", index=index, dtype=np.uint32) df["b"] = Series(name="b", index=index, dtype="S64") df["c"] = Series(name="c", index=index, dtype="S64") @@ -704,8 +717,6 @@ def test_setitem_ea_dtype_rhs_series(self): expected = DataFrame({"a": [1, 2]}, dtype="Int64") tm.assert_frame_equal(df, expected) - # TODO(ArrayManager) set column with 2d column array, see #44788 - @td.skip_array_manager_not_yet_implemented def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -714,11 +725,14 @@ def test_setitem_npmatrix_2d(self): ) a = np.ones((10, 1)) - df = DataFrame(index=np.arange(10)) + df = DataFrame(index=np.arange(10), columns=Index([], dtype="str")) df["np-array"] = a # Instantiation of `np.matrix` gives PendingDeprecationWarning - with tm.assert_produces_warning(PendingDeprecationWarning): + with tm.assert_produces_warning( + PendingDeprecationWarning, + match="matrix subclass is not the recommended way to represent matrices", + ): df["np-matrix"] = np.matrix(a) tm.assert_frame_equal(df, expected) @@ -786,20 +800,18 @@ def test_loc_setitem_ea_dtype(self): df.iloc[:, 0] = Series([11], dtype="Int64") tm.assert_frame_equal(df, expected) - def test_setitem_object_inferring(self): + def test_setitem_index_object_dtype_not_inferring(self): # GH#56102 idx = Index([Timestamp("2019-12-31")], dtype=object) df = DataFrame({"a": [1]}) - with tm.assert_produces_warning(FutureWarning, match="infer"): - df.loc[:, "b"] = idx - with tm.assert_produces_warning(FutureWarning, match="infer"): - df["c"] = idx + df.loc[:, "b"] = idx + df["c"] = idx expected = DataFrame( { "a": [1], - "b": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), - "c": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + "b": idx, + "c": idx, } ) tm.assert_frame_equal(df, expected) @@ -844,11 +856,12 @@ def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): # object array of datetimes with a tz df["B"] = idx.to_pydatetime() result = df["B"] + expected = expected.dt.as_unit("us") tm.assert_series_equal(result, expected) class TestDataFrameSetItemWithExpansion: - def test_setitem_listlike_views(self, using_copy_on_write, warn_copy_on_write): + def test_setitem_listlike_views(self): # GH#38148 df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) @@ -859,13 +872,9 @@ def test_setitem_listlike_views(self, using_copy_on_write, warn_copy_on_write): df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]]) # edit in place the first column to check view semantics - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 100 + df.iloc[0, 0] = 100 - if using_copy_on_write: - expected = Series([1, 2, 3], name="a") - else: - expected = Series([100, 2, 3], name="a") + expected = Series([1, 2, 3], name="a") tm.assert_series_equal(ser, expected) def test_setitem_string_column_numpy_dtype_raising(self): @@ -875,7 +884,7 @@ def test_setitem_string_column_numpy_dtype_raising(self): expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"]) tm.assert_frame_equal(df, expected) - def test_setitem_empty_df_duplicate_columns(self, using_copy_on_write): + def test_setitem_empty_df_duplicate_columns(self): # GH#38521 df = DataFrame(columns=["a", "b", "b"], dtype="float64") df.loc[:, "a"] = list(range(2)) @@ -1004,13 +1013,12 @@ def test_setitem_slice_position(self): expected = DataFrame(arr) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc]) @pytest.mark.parametrize("box", [Series, np.array, list, pd.array]) @pytest.mark.parametrize("n", [1, 2, 3]) - def test_setitem_slice_indexer_broadcasting_rhs(self, n, box, indexer): + def test_setitem_slice_indexer_broadcasting_rhs(self, n, box, indexer_si): # GH#40440 df = DataFrame([[1, 3, 5]] + [[2, 4, 6]] * n, columns=["a", "b", "c"]) - indexer(df)[1:] = box([10, 11, 12]) + indexer_si(df)[1:] = box([10, 11, 12]) expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) @@ -1023,15 +1031,14 @@ def test_setitem_list_indexer_broadcasting_rhs(self, n, box): expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc]) @pytest.mark.parametrize("box", [Series, np.array, list, pd.array]) @pytest.mark.parametrize("n", [1, 2, 3]) - def test_setitem_slice_broadcasting_rhs_mixed_dtypes(self, n, box, indexer): + def test_setitem_slice_broadcasting_rhs_mixed_dtypes(self, n, box, indexer_si): # GH#40440 df = DataFrame( [[1, 3, 5], ["x", "y", "z"]] + [[2, 4, 6]] * n, columns=["a", "b", "c"] ) - indexer(df)[1:] = box([10, 11, 12]) + indexer_si(df)[1:] = box([10, 11, 12]) expected = DataFrame( [[1, 3, 5]] + [[10, 11, 12]] * (n + 1), columns=["a", "b", "c"], @@ -1063,7 +1070,6 @@ def inc(x): class TestDataFrameSetItemBooleanMask: - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values @pytest.mark.parametrize( "mask_type", [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values], @@ -1110,13 +1116,12 @@ def test_setitem_loc_only_false_indexer_dtype_changed(self, box): df.loc[indexer, ["b"]] = 9 tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("indexer", [tm.setitem, tm.loc]) - def test_setitem_boolean_mask_aligning(self, indexer): + def test_setitem_boolean_mask_aligning(self, indexer_sl): # GH#39931 df = DataFrame({"a": [1, 4, 2, 3], "b": [5, 6, 7, 8]}) expected = df.copy() mask = df["a"] >= 3 - indexer(df)[mask] = indexer(df)[mask].sort_values("a") + indexer_sl(df)[mask] = indexer_sl(df)[mask].sort_values("a") tm.assert_frame_equal(df, expected) def test_setitem_mask_categorical(self): @@ -1208,9 +1213,7 @@ def test_setitem_always_copy(self, float_frame): assert notna(s[5:10]).all() @pytest.mark.parametrize("consolidate", [True, False]) - def test_setitem_partial_column_inplace( - self, consolidate, using_array_manager, using_copy_on_write - ): + def test_setitem_partial_column_inplace(self, consolidate): # This setting should be in-place, regardless of whether frame is # single-block or multi-block # GH#304 this used to be incorrectly not-inplace, in which case @@ -1220,25 +1223,17 @@ def test_setitem_partial_column_inplace( {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] ) df.insert(2, "z", np.nan) - if not using_array_manager: - if consolidate: - df._consolidate_inplace() - assert len(df._mgr.blocks) == 1 - else: - assert len(df._mgr.blocks) == 2 - - zvals = df["z"]._values + if consolidate: + df._consolidate_inplace() + assert len(df._mgr.blocks) == 1 + else: + assert len(df._mgr.blocks) == 2 df.loc[2:, "z"] = 42 expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") tm.assert_series_equal(df["z"], expected) - # check setting occurred in-place - if not using_copy_on_write: - tm.assert_numpy_array_equal(zvals, expected.values) - assert np.shares_memory(zvals, df["z"]._values) - def test_setitem_duplicate_columns_not_inplace(self): # GH#39510 cols = ["A", "B"] * 2 @@ -1254,7 +1249,7 @@ def test_setitem_duplicate_columns_not_inplace(self): @pytest.mark.parametrize( "value", [1, np.array([[1], [1]], dtype="int64"), [[1], [1]]] ) - def test_setitem_same_dtype_not_inplace(self, value, using_array_manager): + def test_setitem_same_dtype_not_inplace(self, value): # GH#39510 cols = ["A", "B"] df = DataFrame(0, index=[0, 1], columns=cols) @@ -1310,10 +1305,7 @@ def test_setitem_not_operating_inplace(self, value, set_value, indexer): df[indexer] = set_value tm.assert_frame_equal(view, expected) - @td.skip_array_manager_invalid_test - def test_setitem_column_update_inplace( - self, using_copy_on_write, warn_copy_on_write - ): + def test_setitem_column_update_inplace(self): # https://github.com/pandas-dev/pandas/issues/47172 labels = [f"c{i}" for i in range(10)] @@ -1323,12 +1315,8 @@ def test_setitem_column_update_inplace( with tm.raises_chained_assignment_error(): for label in df.columns: df[label][label] = 1 - if not using_copy_on_write: - # diagonal values all updated - assert np.all(values[np.arange(10), np.arange(10)] == 1) - else: - # original dataframe not updated - assert np.all(values[np.arange(10), np.arange(10)] == 0) + # original dataframe not updated + assert np.all(values[np.arange(10), np.arange(10)] == 0) def test_setitem_column_frame_as_category(self): # GH31581 @@ -1381,3 +1369,33 @@ def test_frame_setitem_empty_dataframe(self): index=dti[:0], ) tm.assert_frame_equal(df, expected) + + +def test_full_setter_loc_incompatible_dtype(): + # https://github.com/pandas-dev/pandas/issues/55791 + df = DataFrame({"a": [1, 2]}) + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, "a"] = True + + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, "a"] = {0: 3.5, 1: 4.5} + + df.loc[:, "a"] = {0: 3, 1: 4} + expected = DataFrame({"a": [3, 4]}) + tm.assert_frame_equal(df, expected) + + +def test_setitem_partial_row_multiple_columns(): + # https://github.com/pandas-dev/pandas/issues/56503 + df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]}) + # should not warn + df.loc[df.index <= 1, ["F", "G"]] = (1, "abc") + expected = DataFrame( + { + "A": [1, 2, 3], + "B": [4.0, 5, 6], + "F": [1.0, 1, float("nan")], + "G": ["abc", "abc", float("nan")], + } + ) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_take.py b/pandas/tests/frame/indexing/test_take.py index 8c17231440917..56a7bfd003083 100644 --- a/pandas/tests/frame/indexing/test_take.py +++ b/pandas/tests/frame/indexing/test_take.py @@ -4,14 +4,14 @@ class TestDataFrameTake: - def test_take_slices_deprecated(self, float_frame): + def test_take_slices_not_supported(self, float_frame): # GH#51539 df = float_frame slc = slice(0, 4, 1) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(TypeError, match="slice"): df.take(slc, axis=0) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(TypeError, match="slice"): df.take(slc, axis=1) def test_take(self, float_frame): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 3d36d0471f02f..d6570fcda2ee8 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -63,7 +63,10 @@ def _check_get(df, cond, check_dtypes=True): # check getting df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -96,7 +99,6 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) - @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -128,7 +130,10 @@ def _check_align(df, cond, other, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -171,13 +176,12 @@ def test_where_invalid(self): with pytest.raises(ValueError, match=msg): df.mask(0) - @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace def _check_set(df, cond, check_dtypes=True): dfi = df.copy() - econd = cond.reindex_like(df).fillna(True).infer_objects(copy=False) + econd = cond.reindex_like(df).fillna(True).infer_objects() expected = dfi.mask(~econd) return_value = dfi.where(cond, np.nan, inplace=True) @@ -193,7 +197,10 @@ def _check_set(df, cond, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -250,7 +257,7 @@ def test_where_invalid_input_single(self, cond): df = DataFrame({"a": [1, 2, 3]}) msg = "Boolean array expected for the condition" - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): df.where(cond) @pytest.mark.parametrize( @@ -272,7 +279,7 @@ def test_where_invalid_input_multiple(self, cond): df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) msg = "Boolean array expected for the condition" - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): df.where(cond) def test_where_dataframe_col_match(self): @@ -357,10 +364,9 @@ def test_where_bug_transposition(self): expected = a.copy() expected[~do_not_replace] = b + expected[[0, 1]] = expected[[0, 1]].astype("float64") - msg = "Downcasting behavior in Series and DataFrame methods 'where'" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = a.where(do_not_replace, b) + result = a.where(do_not_replace, b) tm.assert_frame_equal(result, expected) a = DataFrame({0: [4, 6], 1: [1, 0]}) @@ -369,9 +375,9 @@ def test_where_bug_transposition(self): expected = a.copy() expected[~do_not_replace] = b + expected[1] = expected[1].astype("float64") - with tm.assert_produces_warning(FutureWarning, match=msg): - result = a.where(do_not_replace, b) + result = a.where(do_not_replace, b) tm.assert_frame_equal(result, expected) def test_where_datetime(self): @@ -516,26 +522,15 @@ def test_where_axis_with_upcast(self): tm.assert_frame_equal(result, expected) result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, ser, axis="index", inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + result.where(mask, ser, axis="index", inplace=True) expected = DataFrame([[0, np.nan], [0, np.nan]]) result = df.where(mask, ser, axis="columns") tm.assert_frame_equal(result, expected) - expected = DataFrame( - { - 0: np.array([0, 0], dtype="int64"), - 1: np.array([np.nan, np.nan], dtype="float64"), - } - ) - result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, ser, axis="columns", inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + df.where(mask, ser, axis="columns", inplace=True) def test_where_axis_multiple_dtypes(self): # Multiple dtypes (=> multiple Blocks) @@ -587,15 +582,10 @@ def test_where_axis_multiple_dtypes(self): result = df.where(mask, d1, axis="index") tm.assert_frame_equal(result, expected) result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, d1, inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) - result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): + result.where(mask, d1, inplace=True) + with pytest.raises(TypeError, match="Invalid value"): return_value = result.where(mask, d1, inplace=True, axis="index") - assert return_value is None - tm.assert_frame_equal(result, expected) d2 = df.copy().drop(1, axis=1) expected = df.copy() @@ -738,22 +728,12 @@ def test_where_interval_noop(self): def test_where_interval_fullop_downcast(self, frame_or_series): # GH#45768 obj = frame_or_series([pd.Interval(0, 0)] * 2) - other = frame_or_series([1.0, 2.0]) - - msg = "Downcasting behavior in Series and DataFrame methods 'where'" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = obj.where(~obj.notna(), other) - - # since all entries are being changed, we will downcast result - # from object to ints (not floats) - tm.assert_equal(res, other.astype(np.int64)) + other = frame_or_series([1.0, 2.0], dtype=object) + res = obj.where(~obj.notna(), other) + tm.assert_equal(res, other) - # unlike where, Block.putmask does not downcast - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj.mask(obj.notna(), other, inplace=True) - tm.assert_equal(obj, other.astype(object)) @pytest.mark.parametrize( "dtype", @@ -761,15 +741,13 @@ def test_where_interval_fullop_downcast(self, frame_or_series): "timedelta64[ns]", "datetime64[ns]", "datetime64[ns, Asia/Tokyo]", - "Period[D]", ], ) def test_where_datetimelike_noop(self, dtype): # GH#45135, analogue to GH#44181 for Period don't raise on no-op # For td64/dt64/dt64tz we already don't raise, but also are # checking that we don't unnecessarily upcast to object. - with tm.assert_produces_warning(FutureWarning, match="is deprecated"): - ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) + ser = Series(np.arange(3) * 10**9, dtype=np.int64).astype(dtype) df = ser.to_frame() mask = np.array([False, False, False]) @@ -786,20 +764,9 @@ def test_where_datetimelike_noop(self, dtype): res4 = df.mask(mask2, "foo") tm.assert_frame_equal(res4, df) - # opposite case where we are replacing *all* values -> we downcast - # from object dtype # GH#45768 - msg = "Downcasting behavior in Series and DataFrame methods 'where'" - with tm.assert_produces_warning(FutureWarning, match=msg): - res5 = df.where(mask2, 4) - expected = DataFrame(4, index=df.index, columns=df.columns) - tm.assert_frame_equal(res5, expected) - # unlike where, Block.putmask does not downcast - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.mask(~mask2, 4, inplace=True) - tm.assert_frame_equal(df, expected.astype(object)) def test_where_int_downcasting_deprecated(): @@ -953,11 +920,8 @@ def test_where_period_invalid_na(frame_or_series, as_cat, request): result = obj.mask(mask, tdnat) tm.assert_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj.mask(mask, tdnat, inplace=True) - tm.assert_equal(obj, expected) def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): @@ -967,7 +931,7 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): mask = np.array([True, True, False], ndmin=obj.ndim).T - msg = r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}" + msg = r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'" for null in tm.NP_NAT_OBJECTS + [pd.NaT]: # NaT is an NA value that we should *not* cast to pd.NA dtype @@ -993,16 +957,9 @@ def test_where_downcast_to_td64(): mask = np.array([False, False, False]) td = pd.Timedelta(days=1) - - msg = "Downcasting behavior in Series and DataFrame methods 'where'" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.where(mask, td) expected = Series([td, td, td], dtype="m8[ns]") - tm.assert_series_equal(res, expected) - with pd.option_context("future.no_silent_downcasting", True): - with tm.assert_produces_warning(None, match=msg): - res2 = ser.where(mask, td) + res2 = ser.where(mask, td) expected2 = expected.astype(object) tm.assert_series_equal(res2, expected2) @@ -1038,13 +995,6 @@ def test_where_dt64_2d(): mask = np.asarray(df.isna()).copy() mask[:, 1] = True - # setting all of one column, none of the other - expected = DataFrame({"A": other[:, 0], "B": dta[:, 1]}) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - _check_where_equivalences(df, mask, other, expected) - # setting part of one column, none of the other mask[1, 0] = True expected = DataFrame( @@ -1053,9 +1003,7 @@ def test_where_dt64_2d(): "B": dta[:, 1], } ) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): _check_where_equivalences(df, mask, other, expected) # setting nothing in either column @@ -1077,13 +1025,9 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement, using_infer_string, request): +def test_where_int_overflow(replacement): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) - if using_infer_string and replacement not in (None, "snake"): - request.node.add_marker( - pytest.mark.xfail(reason="Can't set non-string into string column") - ) result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index be809e3a17c8e..54733129b4d47 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.errors import SettingWithCopyError - from pandas import ( DataFrame, Index, @@ -36,10 +34,7 @@ def four_level_index_dataframe(): class TestXS: - def test_xs( - self, float_frame, datetime_frame, using_copy_on_write, warn_copy_on_write - ): - float_frame_orig = float_frame.copy() + def test_xs(self, float_frame): idx = float_frame.index[5] xs = float_frame.xs(idx) for item, value in xs.items(): @@ -48,6 +43,7 @@ def test_xs( else: assert value == float_frame[item][idx] + def test_xs_mixed(self): # mixed-type xs test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data) @@ -56,11 +52,14 @@ def test_xs( assert xs["A"] == 1 assert xs["B"] == "1" + def test_xs_dt_error(self, datetime_frame): with pytest.raises( KeyError, match=re.escape("Timestamp('1999-12-31 00:00:00')") ): datetime_frame.xs(datetime_frame.index[0] - BDay()) + def test_xs_other(self, float_frame): + float_frame_orig = float_frame.copy() # xs get column series = float_frame.xs("A", axis=1) expected = float_frame["A"] @@ -68,18 +67,14 @@ def test_xs( # view is returned if possible series = float_frame.xs("A", axis=1) - with tm.assert_cow_warning(warn_copy_on_write): - series[:] = 5 - if using_copy_on_write: - # but with CoW the view shouldn't propagate mutations - tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) - assert not (expected == 5).all() - else: - assert (expected == 5).all() + series[:] = 5 + # The view shouldn't propagate mutations + tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) + assert not (expected == 5).all() def test_xs_corner(self): # pathological mixed-type reordering case - df = DataFrame(index=[0]) + df = DataFrame(index=[0], columns=Index([], dtype="str")) df["A"] = 1.0 df["B"] = "foo" df["C"] = 2.0 @@ -122,30 +117,16 @@ def test_xs_keep_level(self): result = df.xs((2008, "sat"), level=["year", "day"], drop_level=False) tm.assert_frame_equal(result, expected) - def test_xs_view( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_xs_view(self): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) df_orig = dm.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - dm.xs(2)[:] = 20 - tm.assert_frame_equal(dm, df_orig) - elif using_array_manager: - # INFO(ArrayManager) with ArrayManager getting a row as a view is - # not possible - msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): - dm.xs(2)[:] = 20 - assert not (dm.xs(2) == 20).any() - else: - with tm.raises_chained_assignment_error(): - dm.xs(2)[:] = 20 - assert (dm.xs(2) == 20).all() + with tm.raises_chained_assignment_error(): + dm.xs(2)[:] = 20 + tm.assert_frame_equal(dm, df_orig) class TestXSWithMultiIndex: @@ -203,43 +184,22 @@ def test_xs_level_eq_2(self): result = df.xs("c", level=2) tm.assert_frame_equal(result, expected) - def test_xs_setting_with_copy_error( - self, - multiindex_dataframe_random_data, - using_copy_on_write, - warn_copy_on_write, - ): + def test_xs_setting_with_copy_error(self, multiindex_dataframe_random_data): # this is a copy in 0.14 df = multiindex_dataframe_random_data df_orig = df.copy() result = df.xs("two", level="second") - if using_copy_on_write or warn_copy_on_write: - result[:] = 10 - else: - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): - result[:] = 10 + result[:] = 10 tm.assert_frame_equal(df, df_orig) - def test_xs_setting_with_copy_error_multiple( - self, four_level_index_dataframe, using_copy_on_write, warn_copy_on_write - ): + def test_xs_setting_with_copy_error_multiple(self, four_level_index_dataframe): # this is a copy in 0.14 df = four_level_index_dataframe df_orig = df.copy() result = df.xs(("a", 4), level=["one", "four"]) - if using_copy_on_write or warn_copy_on_write: - result[:] = 10 - else: - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): - result[:] = 10 + result[:] = 10 tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) @@ -400,40 +360,23 @@ def test_xs_droplevel_false(self): expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - def test_xs_droplevel_false_view( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_xs_droplevel_false_view(self): # GH#37832 df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) # check that result still views the same data as df assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) - with tm.assert_cow_warning(warn_copy_on_write): - df.iloc[0, 0] = 2 - if using_copy_on_write: - # with copy on write the subset is never modified - expected = DataFrame({"a": [1]}) - else: - # modifying original df also modifies result when having a single block - expected = DataFrame({"a": [2]}) + df.iloc[0, 0] = 2 + # The subset is never modified + expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - # with mixed dataframe, modifying the parent doesn't modify result - # TODO the "split" path behaves differently here as with single block df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) df.iloc[0, 0] = 2 - if using_copy_on_write: - # with copy on write the subset is never modified - expected = DataFrame({"a": [1]}) - elif using_array_manager: - # Here the behavior is consistent - expected = DataFrame({"a": [2]}) - else: - # FIXME: iloc does not update the array inplace using - # "split" path - expected = DataFrame({"a": [1]}) + # The subset is never modified + expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) def test_xs_list_indexer_droplevel_false(self): diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 5a9c47866dae8..2677b7332a65a 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -14,14 +14,6 @@ class TestDataFrameAlign: - def test_align_asfreq_method_raises(self): - df = DataFrame({"A": [1, np.nan, 2]}) - msg = "Invalid fill method" - msg2 = "The 'method', 'limit', and 'fill_axis' keywords" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - df.align(df.iloc[::-1], method="asfreq") - def test_frame_align_aware(self): idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern") @@ -48,15 +40,12 @@ def test_frame_align_aware(self): assert new1.index.tz is timezone.utc assert new2.index.tz is timezone.utc - def test_align_float(self, float_frame, using_copy_on_write): + def test_align_float(self, float_frame): af, bf = float_frame.align(float_frame) assert af._mgr is not float_frame._mgr - af, bf = float_frame.align(float_frame, copy=False) - if not using_copy_on_write: - assert af._mgr is float_frame._mgr - else: - assert af._mgr is not float_frame._mgr + af, bf = float_frame.align(float_frame) + assert af._mgr is not float_frame._mgr # axis = 0 other = float_frame.iloc[:-5, :3] @@ -91,34 +80,6 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align(other, join="inner", axis=1) tm.assert_index_equal(bf.columns, other.columns) - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " - "are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - af, bf = float_frame.align(other, join="inner", axis=1, method="pad") - tm.assert_index_equal(bf.columns, other.columns) - - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " - "are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - af, bf = float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None - ) - tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) - - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " - "are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - af, bf = float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) - tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) - # Try to align DataFrame to Series along bad axis msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): @@ -134,16 +95,6 @@ def test_align_frame_with_series(self, float_frame): tm.assert_index_equal(right.index, float_frame.index) assert isinstance(right, Series) - msg = "The 'broadcast_axis' keyword in DataFrame.align is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - left, right = float_frame.align(s, broadcast_axis=1) - tm.assert_index_equal(left.index, float_frame.index) - expected = {c: s for c in float_frame.columns} - expected = DataFrame( - expected, index=float_frame.index, columns=float_frame.columns - ) - tm.assert_frame_equal(right, expected) - def test_align_series_condition(self): # see gh-9558 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -155,54 +106,19 @@ def test_align_series_condition(self): expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) tm.assert_frame_equal(result, expected) - def test_align_int(self, int_frame): - # test other non-float types - other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " - "are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - af, bf = int_frame.align(other, join="inner", axis=1, method="pad") - tm.assert_index_equal(bf.columns, other.columns) - - def test_align_mixed_type(self, float_string_frame): - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " - "are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - af, bf = float_string_frame.align( - float_string_frame, join="inner", axis=1, method="pad" - ) - tm.assert_index_equal(bf.columns, float_string_frame.columns) - def test_align_mixed_float(self, mixed_float_frame): # mixed floats/ints other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " - "are deprecated" + af, bf = mixed_float_frame.align( + other.iloc[:, 0], join="inner", axis=1, fill_value=0 ) - with tm.assert_produces_warning(FutureWarning, match=msg): - af, bf = mixed_float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) tm.assert_index_equal(bf.index, Index([])) def test_align_mixed_int(self, mixed_int_frame): other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " - "are deprecated" + af, bf = mixed_int_frame.align( + other.iloc[:, 0], join="inner", axis=1, fill_value=0 ) - with tm.assert_produces_warning(FutureWarning, match=msg): - af, bf = mixed_int_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) tm.assert_index_equal(bf.index, Index([])) @pytest.mark.parametrize( @@ -392,68 +308,6 @@ def test_missing_axis_specification_exception(self): with pytest.raises(ValueError, match=r"axis=0 or 1"): df.align(series) - @pytest.mark.parametrize("method", ["pad", "bfill"]) - @pytest.mark.parametrize("axis", [0, 1, None]) - @pytest.mark.parametrize("fill_axis", [0, 1]) - @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) - @pytest.mark.parametrize( - "left_slice", - [ - [slice(4), slice(10)], - [slice(0), slice(0)], - ], - ) - @pytest.mark.parametrize( - "right_slice", - [ - [slice(2, None), slice(6, None)], - [slice(0), slice(0)], - ], - ) - @pytest.mark.parametrize("limit", [1, None]) - def test_align_fill_method( - self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit - ): - frame = float_frame - left = frame.iloc[left_slice[0], left_slice[1]] - right = frame.iloc[right_slice[0], right_slice[1]] - - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " - "are deprecated" - ) - - with tm.assert_produces_warning(FutureWarning, match=msg): - aa, ab = left.align( - right, - axis=axis, - join=how, - method=method, - limit=limit, - fill_axis=fill_axis, - ) - - join_index, join_columns = None, None - - ea, eb = left, right - if axis is None or axis == 0: - join_index = left.index.join(right.index, how=how) - ea = ea.reindex(index=join_index) - eb = eb.reindex(index=join_index) - - if axis is None or axis == 1: - join_columns = left.columns.join(right.columns, how=how) - ea = ea.reindex(columns=join_columns) - eb = eb.reindex(columns=join_columns) - - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ea = ea.fillna(axis=fill_axis, method=method, limit=limit) - eb = eb.fillna(axis=fill_axis, method=method, limit=limit) - - tm.assert_frame_equal(aa, ea) - tm.assert_frame_equal(ab, eb) - def test_align_series_check_copy(self): # GH# df = DataFrame({0: [1, 2]}) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index ef72ca1ac86b9..1c3c41e2e0299 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -8,6 +8,7 @@ from pandas import ( DataFrame, DatetimeIndex, + PeriodIndex, Series, date_range, period_range, @@ -19,10 +20,6 @@ class TestAsFreq: - @pytest.fixture(params=["s", "ms", "us", "ns"]) - def unit(self, request): - return request.param - def test_asfreq2(self, frame_or_series): ts = frame_or_series( [0.0, 1.0, 2.0], @@ -239,25 +236,61 @@ def test_asfreq_2ME(self, freq, freq_half): "freq, freq_depr", [ ("2ME", "2M"), + ("2ME", "2m"), ("2QE", "2Q"), ("2QE-SEP", "2Q-SEP"), ("1BQE", "1BQ"), ("2BQE-SEP", "2BQ-SEP"), - ("1YE", "1Y"), + ("2BQE-SEP", "2bq-sep"), + ("1YE", "1y"), ("2YE-MAR", "2Y-MAR"), - ("1YE", "1A"), - ("2YE-MAR", "2A-MAR"), - ("2BYE-MAR", "2BA-MAR"), ], ) - def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): - # GH#9586, #55978 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_asfreq_frequency_M_Q_Y_raises(self, freq, freq_depr): + msg = f"Invalid frequency: {freq_depr}" index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}") df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) - expected = df.asfreq(freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = df.asfreq(freq=freq_depr) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + df.asfreq(freq=freq_depr) + + @pytest.mark.parametrize( + "freq, error_msg", + [ + ( + "2MS", + "Invalid frequency: 2MS", + ), + ( + offsets.MonthBegin(), + r"\ is not supported as period frequency", + ), + ( + offsets.DateOffset(months=2), + r"\ is not supported as period frequency", + ), + ], + ) + def test_asfreq_unsupported_freq(self, freq, error_msg): + # https://github.com/pandas-dev/pandas/issues/56718 + index = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + df = DataFrame({"a": Series([0, 1], index=index)}) + + with pytest.raises(ValueError, match=error_msg): + df.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2YE", "2A"), + ("2BYE-MAR", "2BA-MAR"), + ], + ) + def test_asfreq_frequency_A_BA_raises(self, freq, freq_depr): + msg = f"Invalid frequency: {freq_depr}" + + index = date_range("1/1/2000", periods=4, freq=freq) + df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) + + with pytest.raises(ValueError, match=msg): + df.asfreq(freq=freq_depr) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 4a8adf89b3aef..c510ef78d03aa 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -36,18 +36,18 @@ def test_basic(self, date_range_frame): dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = df.asof(dates) - assert result.notna().all(1).all() + assert result.notna().all(axis=1).all() lb = df.index[14] ub = df.index[30] dates = list(dates) result = df.asof(dates) - assert result.notna().all(1).all() + assert result.notna().all(axis=1).all() mask = (result.index >= lb) & (result.index < ub) rs = result[mask] - assert (rs == 14).all(1).all() + assert (rs == 14).all(axis=1).all() def test_subset(self, date_range_frame): N = 10 @@ -163,19 +163,6 @@ def test_time_zone_aware_index(self, stamp, expected): result = df.asof(stamp) tm.assert_series_equal(result, expected) - def test_is_copy(self, date_range_frame): - # GH-27357, GH-30784: ensure the result of asof is an actual copy and - # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings - df = date_range_frame.astype({"A": "float"}) - N = 50 - df.loc[df.index[15:30], "A"] = np.nan - dates = date_range("1/1/1990", periods=N * 3, freq="25s") - - result = df.asof(dates) - - with tm.assert_produces_warning(None): - result["C"] = 1 - def test_asof_periodindex_mismatched_freq(self): N = 50 rng = period_range("1/1/1990", periods=N, freq="h") diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 5a1e3cd786f84..eb1ee4e7b2970 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -122,11 +124,11 @@ def test_astype_with_exclude_string(self, float_frame): def test_astype_with_view_float(self, float_frame): # this is the only real reason to do it this way tf = np.round(float_frame).astype(np.int32) - tf.astype(np.float32, copy=False) + tf.astype(np.float32) # TODO(wesm): verification? tf = float_frame.astype(np.float64) - tf.astype(np.int64, copy=False) + tf.astype(np.int64) def test_astype_with_view_mixed_float(self, mixed_float_frame): tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) @@ -134,9 +136,8 @@ def test_astype_with_view_mixed_float(self, mixed_float_frame): tf.astype(np.int64) tf.astype(np.float32) - @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) - def test_astype_cast_nan_inf_int(self, val, dtype): + def test_astype_cast_nan_inf_int(self, val, any_int_numpy_dtype): # see GH#14265 # # Check NaN and inf --> raise error when converting to int. @@ -144,13 +145,13 @@ def test_astype_cast_nan_inf_int(self, val, dtype): df = DataFrame([val]) with pytest.raises(ValueError, match=msg): - df.astype(dtype) + df.astype(any_int_numpy_dtype) def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) - c = Series([Timedelta(x, unit="d") for x in range(5)]) + c = Series([Timedelta(x, unit="D") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) @@ -167,21 +168,21 @@ def test_astype_str(self): "d": list(map(str, d._values)), "e": list(map(str, e._values)), }, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) - def test_astype_str_float(self): + def test_astype_str_float(self, using_infer_string): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="object") + expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val], dtype="object") + expected = DataFrame([val], dtype="str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -200,7 +201,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"], dtype="object"), + "b": Series(["0", "1", "2", "3", "4"], dtype="str"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -261,9 +262,9 @@ def test_astype_duplicate_col(self): a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) - result = df.astype(str) + result = df.astype("str") a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") - b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype="str", name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) @@ -283,7 +284,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: Series(vals[:, 0].astype(str), dtype=object), + 0: Series(vals[:, 0].astype(str), dtype="str"), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -323,9 +324,9 @@ def test_astype_categoricaldtype_class_raises(self, cls): with pytest.raises(TypeError, match=xpr): df["A"].astype(cls) - @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) - def test_astype_extension_dtypes(self, dtype): + def test_astype_extension_dtypes(self, any_int_ea_dtype): # GH#22578 + dtype = any_int_ea_dtype df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) expected1 = DataFrame( @@ -348,9 +349,9 @@ def test_astype_extension_dtypes(self, dtype): tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) - def test_astype_extension_dtypes_1d(self, dtype): + def test_astype_extension_dtypes_1d(self, any_int_ea_dtype): # GH#22578 + dtype = any_int_ea_dtype df = DataFrame({"a": [1.0, 2.0, 3.0]}) expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) @@ -433,14 +434,13 @@ def test_astype_from_datetimelike_to_object(self, dtype, unit): else: assert result.iloc[0, 0] == Timedelta(1, unit=unit) - @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) - def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): + def test_astype_to_datetimelike_unit(self, any_real_numpy_dtype, dtype, unit): # tests all units from numeric origination # GH#19223 / GH#12425 dtype = f"{dtype}[{unit}]" - arr = np.array([[1, 2, 3]], dtype=arr_dtype) + arr = np.array([[1, 2, 3]], dtype=any_real_numpy_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) @@ -498,11 +498,10 @@ def test_astype_to_datetime_unit(self, unit): assert exp_dta.dtype == dtype tm.assert_extension_array_equal(res_dta, exp_dta) - @pytest.mark.parametrize("unit", ["ns"]) - def test_astype_to_timedelta_unit_ns(self, unit): + def test_astype_to_timedelta_unit_ns(self): # preserver the timedelta conversion # GH#19223 - dtype = f"m8[{unit}]" + dtype = "m8[ns]" arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) @@ -556,27 +555,11 @@ def test_astype_to_incorrect_datetimelike(self, unit): other = f"m8[{unit}]" df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) - msg = "|".join( - [ - # BlockManager path - rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]", - # ArrayManager path - "cannot astype a datetimelike from " - rf"\[datetime64\[ns\]\] to \[timedelta64\[{unit}\]\]", - ] - ) + msg = rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]" with pytest.raises(TypeError, match=msg): df.astype(other) - msg = "|".join( - [ - # BlockManager path - rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]", - # ArrayManager path - "cannot astype a timedelta from " - rf"\[timedelta64\[ns\]\] to \[datetime64\[{unit}\]\]", - ] - ) + msg = rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]" df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) @@ -664,9 +647,10 @@ def test_astype_dt64tz(self, timezone_frame): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") - def test_astype_dt64tz_to_str(self, timezone_frame): + def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string): # str formatting result = timezone_frame.astype(str) + na_value = np.nan if using_infer_string else "NaT" expected = DataFrame( [ [ @@ -674,7 +658,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], - ["2013-01-02", "NaT", "NaT"], + ["2013-01-02", na_value, na_value], [ "2013-01-03", "2013-01-03 00:00:00-05:00", @@ -682,7 +666,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) @@ -734,8 +718,12 @@ def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): df.astype(float, errors=errors) def test_astype_tz_conversion(self): - # GH 35973 - val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + # GH 35973, GH#58998 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + val = { + "tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London") + } df = DataFrame(val) result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) @@ -746,7 +734,7 @@ def test_astype_tz_conversion(self): @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) def test_astype_tz_object_conversion(self, tz): # GH 35973 - val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + val = {"tz": date_range("2020-08-30", freq="D", periods=2, tz="Europe/London")} expected = DataFrame(val) # convert expected to object dtype from other tz str (independently tested) @@ -757,6 +745,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) GH#60639") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): @@ -870,7 +859,7 @@ class IntegerArrayNoCopy(pd.core.arrays.IntegerArray): # GH 42501 def copy(self): - assert False + raise NotImplementedError class Int16DtypeNoCopy(pd.Int16Dtype): @@ -884,7 +873,7 @@ def construct_array_type(cls): def test_frame_astype_no_copy(): # GH 42501 df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object) - result = df.astype({"a": Int16DtypeNoCopy()}, copy=False) + result = df.astype({"a": Int16DtypeNoCopy()}) assert result.a.dtype == pd.Int16Dtype() assert np.shares_memory(df.b.values, result.b.values) @@ -895,7 +884,7 @@ def test_astype_copies(dtype): # GH#50984 pytest.importorskip("pyarrow") df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) - result = df.astype("int64[pyarrow]", copy=True) + result = df.astype("int64[pyarrow]") df.iloc[0, 0] = 100 expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]") tm.assert_frame_equal(result, expected) @@ -907,5 +896,14 @@ def test_astype_to_string_not_modifying_input(string_storage, val): df = DataFrame({"a": ["a", "b", val]}) expected = df.copy() with option_context("mode.string_storage", string_storage): - df.astype("string", copy=False) + df.astype("string") + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val): + # GH#51073 - variant of the above test with explicit dtype instances + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + df.astype(any_string_dtype) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 4c1434bd66aff..b69db80dee446 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -1,8 +1,11 @@ -from datetime import time +from datetime import ( + time, + timezone, +) +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones @@ -65,7 +68,7 @@ def test_at_time_nonexistent(self, frame_or_series): assert len(rs) == 0 @pytest.mark.parametrize( - "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)] + "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=timezone.utc)] ) def test_at_time_errors(self, hour): # GH#24043 @@ -83,7 +86,7 @@ def test_at_time_tz(self): # GH#24043 dti = date_range("2018", periods=3, freq="h", tz="US/Pacific") df = DataFrame(list(range(len(dti))), index=dti) - result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) + result = df.at_time(time(4, tzinfo=zoneinfo.ZoneInfo("US/Eastern"))) expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) @@ -95,10 +98,9 @@ def test_at_time_raises(self, frame_or_series): with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex obj.at_time("00:00") - @pytest.mark.parametrize("axis", ["index", "columns", 0, 1]) def test_at_time_axis(self, axis): # issue 8839 - rng = date_range("1/1/2000", "1/5/2000", freq="5min") + rng = date_range("1/1/2000", "1/2/2000", freq="5min") ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng)))) ts.index, ts.columns = rng, rng diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index f783a388d7517..62a97271d208b 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -155,13 +155,13 @@ def test_clip_with_na_args(self, float_frame): # GH#19992 and adjusted in GH#40420 df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) - msg = "Downcasting behavior in Series and DataFrame methods 'where'" - # TODO: avoid this warning here? seems like we should never be upcasting - # in the first place? - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.clip(lower=[4, 5, np.nan], axis=0) + result = df.clip(lower=[4, 5, np.nan], axis=0) expected = DataFrame( - {"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]} + { + "col_0": Series([4, 5, 3], dtype="float"), + "col_1": [4, 5, 6], + "col_2": [7, 8, 9], + } ) tm.assert_frame_equal(result, expected) @@ -175,9 +175,10 @@ def test_clip_with_na_args(self, float_frame): data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} df = DataFrame(data) t = Series([2, -4, np.nan, 6, 3]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.clip(lower=t, axis=0) - expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) + result = df.clip(lower=t, axis=0) + expected = DataFrame( + {"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}, dtype="float" + ) tm.assert_frame_equal(result, expected) def test_clip_int_data_with_float_bound(self): diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 8aeab5dacd8b4..1e594043510ea 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -30,7 +30,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - def test_combine_first(self, float_frame, using_infer_string): + def test_combine_first(self, float_frame): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -76,9 +76,7 @@ def test_combine_first(self, float_frame, using_infer_string): tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases - warning = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warning, match="empty entries"): - comb = float_frame.combine_first(DataFrame()) + comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) @@ -199,7 +197,7 @@ def test_combine_first_align_nan(self): # GH 7509 (not fixed) dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) dfb = DataFrame([[4], [5]], columns=["b"]) - assert dfa["a"].dtype == "datetime64[ns]" + assert dfa["a"].dtype == "datetime64[s]" assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) @@ -208,7 +206,7 @@ def test_combine_first_align_nan(self): columns=["a", "b"], ) tm.assert_frame_equal(res, exp) - assert res["a"].dtype == "datetime64[ns]" + assert res["a"].dtype == "datetime64[s]" # TODO: this must be int64 assert res["b"].dtype == "int64" @@ -226,13 +224,13 @@ def test_combine_first_timezone(self, unit): df1 = DataFrame( columns=["UTCdatetime", "abc"], data=data1, - index=pd.date_range("20140627", periods=1), + index=pd.date_range("20140627", periods=1, unit=unit), ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit) df2 = DataFrame( columns=["UTCdatetime", "xyz"], data=data2, - index=pd.date_range("20140628", periods=1), + index=pd.date_range("20140628", periods=1, unit=unit), ) res = df2[["UTCdatetime"]].combine_first(df1) exp = DataFrame( @@ -244,7 +242,7 @@ def test_combine_first_timezone(self, unit): "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], }, columns=["UTCdatetime", "abc"], - index=pd.date_range("20140627", periods=2, freq="D"), + index=pd.date_range("20140627", periods=2, freq="D", unit=unit), dtype=f"datetime64[{unit}, UTC]", ) assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]" @@ -288,18 +286,17 @@ def test_combine_first_timezone3(self, unit): exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - # FIXME: parametrizing over unit breaks on non-nano - def test_combine_first_timezone4(self): + def test_combine_first_timezone4(self, unit): # different tz - dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern", unit=unit) df1 = DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-03", "2015-01-05") + dts2 = pd.date_range("2015-01-03", "2015-01-05", unit=unit) df2 = DataFrame({"DATE": dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" + assert res["DATE"].dtype == f"datetime64[{unit}, US/Eastern]" def test_combine_first_timezone5(self, unit): dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit) @@ -383,7 +380,7 @@ def test_combine_first_with_asymmetric_other(self, val): df2 = DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = DataFrame({"isBool": [True], "isNum": [val]}) + exp = DataFrame({"isNum": [val], "isBool": [True]}) tm.assert_frame_equal(res, exp) @@ -420,7 +417,11 @@ def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]]) - if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]: + if ( + is_dtype_equal(common_dtype, "object") + or frame.dtypes["b"] == other.dtypes["b"] + or frame.dtypes["b"].kind == frame.dtypes["b"].kind == "M" + ): val = scalar1 else: val = na_value @@ -554,3 +555,13 @@ def test_combine_first_empty_columns(): result = left.combine_first(right) expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) + + +def test_combine_first_preserve_column_order(): + # GH#60427 + df1 = DataFrame({"B": [1, 2, 3], "A": [4, None, 6]}) + df2 = DataFrame({"A": [5]}, index=[1]) + + result = df1.combine_first(df2) + expected = DataFrame({"B": [1, 2, 3], "A": [4.0, 5.0, 6.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index a4d0a7068a3a6..2ffc3f933e246 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -21,7 +21,7 @@ def test_compare_axis(align_axis): result = df.compare(df2, align_axis=align_axis) if align_axis in (1, "columns"): - indices = pd.Index([0, 2]) + indices = pd.RangeIndex(0, 4, 2) columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) expected = pd.DataFrame( [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]], @@ -29,7 +29,7 @@ def test_compare_axis(align_axis): columns=columns, ) else: - indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + indices = pd.MultiIndex.from_product([range(0, 4, 2), ["self", "other"]]) columns = pd.Index(["col1", "col3"]) expected = pd.DataFrame( [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]], @@ -60,7 +60,7 @@ def test_compare_various_formats(keep_shape, keep_equal): result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal) if keep_shape: - indices = pd.Index([0, 1, 2]) + indices = pd.RangeIndex(3) columns = pd.MultiIndex.from_product( [["col1", "col2", "col3"], ["self", "other"]] ) @@ -85,7 +85,7 @@ def test_compare_various_formats(keep_shape, keep_equal): columns=columns, ) else: - indices = pd.Index([0, 2]) + indices = pd.RangeIndex(0, 4, 2) columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) expected = pd.DataFrame( [["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns @@ -168,25 +168,25 @@ def test_compare_multi_index(align_axis): tm.assert_frame_equal(result, expected) -def test_compare_unaligned_objects(): - # test DataFrames with different indices +def test_compare_different_indices(): msg = ( r"Can only compare identically-labeled \(both index and columns\) DataFrame " "objects" ) + df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"]) + df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"]) with pytest.raises(ValueError, match=msg): - df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"]) - df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"]) df1.compare(df2) - # test DataFrames with different shapes + +def test_compare_different_shapes(): msg = ( r"Can only compare identically-labeled \(both index and columns\) DataFrame " "objects" ) + df1 = pd.DataFrame(np.ones((3, 3))) + df2 = pd.DataFrame(np.zeros((2, 1))) with pytest.raises(ValueError, match=msg): - df1 = pd.DataFrame(np.ones((3, 3))) - df2 = pd.DataFrame(np.zeros((2, 1))) df1.compare(df2) @@ -203,6 +203,7 @@ def test_compare_result_names(): }, ) result = df1.compare(df2, result_names=("left", "right")) + result.index = pd.Index([0, 2]) expected = pd.DataFrame( { ("col1", "left"): {0: "a", 2: np.nan}, diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 521d2cb14ac6a..ab847e2f8e81e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -11,13 +13,9 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes( - self, convert_integer, expected, string_storage, using_infer_string - ): + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here - if using_infer_string: - string_storage = "pyarrow_numpy" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), @@ -39,6 +37,19 @@ def test_convert_empty(self): empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) + @td.skip_if_no("pyarrow") + def test_convert_empty_categorical_to_pyarrow(self): + # GH#59934 + df = pd.DataFrame( + { + "A": pd.Categorical([None] * 5), + "B": pd.Categorical([None] * 5, categories=["B1", "B2"]), + } + ) + converted = df.convert_dtypes(dtype_backend="pyarrow") + expected = df + tm.assert_frame_equal(converted, expected) + def test_convert_dtypes_retain_column_names(self): # GH#41435 df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) @@ -200,3 +211,18 @@ def test_convert_dtypes_from_arrow(self): result = df.convert_dtypes() expected = df.astype({"a": "string[python]"}) tm.assert_frame_equal(result, expected) + + def test_convert_dtype_pyarrow_timezone_preserve(self): + # GH 60237 + pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "timestamps": pd.Series( + pd.to_datetime(range(5), utc=True, unit="h"), + dtype="timestamp[ns, tz=UTC][pyarrow]", + ) + } + ) + result = df.convert_dtypes(dtype_backend="pyarrow") + expected = df.copy() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py index e7901ed363106..5b72a84320c52 100644 --- a/pandas/tests/frame/methods/test_copy.py +++ b/pandas/tests/frame/methods/test_copy.py @@ -1,10 +1,7 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import DataFrame -import pandas._testing as tm class TestCopy: @@ -18,25 +15,6 @@ def test_copy_index_name_checking(self, float_frame, attr): getattr(cp, attr).name = "foo" assert getattr(float_frame, attr).name is None - @td.skip_copy_on_write_invalid_test - def test_copy_cache(self): - # GH#31784 _item_cache not cleared on copy causes incorrect reads after updates - df = DataFrame({"a": [1]}) - - df["x"] = [0] - df["a"] - - df.copy() - - df["a"].values[0] = -1 - - tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]})) - - df["y"] = [0] - - assert df["a"].values[0] == -1 - tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]})) - def test_copy(self, float_frame, float_string_frame): cop = float_frame.copy() cop["E"] = cop["A"] @@ -46,7 +24,6 @@ def test_copy(self, float_frame, float_string_frame): copy = float_string_frame.copy() assert copy._mgr is not float_string_frame._mgr - @td.skip_array_manager_invalid_test def test_copy_consolidates(self): # GH#42477 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 04a08c8b9bc52..304638a3a7dcf 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -207,27 +207,19 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write): + def test_corr_item_cache(self): # Check that corr does not lead to incorrect entries in item_cache df = DataFrame({"A": range(10)}) df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache - assert len(df._mgr.arrays) == 2 # i.e. 2 blocks + assert len(df._mgr.blocks) == 2 _ = df.corr(numeric_only=True) - if using_copy_on_write: - ser.iloc[0] = 99 - assert df.loc[0, "A"] == 0 - else: - # Check that the corr didn't break link between ser and df - ser.values[0] = 99 - assert df.loc[0, "A"] == 99 - if not warn_copy_on_write: - assert df["A"] is ser - assert df.values[0, 0] == 99 + ser.iloc[0] = 99 + assert df.loc[0, "A"] == 0 @pytest.mark.parametrize("length", [2, 20, 200, 2000]) def test_corr_for_constant_columns(self, length): @@ -293,7 +285,7 @@ def test_corrwith(self, datetime_frame, dtype): b = datetime_frame.add(noise, axis=0) # make sure order does not matter - b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][len(a) // 2 :]) del b["B"] colcorr = a.corrwith(b, axis=0) @@ -309,7 +301,7 @@ def test_corrwith(self, datetime_frame, dtype): dropped = a.corrwith(b, axis=1, drop=True) assert a.index[-1] not in dropped.index - # non time-series data + def test_corrwith_non_timeseries_data(self): index = ["a", "b", "c", "d", "e"] columns = ["one", "two", "three", "four"] df1 = DataFrame( @@ -339,9 +331,8 @@ def test_corrwith_with_objects(self, using_infer_string): df2["obj"] = "bar" if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): df1.corrwith(df2) else: with pytest.raises(TypeError, match="Could not convert"): @@ -363,8 +354,8 @@ def test_corrwith_series(self, datetime_frame): tm.assert_series_equal(result, expected) def test_corrwith_matches_corrcoef(self): - df1 = DataFrame(np.arange(10000), columns=["a"]) - df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) + df1 = DataFrame(np.arange(100), columns=["a"]) + df2 = DataFrame(np.arange(100) ** 2, columns=["a"]) c1 = df1.corrwith(df2)["a"] c2 = np.corrcoef(df1["a"], df2["a"])[0][1] @@ -469,3 +460,50 @@ def test_corrwith_spearman_with_tied_data(self): result = df_bool.corrwith(ser_bool) expected = Series([0.57735, 0.57735], index=["A", "B"]) tm.assert_series_equal(result, expected) + + def test_corrwith_min_periods_method(self): + # GH#9490 + pytest.importorskip("scipy") + df1 = DataFrame( + { + "A": [1, np.nan, 7, 8], + "B": [False, True, True, False], + "C": [10, 4, 9, 3], + } + ) + df2 = df1[["B", "C"]] + result = (df1 + 1).corrwith(df2.B, method="spearman", min_periods=2) + expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + def test_corrwith_min_periods_boolean(self): + # GH#9490 + df_bool = DataFrame( + {"A": [True, True, False, False], "B": [True, False, False, True]} + ) + ser_bool = Series([True, True, False, True]) + result = df_bool.corrwith(ser_bool, min_periods=3) + expected = Series([0.57735, 0.57735], index=["A", "B"]) + tm.assert_series_equal(result, expected) + + def test_corr_within_bounds(self): + df1 = DataFrame({"x": [0, 1], "y": [1.35951, 1.3595100000000007]}) + result1 = df1.corr().max().max() + expected1 = 1.0 + tm.assert_equal(result1, expected1) + + rng = np.random.default_rng(seed=42) + df2 = DataFrame(rng.random((100, 4))) + corr_matrix = df2.corr() + assert corr_matrix.min().min() >= -1.0 + assert corr_matrix.max().max() <= 1.0 + + def test_cov_with_missing_values(self): + df = DataFrame({"A": [1, 2, None, 4], "B": [2, 4, None, 9]}) + expected = DataFrame( + {"A": [2.333333, 5.500000], "B": [5.5, 13.0]}, index=["A", "B"] + ) + result1 = df.cov() + result2 = df.dropna().cov() + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 5beb09940acf3..50656ca85e90a 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -318,9 +318,7 @@ def test_describe_tz_values2(self): "max", "std", ] - expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex( - idx, copy=False - ) + expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(idx) result = df.describe(include="all") tm.assert_frame_equal(result, expected) @@ -415,3 +413,44 @@ def test_describe_exclude_pa_dtype(self): dtype=pd.ArrowDtype(pa.float64()), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("percentiles", [None, [], [0.2]]) + def test_refine_percentiles(self, percentiles): + """ + Test that the percentiles are returned correctly depending on the `percentiles` + argument. + - The default behavior is to return the 25th, 50th, and 75 percentiles + - If `percentiles` is an empty list, no percentiles are returned + - If `percentiles` is a non-empty list, only those percentiles are returned + """ + # GH#60550 + df = DataFrame({"a": np.arange(0, 10, 1)}) + + result = df.describe(percentiles=percentiles) + + if percentiles is None: + percentiles = [0.25, 0.5, 0.75] + + expected = DataFrame( + [ + len(df.a), + df.a.mean(), + df.a.std(), + df.a.min(), + *[df.a.quantile(p) for p in percentiles], + df.a.max(), + ], + index=pd.Index( + [ + "count", + "mean", + "std", + "min", + *[f"{p:.0%}" for p in percentiles], + "max", + ] + ), + columns=["a"], + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_dot.py b/pandas/tests/frame/methods/test_dot.py index 3e01f67c8794b..b365ceb2ab61c 100644 --- a/pandas/tests/frame/methods/test_dot.py +++ b/pandas/tests/frame/methods/test_dot.py @@ -153,3 +153,19 @@ def test_arrow_dtype(dtype, exp_dtype): expected = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=exp_dtype) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype,exp_dtype", + [("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")], +) +def test_arrow_dtype_series(dtype, exp_dtype): + pytest.importorskip("pyarrow") + + cols = ["a", "b"] + series_a = Series([1, 2], index=cols, dtype="int32") + df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype) + result = series_a.dot(df_b) + expected = Series([1, 2], dtype=exp_dtype) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 06cd51b43a0aa..d9668ce46c943 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd from pandas import ( DataFrame, @@ -167,7 +165,7 @@ def test_drop(self): assert return_value is None tm.assert_frame_equal(df, expected) - def test_drop_multiindex_not_lexsorted(self): + def test_drop_multiindex_not_lexsorted(self, performance_warning): # GH#11640 # define the lexsorted version @@ -188,7 +186,7 @@ def test_drop_multiindex_not_lexsorted(self): assert not not_lexsorted_df.columns._is_lexsorted() expected = lexsorted_df.drop("a", axis=1).astype(float) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = not_lexsorted_df.drop("a", axis=1) tm.assert_frame_equal(result, expected) @@ -232,15 +230,13 @@ def test_drop_api_equivalence(self): with pytest.raises(ValueError, match=msg): df.drop(axis=1) - data = [[1, 2, 3], [1, 2, 3]] - @pytest.mark.parametrize( "actual", [ - DataFrame(data=data, index=["a", "a"]), - DataFrame(data=data, index=["a", "b"]), - DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), - DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), + DataFrame([[1, 2, 3], [1, 2, 3]], index=["a", "a"]), + DataFrame([[1, 2, 3], [1, 2, 3]], index=["a", "b"]), + DataFrame([[1, 2, 3], [1, 2, 3]], index=["a", "b"]).set_index([0, 1]), + DataFrame([[1, 2, 3], [1, 2, 3]], index=["a", "a"]).set_index([0, 1]), ], ) def test_raise_on_drop_duplicate_index(self, actual): diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 6bea97b2cf189..7feb3b6fd816d 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -411,10 +411,15 @@ def test_drop_duplicates_inplace(): @pytest.mark.parametrize( "origin_dict, output_dict, ignore_index, output_index", [ - ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), - ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), - ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), - ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, range(2)), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, range(0, 4, 2)), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, range(2)), + ( + {"A": [2, 2, 3], "B": [2, 2, 4]}, + {"A": [2, 3], "B": [2, 4]}, + False, + range(0, 4, 2), + ), ], ) def test_drop_duplicates_ignore_index( @@ -471,3 +476,41 @@ def test_drop_duplicates_non_boolean_ignore_index(arg): msg = '^For argument "ignore_index" expected type bool, received type .*.$' with pytest.raises(ValueError, match=msg): df.drop_duplicates(ignore_index=arg) + + +def test_drop_duplicates_set(): + # GH#59237 + df = DataFrame( + { + "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates({"AAA"}) + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates({"AAA"}, keep="last") + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates({"AAA"}, keep=False) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates({"AAA", "B"}) + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates({"AAA", "B"}, keep="last") + expected = df.loc[[0, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates({"AAA", "B"}, keep=False) + expected = df.loc[[0]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 7899b4aeac3fd..11893d7fac1a4 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -195,7 +195,7 @@ def test_dropna_tz_aware_datetime(self): # Ex2 df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) result = df.dropna(axis=0) - expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) + expected = DataFrame([dt1, dt2], columns=["Time"], index=range(0, 6, 3)) tm.assert_frame_equal(result, expected) def test_dropna_categorical_interval_index(self): @@ -233,7 +233,7 @@ def test_set_single_column_subset(self): # GH 41021 df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.nan, 5]}) expected = DataFrame( - {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2] + {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=range(0, 4, 2) ) result = df.dropna(subset="C") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index ab632ac17318e..bf01ec73cf72b 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -10,7 +10,6 @@ DataFrame, Series, date_range, - option_context, ) import pandas._testing as tm @@ -95,14 +94,6 @@ def test_dtypes_gh8722(self, float_string_frame): ) tm.assert_series_equal(result, expected) - # compat, GH 8722 - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with option_context("use_inf_as_na", True): - df = DataFrame([[1]]) - result = df.dtypes - tm.assert_series_equal(result, Series({0: np.dtype("int64")})) - def test_dtypes_timedeltas(self): df = DataFrame( { @@ -146,8 +137,5 @@ def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - if using_infer_string: - expected = Series([np.array(["bar"])]) - else: - expected = Series(["bar"]) + expected = Series(np.array("bar")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index d0b9d96cafa0d..88b3fec02182b 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -14,11 +14,11 @@ def test_dataframe_not_equal(self): df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) assert df1.equals(df2) is False - def test_equals_different_blocks(self, using_array_manager, using_infer_string): + def test_equals_different_blocks(self, using_infer_string): # GH#9330 df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] - if not using_array_manager and not using_infer_string: + if not using_infer_string: # this assert verifies that the above operations have # induced a block rearrangement assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 5cd54db62d783..876ad5539d603 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -38,10 +38,6 @@ def test_error(): [], "column must be nonempty", ), - ( - list("AC"), - "columns must have matching element counts", - ), ], ) def test_error_multi_columns(input_subset, error_message): @@ -214,7 +210,7 @@ def test_ignore_index(): df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]}) result = df.explode("values", ignore_index=True) expected = pd.DataFrame( - {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] + {"id": [0, 0, 10, 10], "values": list("abcd")}, index=range(4) ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 6757669351c5c..8915d6f205d65 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,9 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - -import pandas.util._test_decorators as td +from pandas.errors import OutOfBoundsDatetime from pandas import ( Categorical, @@ -22,18 +20,14 @@ class TestFillNA: - def test_fillna_dict_inplace_nonunique_columns( - self, using_copy_on_write, warn_copy_on_write - ): + def test_fillna_dict_inplace_nonunique_columns(self): df = DataFrame( {"A": [np.nan] * 3, "B": [NaT, Timestamp(1), NaT], "C": [np.nan, "foo", 2]} ) df.columns = ["A", "A", "A"] orig = df[:] - # TODO(CoW-warn) better warning message - with tm.assert_cow_warning(warn_copy_on_write): - df.fillna({"A": 2}, inplace=True) + df.fillna({"A": 2}, inplace=True) # The first and third columns can be set inplace, while the second cannot. expected = DataFrame( @@ -41,31 +35,19 @@ def test_fillna_dict_inplace_nonunique_columns( ) expected.columns = ["A", "A", "A"] tm.assert_frame_equal(df, expected) - - # TODO: what's the expected/desired behavior with CoW? - if not using_copy_on_write: - assert tm.shares_memory(df.iloc[:, 0], orig.iloc[:, 0]) assert not tm.shares_memory(df.iloc[:, 1], orig.iloc[:, 1]) - if not using_copy_on_write: - assert tm.shares_memory(df.iloc[:, 2], orig.iloc[:, 2]) - @td.skip_array_manager_not_yet_implemented - def test_fillna_on_column_view(self, using_copy_on_write): + def test_fillna_on_column_view(self): # GH#46149 avoid unnecessary copies arr = np.full((40, 50), np.nan) df = DataFrame(arr, copy=False) - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df[0].fillna(-1, inplace=True) - assert np.isnan(arr[:, 0]).all() - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - df[0].fillna(-1, inplace=True) - assert (arr[:, 0] == -1).all() + with tm.raises_chained_assignment_error(): + df[0].fillna(-1, inplace=True) + assert np.isnan(arr[:, 0]).all() # i.e. we didn't create a new 49-column block - assert len(df._mgr.arrays) == 1 + assert len(df._mgr.blocks) == 1 assert np.shares_memory(df.values, arr) def test_fillna_datetime(self, datetime_frame): @@ -76,31 +58,27 @@ def test_fillna_datetime(self, datetime_frame): zero_filled = datetime_frame.fillna(0) assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all() - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - padded = datetime_frame.fillna(method="pad") + padded = datetime_frame.ffill() assert np.isnan(padded.loc[padded.index[:5], "A"]).all() - assert ( - padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] - ).all() - msg = "Must specify a fill 'value' or 'method'" - with pytest.raises(ValueError, match=msg): + msg = r"missing 1 required positional argument: 'value'" + with pytest.raises(TypeError, match=msg): datetime_frame.fillna() - msg = "Cannot specify both 'value' and 'method'" - with pytest.raises(ValueError, match=msg): - datetime_frame.fillna(5, method="ffill") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") - def test_fillna_mixed_type(self, float_string_frame): + def test_fillna_mixed_type(self, float_string_frame, using_infer_string): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan mf.loc[mf.index[-10:], "A"] = np.nan - # TODO: make stronger assertion here, GH 25640 - mf.fillna(value=0) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - mf.fillna(method="pad") + + result = mf.ffill() + assert ( + result.loc[result.index[-10:], "A"] == result.loc[result.index[-11], "A"] + ).all() + assert (result.loc[result.index[5:20], "foo"] == "bar").all() + + result = mf.fillna(value=0) + assert (result.loc[result.index[-10:], "A"] == 0).all() + assert (result.loc[result.index[5:20], "foo"] == 0).all() def test_fillna_mixed_float(self, mixed_float_frame): # mixed numeric (but no float16) @@ -108,44 +86,24 @@ def test_fillna_mixed_float(self, mixed_float_frame): mf.loc[mf.index[-10:], "A"] = np.nan result = mf.fillna(value=0) _check_mixed_float(result, dtype={"C": None}) - - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = mf.fillna(method="pad") + result = mf.ffill() _check_mixed_float(result, dtype={"C": None}) - def test_fillna_empty(self, using_copy_on_write): - if using_copy_on_write: - pytest.skip("condition is unnecessary complex and is deprecated anyway") - # empty frame (GH#2778) - df = DataFrame(columns=["x"]) - for m in ["pad", "backfill"]: - msg = "Series.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.x.fillna(method=m, inplace=True) - df.x.fillna(method=m) - - def test_fillna_different_dtype(self, using_infer_string): + def test_fillna_different_dtype(self): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + # column is originally float (all-NaN) -> filling with string gives object dtype + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -188,9 +146,7 @@ def test_fillna_tzaware(self): ] } ) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.fillna(method="pad") + res = df.ffill() tm.assert_frame_equal(res, exp) df = DataFrame({"A": [NaT, Timestamp("2012-11-11 00:00:00+01:00")]}) @@ -202,9 +158,7 @@ def test_fillna_tzaware(self): ] } ) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.fillna(method="bfill") + res = df.bfill() tm.assert_frame_equal(res, exp) def test_fillna_tzaware_different_column(self): @@ -216,9 +170,7 @@ def test_fillna_tzaware_different_column(self): "B": [1, 2, np.nan, np.nan], } ) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.fillna(method="pad") + result = df.ffill() expected = DataFrame( { "A": date_range("20130101", periods=4, tz="US/Eastern"), @@ -249,9 +201,7 @@ def test_na_actions_categorical(self): with pytest.raises(TypeError, match=msg): df.fillna(value={"cats": 4, "vals": "c"}) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.fillna(method="pad") + res = df.ffill() tm.assert_frame_equal(res, df_exp_fill) # dropna @@ -307,57 +257,12 @@ def test_fillna_categorical_nan(self): df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=NaT), df) - def test_fillna_downcast(self): - # GH#15277 - # infer int64 from float64 - df = DataFrame({"a": [1.0, np.nan]}) - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.fillna(0, downcast="infer") - expected = DataFrame({"a": [1, 0]}) - tm.assert_frame_equal(result, expected) - - # infer int64 from float64 when fillna value is a dict - df = DataFrame({"a": [1.0, np.nan]}) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.fillna({"a": 0}, downcast="infer") - expected = DataFrame({"a": [1, 0]}) - tm.assert_frame_equal(result, expected) - - def test_fillna_downcast_false(self, frame_or_series): - # GH#45603 preserve object dtype with downcast=False + def test_fillna_no_downcast(self, frame_or_series): + # GH#45603 preserve object dtype obj = frame_or_series([1, 2, 3], dtype="object") - msg = "The 'downcast' keyword in fillna" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = obj.fillna("", downcast=False) + result = obj.fillna("") tm.assert_equal(result, obj) - def test_fillna_downcast_noop(self, frame_or_series): - # GH#45423 - # Two relevant paths: - # 1) not _can_hold_na (e.g. integer) - # 2) _can_hold_na + noop + not can_hold_element - - obj = frame_or_series([1, 2, 3], dtype=np.int64) - - msg = "The 'downcast' keyword in fillna" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#40988 - res = obj.fillna("foo", downcast=np.dtype(np.int32)) - expected = obj.astype(np.int32) - tm.assert_equal(res, expected) - - obj2 = obj.astype(np.float64) - with tm.assert_produces_warning(FutureWarning, match=msg): - res2 = obj2.fillna("foo", downcast="infer") - expected2 = obj # get back int64 - tm.assert_equal(res2, expected2) - - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#40988 - res3 = obj2.fillna("foo", downcast=np.dtype(np.int32)) - tm.assert_equal(res3, expected) - @pytest.mark.parametrize("columns", [["A", "A", "B"], ["A", "A"]]) def test_fillna_dictlike_value_duplicate_colnames(self, columns): # GH#43476 @@ -369,27 +274,22 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - def test_fillna_dtype_conversion(self, using_infer_string): + def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) tm.assert_series_equal(result, expected) - - msg = "Downcasting object dtype arrays" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.fillna(1) - expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) + result = df.fillna(1) + expected = DataFrame( + 1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5], dtype=object + ) tm.assert_frame_equal(result, expected) # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna("nan") - else: - result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + result = df.fillna("nan") + expected = DataFrame("nan", dtype="object", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) @@ -447,19 +347,14 @@ def test_ffill(self, datetime_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - alt = datetime_frame.fillna(method="ffill") + alt = datetime_frame.ffill() tm.assert_frame_equal(datetime_frame.ffill(), alt) def test_bfill(self, datetime_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - alt = datetime_frame.fillna(method="bfill") - + alt = datetime_frame.bfill() tm.assert_frame_equal(datetime_frame.bfill(), alt) def test_frame_pad_backfill_limit(self): @@ -468,16 +363,13 @@ def test_frame_pad_backfill_limit(self): result = df[:2].reindex(index, method="pad", limit=5) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df[:2].reindex(index).fillna(method="pad") + expected = df[:2].reindex(index).ffill() expected.iloc[-3:] = np.nan tm.assert_frame_equal(result, expected) result = df[-2:].reindex(index, method="backfill", limit=5) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df[-2:].reindex(index).fillna(method="backfill") + expected = df[-2:].reindex(index).bfill() expected.iloc[:3] = np.nan tm.assert_frame_equal(result, expected) @@ -486,21 +378,16 @@ def test_frame_fillna_limit(self): df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index) result = df[:2].reindex(index) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = result.fillna(method="pad", limit=5) + result = result.ffill(limit=5) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df[:2].reindex(index).fillna(method="pad") + expected = df[:2].reindex(index).ffill() expected.iloc[-3:] = np.nan tm.assert_frame_equal(result, expected) result = df[-2:].reindex(index) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = result.fillna(method="backfill", limit=5) + result = result.bfill(limit=5) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df[-2:].reindex(index).fillna(method="backfill") + expected = df[-2:].reindex(index).bfill() expected.iloc[:3] = np.nan tm.assert_frame_equal(result, expected) @@ -544,13 +431,10 @@ def test_fillna_inplace(self): df.loc[:4, 1] = np.nan df.loc[-4:, 3] = np.nan - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.fillna(method="ffill") + expected = df.ffill() assert expected is not df - with tm.assert_produces_warning(FutureWarning, match=msg): - df.fillna(method="ffill", inplace=True) + df.ffill(inplace=True) tm.assert_frame_equal(df, expected) def test_fillna_dict_series(self): @@ -579,7 +463,7 @@ def test_fillna_dict_series(self): # disable this for now with pytest.raises(NotImplementedError, match="column by column"): - df.fillna(df.max(1), axis=1) + df.fillna(df.max(axis=1), axis=1) def test_fillna_dataframe(self): # GH#8377 @@ -621,24 +505,15 @@ def test_fillna_columns(self): arr[:, ::2] = np.nan df = DataFrame(arr) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.fillna(method="ffill", axis=1) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.T.fillna(method="pad").T + result = df.ffill(axis=1) + expected = df.T.ffill().T tm.assert_frame_equal(result, expected) df.insert(6, "foo", 5) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.fillna(method="ffill", axis=1) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.astype(float).fillna(method="ffill", axis=1) + result = df.ffill(axis=1) + expected = df.astype(float).ffill(axis=1) tm.assert_frame_equal(result, expected) - def test_fillna_invalid_method(self, float_frame): - with pytest.raises(ValueError, match="ffil"): - float_frame.fillna(method="ffil") - def test_fillna_invalid_value(self, float_frame): # list msg = '"value" parameter must be a scalar or dict, but you passed a "{}"' @@ -659,32 +534,13 @@ def test_fillna_col_reordering(self): cols = ["COL." + str(i) for i in range(5, 0, -1)] data = np.random.default_rng(2).random((20, 5)) df = DataFrame(index=range(20), columns=cols, data=data) - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - filled = df.fillna(method="ffill") + filled = df.ffill() assert df.columns.tolist() == filled.columns.tolist() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") - def test_fill_corner(self, float_frame, float_string_frame): - mf = float_string_frame - mf.loc[mf.index[5:20], "foo"] = np.nan - mf.loc[mf.index[-10:], "A"] = np.nan - - filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], "foo"] == 0).all() - del float_string_frame["foo"] - - float_frame.reindex(columns=[]).fillna(value=0) - - def test_fillna_downcast_dict(self): - # GH#40809 - df = DataFrame({"col1": [1, np.nan]}) - - msg = "The 'downcast' keyword in fillna" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.fillna({"col1": 2}, downcast={"col1": "int64"}) - expected = DataFrame({"col1": [1, 2]}) - tm.assert_frame_equal(result, expected) + def test_fill_empty(self, float_frame): + df = float_frame.reindex(columns=[]) + result = df.fillna(value=0) + tm.assert_frame_equal(result, df) def test_fillna_with_columns_and_limit(self): # GH40989 @@ -752,23 +608,16 @@ def test_fillna_inplace_with_columns_limit_and_value(self): df.fillna(axis=1, value=100, limit=1, inplace=True) tm.assert_frame_equal(df, expected) - @td.skip_array_manager_invalid_test @pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}]) - def test_inplace_dict_update_view( - self, val, using_copy_on_write, warn_copy_on_write - ): + def test_inplace_dict_update_view(self, val): # GH#47188 df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]}) df_orig = df.copy() result_view = df[:] - with tm.assert_cow_warning(warn_copy_on_write): - df.fillna(val, inplace=True) + df.fillna(val, inplace=True) expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]}) tm.assert_frame_equal(df, expected) - if using_copy_on_write: - tm.assert_frame_equal(result_view, df_orig) - else: - tm.assert_frame_equal(result_view, expected) + tm.assert_frame_equal(result_view, df_orig) def test_single_block_df_with_horizontal_axis(self): # GH 47713 @@ -843,60 +692,38 @@ def test_fillna_nones_inplace(): [[None, None], [None, None]], columns=["A", "B"], ) - msg = "Downcasting object dtype arrays" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.fillna(value={"A": 1, "B": 2}, inplace=True) + df.fillna(value={"A": 1, "B": 2}, inplace=True) - expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"]) + expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"], dtype=object) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("func", ["pad", "backfill"]) -def test_pad_backfill_deprecated(func): - # GH#33396 - df = DataFrame({"a": [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning): - getattr(df, func)() - - @pytest.mark.parametrize( "data, expected_data, method, kwargs", ( - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], "ffill", {"limit_area": "inside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], "ffill", {"limit_area": "inside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], "ffill", {"limit_area": "outside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], "ffill", {"limit_area": "outside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), ( [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -910,41 +737,29 @@ def test_pad_backfill_deprecated(func): "ffill", {"limit_area": "outside", "limit": 1}, ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], "bfill", {"limit_area": "inside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], "bfill", {"limit_area": "inside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], "bfill", {"limit_area": "outside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], "bfill", {"limit_area": "outside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), ), ) @@ -954,3 +769,29 @@ def test_ffill_bfill_limit_area(data, expected_data, method, kwargs): expected = DataFrame(expected_data) result = getattr(df, method)(**kwargs) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("test_frame", [True, False]) +@pytest.mark.parametrize("dtype", ["float", "object"]) +def test_fillna_with_none_object(test_frame, dtype): + # GH#57723 + obj = Series([1, np.nan, 3], dtype=dtype) + if test_frame: + obj = obj.to_frame() + result = obj.fillna(value=None) + expected = Series([1, None, 3], dtype=dtype) + if test_frame: + expected = expected.to_frame() + tm.assert_equal(result, expected) + + +def test_fillna_out_of_bounds_datetime(): + # GH#61208 + df = DataFrame( + {"datetime": date_range("1/1/2011", periods=3, freq="h"), "value": [1, 2, 3]} + ) + df.iloc[0, 0] = None + + msg = "Cannot cast 0001-01-01 00:00:00 to unit='ns' without overflow" + with pytest.raises(OutOfBoundsDatetime, match=msg): + df.fillna(Timestamp("0001-01-01")) diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index 9d5e6876bb08c..dc84e2adf1239 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -98,23 +98,23 @@ def test_filter_regex_search(self, float_frame): tm.assert_frame_equal(result, exp) @pytest.mark.parametrize( - "name,expected", + "name,expected_data", [ - ("a", DataFrame({"a": [1, 2]})), - ("a", DataFrame({"a": [1, 2]})), - ("あ", DataFrame({"あ": [3, 4]})), + ("a", {"a": [1, 2]}), + ("あ", {"あ": [3, 4]}), ], ) - def test_filter_unicode(self, name, expected): + def test_filter_unicode(self, name, expected_data): # GH13101 df = DataFrame({"a": [1, 2], "あ": [3, 4]}) + expected = DataFrame(expected_data) tm.assert_frame_equal(df.filter(like=name), expected) tm.assert_frame_equal(df.filter(regex=name), expected) - @pytest.mark.parametrize("name", ["a", "a"]) - def test_filter_bytestring(self, name): + def test_filter_bytestring(self): # GH13101 + name = "a" df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) expected = DataFrame({b"a": [1, 2]}) diff --git a/pandas/tests/frame/methods/test_first_and_last.py b/pandas/tests/frame/methods/test_first_and_last.py deleted file mode 100644 index 212e56442ee07..0000000000000 --- a/pandas/tests/frame/methods/test_first_and_last.py +++ /dev/null @@ -1,143 +0,0 @@ -""" -Note: includes tests for `last` -""" -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - Index, - bdate_range, - date_range, -) -import pandas._testing as tm - -deprecated_msg = "first is deprecated" -last_deprecated_msg = "last is deprecated" - - -class TestFirst: - def test_first_subset(self, frame_or_series): - ts = DataFrame( - np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), - index=date_range("2000-01-01", periods=100, freq="12h"), - ) - ts = tm.get_obj(ts, frame_or_series) - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts.first("10d") - assert len(result) == 20 - - ts = DataFrame( - np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), - index=date_range("2000-01-01", periods=100, freq="D"), - ) - ts = tm.get_obj(ts, frame_or_series) - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts.first("10d") - assert len(result) == 10 - - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts.first("3ME") - expected = ts[:"3/31/2000"] - tm.assert_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts.first("21D") - expected = ts[:21] - tm.assert_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts[:0].first("3ME") - tm.assert_equal(result, ts[:0]) - - def test_first_last_raises(self, frame_or_series): - # GH#20725 - obj = DataFrame([[1, 2, 3], [4, 5, 6]]) - obj = tm.get_obj(obj, frame_or_series) - - msg = "'first' only supports a DatetimeIndex index" - with tm.assert_produces_warning( - FutureWarning, match=deprecated_msg - ), pytest.raises( - TypeError, match=msg - ): # index is not a DatetimeIndex - obj.first("1D") - - msg = "'last' only supports a DatetimeIndex index" - with tm.assert_produces_warning( - FutureWarning, match=last_deprecated_msg - ), pytest.raises( - TypeError, match=msg - ): # index is not a DatetimeIndex - obj.last("1D") - - def test_last_subset(self, frame_or_series): - ts = DataFrame( - np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), - index=date_range("2000-01-01", periods=100, freq="12h"), - ) - ts = tm.get_obj(ts, frame_or_series) - with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = ts.last("10d") - assert len(result) == 20 - - ts = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=date_range("2000-01-01", periods=30, freq="D"), - ) - ts = tm.get_obj(ts, frame_or_series) - with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = ts.last("10d") - assert len(result) == 10 - - with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = ts.last("21D") - expected = ts["2000-01-10":] - tm.assert_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = ts.last("21D") - expected = ts[-21:] - tm.assert_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = ts[:0].last("3ME") - tm.assert_equal(result, ts[:0]) - - @pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)]) - def test_first_with_first_day_last_of_month(self, frame_or_series, start, periods): - # GH#29623 - x = frame_or_series([1] * 100, index=bdate_range(start, periods=100)) - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = x.first("1ME") - expected = frame_or_series( - [1] * periods, index=bdate_range(start, periods=periods) - ) - tm.assert_equal(result, expected) - - def test_first_with_first_day_end_of_frq_n_greater_one(self, frame_or_series): - # GH#29623 - x = frame_or_series([1] * 100, index=bdate_range("2010-03-31", periods=100)) - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = x.first("2ME") - expected = frame_or_series( - [1] * 23, index=bdate_range("2010-03-31", "2010-04-30") - ) - tm.assert_equal(result, expected) - - def test_empty_not_input(self): - # GH#51032 - df = DataFrame(index=pd.DatetimeIndex([])) - with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = df.last(offset=1) - - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = df.first(offset=1) - - tm.assert_frame_equal(df, result) - assert df is not result diff --git a/pandas/tests/frame/methods/test_first_valid_index.py b/pandas/tests/frame/methods/test_first_valid_index.py index 2e27f1aa71700..5855be2373ae2 100644 --- a/pandas/tests/frame/methods/test_first_valid_index.py +++ b/pandas/tests/frame/methods/test_first_valid_index.py @@ -1,6 +1,7 @@ """ Includes test for last_valid_index. """ + import numpy as np import pytest diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index c5d32d56d03c1..6d097e75f6703 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -33,7 +33,9 @@ def test_get_numeric_data(self, using_infer_string): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname) if not using_infer_string else "string", + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_infer_objects.py b/pandas/tests/frame/methods/test_infer_objects.py index a824a615b5c29..c7cdcd177403b 100644 --- a/pandas/tests/frame/methods/test_infer_objects.py +++ b/pandas/tests/frame/methods/test_infer_objects.py @@ -25,7 +25,7 @@ def test_infer_objects(self): assert df["a"].dtype == "int64" assert df["b"].dtype == "float64" - assert df["c"].dtype == "M8[ns]" + assert df["c"].dtype == "M8[us]" assert df["d"].dtype == "object" expected = DataFrame( diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index fcb7677f03f27..de6737ec3bc39 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -8,19 +8,23 @@ import pytest from pandas.compat import ( + HAS_PYARROW, IS64, PYPY, + is_platform_arm, ) from pandas import ( CategoricalIndex, DataFrame, + Index, MultiIndex, Series, date_range, option_context, ) import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -40,7 +44,7 @@ def test_info_empty(): result = buf.getvalue() expected = textwrap.dedent( """\ - + RangeIndex: 0 entries Empty DataFrame\n""" ) @@ -208,7 +212,7 @@ def test_info_memory(): bytes = float(df.memory_usage().sum()) expected = textwrap.dedent( f"""\ - + RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): # Column Non-Null Count Dtype @@ -360,7 +364,7 @@ def test_info_memory_usage(): df = DataFrame(data) df.columns = dtypes - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) @@ -398,25 +402,25 @@ def test_info_memory_usage(): @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() == df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() @@ -432,17 +436,25 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 -def test_info_memory_usage_qualified(): +def test_info_memory_usage_qualified(using_infer_string): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object)) df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str")) + df.info(buf=buf) + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() + buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) @@ -455,7 +467,10 @@ def test_info_memory_usage_qualified(): 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) - assert "+" in buf.getvalue() + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(): @@ -493,15 +508,15 @@ def test_info_categorical(): @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") -def test_info_int_columns(): +def test_info_int_columns(using_infer_string): # GH#37245 df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) buf = StringIO() df.info(show_counts=True, buf=buf) result = buf.getvalue() expected = textwrap.dedent( - """\ - + f"""\ + Index: 2 entries, A to B Data columns (total 2 columns): # Column Non-Null Count Dtype @@ -509,25 +524,31 @@ def test_info_int_columns(): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: 48.0+ bytes + memory usage: {"50.0" if using_infer_string and HAS_PYARROW else "48.0+"} bytes """ ) assert result == expected -def test_memory_usage_empty_no_warning(): +def test_memory_usage_empty_no_warning(using_infer_string): # GH#50066 df = DataFrame(index=["a", "b"]) with tm.assert_produces_warning(None): result = df.memory_usage() - expected = Series(16 if IS64 else 8, index=["Index"]) + if using_infer_string and HAS_PYARROW: + value = 18 + else: + value = 16 if IS64 else 8 + expected = Series(value, index=["Index"]) tm.assert_series_equal(result, expected) @pytest.mark.single_cpu def test_info_compute_numba(): # GH#51922 - pytest.importorskip("numba") + numba = pytest.importorskip("numba") + if Version(numba.__version__) == Version("0.61") and is_platform_arm(): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") df = DataFrame([[1, 2], [3, 4]]) with option_context("compute.use_numba", True): diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index e0641fcb65bd3..09d1cc9a479b2 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,9 +1,8 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype -from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td from pandas import ( @@ -52,12 +51,8 @@ def test_interpolate_datetimelike_values(self, frame_or_series): expected_td = frame_or_series(orig - orig[0]) tm.assert_equal(res_td, expected_td) - def test_interpolate_inplace(self, frame_or_series, using_array_manager, request): + def test_interpolate_inplace(self, frame_or_series, request): # GH#44749 - if using_array_manager and frame_or_series is DataFrame: - mark = pytest.mark.xfail(reason=".values-based in-place check is invalid") - request.applymarker(mark) - obj = frame_or_series([1, np.nan, 2]) orig = obj.values @@ -69,10 +64,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" - ) - def test_interp_basic(self, using_copy_on_write): + def test_interp_basic(self, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -81,40 +73,22 @@ def test_interp_basic(self, using_copy_on_write): "D": list("abcd"), } ) - expected = DataFrame( - { - "A": [1.0, 2.0, 3.0, 4.0], - "B": [1.0, 4.0, 9.0, 9.0], - "C": [1, 2, 3, 5], - "D": list("abcd"), - } - ) - msg = "DataFrame.interpolate with object dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.interpolate() - tm.assert_frame_equal(result, expected) + dtype = "str" if using_infer_string else "object" + msg = f"[Cc]annot interpolate with {dtype} dtype" + with pytest.raises(TypeError, match=msg): + df.interpolate() - # check we didn't operate inplace GH#45791 cvalues = df["C"]._values dvalues = df["D"].values - if using_copy_on_write: - assert np.shares_memory(cvalues, result["C"]._values) - assert np.shares_memory(dvalues, result["D"]._values) - else: - assert not np.shares_memory(cvalues, result["C"]._values) - assert not np.shares_memory(dvalues, result["D"]._values) - - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.interpolate(inplace=True) - assert res is None - tm.assert_frame_equal(df, expected) + with pytest.raises(TypeError, match=msg): + df.interpolate(inplace=True) # check we DID operate inplace - assert np.shares_memory(df["C"]._values, cvalues) - assert np.shares_memory(df["D"]._values, dvalues) + assert tm.shares_memory(df["C"]._values, cvalues) + assert tm.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( @@ -126,14 +100,16 @@ def test_interp_basic_with_non_range_index(self, using_infer_string): } ) - msg = "DataFrame.interpolate with object dtype" - warning = FutureWarning if not using_infer_string else None - with tm.assert_produces_warning(warning, match=msg): + msg = "DataFrame cannot interpolate with object dtype" + if not using_infer_string: + with pytest.raises(TypeError, match=msg): + df.set_index("C").interpolate() + else: result = df.set_index("C").interpolate() - expected = df.set_index("C") - expected.loc[3, "A"] = 3 - expected.loc[5, "B"] = 9 - tm.assert_frame_equal(result, expected) + expected = df.set_index("C") + expected.loc[3, "A"] = 2.66667 + expected.loc[5, "B"] = 9 + tm.assert_frame_equal(result, expected) def test_interp_empty(self): # https://github.com/pandas-dev/pandas/issues/35598 @@ -151,13 +127,7 @@ def test_interp_bad_method(self): "C": [1, 2, 3, 5], } ) - msg = ( - r"method must be one of \['linear', 'time', 'index', 'values', " - r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', " - r"'barycentric', 'krogh', 'spline', 'polynomial', " - r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', " - r"'cubicspline'\]. Got 'not_a_method' instead." - ) + msg = "Can not interpolate with method=not_a_method" with pytest.raises(ValueError, match=msg): df.interpolate(method="not_a_method") @@ -175,33 +145,6 @@ def test_interp_combo(self): expected = Series([1.0, 2.0, 3.0, 4.0], name="A") tm.assert_series_equal(result, expected) - msg = "The 'downcast' keyword in Series.interpolate is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df["A"].interpolate(downcast="infer") - expected = Series([1, 2, 3, 4], name="A") - tm.assert_series_equal(result, expected) - - def test_inerpolate_invalid_downcast(self): - # GH#53103 - df = DataFrame( - { - "A": [1.0, 2.0, np.nan, 4.0], - "B": [1, 4, 9, np.nan], - "C": [1, 2, 3, 5], - "D": list("abcd"), - } - ) - - msg = "downcast must be either None or 'infer'" - msg2 = "The 'downcast' keyword in DataFrame.interpolate is deprecated" - msg3 = "The 'downcast' keyword in Series.interpolate is deprecated" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - df.interpolate(downcast="int64") - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg3): - df["A"].interpolate(downcast="int64") - def test_interp_nan_idx(self): df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]}) df = df.set_index("A") @@ -262,11 +205,6 @@ def test_interp_alt_scipy(self): expected.loc[5, "A"] = 6 tm.assert_frame_equal(result, expected) - msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.interpolate(method="barycentric", downcast="infer") - tm.assert_frame_equal(result, expected.astype(np.int64)) - result = df.interpolate(method="krogh") expectedk = df.copy() expectedk["A"] = expected["A"] @@ -356,57 +294,26 @@ def test_interp_raise_on_only_mixed(self, axis): "E": [1, 2, 3, 4], } ) - msg = ( - "Cannot interpolate with all object-dtype columns " - "in the DataFrame. Try setting at least one " - "column to a numeric dtype." - ) + msg = "DataFrame cannot interpolate with object dtype" with pytest.raises(TypeError, match=msg): df.astype("object").interpolate(axis=axis) def test_interp_raise_on_all_object_dtype(self): # GH 22985 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object") - msg = ( - "Cannot interpolate with all object-dtype columns " - "in the DataFrame. Try setting at least one " - "column to a numeric dtype." - ) + msg = "DataFrame cannot interpolate with object dtype" with pytest.raises(TypeError, match=msg): df.interpolate() - def test_interp_inplace(self, using_copy_on_write): + def test_interp_inplace(self): df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) - expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) - expected_cow = df.copy() + expected = df.copy() result = df.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - return_value = result["a"].interpolate(inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected_cow) - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - return_value = result["a"].interpolate(inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) - - result = df.copy() - msg = "The 'downcast' keyword in Series.interpolate is deprecated" - - if using_copy_on_write: - with tm.assert_produces_warning( - (FutureWarning, ChainedAssignmentError), match=msg - ): - return_value = result["a"].interpolate(inplace=True, downcast="infer") - assert return_value is None - tm.assert_frame_equal(result, expected_cow) - else: - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = result["a"].interpolate(inplace=True, downcast="infer") - assert return_value is None - tm.assert_frame_equal(result, expected.astype("int64")) + with tm.raises_chained_assignment_error(): + return_value = result["a"].interpolate(inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) def test_interp_inplace_row(self): # GH 10395 @@ -436,15 +343,11 @@ def test_interp_ignore_all_good(self): "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"), } ) - - msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.interpolate(downcast=None) + result = df.interpolate() tm.assert_frame_equal(result, expected) # all good - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df[["B", "D"]].interpolate(downcast=None) + result = df[["B", "D"]].interpolate() tm.assert_frame_equal(result, df[["B", "D"]]) def test_interp_time_inplace_axis(self): @@ -474,14 +377,8 @@ def test_interp_string_axis(self, axis_name, axis_number): @pytest.mark.parametrize("multiblock", [True, False]) @pytest.mark.parametrize("method", ["ffill", "bfill", "pad"]) - def test_interp_fillna_methods( - self, request, axis, multiblock, method, using_array_manager - ): + def test_interp_fillna_methods(self, axis, multiblock, method): # GH 12918 - if using_array_manager and axis in (1, "columns"): - # TODO(ArrayManager) support axis=1 - td.mark_array_manager_not_yet_implemented(request) - df = DataFrame( { "A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0], @@ -493,12 +390,9 @@ def test_interp_fillna_methods( df["D"] = np.nan df["E"] = 1.0 - method2 = method if method != "pad" else "ffill" - expected = getattr(df, method2)(axis=axis) - msg = f"DataFrame.interpolate with method={method} is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.interpolate(method=method, axis=axis) - tm.assert_frame_equal(result, expected) + msg = f"Can not interpolate with method={method}" + with pytest.raises(ValueError, match=msg): + df.interpolate(method=method, axis=axis) def test_interpolate_empty_df(self): # GH#53199 @@ -508,8 +402,41 @@ def test_interpolate_empty_df(self): assert result is None tm.assert_frame_equal(df, expected) - def test_interpolate_ea_raise(self): + def test_interpolate_ea(self, any_int_ea_dtype): # GH#55347 - df = DataFrame({"a": [1, None, 2]}, dtype="Int64") - with pytest.raises(NotImplementedError, match="does not implement"): - df.interpolate() + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64") + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + [ + "Float64", + "Float32", + pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_interpolate_ea_float(self, dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"], + ) + def test_interpolate_arrow(self, dtype): + # GH#55347 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]") + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py index 1fe28cb8eb856..086986702d24f 100644 --- a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -1,16 +1,11 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, DataFrame, ) -# _is_homogeneous_type always returns True for ArrayManager -pytestmark = td.skip_array_manager_invalid_test - @pytest.mark.parametrize( "data, expected", diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 735f6c50ab739..aaa9485cab580 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -1,4 +1,6 @@ from datetime import datetime +import re +import zoneinfo import numpy as np import pytest @@ -17,25 +19,6 @@ from pandas.core.reshape.concat import concat -@pytest.fixture -def frame_with_period_index(): - return DataFrame( - data=np.arange(20).reshape(4, 5), - columns=list("abcde"), - index=period_range(start="2000", freq="Y", periods=4), - ) - - -@pytest.fixture -def left(): - return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) - - -@pytest.fixture -def right(): - return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) - - @pytest.fixture def left_no_dup(): return DataFrame( @@ -112,7 +95,9 @@ def right_w_dups(right_no_dup): ), ], ) -def test_join(left, right, how, sort, expected): +def test_join(how, sort, expected): + left = DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) + right = DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) result = left.join(right, how=how, sort=sort, validate="1:1") tm.assert_frame_equal(result, expected) @@ -292,7 +277,21 @@ def test_join_index(float_frame): tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) - with pytest.raises(ValueError, match="join method"): + # left anti + joined = f.join(f2, how="left_anti") + tm.assert_index_equal(joined.index, float_frame.index[:5]) + tm.assert_index_equal(joined.columns, expected_columns) + + # right anti + joined = f.join(f2, how="right_anti") + tm.assert_index_equal(joined.index, float_frame.index[10:][::-1]) + tm.assert_index_equal(joined.columns, expected_columns) + + join_msg = ( + "'foo' is not a valid Merge type: left, right, inner, outer, " + "left_anti, right_anti, cross, asof" + ) + with pytest.raises(ValueError, match=re.escape(join_msg)): f.join(f2, how="foo") # corner case - overlapping columns @@ -347,7 +346,12 @@ def test_join_overlap(float_frame): tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) -def test_join_period_index(frame_with_period_index): +def test_join_period_index(): + frame_with_period_index = DataFrame( + data=np.arange(20).reshape(4, 5), + columns=list("abcde"), + index=period_range(start="2000", freq="Y", periods=4), + ) other = frame_with_period_index.rename(columns=lambda key: f"{key}{key}") joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1) @@ -391,8 +395,8 @@ def test_join_list_series(float_frame): tm.assert_frame_equal(result, float_frame) -@pytest.mark.parametrize("sort_kw", [True, False]) -def test_suppress_future_warning_with_sort_kw(sort_kw): +def test_suppress_future_warning_with_sort_kw(sort): + sort_kw = sort a = DataFrame({"col1": [1, 2]}, index=["c", "a"]) b = DataFrame({"col2": [4, 5]}, index=["b", "a"]) @@ -555,17 +559,14 @@ def test_merge_join_different_levels_raises(self): df1.join(df2, on="a") def test_frame_join_tzaware(self): + tz = zoneinfo.ZoneInfo("US/Central") test1 = DataFrame( np.zeros((6, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central" - ), + index=date_range("2012-11-15 00:00:00", periods=6, freq="100ms", tz=tz), ) test2 = DataFrame( np.zeros((3, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central" - ), + index=date_range("2012-11-15 00:00:00", periods=3, freq="250ms", tz=tz), columns=range(3, 6), ) @@ -573,4 +574,4 @@ def test_frame_join_tzaware(self): expected = test1.index.union(test2.index) tm.assert_index_equal(result.index, expected) - assert result.index.tz.zone == "US/Central" + assert result.index.tz.key == "US/Central" diff --git a/pandas/tests/frame/methods/test_map.py b/pandas/tests/frame/methods/test_map.py index 03681c3df844e..9850de14b2092 100644 --- a/pandas/tests/frame/methods/test_map.py +++ b/pandas/tests/frame/methods/test_map.py @@ -33,7 +33,6 @@ def test_map_float_object_conversion(val): assert result == object -@pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map_keeps_dtype(na_action): # GH52219 arr = Series(["a", np.nan, "b"]) @@ -159,14 +158,15 @@ def test_map_box(): tm.assert_frame_equal(result, expected) -def test_frame_map_dont_convert_datetime64(): - df = DataFrame({"x1": [datetime(1996, 1, 1)]}) +def test_frame_map_dont_convert_datetime64(unit): + dtype = f"M8[{unit}]" + df = DataFrame({"x1": [datetime(1996, 1, 1)]}, dtype=dtype) df = df.map(lambda x: x + BDay()) df = df.map(lambda x: x + BDay()) result = df.x1.dtype - assert result == "M8[ns]" + assert result == dtype def test_map_function_runs_once(): @@ -206,11 +206,3 @@ def test_map_invalid_na_action(float_frame): # GH 23803 with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"): float_frame.map(lambda x: len(str(x)), na_action="abc") - - -def test_applymap_deprecated(): - # GH52353 - df = DataFrame({"a": [1, 2, 3]}) - msg = "DataFrame.applymap has been deprecated. Use DataFrame.map instead." - with tm.assert_produces_warning(FutureWarning, match=msg): - df.applymap(lambda x: x) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 3ba893501914a..08b7128e6ec11 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -2,6 +2,7 @@ Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" but are implicitly also testing nsmallest_foo. """ + from string import ascii_lowercase import numpy as np @@ -12,25 +13,6 @@ from pandas.util.version import Version -@pytest.fixture -def df_duplicates(): - return pd.DataFrame( - {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, - index=[0, 0, 1, 1, 1], - ) - - -@pytest.fixture -def df_strings(): - return pd.DataFrame( - { - "a": np.random.default_rng(2).permutation(10), - "b": list(ascii_lowercase[:10]), - "c": np.random.default_rng(2).permutation(10).astype("float64"), - } - ) - - @pytest.fixture def df_main_dtypes(): return pd.DataFrame( @@ -81,12 +63,18 @@ class TestNLargestNSmallest: ], ) @pytest.mark.parametrize("n", range(1, 11)) - def test_nlargest_n(self, df_strings, nselect_method, n, order): + def test_nlargest_n(self, nselect_method, n, order): # GH#10393 - df = df_strings + df = pd.DataFrame( + { + "a": np.random.default_rng(2).permutation(10), + "b": list(ascii_lowercase[:10]), + "c": np.random.default_rng(2).permutation(10).astype("float64"), + } + ) if "b" in order: error_msg = ( - f"Column 'b' has dtype (object|string), " + f"Column 'b' has dtype (object|str), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): @@ -94,6 +82,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): else: ascending = nselect_method == "nsmallest" result = getattr(df, nselect_method)(n, order) + result.index = pd.Index(list(result.index)) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) @@ -144,7 +133,7 @@ def test_nlargest_n_identical_values(self): df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) result = df.nlargest(3, "a") - expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=range(3)) tm.assert_frame_equal(result, expected) result = df.nsmallest(3, "a") @@ -156,18 +145,21 @@ def test_nlargest_n_identical_values(self): [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], ) @pytest.mark.parametrize("n", range(1, 6)) - def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request): + def test_nlargest_n_duplicate_index(self, n, order, request): # GH#13412 - df = df_duplicates + df = pd.DataFrame( + {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, + index=[0, 0, 1, 1, 1], + ) result = df.nsmallest(n, order) - expected = df.sort_values(order).head(n) + expected = df.sort_values(order, kind="stable").head(n) tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False).head(n) + expected = df.sort_values(order, ascending=False, kind="stable").head(n) if Version(np.__version__) >= Version("1.25") and ( - (order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5 + (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ): request.applymarker( pytest.mark.xfail( @@ -188,18 +180,20 @@ def test_nlargest_duplicate_keep_all_ties(self): result = df.nlargest(4, "a", keep="all") expected = pd.DataFrame( { - "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, - } + "a": [5, 4, 4, 3, 3, 3, 3], + "b": [10, 9, 8, 5, 50, 10, 20], + }, + index=[0, 1, 2, 4, 5, 6, 7], ) tm.assert_frame_equal(result, expected) result = df.nsmallest(2, "a", keep="all") expected = pd.DataFrame( { - "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, - } + "a": [2, 3, 3, 3, 3], + "b": [7, 5, 50, 10, 20], + }, + index=range(3, 8), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py index 92b66e12d4356..7d4197577228e 100644 --- a/pandas/tests/frame/methods/test_pct_change.py +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -10,30 +10,17 @@ class TestDataFramePctChange: @pytest.mark.parametrize( - "periods, fill_method, limit, exp", + "periods, exp", [ - (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]), - (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]), - (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]), - (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]), - (-1, "ffill", None, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, 0, np.nan]), - (-1, "ffill", 1, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, np.nan, np.nan]), - (-1, "bfill", None, [0, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), - (-1, "bfill", 1, [np.nan, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + (1, [np.nan, np.nan, np.nan, 1, 1, 1.5, np.nan, np.nan]), + (-1, [np.nan, np.nan, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), ], ) - def test_pct_change_with_nas( - self, periods, fill_method, limit, exp, frame_or_series - ): + def test_pct_change_with_nas(self, periods, exp, frame_or_series): vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] obj = frame_or_series(vals) - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - f"{type(obj).__name__}.pct_change are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - res = obj.pct_change(periods=periods, fill_method=fill_method, limit=limit) + res = obj.pct_change(periods=periods) tm.assert_equal(res, frame_or_series(exp)) def test_pct_change_numeric(self): @@ -45,40 +32,28 @@ def test_pct_change_numeric(self): pnl.iat[1, 1] = np.nan pnl.iat[2, 3] = 60 - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - "DataFrame.pct_change are deprecated" - ) - for axis in range(2): - expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pnl.pct_change(axis=axis, fill_method="pad") + expected = pnl / pnl.shift(axis=axis) - 1 + result = pnl.pct_change(axis=axis) tm.assert_frame_equal(result, expected) def test_pct_change(self, datetime_frame): - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - "DataFrame.pct_change are deprecated" - ) - - rs = datetime_frame.pct_change(fill_method=None) + rs = datetime_frame.pct_change() tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) rs = datetime_frame.pct_change(2) filled = datetime_frame.ffill() tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = datetime_frame.pct_change(fill_method="bfill", limit=1) - filled = datetime_frame.bfill(limit=1) - tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) + rs = datetime_frame.pct_change() + tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) rs = datetime_frame.pct_change(freq="5D") - filled = datetime_frame.ffill() tm.assert_frame_equal( - rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + rs, + (datetime_frame / datetime_frame.shift(freq="5D") - 1).reindex_like( + datetime_frame + ), ) def test_pct_change_shift_over_nas(self): @@ -86,75 +61,45 @@ def test_pct_change_shift_over_nas(self): df = DataFrame({"a": s, "b": s}) - msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - chg = df.pct_change() - - expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + chg = df.pct_change() + expected = Series([np.nan, 0.5, np.nan, np.nan, 0.2]) edf = DataFrame({"a": expected, "b": expected}) tm.assert_frame_equal(chg, edf) @pytest.mark.parametrize( - "freq, periods, fill_method, limit", + "freq, periods", [ - ("5B", 5, None, None), - ("3B", 3, None, None), - ("3B", 3, "bfill", None), - ("7B", 7, "pad", 1), - ("7B", 7, "bfill", 3), - ("14B", 14, None, None), + ("5B", 5), + ("3B", 3), + ("14B", 14), ], ) def test_pct_change_periods_freq( - self, datetime_frame, freq, periods, fill_method, limit + self, + datetime_frame, + freq, + periods, ): - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - "DataFrame.pct_change are deprecated" - ) - # GH#7292 - with tm.assert_produces_warning(FutureWarning, match=msg): - rs_freq = datetime_frame.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs_periods = datetime_frame.pct_change( - periods, fill_method=fill_method, limit=limit - ) + rs_freq = datetime_frame.pct_change(freq=freq) + rs_periods = datetime_frame.pct_change(periods) tm.assert_frame_equal(rs_freq, rs_periods) empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs_freq = empty_ts.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs_periods = empty_ts.pct_change( - periods, fill_method=fill_method, limit=limit - ) + rs_freq = empty_ts.pct_change(freq=freq) + rs_periods = empty_ts.pct_change(periods) tm.assert_frame_equal(rs_freq, rs_periods) -@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) -def test_pct_change_with_duplicated_indices(fill_method): +def test_pct_change_with_duplicated_indices(): # GH30463 data = DataFrame( {0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3 ) - warn = None if fill_method is None else FutureWarning - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - "DataFrame.pct_change are deprecated" - ) - with tm.assert_produces_warning(warn, match=msg): - result = data.pct_change(fill_method=fill_method) + result = data.pct_change() - if fill_method is None: - second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0] - else: - second_column = [np.nan, np.inf, 0.0, 2.0, 2.0, 1.0] + second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0] expected = DataFrame( {0: [np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], 1: second_column}, index=["a", "b"] * 3, @@ -162,7 +107,7 @@ def test_pct_change_with_duplicated_indices(fill_method): tm.assert_frame_equal(result, expected) -def test_pct_change_none_beginning_no_warning(): +def test_pct_change_none_beginning(): # GH#54481 df = DataFrame( [ diff --git a/pandas/tests/frame/methods/test_pop.py b/pandas/tests/frame/methods/test_pop.py index 3eb058015cd3d..617f0c3a27885 100644 --- a/pandas/tests/frame/methods/test_pop.py +++ b/pandas/tests/frame/methods/test_pop.py @@ -9,7 +9,7 @@ class TestDataFramePop: - def test_pop(self, float_frame, warn_copy_on_write): + def test_pop(self, float_frame): float_frame.columns.name = "baz" float_frame.pop("A") @@ -23,8 +23,7 @@ def test_pop(self, float_frame, warn_copy_on_write): # gh-10912: inplace ops cause caching issue a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) b = a.pop("B") - with tm.assert_cow_warning(warn_copy_on_write): - b += 1 + b += 1 # original frame expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0f27eae1a3bfc..d7baac7264a1d 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -45,9 +45,7 @@ def test_quantile_sparse(self, df, expected): expected = expected.astype("Sparse[float]") tm.assert_series_equal(result, expected) - def test_quantile( - self, datetime_frame, interp_method, using_array_manager, request - ): + def test_quantile(self, datetime_frame, interp_method, request): interpolation, method = interp_method df = datetime_frame result = df.quantile( @@ -63,11 +61,6 @@ def test_quantile( tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) - request.applymarker( - pytest.mark.xfail( - using_array_manager, reason="Name set incorrectly for arraymanager" - ) - ) assert result.name == expected.name result = df.quantile( @@ -83,11 +76,6 @@ def test_quantile( tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) - request.applymarker( - pytest.mark.xfail( - using_array_manager, reason="Name set incorrectly for arraymanager" - ) - ) assert result.name == expected.name def test_empty(self, interp_method): @@ -97,7 +85,7 @@ def test_empty(self, interp_method): ) assert np.isnan(q["x"]) and np.isnan(q["y"]) - def test_non_numeric_exclusion(self, interp_method, request, using_array_manager): + def test_non_numeric_exclusion(self, interp_method, request): interpolation, method = interp_method df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) rs = df.quantile( @@ -106,11 +94,9 @@ def test_non_numeric_exclusion(self, interp_method, request, using_array_manager xp = df.median(numeric_only=True).rename(0.5) if interpolation == "nearest": xp = (xp + 0.5).astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(rs, xp) - def test_axis(self, interp_method, request, using_array_manager): + def test_axis(self, interp_method): # axis interpolation, method = interp_method df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) @@ -118,8 +104,6 @@ def test_axis(self, interp_method, request, using_array_manager): expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) if interpolation == "nearest": expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) result = df.quantile( @@ -134,7 +118,7 @@ def test_axis(self, interp_method, request, using_array_manager): expected = expected.astype(np.int64) tm.assert_frame_equal(result, expected, check_index_type=True) - def test_axis_numeric_only_true(self, interp_method, request, using_array_manager): + def test_axis_numeric_only_true(self, interp_method): # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 @@ -143,14 +127,12 @@ def test_axis_numeric_only_true(self, interp_method, request, using_array_manage result = df.quantile( 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series([3.0, 4.0], index=[0, 1], name=0.5) + expected = Series([3.0, 4.0], index=range(2), name=0.5) if interpolation == "nearest": expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) - def test_quantile_date_range(self, interp_method, request, using_array_manager): + def test_quantile_date_range(self, interp_method): # GH 2460 interpolation, method = interp_method dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") @@ -163,12 +145,10 @@ def test_quantile_date_range(self, interp_method, request, using_array_manager): expected = Series( ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" ) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) - def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): + def test_quantile_axis_mixed(self, interp_method): # mixed on axis=1 interpolation, method = interp_method df = DataFrame( @@ -185,8 +165,6 @@ def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): expected = Series([1.5, 2.5, 3.5], name=0.5) if interpolation == "nearest": expected -= 0.5 - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) # must raise @@ -194,11 +172,9 @@ def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): with pytest.raises(TypeError, match=msg): df.quantile(0.5, axis=1, numeric_only=False) - def test_quantile_axis_parameter(self, interp_method, request, using_array_manager): + def test_quantile_axis_parameter(self, interp_method): # GH 9543/9544 interpolation, method = interp_method - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method) @@ -312,7 +288,7 @@ def test_quantile_interpolation_int(self, int_frame): assert q1["A"] == np.percentile(df["A"], 10) tm.assert_series_equal(q, q1) - def test_quantile_multi(self, interp_method, request, using_array_manager): + def test_quantile_multi(self, interp_method): interpolation, method = interp_method df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) result = df.quantile([0.25, 0.5], interpolation=interpolation, method=method) @@ -323,11 +299,9 @@ def test_quantile_multi(self, interp_method, request, using_array_manager): ) if interpolation == "nearest": expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_frame_equal(result, expected) - def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager): + def test_quantile_multi_axis_1(self, interp_method): interpolation, method = interp_method df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) result = df.quantile( @@ -338,8 +312,6 @@ def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager ) if interpolation == "nearest": expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_frame_equal(result, expected) def test_quantile_multi_empty(self, interp_method): @@ -398,11 +370,11 @@ def test_quantile_datetime(self, unit): # empty when numeric_only=True result = df[["a", "c"]].quantile(0.5, numeric_only=True) - expected = Series([], index=[], dtype=np.float64, name=0.5) + expected = Series([], index=Index([], dtype="str"), dtype=np.float64, name=0.5) tm.assert_series_equal(result, expected) result = df[["a", "c"]].quantile([0.5], numeric_only=True) - expected = DataFrame(index=[0.5], columns=[]) + expected = DataFrame(index=[0.5], columns=Index([], dtype="str")) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -422,7 +394,7 @@ def test_quantile_dt64_empty(self, dtype, interp_method): res = df.quantile( 0.5, axis=1, numeric_only=False, interpolation=interpolation, method=method ) - expected = Series([], index=[], name=0.5, dtype=dtype) + expected = Series([], index=Index([], dtype="str"), name=0.5, dtype=dtype) tm.assert_series_equal(res, expected) # no columns in result, so no dtype preservation @@ -433,7 +405,7 @@ def test_quantile_dt64_empty(self, dtype, interp_method): interpolation=interpolation, method=method, ) - expected = DataFrame(index=[0.5], columns=[]) + expected = DataFrame(index=[0.5], columns=Index([], dtype="str")) tm.assert_frame_equal(res, expected) @pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]]) @@ -443,10 +415,8 @@ def test_quantile_invalid(self, invalid, datetime_frame, interp_method): with pytest.raises(ValueError, match=msg): datetime_frame.quantile(invalid, interpolation=interpolation, method=method) - def test_quantile_box(self, interp_method, request, using_array_manager): + def test_quantile_box(self, interp_method): interpolation, method = interp_method - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) df = DataFrame( { "A": [ @@ -574,10 +544,8 @@ def test_quantile_box_nat(self): ) tm.assert_frame_equal(res, exp) - def test_quantile_nan(self, interp_method, request, using_array_manager): + def test_quantile_nan(self, interp_method): interpolation, method = interp_method - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # GH 14357 - float block where some cols have missing values df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan @@ -621,10 +589,8 @@ def test_quantile_nan(self, interp_method, request, using_array_manager): exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) - def test_quantile_nat(self, interp_method, request, using_array_manager, unit): + def test_quantile_nat(self, interp_method, unit): interpolation, method = interp_method - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # full NaT column df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}, dtype=f"M8[{unit}]") @@ -689,11 +655,11 @@ def test_quantile_empty_no_rows_floats(self, interp_method): tm.assert_frame_equal(res, exp) res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) - exp = Series([], index=[], dtype="float64", name=0.5) + exp = Series([], index=Index([], dtype="str"), dtype="float64", name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5], axis=1, interpolation=interpolation, method=method) - exp = DataFrame(columns=[], index=[0.5]) + exp = DataFrame(columns=Index([], dtype="str"), index=[0.5]) tm.assert_frame_equal(res, exp) def test_quantile_empty_no_rows_ints(self, interp_method): @@ -725,9 +691,7 @@ def test_quantile_empty_no_rows_dt64(self, interp_method): exp = exp.astype(object) if interpolation == "nearest": # GH#18463 TODO: would we prefer NaTs here? - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - exp = exp.fillna(np.nan, downcast=False) + exp = exp.fillna(np.nan) tm.assert_series_equal(res, exp) # both dt64tz @@ -746,20 +710,18 @@ def test_quantile_empty_no_columns(self, interp_method): result = df.quantile( 0.5, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series([], index=[], name=0.5, dtype=np.float64) + expected = Series([], name=0.5, dtype=np.float64) expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) result = df.quantile( [0.5], numeric_only=True, interpolation=interpolation, method=method ) - expected = DataFrame([], index=[0.5], columns=[]) + expected = DataFrame([], index=[0.5]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) - def test_quantile_item_cache( - self, using_array_manager, interp_method, using_copy_on_write - ): + def test_quantile_item_cache(self, interp_method): # previous behavior incorrect retained an invalid _item_cache entry interpolation, method = interp_method df = DataFrame( @@ -767,19 +729,13 @@ def test_quantile_item_cache( ) df["D"] = df["A"] * 2 ser = df["A"] - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 2 df.quantile(numeric_only=False, interpolation=interpolation, method=method) - if using_copy_on_write: - ser.iloc[0] = 99 - assert df.iloc[0, 0] == df["A"][0] - assert df.iloc[0, 0] != 99 - else: - ser.values[0] = 99 - assert df.iloc[0, 0] == df["A"][0] - assert df.iloc[0, 0] == 99 + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] != 99 def test_invalid_method(self): with pytest.raises(ValueError, match="Invalid method: foo"): @@ -927,7 +883,10 @@ def test_empty_numeric(self, dtype, expected_data, expected_index, axis): df = DataFrame(columns=["a", "b"], dtype=dtype) result = df.quantile(0.5, axis=axis) expected = Series( - expected_data, name=0.5, index=Index(expected_index), dtype="float64" + expected_data, + name=0.5, + index=Index(expected_index, dtype="str"), + dtype="float64", ) tm.assert_series_equal(result, expected) @@ -945,7 +904,10 @@ def test_empty_datelike( df = DataFrame(columns=["a", "b"], dtype=dtype) result = df.quantile(0.5, axis=axis, numeric_only=False) expected = Series( - expected_data, name=0.5, index=Index(expected_index), dtype=expected_dtype + expected_data, + name=0.5, + index=Index(expected_index, dtype="str"), + dtype=expected_dtype, ) tm.assert_series_equal(result, expected) @@ -967,6 +929,18 @@ def test_datelike_numeric_only(self, expected_data, expected_index, axis): ) result = df[["a", "c"]].quantile(0.5, axis=axis, numeric_only=True) expected = Series( - expected_data, name=0.5, index=Index(expected_index), dtype=np.float64 + expected_data, + name=0.5, + index=Index(expected_index, dtype="str" if axis == 0 else "int64"), + dtype=np.float64, ) tm.assert_series_equal(result, expected) + + +def test_multi_quantile_numeric_only_retains_columns(): + df = DataFrame(list("abc")) + result = df.quantile([0.5, 0.7], numeric_only=True) + expected = DataFrame(index=[0.5, 0.7]) + tm.assert_frame_equal( + result, expected, check_index_type=True, check_column_type=True + ) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8d7a0b373f5f8..6c6c208ee0c78 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -31,13 +31,6 @@ class TestRank: "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), } - @pytest.fixture(params=["average", "min", "max", "first", "dense"]) - def method(self, request): - """ - Fixture for trying all rank methods - """ - return request.param - def test_rank(self, float_frame): sp_stats = pytest.importorskip("scipy.stats") @@ -225,8 +218,7 @@ def test_rank_axis(self): tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) @pytest.mark.parametrize("ax", [0, 1]) - @pytest.mark.parametrize("m", ["average", "min", "max", "first", "dense"]) - def test_rank_methods_frame(self, ax, m): + def test_rank_methods_frame(self, ax, rank_method): sp_stats = pytest.importorskip("scipy.stats") xs = np.random.default_rng(2).integers(0, 21, (100, 26)) @@ -236,16 +228,19 @@ def test_rank_methods_frame(self, ax, m): for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) - result = df.rank(axis=ax, method=m) + result = df.rank(axis=ax, method=rank_method) sprank = np.apply_along_axis( - sp_stats.rankdata, ax, vals, m if m != "first" else "ordinal" + sp_stats.rankdata, + ax, + vals, + rank_method if rank_method != "first" else "ordinal", ) sprank = sprank.astype(np.float64) expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) - def test_rank_descending(self, method, dtype): + def test_rank_descending(self, rank_method, dtype): if "i" in dtype: df = self.df.dropna().astype(dtype) else: @@ -255,18 +250,18 @@ def test_rank_descending(self, method, dtype): expected = (df.max() - df).rank() tm.assert_frame_equal(res, expected) - expected = (df.max() - df).rank(method=method) + expected = (df.max() - df).rank(method=rank_method) if dtype != "O": - res2 = df.rank(method=method, ascending=False, numeric_only=True) + res2 = df.rank(method=rank_method, ascending=False, numeric_only=True) tm.assert_frame_equal(res2, expected) - res3 = df.rank(method=method, ascending=False, numeric_only=False) + res3 = df.rank(method=rank_method, ascending=False, numeric_only=False) tm.assert_frame_equal(res3, expected) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("dtype", [None, object]) - def test_rank_2d_tie_methods(self, method, axis, dtype): + def test_rank_2d_tie_methods(self, rank_method, axis, dtype): df = self.df def _check2d(df, expected, method="average", axis=0): @@ -276,14 +271,14 @@ def _check2d(df, expected, method="average", axis=0): df = df.T exp_df = exp_df.T - result = df.rank(method=method, axis=axis) + result = df.rank(method=rank_method, axis=axis) tm.assert_frame_equal(result, exp_df) frame = df if dtype is None else df.astype(dtype) - _check2d(frame, self.results[method], method=method, axis=axis) + _check2d(frame, self.results[rank_method], method=rank_method, axis=axis) @pytest.mark.parametrize( - "method,exp", + "rank_method,exp", [ ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]), ( @@ -312,11 +307,11 @@ def _check2d(df, expected, method="average", axis=0): ), ], ) - def test_rank_pct_true(self, method, exp): + def test_rank_pct_true(self, rank_method, exp): # see gh-15630. df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) - result = df.rank(method=method, pct=True) + result = df.rank(method=rank_method, pct=True) expected = DataFrame(exp) tm.assert_frame_equal(result, expected) @@ -324,9 +319,7 @@ def test_rank_pct_true(self, method, exp): @pytest.mark.single_cpu def test_pct_max_many_rows(self): # GH 18271 - df = DataFrame( - {"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)} - ) + df = DataFrame({"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)}) result = df.rank(pct=True).max() assert (result == 1).all() @@ -400,7 +393,7 @@ def test_rank_inf_and_nan(self, contents, dtype, frame_or_series): # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly # This is for testing whether rank calculation is affected - # when values are interwined with nan values. + # when values are intertwined with nan values. values = np.array(contents, dtype=dtype) exp_order = np.array(range(len(values)), dtype="float64") + 1.0 if dtype in dtype_na_map: @@ -454,10 +447,10 @@ def test_rank_both_inf(self): ], ) def test_rank_inf_nans_na_option( - self, frame_or_series, method, na_option, ascending, expected + self, frame_or_series, rank_method, na_option, ascending, expected ): obj = frame_or_series([np.inf, np.nan, -np.inf]) - result = obj.rank(method=method, na_option=na_option, ascending=ascending) + result = obj.rank(method=rank_method, na_option=na_option, ascending=ascending) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -470,14 +463,10 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first( - self, frame_or_series, na_option, ascending, expected, using_infer_string - ): + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) - if using_infer_string and isinstance(obj, Series): - expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -497,14 +486,15 @@ def test_rank_mixed_axis_zero(self, data, expected): result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype, exp_dtype", - [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], - ) - def test_rank_string_dtype(self, dtype, exp_dtype): + def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 - pytest.importorskip("pyarrow") - obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") + exp_dtype = ( + "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64" + ) + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index d862e14ce86cb..37adc31fb0f4d 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -13,7 +13,6 @@ is_platform_windows, ) from pandas.compat.numpy import np_version_gt2 -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -136,7 +135,6 @@ class TestDataFrameSelectReindex: reason="Passes int32 values to DatetimeArray in make_na_array on " "windows, 32bit linux builds", ) - @td.skip_array_manager_not_yet_implemented def test_reindex_tzaware_fill_value(self): # GH#52586 df = DataFrame([[1]]) @@ -161,44 +159,6 @@ def test_reindex_tzaware_fill_value(self): expected[1] = expected[1].astype(res.dtypes[1]) tm.assert_frame_equal(res, expected) - def test_reindex_copies(self): - # based on asv time_reindex_axis1 - N = 10 - df = DataFrame(np.random.default_rng(2).standard_normal((N * 10, N))) - cols = np.arange(N) - np.random.default_rng(2).shuffle(cols) - - result = df.reindex(columns=cols, copy=True) - assert not np.shares_memory(result[0]._values, df[0]._values) - - # pass both columns and index - result2 = df.reindex(columns=cols, index=df.index, copy=True) - assert not np.shares_memory(result2[0]._values, df[0]._values) - - def test_reindex_copies_ea(self, using_copy_on_write): - # https://github.com/pandas-dev/pandas/pull/51197 - # also ensure to honor copy keyword for ExtensionDtypes - N = 10 - df = DataFrame( - np.random.default_rng(2).standard_normal((N * 10, N)), dtype="Float64" - ) - cols = np.arange(N) - np.random.default_rng(2).shuffle(cols) - - result = df.reindex(columns=cols, copy=True) - if using_copy_on_write: - assert np.shares_memory(result[0].array._data, df[0].array._data) - else: - assert not np.shares_memory(result[0].array._data, df[0].array._data) - - # pass both columns and index - result2 = df.reindex(columns=cols, index=df.index, copy=True) - if using_copy_on_write: - assert np.shares_memory(result2[0].array._data, df[0].array._data) - else: - assert not np.shares_memory(result2[0].array._data, df[0].array._data) - - @td.skip_array_manager_not_yet_implemented def test_reindex_date_fill_value(self): # passing date to dt64 is deprecated; enforced in 2.0 to cast to object arr = date_range("2016-01-01", periods=6).values.reshape(3, 2) @@ -455,19 +415,16 @@ def f(val): ("mid",), ("mid", "btm"), ("mid", "btm", "top"), - ("mid",), ("mid", "top"), ("mid", "top", "btm"), ("btm",), ("btm", "mid"), ("btm", "mid", "top"), - ("btm",), ("btm", "top"), ("btm", "top", "mid"), ("top",), ("top", "mid"), ("top", "mid", "btm"), - ("top",), ("top", "btm"), ("top", "btm", "mid"), ], @@ -608,7 +565,7 @@ def test_reindex_sparse(self): ) tm.assert_frame_equal(result, expected) - def test_reindex(self, float_frame, using_copy_on_write): + def test_reindex(self, float_frame): datetime_series = Series( np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) ) @@ -647,13 +604,8 @@ def test_reindex(self, float_frame, using_copy_on_write): tm.assert_index_equal(series.index, nonContigFrame.index) # corner cases - - # Same index, copies values but not index if copy=False - newFrame = float_frame.reindex(float_frame.index, copy=False) - if using_copy_on_write: - assert newFrame.index.is_(float_frame.index) - else: - assert newFrame.index is float_frame.index + newFrame = float_frame.reindex(float_frame.index) + assert newFrame.index.is_(float_frame.index) # length zero newFrame = float_frame.reindex([]) @@ -802,7 +754,10 @@ def test_reindex_axes(self): index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=["a", "b", "c"], ) - time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") some_cols = ["a", "b"] index_freq = df.reindex(index=time_freq).index.freq @@ -1033,6 +988,12 @@ def test_reindex_with_nans(self): expected = df.iloc[[1]] tm.assert_frame_equal(result, expected) + def test_reindex_without_upcasting(self): + # GH45857 + df = DataFrame(np.zeros((10, 10), dtype=np.float32)) + result = df.reindex(columns=np.arange(5, 15)) + assert result.dtypes.eq(np.float32).all() + def test_reindex_multi(self): df = DataFrame(np.random.default_rng(2).standard_normal((3, 3))) @@ -1225,13 +1186,7 @@ def test_reindex_empty_frame(self, kwargs): expected = DataFrame({"a": [np.nan] * 3}, index=idx, dtype=object) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "src_idx", - [ - Index([]), - CategoricalIndex([]), - ], - ) + @pytest.mark.parametrize("src_idx", [Index, CategoricalIndex]) @pytest.mark.parametrize( "cat_idx", [ @@ -1246,7 +1201,7 @@ def test_reindex_empty_frame(self, kwargs): ], ) def test_reindex_empty(self, src_idx, cat_idx): - df = DataFrame(columns=src_idx, index=["K"], dtype="f8") + df = DataFrame(columns=src_idx([]), index=["K"], dtype="f8") result = df.reindex(columns=cat_idx) expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") @@ -1287,36 +1242,14 @@ def test_reindex_datetimelike_to_object(self, dtype): assert res.iloc[-1, 1] is fv tm.assert_frame_equal(res, expected) - @pytest.mark.parametrize( - "index_df,index_res,index_exp", - [ - ( - CategoricalIndex([], categories=["A"]), - Index(["A"]), - Index(["A"]), - ), - ( - CategoricalIndex([], categories=["A"]), - Index(["B"]), - Index(["B"]), - ), - ( - CategoricalIndex([], categories=["A"]), - CategoricalIndex(["A"]), - CategoricalIndex(["A"]), - ), - ( - CategoricalIndex([], categories=["A"]), - CategoricalIndex(["B"]), - CategoricalIndex(["B"]), - ), - ], - ) - def test_reindex_not_category(self, index_df, index_res, index_exp): + @pytest.mark.parametrize("klass", [Index, CategoricalIndex]) + @pytest.mark.parametrize("data", ["A", "B"]) + def test_reindex_not_category(self, klass, data): # GH#28690 - df = DataFrame(index=index_df) - result = df.reindex(index=index_res) - expected = DataFrame(index=index_exp) + df = DataFrame(index=CategoricalIndex([], categories=["A"])) + idx = klass([data]) + result = df.reindex(index=idx) + expected = DataFrame(index=idx) tm.assert_frame_equal(result, expected) def test_invalid_method(self): diff --git a/pandas/tests/frame/methods/test_reindex_like.py b/pandas/tests/frame/methods/test_reindex_like.py index ce68ec28eec3d..03968dcbb6314 100644 --- a/pandas/tests/frame/methods/test_reindex_like.py +++ b/pandas/tests/frame/methods/test_reindex_like.py @@ -22,9 +22,11 @@ def test_reindex_like(self, float_frame): def test_reindex_like_methods(self, method, expected_values): df = DataFrame({"x": list(range(5))}) - result = df.reindex_like(df, method=method, tolerance=0) + with tm.assert_produces_warning(FutureWarning): + result = df.reindex_like(df, method=method, tolerance=0) tm.assert_frame_equal(df, result) - result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) + with tm.assert_produces_warning(FutureWarning): + result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) tm.assert_frame_equal(df, result) def test_reindex_like_subclass(self): diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index c3bc96b44c807..6153a168476d4 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -164,17 +164,13 @@ def test_rename_multiindex(self): renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) - def test_rename_nocopy(self, float_frame, using_copy_on_write, warn_copy_on_write): - renamed = float_frame.rename(columns={"C": "foo"}, copy=False) + def test_rename_nocopy(self, float_frame): + renamed = float_frame.rename(columns={"C": "foo"}) assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values) - with tm.assert_cow_warning(warn_copy_on_write): - renamed.loc[:, "foo"] = 1.0 - if using_copy_on_write: - assert not (float_frame["C"] == 1.0).all() - else: - assert (float_frame["C"] == 1.0).all() + renamed.loc[:, "foo"] = 1.0 + assert not (float_frame["C"] == 1.0).all() def test_rename_inplace(self, float_frame): float_frame.rename(columns={"C": "foo"}) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8bfa98042eb07..9e302dc5f94ee 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -30,9 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -48,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame): mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, 0) - expected = float_string_frame.fillna(value=0) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=0) tm.assert_frame_equal(result, expected) tsframe = datetime_frame.copy() @@ -289,61 +286,43 @@ def test_regex_replace_dict_nested_non_first_character( # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) - - else: - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_regex_replace_dict_nested_gh4115(self): - df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) - expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.replace({"Type": {"Q": 0, "T": 1}}) + df = DataFrame( + {"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2} + ) + expected = DataFrame({"Type": Series([0, 1, 0, 0, 1], dtype=object), "tmp": 2}) + result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( { "a": mix_abc["a"], - "b": np.array([np.nan] * 4), + "b": Series([np.nan] * 4, dtype="str"), "c": [np.nan, np.nan, np.nan, "d"], } ) - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) + + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = res2.replace( - [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True - ) + return_value = res2.replace( + [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True + ) assert return_value is None - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = res3.replace( - regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True - ) + return_value = res3.replace( + regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True + ) assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -355,13 +334,11 @@ def test_regex_replace_str_to_numeric(self, mix_abc): return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) assert return_value is None expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) + expec["c"] = expec["c"].astype(object) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -452,19 +429,7 @@ def test_regex_replace_string_types( # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - if len(to_replace) > 1 and isinstance(obj, DataFrame): - request.node.add_marker( - pytest.mark.xfail( - reason="object input array that gets downcasted raises on " - "second pass" - ) - ) - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = obj.replace(to_replace, regex=True) - dtype = "string[pyarrow_numpy]" - else: - result = obj.replace(to_replace, regex=True) + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) @@ -566,37 +531,37 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_convert(self): - # gh 3907 - df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + def test_replace_convert(self, any_string_dtype): + # gh 3907 (pandas >= 3.0 no longer converts dtypes) + df = DataFrame( + [["foo", "bar", "bah"], ["bar", "foo", "bah"]], dtype=any_string_dtype + ) m = {"foo": 1, "bar": 2, "bah": 3} - msg = "Downcasting behavior in `replace` " - with tm.assert_produces_warning(FutureWarning, match=msg): - rep = df.replace(m) - expec = Series([np.int64] * 3) - res = rep.dtypes - tm.assert_series_equal(expec, res) - - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + rep = df.replace(m) + assert (rep.dtypes == object).all() + def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, -18) - expected = float_string_frame.fillna(value=-18) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-18) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-18, np.nan), expected2) result = float_string_frame.replace(np.nan, -1e8) - expected = float_string_frame.fillna(value=-1e8) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-1e8) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2) def test_replace_mixed_int_block_upcasting(self): # int block upcasting @@ -638,7 +603,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - def test_replace_mixed2(self, using_infer_string): + def test_replace_mixed2(self): # to object block upcasting df = DataFrame( { @@ -657,15 +622,11 @@ def test_replace_mixed2(self, using_infer_string): expected = DataFrame( { - "A": Series(["foo", "bar"]), + "A": Series(["foo", "bar"], dtype="object"), "B": Series([0, "foo"], dtype="object"), } ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace([1, 2], ["foo", "bar"]) - else: - result = df.replace([1, 2], ["foo", "bar"]) + result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) def test_replace_mixed3(self): @@ -751,6 +712,13 @@ def test_replace_with_None_keeps_categorical(self): ) tm.assert_frame_equal(result, expected) + def test_replace_all_NA(self): + # GH#60688 + df = DataFrame({"ticker": ["#1234#"], "name": [None]}) + result = df.replace({col: {r"^#": "$"} for col in df.columns}, regex=True) + expected = DataFrame({"ticker": ["$1234#"], "name": [None]}) + tm.assert_frame_equal(result, expected) + def test_replace_value_is_none(self, datetime_frame): orig_value = datetime_frame.iloc[0, 0] orig2 = datetime_frame.iloc[1, 0] @@ -784,11 +752,6 @@ def test_replace_for_new_dtypes(self, datetime_frame): tsframe.loc[tsframe.index[:5], "A"] = np.nan tsframe.loc[tsframe.index[-5:], "A"] = np.nan tsframe.loc[tsframe.index[:5], "B"] = np.nan - msg = "DataFrame.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - # TODO: what is this even testing? - result = tsframe.fillna(method="bfill") - tm.assert_frame_equal(result, tsframe.fillna(method="bfill")) @pytest.mark.parametrize( "frame, to_replace, value, expected", @@ -841,13 +804,6 @@ def test_replace_for_new_dtypes(self, datetime_frame): "bar", DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}), ), - # GH 36782 - ( - DataFrame({"dt": [datetime(2920, 10, 1)]}), - datetime(2920, 10, 1), - datetime(2020, 10, 1), - DataFrame({"dt": [datetime(2020, 10, 1)]}), - ), ( DataFrame( { @@ -898,12 +854,7 @@ def test_replace_for_new_dtypes(self, datetime_frame): ], ) def test_replace_dtypes(self, frame, to_replace, value, expected): - warn = None - if isinstance(to_replace, datetime) and to_replace.year == 2920: - warn = FutureWarning - msg = "Downcasting behavior in `replace` " - with tm.assert_produces_warning(warn, match=msg): - result = frame.replace(to_replace, value) + result = frame.replace(to_replace, value) tm.assert_frame_equal(result, expected) def test_replace_input_formats_listlike(self): @@ -946,9 +897,6 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -977,10 +925,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_dict_no_regex(self): + def test_replace_dict_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -988,7 +933,8 @@ def test_replace_dict_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = { "Agree": 4, @@ -997,16 +943,11 @@ def test_replace_dict_no_regex(self): "Strongly Agree": 5, "Strongly Disagree": 1, } - expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - msg = "Downcasting behavior in `replace` " - with tm.assert_produces_warning(FutureWarning, match=msg): - result = answer.replace(weights) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=object) + result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_series_no_regex(self): + def test_replace_series_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -1014,7 +955,8 @@ def test_replace_series_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = Series( { @@ -1025,10 +967,8 @@ def test_replace_series_no_regex(self): "Strongly Disagree": 1, } ) - expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - msg = "Downcasting behavior in `replace` " - with tm.assert_produces_warning(FutureWarning, match=msg): - result = answer.replace(weights) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=object) + result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_dict_tuple_list_ordering_remains_the_same(self): @@ -1112,94 +1052,17 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_swapping_bug(self, using_infer_string): + def test_replace_swapping_bug(self): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) df = DataFrame({"a": [0, 1, 0]}) res = df.replace({"a": {0: "Y", 1: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_period(self): - d = { - "fname": { - "out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"), - "out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"), - "out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"), - "out_augmented_SUBSIDY_WEEK.json": pd.Period( - year=2011, month=4, freq="M" - ), - "out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"), - "out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"), - "out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"), - } - } - - df = DataFrame( - [ - "out_augmented_AUG_2012.json", - "out_augmented_SEP_2013.json", - "out_augmented_SUBSIDY_WEEK.json", - "out_augmented_MAY_2012.json", - "out_augmented_MAY_2011.json", - "out_augmented_AUG_2011.json", - "out_augmented_JAN_2011.json", - ], - columns=["fname"], - ) - assert set(df.fname.values) == set(d["fname"].keys()) - - expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) - assert expected.dtypes.iloc[0] == "Period[M]" - msg = "Downcasting behavior in `replace` " - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.replace(d) - tm.assert_frame_equal(result, expected) - - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_datetime(self): - d = { - "fname": { - "out_augmented_AUG_2011.json": Timestamp("2011-08"), - "out_augmented_JAN_2011.json": Timestamp("2011-01"), - "out_augmented_MAY_2012.json": Timestamp("2012-05"), - "out_augmented_SUBSIDY_WEEK.json": Timestamp("2011-04"), - "out_augmented_AUG_2012.json": Timestamp("2012-08"), - "out_augmented_MAY_2011.json": Timestamp("2011-05"), - "out_augmented_SEP_2013.json": Timestamp("2013-09"), - } - } - - df = DataFrame( - [ - "out_augmented_AUG_2012.json", - "out_augmented_SEP_2013.json", - "out_augmented_SUBSIDY_WEEK.json", - "out_augmented_MAY_2012.json", - "out_augmented_MAY_2011.json", - "out_augmented_AUG_2011.json", - "out_augmented_JAN_2011.json", - ], - columns=["fname"], - ) - assert set(df.fname.values) == set(d["fname"].keys()) - expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) - msg = "Downcasting behavior in `replace` " - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.replace(d) - tm.assert_frame_equal(result, expected) - def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] @@ -1293,80 +1156,6 @@ def test_replace_with_empty_dictlike(self, mix_abc): tm.assert_frame_equal(df, df.replace({"b": {}})) tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) - @pytest.mark.parametrize( - "to_replace, method, expected", - [ - (0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), - ( - np.nan, - "bfill", - {"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]}, - ), - ("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), - ( - [0, 2], - "bfill", - {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, - ), - ( - [1, 2], - "pad", - {"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, - ), - ( - (1, 2), - "bfill", - {"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, - ), - ( - ["b", "c"], - "ffill", - {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]}, - ), - ], - ) - def test_replace_method(self, to_replace, method, expected): - # GH 19632 - df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}) - - msg = "The 'method' keyword in DataFrame.replace is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.replace(to_replace=to_replace, value=None, method=method) - expected = DataFrame(expected) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "replace_dict, final_data", - [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], - ) - def test_categorical_replace_with_dict(self, replace_dict, final_data): - # GH 26988 - df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") - - final_data = np.array(final_data) - - a = pd.Categorical(final_data[:, 0], categories=[3, 2]) - - ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3] - b = pd.Categorical(final_data[:, 1], categories=ex_cat) - - expected = DataFrame({"a": a, "b": b}) - msg2 = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = df.replace(replace_dict, 3) - tm.assert_frame_equal(result, expected) - msg = ( - r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " - "different" - ) - with pytest.raises(AssertionError, match=msg): - # ensure non-inplace call does not affect original - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning(FutureWarning, match=msg2): - return_value = df.replace(replace_dict, 3, inplace=True) - assert return_value is None - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "df, to_replace, exp", [ @@ -1393,9 +1182,6 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) @pytest.mark.parametrize( "replacer", [ @@ -1408,11 +1194,9 @@ def test_replace_commutative(self, df, to_replace, exp): ) def test_replace_replacer_dtype(self, replacer): # GH26632 - df = DataFrame(["a"]) - msg = "Downcasting behavior in `replace` " - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.replace({"a": replacer, "b": replacer}) - expected = DataFrame([replacer]) + df = DataFrame(["a"], dtype=object) + result = df.replace({"a": replacer, "b": replacer}) + expected = DataFrame([replacer], dtype=object) tm.assert_frame_equal(result, expected) def test_replace_after_convert_dtypes(self): @@ -1430,13 +1214,8 @@ def test_replace_invalid_to_replace(self): r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*" ) - msg2 = ( - "DataFrame.replace without 'value' and with non-dict-like " - "'to_replace' is deprecated" - ) with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - df.replace(lambda x: x.strip()) + df.replace(lambda x: x.strip()) @pytest.mark.parametrize("dtype", ["float", "float64", "int64", "Int64", "boolean"]) @pytest.mark.parametrize("value", [np.nan, pd.NA]) @@ -1471,6 +1250,30 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[2, 2], [2, 2]]), ({"a": 1, "b": 2}, [[2, 1], [2, 2]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[1, 2]) + b = pd.Categorical(final_data[:, 1], categories=[1, 2]) + + expected = DataFrame({"a": a, "b": b}) + result = df.replace(replace_dict, 2) + tm.assert_frame_equal(result, expected) + msg = r"DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " "different" + with pytest.raises(AssertionError, match=msg): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + return_value = df.replace(replace_dict, 2, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1516,15 +1319,17 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"d": "z"}) + ) + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"obj1": "obj9"}) + ) + result = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1549,12 +1354,11 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + result = input_df.apply( + lambda x: x.cat.rename_categories( + {"a": "z", "obj1": "obj9", "cat1": "catX"} + ) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) @@ -1566,13 +1370,11 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - def test_replace_intervals(self, using_infer_string): + def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) - warning = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warning, match="Downcasting"): - result = df.replace({"a": {pd.Interval(0, 1): "x"}}) - expected = DataFrame({"a": ["x", "x"]}) + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + expected = DataFrame({"a": ["x", "x"]}, dtype=object) tm.assert_frame_equal(result, expected) def test_replace_unicode(self): @@ -1672,23 +1474,25 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype_frame(self, regex): + @pytest.mark.parametrize("value", [1, "1"]) + def test_replace_regex_dtype_frame(self, regex, value): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) - expected_df1 = DataFrame({"A": [1], "B": [1]}) - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): - result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + # When value is an integer, coerce result to object. + # When value is a string, infer the correct string dtype. + dtype = object if value == 1 else None + + expected_df1 = DataFrame({"A": [value], "B": [value]}, dtype=dtype) + result_df1 = df1.replace(to_replace="0", value=value, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) - expected_df2 = DataFrame({"A": [1], "B": ["1"]}) - with tm.assert_produces_warning(FutureWarning, match=msg): - result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + if regex: + expected_df2 = DataFrame({"A": [value], "B": ["1"]}, dtype=dtype) + else: + expected_df2 = DataFrame({"A": Series([value], dtype=dtype), "B": ["1"]}) + result_df2 = df2.replace(to_replace="0", value=value, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) def test_replace_with_value_also_being_replaced(self): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index fbf36dbc4fb02..80227c0462329 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -27,7 +27,7 @@ import pandas._testing as tm -@pytest.fixture() +@pytest.fixture def multiindex_df(): levels = [["A", ""], ["B", "b"]] return DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) @@ -106,7 +106,7 @@ def test_reset_index_with_intervals(self): tm.assert_frame_equal(result2, original) def test_reset_index(self, float_frame): - stacked = float_frame.stack(future_stack=True)[::2] + stacked = float_frame.stack()[::2] stacked = DataFrame({"foo": stacked, "bar": stacked}) names = ["first", "second"] @@ -232,9 +232,7 @@ def test_reset_index_level_missing(self, idx_lev): def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) - s1 = Series( - (9.81 * time**2) / 2, index=Index(time, name="time"), name="speed" - ) + s1 = Series((9.81 * time**2) / 2, index=Index(time, name="time"), name="speed") df = DataFrame(s1) reset = s1.reset_index() @@ -602,8 +600,8 @@ def test_reset_index_with_drop( {"a": [pd.NaT, Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, ), ( - [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], - {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, + [(pd.NaT, 1), (pd.Timedelta(123, "D"), 2)], + {"a": [pd.NaT, pd.Timedelta(123, "D")], "b": [1, 2], "x": [11, 12]}, ), ], ) @@ -661,7 +659,7 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex( idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes if using_infer_string and dtype == object: - dtype = "string" + dtype = pd.StringDtype(na_value=np.nan) expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -694,7 +692,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") if using_infer_string: - expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") + expected["c2"] = expected["c2"].astype("str") tm.assert_frame_equal(result, expected) @@ -741,7 +739,7 @@ def test_reset_index_rename(float_frame): def test_reset_index_rename_multiindex(float_frame): # GH 6878 - stacked_df = float_frame.stack(future_stack=True)[::2] + stacked_df = float_frame.stack()[::2] stacked_df = DataFrame({"foo": stacked_df, "bar": stacked_df}) names = ["first", "second"] @@ -755,7 +753,7 @@ def test_reset_index_rename_multiindex(float_frame): def test_errorreset_index_rename(float_frame): # GH 6878 - stacked_df = float_frame.stack(future_stack=True)[::2] + stacked_df = float_frame.stack()[::2] stacked_df = DataFrame({"first": stacked_df, "second": stacked_df}) with pytest.raises( @@ -780,3 +778,34 @@ def test_reset_index_false_index_name(): result_frame.reset_index() expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_frame_equal(result_frame, expected_frame) + + +@pytest.mark.parametrize("columns", [None, Index([])]) +def test_reset_index_with_empty_frame(columns): + # Currently empty DataFrame has RangeIndex or object dtype Index, but when + # resetting the index we still want to end up with the default string dtype + # https://github.com/pandas-dev/pandas/issues/60338 + + index = Index([], name="foo") + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame(columns=["foo"]) + tm.assert_frame_equal(result, expected) + + index = Index([1, 2, 3], name="foo") + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([], names=["foo", "bar"]) + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame(columns=["foo", "bar"]) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"]) + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame({"foo": [1, 2], "bar": [2, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 6b3459fbdc035..a9d56cbfd2b46 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -111,12 +111,10 @@ def test_sample_invalid_weight_lengths(self, obj): obj.sample(n=3, weights=[0, 1]) with pytest.raises(ValueError, match=msg): - bad_weights = [0.5] * 11 - obj.sample(n=3, weights=bad_weights) + obj.sample(n=3, weights=[0.5] * 11) with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"): - bad_weight_series = Series([0, 0, 0.2]) - obj.sample(n=4, weights=bad_weight_series) + obj.sample(n=4, weights=Series([0, 0, 0.2])) def test_sample_negative_weights(self, obj): # Check won't accept negative weights @@ -200,8 +198,7 @@ def test_sample_upsampling_without_replacement(self, frame_or_series): obj = tm.get_obj(obj, frame_or_series) msg = ( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." + "Replace has to be set to `True` when upsampling the population `frac` > 1." ) with pytest.raises(ValueError, match=msg): obj.sample(frac=2, replace=False) @@ -335,7 +332,7 @@ def test_sample_aligns_weights_with_frame(self): def test_sample_is_copy(self): # GH#27357, GH#30784: ensure the result of sample is an actual copy and - # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + # doesn't track the parent dataframe df = DataFrame( np.random.default_rng(2).standard_normal((10, 3)), columns=["a", "b", "c"] ) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 47c479faed1ef..0354e9df3d168 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -32,7 +32,7 @@ def __init__(self, data, dtype) -> None: self.data = data self._dtype = dtype - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property @@ -50,7 +50,7 @@ def copy(self): class TestSelectDtypes: - def test_select_dtypes_include_using_list_like(self): + def test_select_dtypes_include_using_list_like(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -94,6 +94,14 @@ def test_select_dtypes_include_using_list_like(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=["period"]) + if using_infer_string: + ri = df.select_dtypes(include=["str"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -151,7 +159,7 @@ def test_select_dtypes_exclude_include_int(self, include): expected = df[["b", "c", "e"]] tm.assert_frame_equal(result, expected) - def test_select_dtypes_include_using_scalars(self): + def test_select_dtypes_include_using_scalars(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -187,6 +195,11 @@ def test_select_dtypes_include_using_scalars(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include="period") + if using_infer_string: + ri = df.select_dtypes(include="str") + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_scalars(self): df = DataFrame( { @@ -347,7 +360,10 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) - def test_select_dtypes_str_raises(self, dtype, arg): + def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): + if using_infer_string and (dtype == "str" or dtype is str): + # this is tested below + pytest.skip("Selecting string columns works with future strings") df = DataFrame( { "a": list("abc"), diff --git a/pandas/tests/frame/methods/test_set_axis.py b/pandas/tests/frame/methods/test_set_axis.py index 8d249bc7b7fa4..7b75bcf4f348d 100644 --- a/pandas/tests/frame/methods/test_set_axis.py +++ b/pandas/tests/frame/methods/test_set_axis.py @@ -21,7 +21,7 @@ def test_set_axis(self, obj): result = obj.set_axis(new_index, axis=0) tm.assert_equal(expected, result) - def test_set_axis_copy(self, obj, using_copy_on_write): + def test_set_axis_copy(self, obj): # Test copy keyword GH#47932 new_index = list("abcd")[: len(obj)] @@ -29,20 +29,7 @@ def test_set_axis_copy(self, obj, using_copy_on_write): expected = obj.copy() expected.index = new_index - result = obj.set_axis(new_index, axis=0, copy=True) - tm.assert_equal(expected, result) - assert result is not obj - # check we DID make a copy - if not using_copy_on_write: - if obj.ndim == 1: - assert not tm.shares_memory(result, obj) - else: - assert not any( - tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) - for i in range(obj.shape[1]) - ) - - result = obj.set_axis(new_index, axis=0, copy=False) + result = obj.set_axis(new_index, axis=0) tm.assert_equal(expected, result) assert result is not obj # check we did NOT make a copy @@ -54,29 +41,19 @@ def test_set_axis_copy(self, obj, using_copy_on_write): for i in range(obj.shape[1]) ) - # copy defaults to True result = obj.set_axis(new_index, axis=0) tm.assert_equal(expected, result) assert result is not obj - if using_copy_on_write: - # check we DID NOT make a copy - if obj.ndim == 1: - assert tm.shares_memory(result, obj) - else: - assert any( - tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) - for i in range(obj.shape[1]) - ) - # check we DID make a copy - elif obj.ndim == 1: - assert not tm.shares_memory(result, obj) + # check we DID NOT make a copy + if obj.ndim == 1: + assert tm.shares_memory(result, obj) else: - assert not any( + assert any( tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) for i in range(obj.shape[1]) ) - res = obj.set_axis(new_index, copy=False) + res = obj.set_axis(new_index) tm.assert_equal(expected, res) # check we did NOT make a copy if res.ndim == 1: @@ -116,7 +93,7 @@ def test_set_axis_setattr_index_wrong_length(self, obj): # wrong length msg = ( f"Length mismatch: Expected axis has {len(obj)} elements, " - f"new values have {len(obj)-1} elements" + f"new values have {len(obj) - 1} elements" ) with pytest.raises(ValueError, match=msg): obj.index = np.arange(len(obj) - 1) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 5724f79b82578..198cab0e91eab 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -148,7 +148,7 @@ def test_set_index_dst(self): def test_set_index(self, float_string_frame): df = float_string_frame - idx = Index(np.arange(len(df))[::-1]) + idx = Index(np.arange(len(df) - 1, -1, -1, dtype=np.int64)) df = df.set_index(idx) tm.assert_index_equal(df.index, idx) @@ -577,8 +577,8 @@ def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): @pytest.mark.parametrize("append", [True, False]) @pytest.mark.parametrize("drop", [True, False]) - @pytest.mark.parametrize("box", [set], ids=["set"]) - def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append): + def test_set_index_raise_on_type(self, frame_of_index_cols, drop, append): + box = set df = frame_of_index_cols msg = 'The parameter "keys" may be a column key, .*' @@ -628,7 +628,7 @@ def __init__(self, name, color) -> None: self.color = color def __str__(self) -> str: - return f"" + return f"" # necessary for pretty KeyError __repr__ = __str__ @@ -706,7 +706,7 @@ def __init__(self, name, color) -> None: self.color = color def __str__(self) -> str: - return f"" + return f"" thing1 = Thing("One", "red") thing2 = Thing("Two", "blue") diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index b21aa2d687682..b52240c208493 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( CategoricalIndex, @@ -32,23 +30,20 @@ def test_shift_axis1_with_valid_fill_value_one_array(self): expected2 = DataFrame([12345] * 5, dtype="Float64") tm.assert_frame_equal(res2, expected2) - def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): + def test_shift_disallow_freq_and_fill_value(self, frame_or_series): # Can't pass both! obj = frame_or_series( np.random.default_rng(2).standard_normal(5), index=date_range("1/1/2000", periods=5, freq="h"), ) - msg = ( - "Passing a 'freq' together with a 'fill_value' silently ignores the " - "fill_value" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "Passing a 'freq' together with a 'fill_value'" + with pytest.raises(ValueError, match=msg): obj.shift(1, fill_value=1, freq="h") if frame_or_series is DataFrame: obj.columns = date_range("1/1/2000", periods=1, freq="h") - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): obj.shift(1, axis=1, fill_value=1, freq="h") @pytest.mark.parametrize( @@ -325,7 +320,7 @@ def test_shift_categorical1(self, frame_or_series): def get_cat_values(ndframe): # For Series we could just do ._values; for DataFrame # we may be able to do this if we ever have 2D Categoricals - return ndframe._mgr.arrays[0] + return ndframe._mgr.blocks[0].values cat = get_cat_values(obj) @@ -423,13 +418,12 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) - def test_shift_axis1_multiple_blocks(self, using_array_manager): + def test_shift_axis1_multiple_blocks(self): # GH#35488 df1 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 3))) df2 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 2))) df3 = pd.concat([df1, df2], axis=1) - if not using_array_manager: - assert len(df3._mgr.blocks) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(2, axis=1) @@ -449,8 +443,7 @@ def test_shift_axis1_multiple_blocks(self, using_array_manager): # Case with periods < 0 # rebuild df3 because `take` call above consolidated df3 = pd.concat([df1, df2], axis=1) - if not using_array_manager: - assert len(df3._mgr.blocks) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(-2, axis=1) expected = df3.take([2, 3, 4, -1, -1], axis=1) @@ -466,7 +459,6 @@ def test_shift_axis1_multiple_blocks(self, using_array_manager): tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support def test_shift_axis1_multiple_blocks_with_int_fill(self): # GH#42719 rng = np.random.default_rng(2) @@ -568,7 +560,7 @@ def test_shift_dt64values_int_fill_deprecated(self): # same thing but not consolidated; pre-2.0 we got different behavior df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(1, axis=1, fill_value=0) tm.assert_frame_equal(result, expected) @@ -629,7 +621,7 @@ def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat): # same thing but not consolidated df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(-1, axis=1, fill_value="foo") tm.assert_frame_equal(result, expected) @@ -722,13 +714,6 @@ def test_shift_with_iterable_freq_and_fill_value(self): df.shift(1, freq="h"), ) - msg = ( - "Passing a 'freq' together with a 'fill_value' silently ignores the " - "fill_value" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.shift([1, 2], fill_value=1, freq="h") - def test_shift_with_iterable_check_other_arguments(self): # GH#44424 data = {"a": [1, 2], "b": [4, 5]} @@ -756,3 +741,28 @@ def test_shift_with_iterable_check_other_arguments(self): msg = "Cannot specify `suffix` if `periods` is an int." with pytest.raises(ValueError, match=msg): df.shift(1, suffix="fails") + + def test_shift_axis_one_empty(self): + # GH#57301 + df = DataFrame() + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, df) + + def test_shift_with_offsets_freq_empty(self): + # GH#60102 + dates = date_range("2020-01-01", periods=3, freq="D") + offset = offsets.Day() + shifted_dates = dates + offset + df = DataFrame(index=dates) + df_shifted = DataFrame(index=shifted_dates) + result = df.shift(freq=offset) + tm.assert_frame_equal(result, df_shifted) + + def test_series_shift_interval_preserves_closed(self): + # GH#60389 + ser = Series( + [pd.Interval(1, 2, closed="right"), pd.Interval(2, 3, closed="right")] + ) + result = ser.shift(1) + expected = Series([np.nan, pd.Interval(1, 2, closed="right")]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 49e292057e4dc..1a631e760208a 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -773,18 +773,6 @@ def test_sort_index_ascending_bad_value_raises(self, ascending): with pytest.raises(ValueError, match=match): df.sort_index(axis=0, ascending=ascending, na_position="first") - def test_sort_index_use_inf_as_na(self): - # GH 29687 - expected = DataFrame( - {"col1": [1, 2, 3], "col2": [3, 4, 5]}, - index=pd.date_range("2020", periods=3), - ) - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.use_inf_as_na", True): - result = expected.sort_index() - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( "ascending", [(True, False), [True, False]], @@ -927,7 +915,6 @@ def test_sort_index_na_position(self): result = df.sort_index(level=[0, 1], na_position="last") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("ascending", [True, False]) def test_sort_index_multiindex_sort_remaining(self, ascending): # GH #24247 df = DataFrame( @@ -1002,3 +989,44 @@ def test_axis_columns_ignore_index(): result = df.sort_index(axis="columns", ignore_index=True) expected = DataFrame([[2, 1]]) tm.assert_frame_equal(result, expected) + + +def test_axis_columns_ignore_index_ascending_false(): + # GH 57293 + df = DataFrame( + { + "b": [1.0, 3.0, np.nan], + "a": [1, 4, 3], + 1: ["a", "b", "c"], + "e": [3, 1, 4], + "d": [1, 2, 8], + } + ).set_index(["b", "a", 1]) + result = df.sort_index(axis="columns", ignore_index=True, ascending=False) + expected = df.copy() + expected.columns = RangeIndex(2) + tm.assert_frame_equal(result, expected) + + +def test_sort_index_stable_sort(): + # GH 57151 + df = DataFrame( + data=[ + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + result = df.sort_index(level="dt", kind="stable") + expected = DataFrame( + data=[ + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index f2f02058a534e..e728526519e9d 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -170,7 +170,7 @@ def test_sort_values_multicolumn_uint64(self): "a": pd.Series([18446637057563306014, 1162265347240853609]), "b": pd.Series([1, 2]), }, - index=pd.Index([1, 0]), + index=range(1, -1, -1), ) tm.assert_frame_equal(result, expected) @@ -331,21 +331,15 @@ def test_sort_values_datetimes(self): df2 = df.sort_values(by=["C", "B"]) tm.assert_frame_equal(df1, df2) - def test_sort_values_frame_column_inplace_sort_exception( - self, float_frame, using_copy_on_write - ): + def test_sort_values_frame_column_inplace_sort_exception(self, float_frame): s = float_frame["A"] float_frame_orig = float_frame.copy() - if using_copy_on_write: - # INFO(CoW) Series is a new object, so can be changed inplace - # without modifying original datafame - s.sort_values(inplace=True) - tm.assert_series_equal(s, float_frame_orig["A"].sort_values()) - # column in dataframe is not changed - tm.assert_frame_equal(float_frame, float_frame_orig) - else: - with pytest.raises(ValueError, match="This Series is a view"): - s.sort_values(inplace=True) + # INFO(CoW) Series is a new object, so can be changed inplace + # without modifying original datafame + s.sort_values(inplace=True) + tm.assert_series_equal(s, float_frame_orig["A"].sort_values()) + # column in dataframe is not changed + tm.assert_frame_equal(float_frame, float_frame_orig) cp = s.copy() cp.sort_values() # it works! @@ -366,7 +360,7 @@ def test_sort_values_nat_values_in_int_column(self): df_reversed = DataFrame( {"int": int_values[::-1], "float": float_values[::-1]}, columns=["int", "float"], - index=[1, 0], + index=range(1, -1, -1), ) # NaT is not a "na" for int64 columns, so na_position must not @@ -391,7 +385,7 @@ def test_sort_values_nat_values_in_int_column(self): df_reversed = DataFrame( {"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]}, columns=["datetime", "float"], - index=[1, 0], + index=range(1, -1, -1), ) df_sorted = df.sort_values(["datetime", "float"], na_position="first") @@ -546,19 +540,19 @@ def test_sort_values_na_position_with_categories_raises(self): @pytest.mark.parametrize( "original_dict, sorted_dict, ignore_index, output_index", [ - ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), - ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, range(3)), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, range(2, -1, -1)), ( {"A": [1, 2, 3], "B": [2, 3, 4]}, {"A": [3, 2, 1], "B": [4, 3, 2]}, True, - [0, 1, 2], + range(3), ), ( {"A": [1, 2, 3], "B": [2, 3, 4]}, {"A": [3, 2, 1], "B": [4, 3, 2]}, False, - [2, 1, 0], + range(2, -1, -1), ), ], ) @@ -598,26 +592,20 @@ def test_sort_values_nat_na_position_default(self): result = expected.sort_values(["A", "date"]) tm.assert_frame_equal(result, expected) - def test_sort_values_item_cache(self, using_array_manager, using_copy_on_write): + def test_sort_values_item_cache(self): # previous behavior incorrect retained an invalid _item_cache entry df = DataFrame( np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"] ) df["D"] = df["A"] * 2 ser = df["A"] - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 2 df.sort_values(by="A") - if using_copy_on_write: - ser.iloc[0] = 99 - assert df.iloc[0, 0] == df["A"][0] - assert df.iloc[0, 0] != 99 - else: - ser.values[0] = 99 - assert df.iloc[0, 0] == df["A"][0] - assert df.iloc[0, 0] == 99 + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] != 99 def test_sort_values_reshaping(self): # GH 39426 @@ -849,11 +837,6 @@ def sort_names(request): return request.param -@pytest.fixture(params=[True, False]) -def ascending(request): - return request.param - - class TestSortValuesLevelAsStr: def test_sort_index_level_and_column_label( self, df_none, df_idx, sort_names, ascending, request @@ -927,7 +910,6 @@ def test_sort_values_validate_ascending_for_value_error(self): with pytest.raises(ValueError, match=msg): df.sort_values(by="D", ascending="False") - @pytest.mark.parametrize("ascending", [False, 0, 1, True]) def test_sort_values_validate_ascending_functional(self, ascending): df = DataFrame({"D": [23, 7, 21]}) indexer = df["D"].argsort().values diff --git a/pandas/tests/frame/methods/test_swapaxes.py b/pandas/tests/frame/methods/test_swapaxes.py deleted file mode 100644 index 53a4691d48b1c..0000000000000 --- a/pandas/tests/frame/methods/test_swapaxes.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame -import pandas._testing as tm - - -class TestSwapAxes: - def test_swapaxes(self): - df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) - msg = "'DataFrame.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) - tm.assert_frame_equal(df.T, df.swapaxes(1, 0)) - - def test_swapaxes_noop(self): - df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) - msg = "'DataFrame.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_frame_equal(df, df.swapaxes(0, 0)) - - def test_swapaxes_invalid_axis(self): - df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) - msg = "'DataFrame.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - msg = "No axis named 2 for object type DataFrame" - with pytest.raises(ValueError, match=msg): - df.swapaxes(2, 5) - - def test_round_empty_not_input(self): - # GH#51032 - df = DataFrame({"a": [1, 2]}) - msg = "'DataFrame.swapaxes' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.swapaxes("index", "index") - tm.assert_frame_equal(df, result) - assert df is not result diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 250567eafc670..9eafc69013ffe 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -33,125 +33,132 @@ def read_csv(self, path, **kwargs): return read_csv(path, **params) - def test_to_csv_from_csv1(self, float_frame, datetime_frame): - with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path: - float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan - - float_frame.to_csv(path) - float_frame.to_csv(path, columns=["A", "B"]) - float_frame.to_csv(path, header=False) - float_frame.to_csv(path, index=False) - - # test roundtrip - # freq does not roundtrip - datetime_frame.index = datetime_frame.index._with_freq(None) - datetime_frame.to_csv(path) - recons = self.read_csv(path, parse_dates=True) - tm.assert_frame_equal(datetime_frame, recons) - - datetime_frame.to_csv(path, index_label="index") - recons = self.read_csv(path, index_col=None, parse_dates=True) - - assert len(recons.columns) == len(datetime_frame.columns) + 1 - - # no index - datetime_frame.to_csv(path, index=False) - recons = self.read_csv(path, index_col=None, parse_dates=True) - tm.assert_almost_equal(datetime_frame.values, recons.values) - - # corner case - dm = DataFrame( - { - "s1": Series(range(3), index=np.arange(3, dtype=np.int64)), - "s2": Series(range(2), index=np.arange(2, dtype=np.int64)), - } - ) - dm.to_csv(path) + def test_to_csv_from_csv1(self, temp_file, float_frame): + path = str(temp_file) + float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan + + float_frame.to_csv(path) + float_frame.to_csv(path, columns=["A", "B"]) + float_frame.to_csv(path, header=False) + float_frame.to_csv(path, index=False) + + def test_to_csv_from_csv1_datetime(self, temp_file, datetime_frame): + path = str(temp_file) + # test roundtrip + # freq does not roundtrip + datetime_frame.index = datetime_frame.index._with_freq(None) + datetime_frame.to_csv(path) + recons = self.read_csv(path, parse_dates=True) + expected = datetime_frame.copy() + expected.index = expected.index.as_unit("s") + tm.assert_frame_equal(expected, recons) + + datetime_frame.to_csv(path, index_label="index") + recons = self.read_csv(path, index_col=None, parse_dates=True) + + assert len(recons.columns) == len(datetime_frame.columns) + 1 + + # no index + datetime_frame.to_csv(path, index=False) + recons = self.read_csv(path, index_col=None, parse_dates=True) + tm.assert_almost_equal(datetime_frame.values, recons.values) + + def test_to_csv_from_csv1_corner_case(self, temp_file): + path = str(temp_file) + dm = DataFrame( + { + "s1": Series(range(3), index=np.arange(3, dtype=np.int64)), + "s2": Series(range(2), index=np.arange(2, dtype=np.int64)), + } + ) + dm.to_csv(path) - recons = self.read_csv(path) - tm.assert_frame_equal(dm, recons) + recons = self.read_csv(path) + tm.assert_frame_equal(dm, recons) - def test_to_csv_from_csv2(self, float_frame): - with tm.ensure_clean("__tmp_to_csv_from_csv2__") as path: - # duplicate index - df = DataFrame( - np.random.default_rng(2).standard_normal((3, 3)), - index=["a", "a", "b"], - columns=["x", "y", "z"], - ) - df.to_csv(path) - result = self.read_csv(path) - tm.assert_frame_equal(result, df) + def test_to_csv_from_csv2(self, temp_file, float_frame): + path = str(temp_file) + # duplicate index + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=["a", "a", "b"], + columns=["x", "y", "z"], + ) + df.to_csv(path) + result = self.read_csv(path) + tm.assert_frame_equal(result, df) - midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) - df = DataFrame( - np.random.default_rng(2).standard_normal((3, 3)), - index=midx, - columns=["x", "y", "z"], - ) + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=midx, + columns=["x", "y", "z"], + ) - df.to_csv(path) - result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) - tm.assert_frame_equal(result, df, check_names=False) - - # column aliases - col_aliases = Index(["AA", "X", "Y", "Z"]) - float_frame.to_csv(path, header=col_aliases) - - rs = self.read_csv(path) - xp = float_frame.copy() - xp.columns = col_aliases - tm.assert_frame_equal(xp, rs) - - msg = "Writing 4 cols but got 2 aliases" - with pytest.raises(ValueError, match=msg): - float_frame.to_csv(path, header=["AA", "X"]) - - def test_to_csv_from_csv3(self): - with tm.ensure_clean("__tmp_to_csv_from_csv3__") as path: - df1 = DataFrame(np.random.default_rng(2).standard_normal((3, 1))) - df2 = DataFrame(np.random.default_rng(2).standard_normal((3, 1))) - - df1.to_csv(path) - df2.to_csv(path, mode="a", header=False) - xp = pd.concat([df1, df2]) - rs = read_csv(path, index_col=0) - rs.columns = [int(label) for label in rs.columns] - xp.columns = [int(label) for label in xp.columns] - tm.assert_frame_equal(xp, rs) - - def test_to_csv_from_csv4(self): - with tm.ensure_clean("__tmp_to_csv_from_csv4__") as path: - # GH 10833 (TimedeltaIndex formatting) - dt = pd.Timedelta(seconds=1) - df = DataFrame( - {"dt_data": [i * dt for i in range(3)]}, - index=Index([i * dt for i in range(3)], name="dt_index"), - ) - df.to_csv(path) + df.to_csv(path) + result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) + tm.assert_frame_equal(result, df, check_names=False) + + # column aliases + col_aliases = Index(["AA", "X", "Y", "Z"]) + float_frame.to_csv(path, header=col_aliases) + + rs = self.read_csv(path) + xp = float_frame.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + msg = "Writing 4 cols but got 2 aliases" + with pytest.raises(ValueError, match=msg): + float_frame.to_csv(path, header=["AA", "X"]) + + def test_to_csv_from_csv3(self, temp_file): + path = str(temp_file) + df1 = DataFrame(np.random.default_rng(2).standard_normal((3, 1))) + df2 = DataFrame(np.random.default_rng(2).standard_normal((3, 1))) + + df1.to_csv(path) + df2.to_csv(path, mode="a", header=False) + xp = pd.concat([df1, df2]) + rs = read_csv(path, index_col=0) + rs.columns = [int(label) for label in rs.columns] + xp.columns = [int(label) for label in xp.columns] + tm.assert_frame_equal(xp, rs) + + def test_to_csv_from_csv4(self, temp_file): + path = str(temp_file) + # GH 10833 (TimedeltaIndex formatting) + dt = pd.Timedelta(seconds=1) + df = DataFrame( + {"dt_data": [i * dt for i in range(3)]}, + index=Index([i * dt for i in range(3)], name="dt_index"), + ) + df.to_csv(path) - result = read_csv(path, index_col="dt_index") - result.index = pd.to_timedelta(result.index) - result["dt_data"] = pd.to_timedelta(result["dt_data"]) + result = read_csv(path, index_col="dt_index") + result.index = pd.to_timedelta(result.index) + result["dt_data"] = pd.to_timedelta(result["dt_data"]) - tm.assert_frame_equal(df, result, check_index_type=True) + tm.assert_frame_equal(df, result, check_index_type=True) - def test_to_csv_from_csv5(self, timezone_frame): + def test_to_csv_from_csv5(self, temp_file, timezone_frame): # tz, 8260 - with tm.ensure_clean("__tmp_to_csv_from_csv5__") as path: - timezone_frame.to_csv(path) - result = read_csv(path, index_col=0, parse_dates=["A"]) - - converter = ( - lambda c: to_datetime(result[c]) - .dt.tz_convert("UTC") - .dt.tz_convert(timezone_frame[c].dt.tz) - ) - result["B"] = converter("B") - result["C"] = converter("C") - tm.assert_frame_equal(result, timezone_frame) + path = str(temp_file) + timezone_frame.to_csv(path) + result = read_csv(path, index_col=0, parse_dates=["A"]) + + converter = ( + lambda c: to_datetime(result[c]) + .dt.tz_convert("UTC") + .dt.tz_convert(timezone_frame[c].dt.tz) + .dt.as_unit("ns") + ) + result["B"] = converter("B") + result["C"] = converter("C") + result["A"] = result["A"].dt.as_unit("ns") + tm.assert_frame_equal(result, timezone_frame) - def test_to_csv_cols_reordering(self): + def test_to_csv_cols_reordering(self, temp_file): # GH3454 chunksize = 5 N = int(chunksize * 2.5) @@ -164,14 +171,14 @@ def test_to_csv_cols_reordering(self): cs = df.columns cols = [cs[2], cs[0]] - with tm.ensure_clean() as path: - df.to_csv(path, columns=cols, chunksize=chunksize) - rs_c = read_csv(path, index_col=0) + path = str(temp_file) + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = read_csv(path, index_col=0) tm.assert_frame_equal(df[cols], rs_c, check_names=False) @pytest.mark.parametrize("cols", [None, ["b", "a"]]) - def test_to_csv_new_dupe_cols(self, cols): + def test_to_csv_new_dupe_cols(self, temp_file, cols): chunksize = 5 N = int(chunksize * 2.5) @@ -181,34 +188,34 @@ def test_to_csv_new_dupe_cols(self, cols): index=Index([f"i-{i}" for i in range(N)], name="a"), columns=["a", "a", "b"], ) - with tm.ensure_clean() as path: - df.to_csv(path, columns=cols, chunksize=chunksize) - rs_c = read_csv(path, index_col=0) - - # we wrote them in a different order - # so compare them in that order - if cols is not None: - if df.columns.is_unique: - rs_c.columns = cols - else: - indexer, missing = df.columns.get_indexer_non_unique(cols) - rs_c.columns = df.columns.take(indexer) - - for c in cols: - obj_df = df[c] - obj_rs = rs_c[c] - if isinstance(obj_df, Series): - tm.assert_series_equal(obj_df, obj_rs) - else: - tm.assert_frame_equal(obj_df, obj_rs, check_names=False) - - # wrote in the same order + path = str(temp_file) + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = read_csv(path, index_col=0) + + # we wrote them in a different order + # so compare them in that order + if cols is not None: + if df.columns.is_unique: + rs_c.columns = cols else: - rs_c.columns = df.columns - tm.assert_frame_equal(df, rs_c, check_names=False) + indexer, missing = df.columns.get_indexer_non_unique(cols) + rs_c.columns = df.columns.take(indexer) + + for c in cols: + obj_df = df[c] + obj_rs = rs_c[c] + if isinstance(obj_df, Series): + tm.assert_series_equal(obj_df, obj_rs) + else: + tm.assert_frame_equal(obj_df, obj_rs, check_names=False) + + # wrote in the same order + else: + rs_c.columns = df.columns + tm.assert_frame_equal(df, rs_c, check_names=False) @pytest.mark.slow - def test_to_csv_dtnat(self): + def test_to_csv_dtnat(self, temp_file): # GH3437 def make_dtnat_arr(n, nnat=None): if nnat is None: @@ -226,12 +233,16 @@ def make_dtnat_arr(n, nnat=None): s1 = make_dtnat_arr(chunksize + 5) s2 = make_dtnat_arr(chunksize + 5, 0) - with tm.ensure_clean("1.csv") as pth: - df = DataFrame({"a": s1, "b": s2}) - df.to_csv(pth, chunksize=chunksize) + path = str(temp_file) + df = DataFrame({"a": s1, "b": s2}) + df.to_csv(path, chunksize=chunksize) + + result = self.read_csv(path).apply(to_datetime) - recons = self.read_csv(pth).apply(to_datetime) - tm.assert_frame_equal(df, recons, check_names=False) + expected = df[:] + expected["a"] = expected["a"].astype("M8[s]") + expected["b"] = expected["b"].astype("M8[s]") + tm.assert_frame_equal(result, expected, check_names=False) def _return_result_expected( self, @@ -349,6 +360,7 @@ def test_to_csv_nrows(self, nrows): columns=Index(list("abcd"), dtype=object), ) result, expected = self._return_result_expected(df, 1000, "dt", "s") + expected.index = expected.index.astype("M8[ns]") tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -378,6 +390,10 @@ def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): r_idx_type, c_idx_type, ) + if r_idx_type in ["dt", "p"]: + expected.index = expected.index.astype("M8[ns]") + if c_idx_type in ["dt", "p"]: + expected.columns = expected.columns.astype("M8[ns]") tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -426,7 +442,7 @@ def test_to_csv_chunksize(self): rows = chunksize // 2 + 1 df = DataFrame( np.ones((rows, 2)), - columns=Index(list("ab"), dtype=object), + columns=Index(list("ab")), index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), ) result, expected = self._return_result_expected(df, chunksize, rnlvl=2) @@ -460,47 +476,47 @@ def test_to_csv_params(self, nrows, df_params, func_params, ncols): for _ in range(df_params["c_idx_nlevels"]) ) else: - columns = Index([f"i-{i}" for i in range(ncols)], dtype=object) + columns = Index([f"i-{i}" for i in range(ncols)]) df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) result, expected = self._return_result_expected(df, 1000, **func_params) tm.assert_frame_equal(result, expected, check_names=False) - def test_to_csv_from_csv_w_some_infs(self, float_frame): + def test_to_csv_from_csv_w_some_infs(self, temp_file, float_frame): # test roundtrip with inf, -inf, nan, as full columns and mix float_frame["G"] = np.nan f = lambda x: [np.inf, np.nan][np.random.default_rng(2).random() < 0.5] float_frame["h"] = float_frame.index.map(f) - with tm.ensure_clean() as path: - float_frame.to_csv(path) - recons = self.read_csv(path) + path = str(temp_file) + float_frame.to_csv(path) + recons = self.read_csv(path) - tm.assert_frame_equal(float_frame, recons) - tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons)) + tm.assert_frame_equal(float_frame, recons) + tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons)) - def test_to_csv_from_csv_w_all_infs(self, float_frame): + def test_to_csv_from_csv_w_all_infs(self, temp_file, float_frame): # test roundtrip with inf, -inf, nan, as full columns and mix float_frame["E"] = np.inf float_frame["F"] = -np.inf - with tm.ensure_clean() as path: - float_frame.to_csv(path) - recons = self.read_csv(path) + path = str(temp_file) + float_frame.to_csv(path) + recons = self.read_csv(path) - tm.assert_frame_equal(float_frame, recons) - tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons)) + tm.assert_frame_equal(float_frame, recons) + tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons)) - def test_to_csv_no_index(self): + def test_to_csv_no_index(self, temp_file): # GH 3624, after appending columns, to_csv fails - with tm.ensure_clean("__tmp_to_csv_no_index__") as path: - df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]}) - df.to_csv(path, index=False) - result = read_csv(path) - tm.assert_frame_equal(df, result) - df["c3"] = Series([7, 8, 9], dtype="int64") - df.to_csv(path, index=False) - result = read_csv(path) - tm.assert_frame_equal(df, result) + path = str(temp_file) + df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]}) + df.to_csv(path, index=False) + result = read_csv(path) + tm.assert_frame_equal(df, result) + df["c3"] = Series([7, 8, 9], dtype="int64") + df.to_csv(path, index=False) + result = read_csv(path) + tm.assert_frame_equal(df, result) def test_to_csv_with_mix_columns(self): # gh-11637: incorrect output when a mix of integer and string column @@ -510,74 +526,74 @@ def test_to_csv_with_mix_columns(self): df["test"] = "txt" assert df.to_csv() == df.to_csv(columns=[0, 1, "test"]) - def test_to_csv_headers(self): + def test_to_csv_headers(self, temp_file): # GH6186, the presence or absence of `index` incorrectly # causes to_csv to have different header semantics. from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"]) - with tm.ensure_clean("__tmp_to_csv_headers__") as path: - from_df.to_csv(path, header=["X", "Y"]) - recons = self.read_csv(path) + path = str(temp_file) + from_df.to_csv(path, header=["X", "Y"]) + recons = self.read_csv(path) - tm.assert_frame_equal(to_df, recons) + tm.assert_frame_equal(to_df, recons) - from_df.to_csv(path, index=False, header=["X", "Y"]) - recons = self.read_csv(path) + from_df.to_csv(path, index=False, header=["X", "Y"]) + recons = self.read_csv(path) - return_value = recons.reset_index(inplace=True) - assert return_value is None - tm.assert_frame_equal(to_df, recons) + return_value = recons.reset_index(inplace=True) + assert return_value is None + tm.assert_frame_equal(to_df, recons) - def test_to_csv_multiindex(self, float_frame, datetime_frame): + def test_to_csv_multiindex(self, temp_file, float_frame, datetime_frame): frame = float_frame old_index = frame.index arrays = np.arange(len(old_index) * 2, dtype=np.int64).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: - frame.to_csv(path, header=False) - frame.to_csv(path, columns=["A", "B"]) + path = str(temp_file) + frame.to_csv(path, header=False) + frame.to_csv(path, columns=["A", "B"]) - # round trip - frame.to_csv(path) + # round trip + frame.to_csv(path) - df = self.read_csv(path, index_col=[0, 1], parse_dates=False) + df = self.read_csv(path, index_col=[0, 1], parse_dates=False) - # TODO to_csv drops column name - tm.assert_frame_equal(frame, df, check_names=False) - assert frame.index.names == df.index.names + # TODO to_csv drops column name + tm.assert_frame_equal(frame, df, check_names=False) + assert frame.index.names == df.index.names - # needed if setUp becomes a class method - float_frame.index = old_index + # needed if setUp becomes a class method + float_frame.index = old_index - # try multiindex with dates - tsframe = datetime_frame - old_index = tsframe.index - new_index = [old_index, np.arange(len(old_index), dtype=np.int64)] - tsframe.index = MultiIndex.from_arrays(new_index) + # try multiindex with dates + tsframe = datetime_frame + old_index = tsframe.index + new_index = [old_index, np.arange(len(old_index), dtype=np.int64)] + tsframe.index = MultiIndex.from_arrays(new_index) - tsframe.to_csv(path, index_label=["time", "foo"]) - with tm.assert_produces_warning( - UserWarning, match="Could not infer format" - ): - recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) + tsframe.to_csv(path, index_label=["time", "foo"]) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) - # TODO to_csv drops column name - tm.assert_frame_equal(tsframe, recons, check_names=False) + # TODO to_csv drops column name + expected = tsframe.copy() + expected.index = MultiIndex.from_arrays([old_index.as_unit("s"), new_index[1]]) + tm.assert_frame_equal(recons, expected, check_names=False) - # do not load index - tsframe.to_csv(path) - recons = self.read_csv(path, index_col=None) - assert len(recons.columns) == len(tsframe.columns) + 2 + # do not load index + tsframe.to_csv(path) + recons = self.read_csv(path, index_col=None) + assert len(recons.columns) == len(tsframe.columns) + 2 - # no index - tsframe.to_csv(path, index=False) - recons = self.read_csv(path, index_col=None) - tm.assert_almost_equal(recons.values, datetime_frame.values) + # no index + tsframe.to_csv(path, index=False) + recons = self.read_csv(path, index_col=None) + tm.assert_almost_equal(recons.values, datetime_frame.values) - # needed if setUp becomes class method - datetime_frame.index = old_index + # needed if setUp becomes class method + datetime_frame.index = old_index with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: # GH3571, GH1651, GH3141 @@ -682,46 +698,43 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 - def test_to_csv_interval_index(self, using_infer_string): + def test_to_csv_interval_index(self, temp_file, using_infer_string): # GH 28210 df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) - with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: - df.to_csv(path) - result = self.read_csv(path, index_col=0) + path = str(temp_file) + df.to_csv(path) + result = self.read_csv(path, index_col=0) - # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) - expected = df.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = df.copy() + expected.index = expected.index.astype("str") - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) - def test_to_csv_float32_nanrep(self): + def test_to_csv_float32_nanrep(self, temp_file): df = DataFrame( np.random.default_rng(2).standard_normal((1, 4)).astype(np.float32) ) df[1] = np.nan - with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path: - df.to_csv(path, na_rep=999) + path = str(temp_file) + df.to_csv(path, na_rep=999) - with open(path, encoding="utf-8") as f: - lines = f.readlines() - assert lines[1].split(",")[2] == "999" + with open(path, encoding="utf-8") as f: + lines = f.readlines() + assert lines[1].split(",")[2] == "999" - def test_to_csv_withcommas(self): + def test_to_csv_withcommas(self, temp_file): # Commas inside fields should be correctly escaped when saving as CSV. df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]}) - with tm.ensure_clean("__tmp_to_csv_withcommas__.csv") as path: - df.to_csv(path) - df2 = self.read_csv(path) - tm.assert_frame_equal(df2, df) + path = str(temp_file) + df.to_csv(path) + df2 = self.read_csv(path) + tm.assert_frame_equal(df2, df) - def test_to_csv_mixed(self): + def test_to_csv_mixed(self, temp_file): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)] @@ -737,10 +750,10 @@ def create_cols(name): ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( - "foo", index=df_float.index, columns=create_cols("object") + "foo", index=df_float.index, columns=create_cols("object"), dtype="object" ) df_dt = DataFrame( - Timestamp("20010101").as_unit("ns"), + Timestamp("20010101"), index=df_float.index, columns=create_cols("date"), ) @@ -762,25 +775,23 @@ def create_cols(name): for c in create_cols(n): dtypes[c] = dtype - with tm.ensure_clean() as filename: - df.to_csv(filename) - rs = read_csv( - filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date") - ) - tm.assert_frame_equal(rs, df) + path = str(temp_file) + df.to_csv(path) + rs = read_csv(path, index_col=0, dtype=dtypes, parse_dates=create_cols("date")) + tm.assert_frame_equal(rs, df) - def test_to_csv_dups_cols(self): + def test_to_csv_dups_cols(self, temp_file): df = DataFrame( np.random.default_rng(2).standard_normal((1000, 30)), columns=list(range(15)) + list(range(15)), dtype="float64", ) - with tm.ensure_clean() as filename: - df.to_csv(filename) # single dtype, fine - result = read_csv(filename, index_col=0) - result.columns = df.columns - tm.assert_frame_equal(result, df) + path = str(temp_file) + df.to_csv(path) # single dtype, fine + result = read_csv(path, index_col=0) + result.columns = df.columns + tm.assert_frame_equal(result, df) df_float = DataFrame( np.random.default_rng(2).standard_normal((1000, 3)), dtype="float64" @@ -790,9 +801,7 @@ def test_to_csv_dups_cols(self): ) df_bool = DataFrame(True, index=df_float.index, columns=range(3)) df_object = DataFrame("foo", index=df_float.index, columns=range(3)) - df_dt = DataFrame( - Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3) - ) + df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) df = pd.concat( [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True ) @@ -810,36 +819,36 @@ def test_to_csv_dups_cols(self): result.columns = df.columns tm.assert_frame_equal(result, df) - def test_to_csv_dups_cols2(self): + def test_to_csv_dups_cols2(self, temp_file): # GH3457 df = DataFrame( np.ones((5, 3)), index=Index([f"i-{i}" for i in range(5)], name="foo"), - columns=Index(["a", "a", "b"], dtype=object), + columns=Index(["a", "a", "b"]), ) - with tm.ensure_clean() as filename: - df.to_csv(filename) + path = str(temp_file) + df.to_csv(path) - # read_csv will rename the dups columns - result = read_csv(filename, index_col=0) - result = result.rename(columns={"a.1": "a"}) - tm.assert_frame_equal(result, df) + # read_csv will rename the dups columns + result = read_csv(path, index_col=0) + result = result.rename(columns={"a.1": "a"}) + tm.assert_frame_equal(result, df) @pytest.mark.parametrize("chunksize", [10000, 50000, 100000]) - def test_to_csv_chunking(self, chunksize): + def test_to_csv_chunking(self, chunksize, temp_file): aa = DataFrame({"A": range(100000)}) aa["B"] = aa.A + 1.0 aa["C"] = aa.A + 2.0 aa["D"] = aa.A + 3.0 - with tm.ensure_clean() as filename: - aa.to_csv(filename, chunksize=chunksize) - rs = read_csv(filename, index_col=0) - tm.assert_frame_equal(rs, aa) + path = str(temp_file) + aa.to_csv(path, chunksize=chunksize) + rs = read_csv(path, index_col=0) + tm.assert_frame_equal(rs, aa) @pytest.mark.slow - def test_to_csv_wide_frame_formatting(self, monkeypatch): + def test_to_csv_wide_frame_formatting(self, temp_file, monkeypatch): # Issue #8621 chunksize = 100 df = DataFrame( @@ -847,35 +856,35 @@ def test_to_csv_wide_frame_formatting(self, monkeypatch): columns=None, index=None, ) - with tm.ensure_clean() as filename: - with monkeypatch.context() as m: - m.setattr("pandas.io.formats.csvs._DEFAULT_CHUNKSIZE_CELLS", chunksize) - df.to_csv(filename, header=False, index=False) - rs = read_csv(filename, header=None) + path = str(temp_file) + with monkeypatch.context() as m: + m.setattr("pandas.io.formats.csvs._DEFAULT_CHUNKSIZE_CELLS", chunksize) + df.to_csv(path, header=False, index=False) + rs = read_csv(path, header=None) tm.assert_frame_equal(rs, df) - def test_to_csv_bug(self): + def test_to_csv_bug(self, temp_file): f1 = StringIO("a,1.0\nb,2.0") df = self.read_csv(f1, header=None) newdf = DataFrame({"t": df[df.columns[0]]}) - with tm.ensure_clean() as path: - newdf.to_csv(path) + path = str(temp_file) + newdf.to_csv(path) - recons = read_csv(path, index_col=0) - # don't check_names as t != 1 - tm.assert_frame_equal(recons, newdf, check_names=False) + recons = read_csv(path, index_col=0) + # don't check_names as t != 1 + tm.assert_frame_equal(recons, newdf, check_names=False) - def test_to_csv_unicode(self): + def test_to_csv_unicode(self, temp_file): df = DataFrame({"c/\u03c3": [1, 2, 3]}) - with tm.ensure_clean() as path: - df.to_csv(path, encoding="UTF-8") - df2 = read_csv(path, index_col=0, encoding="UTF-8") - tm.assert_frame_equal(df, df2) + path = str(temp_file) + df.to_csv(path, encoding="UTF-8") + df2 = read_csv(path, index_col=0, encoding="UTF-8") + tm.assert_frame_equal(df, df2) - df.to_csv(path, encoding="UTF-8", index=False) - df2 = read_csv(path, index_col=None, encoding="UTF-8") - tm.assert_frame_equal(df, df2) + df.to_csv(path, encoding="UTF-8", index=False) + df2 = read_csv(path, index_col=None, encoding="UTF-8") + tm.assert_frame_equal(df, df2) def test_to_csv_unicode_index_col(self): buf = StringIO("") @@ -898,23 +907,23 @@ def test_to_csv_stringio(self, float_frame): recons = read_csv(buf, index_col=0) tm.assert_frame_equal(recons, float_frame) - def test_to_csv_float_format(self): + def test_to_csv_float_format(self, temp_file): df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) - with tm.ensure_clean() as filename: - df.to_csv(filename, float_format="%.2f") + path = str(temp_file) + df.to_csv(path, float_format="%.2f") - rs = read_csv(filename, index_col=0) - xp = DataFrame( - [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], - index=["A", "B"], - columns=["X", "Y", "Z"], - ) - tm.assert_frame_equal(rs, xp) + rs = read_csv(path, index_col=0) + xp = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(rs, xp) def test_to_csv_float_format_over_decimal(self): # GH#47436 @@ -961,43 +970,50 @@ def test_to_csv_index_no_leading_comma(self): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert buf.getvalue() == expected - def test_to_csv_lineterminators(self): + def test_to_csv_lineterminators(self, temp_file): # see gh-20353 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) - with tm.ensure_clean() as path: - # case 1: CRLF as line terminator - df.to_csv(path, lineterminator="\r\n") - expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n" + path = str(temp_file) + # case 1: CRLF as line terminator + df.to_csv(path, lineterminator="\r\n") + expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n" - with open(path, mode="rb") as f: - assert f.read() == expected + with open(path, mode="rb") as f: + assert f.read() == expected - with tm.ensure_clean() as path: - # case 2: LF as line terminator - df.to_csv(path, lineterminator="\n") - expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n" + def test_to_csv_lineterminators2(self, temp_file): + # see gh-20353 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) - with open(path, mode="rb") as f: - assert f.read() == expected + path = str(temp_file) + # case 2: LF as line terminator + df.to_csv(path, lineterminator="\n") + expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n" - with tm.ensure_clean() as path: - # case 3: The default line terminator(=os.linesep)(gh-21406) - df.to_csv(path) - os_linesep = os.linesep.encode("utf-8") - expected = ( - b",A,B" - + os_linesep - + b"one,1,4" - + os_linesep - + b"two,2,5" - + os_linesep - + b"three,3,6" - + os_linesep - ) + with open(path, mode="rb") as f: + assert f.read() == expected - with open(path, mode="rb") as f: - assert f.read() == expected + def test_to_csv_lineterminators3(self, temp_file): + # see gh-20353 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) + path = str(temp_file) + # case 3: The default line terminator(=os.linesep)(gh-21406) + df.to_csv(path) + os_linesep = os.linesep.encode("utf-8") + expected = ( + b",A,B" + + os_linesep + + b"one,1,4" + + os_linesep + + b"two,2,5" + + os_linesep + + b"three,3,6" + + os_linesep + ) + + with open(path, mode="rb") as f: + assert f.read() == expected def test_to_csv_from_csv_categorical(self): # CSV with categoricals should result in the same output @@ -1055,130 +1071,139 @@ def test_to_csv_path_is_none(self, float_frame): ), ], ) - def test_to_csv_compression(self, df, encoding, compression): - with tm.ensure_clean() as filename: - df.to_csv(filename, compression=compression, encoding=encoding) - # test the round trip - to_csv -> read_csv - result = read_csv( - filename, compression=compression, index_col=0, encoding=encoding - ) - tm.assert_frame_equal(df, result) - - # test the round trip using file handle - to_csv -> read_csv - with get_handle( - filename, "w", compression=compression, encoding=encoding - ) as handles: - df.to_csv(handles.handle, encoding=encoding) - assert not handles.handle.closed - - result = read_csv( - filename, - compression=compression, - encoding=encoding, - index_col=0, - ).squeeze("columns") - tm.assert_frame_equal(df, result) - - # explicitly make sure file is compressed - with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding or "utf8") - for col in df.columns: - assert col in text - - with tm.decompress_file(filename, compression) as fh: - tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) - - def test_to_csv_date_format(self, datetime_frame): - with tm.ensure_clean("__tmp_to_csv_date_format__") as path: - dt_index = datetime_frame.index - datetime_frame = DataFrame( - {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index - ) - datetime_frame.to_csv(path, date_format="%Y%m%d") + def test_to_csv_compression(self, temp_file, df, encoding, compression): + path = str(temp_file) + df.to_csv(path, compression=compression, encoding=encoding) + # test the round trip - to_csv -> read_csv + result = read_csv(path, compression=compression, index_col=0, encoding=encoding) + tm.assert_frame_equal(df, result) + + # test the round trip using file handle - to_csv -> read_csv + with get_handle( + path, "w", compression=compression, encoding=encoding + ) as handles: + df.to_csv(handles.handle, encoding=encoding) + assert not handles.handle.closed + + result = read_csv( + path, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_frame_equal(df, result) + + # explicitly make sure file is compressed + with tm.decompress_file(path, compression) as fh: + text = fh.read().decode(encoding or "utf8") + for col in df.columns: + assert col in text + + with tm.decompress_file(path, compression) as fh: + tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) + + def test_to_csv_date_format(self, temp_file, datetime_frame): + path = str(temp_file) + dt_index = datetime_frame.index + datetime_frame = DataFrame( + {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index + ) + datetime_frame.to_csv(path, date_format="%Y%m%d") - # Check that the data was put in the specified format - test = read_csv(path, index_col=0) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) - datetime_frame_int = datetime_frame.map(lambda x: int(x.strftime("%Y%m%d"))) - datetime_frame_int.index = datetime_frame_int.index.map( - lambda x: int(x.strftime("%Y%m%d")) - ) + datetime_frame_int = datetime_frame.map(lambda x: int(x.strftime("%Y%m%d"))) + datetime_frame_int.index = datetime_frame_int.index.map( + lambda x: int(x.strftime("%Y%m%d")) + ) - tm.assert_frame_equal(test, datetime_frame_int) + tm.assert_frame_equal(test, datetime_frame_int) - datetime_frame.to_csv(path, date_format="%Y-%m-%d") + datetime_frame.to_csv(path, date_format="%Y-%m-%d") - # Check that the data was put in the specified format - test = read_csv(path, index_col=0) - datetime_frame_str = datetime_frame.map(lambda x: x.strftime("%Y-%m-%d")) - datetime_frame_str.index = datetime_frame_str.index.map( - lambda x: x.strftime("%Y-%m-%d") - ) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + datetime_frame_str = datetime_frame.map(lambda x: x.strftime("%Y-%m-%d")) + datetime_frame_str.index = datetime_frame_str.index.map( + lambda x: x.strftime("%Y-%m-%d") + ) - tm.assert_frame_equal(test, datetime_frame_str) + tm.assert_frame_equal(test, datetime_frame_str) - # Check that columns get converted - datetime_frame_columns = datetime_frame.T - datetime_frame_columns.to_csv(path, date_format="%Y%m%d") + # Check that columns get converted + datetime_frame_columns = datetime_frame.T + datetime_frame_columns.to_csv(path, date_format="%Y%m%d") - test = read_csv(path, index_col=0) + test = read_csv(path, index_col=0) - datetime_frame_columns = datetime_frame_columns.map( - lambda x: int(x.strftime("%Y%m%d")) - ) - # Columns don't get converted to ints by read_csv - datetime_frame_columns.columns = datetime_frame_columns.columns.map( - lambda x: x.strftime("%Y%m%d") - ) + datetime_frame_columns = datetime_frame_columns.map( + lambda x: int(x.strftime("%Y%m%d")) + ) + # Columns don't get converted to ints by read_csv + datetime_frame_columns.columns = datetime_frame_columns.columns.map( + lambda x: x.strftime("%Y%m%d") + ) - tm.assert_frame_equal(test, datetime_frame_columns) + tm.assert_frame_equal(test, datetime_frame_columns) - # test NaTs - nat_index = to_datetime( - ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"] - ) - nat_frame = DataFrame({"A": nat_index}, index=nat_index) - nat_frame.to_csv(path, date_format="%Y-%m-%d") + # test NaTs + nat_index = to_datetime( + ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"] + ) + nat_frame = DataFrame({"A": nat_index}, index=nat_index) + nat_frame.to_csv(path, date_format="%Y-%m-%d") - test = read_csv(path, parse_dates=[0, 1], index_col=0) + test = read_csv(path, parse_dates=[0, 1], index_col=0) - tm.assert_frame_equal(test, nat_frame) + tm.assert_frame_equal(test, nat_frame) @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")]) - def test_to_csv_with_dst_transitions(self, td): - with tm.ensure_clean("csv_date_format_with_dst") as path: - # make sure we are not failing on transitions - times = date_range( - "2013-10-26 23:00", - "2013-10-27 01:00", - tz="Europe/London", - freq="h", - ambiguous="infer", - ) - i = times + td - i = i._with_freq(None) # freq is not preserved by read_csv - time_range = np.array(range(len(i)), dtype="int64") - df = DataFrame({"A": time_range}, index=i) - df.to_csv(path, index=True) - # we have to reconvert the index as we - # don't parse the tz's - result = read_csv(path, index_col=0) - result.index = to_datetime(result.index, utc=True).tz_convert( - "Europe/London" - ) - tm.assert_frame_equal(result, df) + def test_to_csv_with_dst_transitions(self, td, temp_file): + path = str(temp_file) + # make sure we are not failing on transitions + times = date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="h", + ambiguous="infer", + ) + i = times + td + i = i._with_freq(None) # freq is not preserved by read_csv + time_range = np.array(range(len(i)), dtype="int64") + df = DataFrame({"A": time_range}, index=i) + df.to_csv(path, index=True) + # we have to reconvert the index as we + # don't parse the tz's + result = read_csv(path, index_col=0) + result.index = ( + to_datetime(result.index, utc=True) + .tz_convert("Europe/London") + .as_unit("ns") + ) + tm.assert_frame_equal(result, df) - def test_to_csv_with_dst_transitions_with_pickle(self): + @pytest.mark.parametrize( + "start,end", + [ + ["2015-03-29", "2015-03-30"], + ["2015-10-25", "2015-10-26"], + ], + ) + def test_to_csv_with_dst_transitions_with_pickle(self, start, end, temp_file): # GH11619 - idx = date_range("2015-01-01", "2015-12-31", freq="h", tz="Europe/Paris") + idx = date_range(start, end, freq="h", tz="Europe/Paris") idx = idx._with_freq(None) # freq does not round-trip idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) with tm.ensure_clean("csv_date_format_with_dst") as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) - result.index = to_datetime(result.index, utc=True).tz_convert( - "Europe/Paris" + result.index = ( + to_datetime(result.index, utc=True) + .tz_convert("Europe/Paris") + .as_unit("ns") ) result["idx"] = to_datetime(result["idx"], utc=True).astype( "datetime64[ns, Europe/Paris]" @@ -1188,10 +1213,10 @@ def test_to_csv_with_dst_transitions_with_pickle(self): # assert working df.astype(str) - with tm.ensure_clean("csv_date_format_with_dst") as path: - df.to_pickle(path) - result = pd.read_pickle(path) - tm.assert_frame_equal(result, df) + path = str(temp_file) + df.to_pickle(path) + result = pd.read_pickle(path) + tm.assert_frame_equal(result, df) def test_to_csv_quoting(self): df = DataFrame( @@ -1344,15 +1369,17 @@ def test_to_csv_single_level_multi_index(self): result = df.to_csv(lineterminator="\n") tm.assert_almost_equal(result, expected) - def test_gz_lineend(self): + def test_gz_lineend(self, tmp_path): # GH 25311 df = DataFrame({"a": [1, 2]}) expected_rows = ["a", "1", "2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - with tm.ensure_clean("__test_gz_lineend.csv.gz") as path: - df.to_csv(path, index=False) - with tm.decompress_file(path, compression="gzip") as f: - result = f.read().decode("utf-8") + file_path = tmp_path / "__test_gz_lineend.csv.gz" + file_path.touch() + path = str(file_path) + df.to_csv(path, index=False) + with tm.decompress_file(path, compression="gzip") as f: + result = f.read().decode("utf-8") assert result == expected @@ -1404,3 +1431,22 @@ def test_to_csv_categorical_and_interval(self): expected_rows = [",a", '0,"[2020-01-01 00:00:00, 2020-01-02 00:00:00]"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected + + def test_to_csv_warn_when_zip_tar_and_append_mode(self, tmp_path): + # GH57875 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + "zip and tar do not support mode 'a' properly. This combination will " + "result in multiple files with same name being added to the archive" + ) + zip_path = tmp_path / "test.zip" + tar_path = tmp_path / "test.tar" + with tm.assert_produces_warning( + RuntimeWarning, match=msg, raise_on_extra_warnings=False + ): + df.to_csv(zip_path, mode="a") + + with tm.assert_produces_warning( + RuntimeWarning, match=msg, raise_on_extra_warnings=False + ): + df.to_csv(tar_path, mode="a") diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 61f0ad30b4519..c43d947b4877e 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -2,18 +2,23 @@ OrderedDict, defaultdict, ) -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest -import pytz from pandas import ( NA, DataFrame, Index, + Interval, MultiIndex, + Period, Series, + Timedelta, Timestamp, ) import pandas._testing as tm @@ -163,7 +168,7 @@ def test_to_dict_not_unique_warning(self): # GH#16927: When converting to a dict, if a column has a non-unique name # it will be dropped, throwing a warning. df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="columns will be omitted"): df.to_dict() @pytest.mark.filterwarnings("ignore::UserWarning") @@ -206,15 +211,15 @@ def test_to_dict_tz(self): # GH#18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [ - (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), - (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=timezone.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=timezone.utc),), ] df = DataFrame(list(data), columns=["d"]) result = df.to_dict(orient="records") expected = [ - {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, - {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, + {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=timezone.utc)}, + {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=timezone.utc)}, ] tm.assert_dict_equal(result[0], expected[0]) tm.assert_dict_equal(result[1], expected[1]) @@ -510,12 +515,27 @@ def test_to_dict_masked_native_python(self): result = df.to_dict(orient="records") assert isinstance(result[0]["a"], int) - def test_to_dict_pos_args_deprecation(self): - # GH-54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_dict except for the " - r"argument 'orient' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_dict("records", {}) + def test_to_dict_tight_no_warning_with_duplicate_column(self): + # GH#58281 + df = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "A"]) + with tm.assert_produces_warning(None): + result = df.to_dict(orient="tight") + expected = { + "index": [0, 1, 2], + "columns": ["A", "A"], + "data": [[1, 2], [3, 4], [5, 6]], + "index_names": [None], + "column_names": [None], + } + assert result == expected + + +@pytest.mark.parametrize( + "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] +) +def test_to_dict_list_pd_scalars(val): + # GH 54824 + df = DataFrame({"a": [val]}) + result = df.to_dict(orient="list") + expected = {"a": [val]} + assert result == expected diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index f64cfd5fe6a2d..4f621b4643b70 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,7 +1,7 @@ import numpy as np import pytest -import pandas.util._test_decorators as td +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -10,12 +10,9 @@ import pandas._testing as tm from pandas.core.arrays import NumpyExtensionArray -pytestmark = td.skip_array_manager_invalid_test - class TestToDictOfBlocks: - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") - def test_no_copy_blocks(self, float_frame, using_copy_on_write): + def test_no_copy_blocks(self, float_frame): # GH#9607 df = DataFrame(float_frame, copy=True) column = df.columns[0] @@ -27,15 +24,11 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): _last_df = _df if column in _df: _df.loc[:, column] = _df[column] + 1 - - if not using_copy_on_write: - # make sure we did change the original DataFrame - assert _last_df is not None and _last_df[column].equals(df[column]) - else: - assert _last_df is not None and not _last_df[column].equals(df[column]) + assert _last_df is not None and not _last_df[column].equals(df[column]) -def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +def test_to_dict_of_blocks_item_cache(): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object)) @@ -46,20 +39,8 @@ def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): df._to_dict_of_blocks() - if using_copy_on_write: - with pytest.raises(ValueError, match="read-only"): - ser.values[0] = "foo" - elif warn_copy_on_write: + with pytest.raises(ValueError, match="read-only"): ser.values[0] = "foo" - assert df.loc[0, "b"] == "foo" - # with warning mode, the item cache is disabled - assert df["b"] is not ser - else: - # Check that the to_dict_of_blocks didn't break link between ser and df - ser.values[0] = "foo" - assert df.loc[0, "b"] == "foo" - - assert df["b"] is ser def test_set_change_dtype_slice(): diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index bdb9b2c055061..36088cceb13f1 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,6 +1,5 @@ import numpy as np - -import pandas.util._test_decorators as td +import pytest from pandas import ( DataFrame, @@ -22,25 +21,20 @@ def test_to_numpy_dtype(self): result = df.to_numpy(dtype="int64") tm.assert_numpy_array_equal(result, expected) - @td.skip_array_manager_invalid_test - def test_to_numpy_copy(self, using_copy_on_write): + def test_to_numpy_copy(self): arr = np.random.default_rng(2).standard_normal((4, 3)) df = DataFrame(arr) - if using_copy_on_write: - assert df.values.base is not arr - assert df.to_numpy(copy=False).base is df.values.base - else: - assert df.values.base is arr - assert df.to_numpy(copy=False).base is arr + assert df.values.base is not arr + assert df.to_numpy(copy=False).base is df.values.base assert df.to_numpy(copy=True).base is not arr # we still don't want a copy when na_value=np.nan is passed, # and that can be respected because we are already numpy-float - if using_copy_on_write: - assert df.to_numpy(copy=False).base is df.values.base - else: - assert df.to_numpy(copy=False, na_value=np.nan).base is arr + assert df.to_numpy(copy=False).base is df.values.base + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index d0caa071fae1c..1b7b30ac40363 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,8 +1,7 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -26,6 +25,7 @@ def test_transpose_td64_intervals(self): df = DataFrame(ii) result = df.T + result.columns = Index(list(range(len(ii)))) expected = DataFrame({i: ii[i : i + 1] for i in range(len(ii))}) tm.assert_frame_equal(result, expected) @@ -126,18 +126,12 @@ def test_transpose_mixed(self): for col, s in mixed_T.items(): assert s.dtype == np.object_ - @td.skip_array_manager_invalid_test - def test_transpose_get_view(self, float_frame, using_copy_on_write): + def test_transpose_get_view(self, float_frame): dft = float_frame.T dft.iloc[:, 5:10] = 5 + assert (float_frame.values[5:10] != 5).all() - if using_copy_on_write: - assert (float_frame.values[5:10] != 5).all() - else: - assert (float_frame.values[5:10] == 5).all() - - @td.skip_array_manager_invalid_test - def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): + def test_transpose_get_view_dt64tzget_view(self): dti = date_range("2016-01-01", periods=6, tz="US/Pacific") arr = dti._data.reshape(3, 2) df = DataFrame(arr) @@ -147,10 +141,7 @@ def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): assert result._mgr.nblocks == 1 rtrip = result._mgr.blocks[0].values - if using_copy_on_write: - assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray) - else: - assert np.shares_memory(arr._ndarray, rtrip._ndarray) + assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray) def test_transpose_not_inferring_dt(self): # GH#51546 @@ -163,7 +154,6 @@ def test_transpose_not_inferring_dt(self): result = df.T expected = DataFrame( [[Timestamp("2019-12-31"), Timestamp("2019-12-31")]], - columns=[0, 1], index=["a"], dtype=object, ) @@ -185,8 +175,23 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): [Timestamp("2019-12-31"), Timestamp("2019-12-31")], [Timestamp("2019-12-31"), Timestamp("2019-12-31")], ], - columns=[0, 1], index=["a", "b"], dtype=object, ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index 12077952c2e03..f28f811148c5d 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -60,7 +60,7 @@ def test_truncate(self, datetime_frame, frame_or_series): truncated = ts.truncate(before=ts.index[-1] + ts.index.freq) assert len(truncated) == 0 - msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-05-16 00:00:00" + msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-01-11 00:00:00" with pytest.raises(ValueError, match=msg): ts.truncate( before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq diff --git a/pandas/tests/frame/methods/test_tz_convert.py b/pandas/tests/frame/methods/test_tz_convert.py index bcb8e423980fd..5ee4021102f22 100644 --- a/pandas/tests/frame/methods/test_tz_convert.py +++ b/pandas/tests/frame/methods/test_tz_convert.py @@ -1,3 +1,5 @@ +import zoneinfo + import numpy as np import pytest @@ -13,28 +15,34 @@ class TestTZConvert: def test_tz_convert(self, frame_or_series): - rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + rng = date_range( + "1/1/2011", periods=200, freq="D", tz=zoneinfo.ZoneInfo("US/Eastern") + ) obj = DataFrame({"a": 1}, index=rng) obj = tm.get_obj(obj, frame_or_series) - result = obj.tz_convert("Europe/Berlin") - expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + berlin = zoneinfo.ZoneInfo("Europe/Berlin") + result = obj.tz_convert(berlin) + expected = DataFrame({"a": 1}, rng.tz_convert(berlin)) expected = tm.get_obj(expected, frame_or_series) - assert result.index.tz.zone == "Europe/Berlin" + assert result.index.tz.key == "Europe/Berlin" tm.assert_equal(result, expected) def test_tz_convert_axis1(self): - rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + rng = date_range( + "1/1/2011", periods=200, freq="D", tz=zoneinfo.ZoneInfo("US/Eastern") + ) obj = DataFrame({"a": 1}, index=rng) obj = obj.T - result = obj.tz_convert("Europe/Berlin", axis=1) - assert result.columns.tz.zone == "Europe/Berlin" + berlin = zoneinfo.ZoneInfo("Europe/Berlin") + result = obj.tz_convert(berlin, axis=1) + assert result.columns.tz.key == "Europe/Berlin" - expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + expected = DataFrame({"a": 1}, rng.tz_convert(berlin)) tm.assert_equal(result, expected.T) @@ -98,32 +106,33 @@ def test_tz_convert_and_localize(self, fn): tm.assert_index_equal(df3.index.levels[1], l1_expected) assert not df3.index.levels[1].equals(l1) - # Bad Inputs - + @pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"]) + def test_tz_convert_and_localize_bad_input(self, fn): + int_idx = Index(range(5)) + l0 = date_range("20140701", periods=5, freq="D") # Not DatetimeIndex / PeriodIndex + df = DataFrame(index=int_idx) with pytest.raises(TypeError, match="DatetimeIndex"): - df = DataFrame(index=int_idx) getattr(df, fn)("US/Pacific") # Not DatetimeIndex / PeriodIndex + df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) with pytest.raises(TypeError, match="DatetimeIndex"): - df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) getattr(df, fn)("US/Pacific", level=0) # Invalid level + df = DataFrame(index=l0) with pytest.raises(ValueError, match="not valid"): - df = DataFrame(index=l0) getattr(df, fn)("US/Pacific", level=1) - @pytest.mark.parametrize("copy", [True, False]) - def test_tz_convert_copy_inplace_mutate(self, copy, frame_or_series): + def test_tz_convert_copy_inplace_mutate(self, frame_or_series): # GH#6326 obj = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="h", tz="Europe/Berlin"), ) orig = obj.copy() - result = obj.tz_convert("UTC", copy=copy) + result = obj.tz_convert("UTC") expected = frame_or_series(np.arange(0, 5), index=obj.index.tz_convert("UTC")) tm.assert_equal(result, expected) tm.assert_equal(obj, orig) diff --git a/pandas/tests/frame/methods/test_tz_localize.py b/pandas/tests/frame/methods/test_tz_localize.py index b167afc17f484..c0f5a90a9d2ee 100644 --- a/pandas/tests/frame/methods/test_tz_localize.py +++ b/pandas/tests/frame/methods/test_tz_localize.py @@ -50,14 +50,13 @@ def test_tz_localize_naive(self, frame_or_series): with pytest.raises(TypeError, match="Already tz-aware"): ts.tz_localize("US/Eastern") - @pytest.mark.parametrize("copy", [True, False]) - def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series): + def test_tz_localize_copy_inplace_mutate(self, frame_or_series): # GH#6326 obj = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="1h", tz=None) ) orig = obj.copy() - result = obj.tz_localize("UTC", copy=copy) + result = obj.tz_localize("UTC") expected = frame_or_series( np.arange(0, 5), index=date_range("20131027", periods=5, freq="1h", tz="UTC"), diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 7c7a0d23ff75f..ea63b2264d4f6 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -140,59 +138,33 @@ def test_update_datetime_tz(self): expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) tm.assert_frame_equal(result, expected) - def test_update_datetime_tz_in_place(self, using_copy_on_write, warn_copy_on_write): + def test_update_datetime_tz_in_place(self): # https://github.com/pandas-dev/pandas/issues/56227 result = DataFrame([pd.Timestamp("2019", tz="UTC")]) orig = result.copy() view = result[:] - with tm.assert_produces_warning( - FutureWarning if warn_copy_on_write else None, match="Setting a value" - ): - result.update(result + pd.Timedelta(days=1)) + result.update(result + pd.Timedelta(days=1)) expected = DataFrame([pd.Timestamp("2019-01-02", tz="UTC")]) tm.assert_frame_equal(result, expected) - if not using_copy_on_write: - tm.assert_frame_equal(view, expected) - else: - tm.assert_frame_equal(view, orig) + tm.assert_frame_equal(view, orig) - def test_update_with_different_dtype(self, using_copy_on_write): + def test_update_with_different_dtype(self): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - if using_copy_on_write: + with pytest.raises(TypeError, match="Invalid value"): df.update({"c": Series(["foo"], index=[0])}) - else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - df["c"].update(Series(["foo"], index=[0])) - - expected = DataFrame( - { - "a": [1, 3], - "b": [np.nan, 2], - "c": Series(["foo", np.nan], dtype="object"), - } - ) - tm.assert_frame_equal(df, expected) - @td.skip_array_manager_invalid_test - def test_update_modify_view( - self, using_copy_on_write, warn_copy_on_write, using_infer_string - ): + def test_update_modify_view(self, using_infer_string): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) df2_orig = df2.copy() result_view = df2[:] - # TODO(CoW-warn) better warning message - with tm.assert_cow_warning(warn_copy_on_write): - df2.update(df) + df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) - if using_copy_on_write or using_infer_string: - tm.assert_frame_equal(result_view, df2_orig) - else: - tm.assert_frame_equal(result_view, expected) + tm.assert_frame_equal(result_view, df2_orig) def test_update_dt_column_with_NaT_create_column(self): # GH#16713 @@ -203,3 +175,55 @@ def test_update_dt_column_with_NaT_create_column(self): {"A": [1.0, 3.0], "B": [pd.NaT, pd.to_datetime("2016-01-01")]} ) tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "value_df, value_other, dtype", + [ + (True, False, bool), + (1, 2, int), + (1.0, 2.0, float), + (1.0 + 1j, 2.0 + 2j, complex), + (np.uint64(1), np.uint(2), np.dtype("ubyte")), + (np.uint64(1), np.uint(2), np.dtype("intc")), + ("a", "b", pd.StringDtype()), + ( + pd.to_timedelta("1 ms"), + pd.to_timedelta("2 ms"), + np.dtype("timedelta64[ns]"), + ), + ( + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-02T00:00:00"), + np.dtype("datetime64[ns]"), + ), + ], + ) + def test_update_preserve_dtype(self, value_df, value_other, dtype): + # GH#55509 + df = DataFrame({"a": [value_df] * 2}, index=[1, 2], dtype=dtype) + other = DataFrame({"a": [value_other]}, index=[1], dtype=dtype) + expected = DataFrame({"a": [value_other, value_df]}, index=[1, 2], dtype=dtype) + df.update(other) + tm.assert_frame_equal(df, expected) + + def test_update_raises_on_duplicate_argument_index(self): + # GH#55509 + df = DataFrame({"a": [1, 1]}, index=[1, 2]) + other = DataFrame({"a": [2, 3]}, index=[1, 1]) + with pytest.raises(ValueError, match="duplicate index"): + df.update(other) + + def test_update_raises_without_intersection(self): + # GH#55509 + df = DataFrame({"a": [1]}, index=[1]) + other = DataFrame({"a": [2]}, index=[2]) + with pytest.raises(ValueError, match="no intersection"): + df.update(other) + + def test_update_on_duplicate_frame_unique_argument_index(self): + # GH#55509 + df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2], dtype=np.dtype("intc")) + other = DataFrame({"a": [2, 3]}, index=[1, 2], dtype=np.dtype("intc")) + expected = DataFrame({"a": [2, 2, 3]}, index=[1, 1, 2], dtype=np.dtype("intc")) + df.update(other) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 4136d641ef67f..43db234267f21 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -124,7 +124,7 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): expected = pd.Series( data=[1, 1], index=pd.MultiIndex.from_arrays( - [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] + [("John", "Beth"), ("Smith", "Louise")], names=["first_name", "middle_name"] ), name="count", ) @@ -149,7 +149,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): pd.Index(["Anne", "Beth", "John"]), pd.Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + codes=[[2, 0, 2, 1], [1, 2, 2, 0]], names=["first_name", "middle_name"], ), name="count", diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index bbca4ee1b88b1..2de2053bb705f 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, NaT, @@ -15,15 +13,10 @@ class TestDataFrameValues: - @td.skip_array_manager_invalid_test - def test_values(self, float_frame, using_copy_on_write): - if using_copy_on_write: - with pytest.raises(ValueError, match="read-only"): - float_frame.values[:, 0] = 5.0 - assert (float_frame.values[:, 0] != 5).all() - else: + def test_values(self, float_frame): + with pytest.raises(ValueError, match="read-only"): float_frame.values[:, 0] = 5.0 - assert (float_frame.values[:, 0] == 5).all() + assert (float_frame.values[:, 0] != 5).all() def test_more_values(self, float_string_frame): values = float_string_frame.values @@ -231,36 +224,26 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame): class TestPrivateValues: - @td.skip_array_manager_invalid_test - def test_private_values_dt64tz(self, using_copy_on_write): + def test_private_values_dt64tz(self): dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) df = DataFrame(dta, columns=["A"]) tm.assert_equal(df._values, dta) - if using_copy_on_write: - assert not np.shares_memory(df._values._ndarray, dta._ndarray) - else: - # we have a view - assert np.shares_memory(df._values._ndarray, dta._ndarray) + assert not np.shares_memory(df._values._ndarray, dta._ndarray) # TimedeltaArray tda = dta - dta df2 = df - df tm.assert_equal(df2._values, tda) - @td.skip_array_manager_invalid_test - def test_private_values_dt64tz_multicol(self, using_copy_on_write): + def test_private_values_dt64tz_multicol(self): dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2) df = DataFrame(dta, columns=["A", "B"]) tm.assert_equal(df._values, dta) - if using_copy_on_write: - assert not np.shares_memory(df._values._ndarray, dta._ndarray) - else: - # we have a view - assert np.shares_memory(df._values._ndarray, dta._ndarray) + assert not np.shares_memory(df._values._ndarray, dta._ndarray) # TimedeltaArray tda = dta - dta @@ -273,7 +256,7 @@ def test_private_values_dt64_multiblock(self): df = DataFrame({"A": dta[:4]}, copy=False) df["B"] = dta[4:] - assert len(df._mgr.arrays) == 2 + assert len(df._mgr.blocks) == 2 result = df._values expected = dta.reshape(2, 4).T diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index c68171ab254c7..b4c16b94fcf8b 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,6 +1,7 @@ -from datetime import datetime - -import pytz +from datetime import ( + datetime, + timezone, +) from pandas import DataFrame import pandas._testing as tm @@ -13,7 +14,7 @@ def test_set_axis_setattr_index(self): # GH 6785 # set the index manually - df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1}]) + df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=timezone.utc), "foo": 1}]) expected = df.set_index("ts") df.index = df["ts"] df.pop("ts") diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index c7b444045a0f2..2b0bf1b0576f9 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,9 +5,11 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._config.config import option_context +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -85,7 +87,7 @@ def test_tab_completion(self): assert isinstance(df.__getitem__("A"), DataFrame) def test_display_max_dir_items(self): - # display.max_dir_items increaes the number of columns that are in __dir__. + # display.max_dir_items increases the number of columns that are in __dir__. columns = ["a" + str(i) for i in range(420)] values = [range(420), range(420)] df = DataFrame(values, columns=columns) @@ -113,7 +115,9 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="surrogates not allowed" + ) def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" @@ -325,8 +329,6 @@ def test_set_flags( self, allows_duplicate_labels, frame_or_series, - using_copy_on_write, - warn_copy_on_write, ): obj = DataFrame({"A": [1, 2]}) key = (0, 0) @@ -354,20 +356,11 @@ def test_set_flags( else: assert np.may_share_memory(obj["A"].values, result["A"].values) - with tm.assert_cow_warning(warn_copy_on_write): - result.iloc[key] = 0 - if using_copy_on_write: - assert obj.iloc[key] == 1 - else: - assert obj.iloc[key] == 0 - # set back to 1 for test below - with tm.assert_cow_warning(warn_copy_on_write): - result.iloc[key] = 1 + result.iloc[key] = 0 + assert obj.iloc[key] == 1 # Now we do copy. - result = obj.set_flags( - copy=True, allows_duplicate_labels=allows_duplicate_labels - ) + result = obj.set_flags(allows_duplicate_labels=allows_duplicate_labels) result.iloc[key] = 10 assert obj.iloc[key] == 1 @@ -383,10 +376,5 @@ def test_constructor_expanddim(self): def test_inspect_getmembers(self): # GH38740 - pytest.importorskip("jinja2") df = DataFrame() - msg = "DataFrame._data is deprecated" - with tm.assert_produces_warning( - DeprecationWarning, match=msg, check_stacklevel=False - ): - inspect.getmembers(df) + inspect.getmembers(df) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 42ce658701355..bc69ec388bf0c 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,9 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - -import pandas.util._test_decorators as td +from pandas.compat import HAS_PYARROW import pandas as pd from pandas import ( @@ -59,7 +57,7 @@ def __init__(self, value, dtype) -> None: self.value = value self.dtype = np.dtype(dtype) - def __array__(self): + def __array__(self, dtype=None, copy=None): return np.array(self.value, dtype=self.dtype) def __str__(self) -> str: @@ -253,9 +251,6 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't compare string and int" - ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError @@ -306,8 +301,7 @@ def test_df_string_comparison(self): class TestFrameFlexComparisons: # TODO: test_bool_flex_frame needs a better name - @pytest.mark.parametrize("op", ["eq", "ne", "gt", "lt", "ge", "le"]) - def test_bool_flex_frame(self, op): + def test_bool_flex_frame(self, comparison_op): data = np.random.default_rng(2).standard_normal((5, 3)) other_data = np.random.default_rng(2).standard_normal((5, 3)) df = DataFrame(data) @@ -317,8 +311,8 @@ def test_bool_flex_frame(self, op): # DataFrame assert df.eq(df).values.all() assert not df.ne(df).values.any() - f = getattr(df, op) - o = getattr(operator, op) + f = getattr(df, comparison_op.__name__) + o = comparison_op # No NAs tm.assert_frame_equal(f(other), o(df, other)) # Unaligned @@ -461,25 +455,23 @@ def test_flex_comparison_nat(self): result = df.ne(pd.NaT) assert result.iloc[0, 0].item() is True - @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) - def test_df_flex_cmp_constant_return_types(self, opname): + def test_df_flex_cmp_constant_return_types(self, comparison_op): # GH 15077, non-empty DataFrame df = DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 - result = getattr(df, opname)(const).dtypes.value_counts() + result = getattr(df, comparison_op.__name__)(const).dtypes.value_counts() tm.assert_series_equal( result, Series([2], index=[np.dtype(bool)], name="count") ) - @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) - def test_df_flex_cmp_constant_return_types_empty(self, opname): + def test_df_flex_cmp_constant_return_types_empty(self, comparison_op): # GH 15077 empty DataFrame df = DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 empty = df.iloc[:0] - result = getattr(empty, opname)(const).dtypes.value_counts() + result = getattr(empty, comparison_op.__name__)(const).dtypes.value_counts() tm.assert_series_equal( result, Series([2], index=[np.dtype(bool)], name="count") ) @@ -666,11 +658,12 @@ def test_arith_flex_series(self, simple_frame): tm.assert_frame_equal(df.div(row), df / row) tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T) - @pytest.mark.parametrize("dtype", ["int64", "float64"]) - def test_arith_flex_series_broadcasting(self, dtype): + def test_arith_flex_series_broadcasting(self, any_real_numpy_dtype): # broadcasting issue in GH 7325 - df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype=dtype) + df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype=any_real_numpy_dtype) expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) + if any_real_numpy_dtype == "float32": + expected = expected.astype(any_real_numpy_dtype) result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) @@ -839,6 +832,43 @@ def test_frame_multiindex_operations_part_align(self): tm.assert_frame_equal(result, expected) + def test_frame_multiindex_operations_part_align_axis1(self): + # GH#61009 Test DataFrame-Series arithmetic operation + # with partly aligned MultiIndex and axis = 1 + df = DataFrame( + [[1, 2, 3], [3, 4, 5]], + index=[2010, 2020], + columns=MultiIndex.from_tuples( + [ + ("a", "b", 0), + ("a", "b", 1), + ("a", "c", 2), + ], + names=["scen", "mod", "id"], + ), + ) + + series = Series( + [0.4], + index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]), + ) + + expected = DataFrame( + [[1.4, 2.4, np.nan], [3.4, 4.4, np.nan]], + index=[2010, 2020], + columns=MultiIndex.from_tuples( + [ + ("a", "b", 0), + ("a", "b", 1), + ("a", "c", 2), + ], + names=["scen", "mod", "id"], + ), + ) + result = df.add(series, axis=1) + + tm.assert_frame_equal(result, expected) + class TestFrameArithmetic: def test_td64_op_nat_casting(self): @@ -894,15 +924,10 @@ def test_df_add_2d_array_collike_broadcasts(self): tm.assert_frame_equal(result, expected) def test_df_arith_2d_array_rowlike_broadcasts( - self, request, all_arithmetic_operators, using_array_manager + self, request, all_arithmetic_operators ): # GH#23000 opname = all_arithmetic_operators - - if using_array_manager and opname in ("__rmod__", "__rfloordiv__"): - # TODO(ArrayManager) decide on dtypes - td.mark_array_manager_not_yet_implemented(request) - arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) @@ -921,15 +946,10 @@ def test_df_arith_2d_array_rowlike_broadcasts( tm.assert_frame_equal(result, expected) def test_df_arith_2d_array_collike_broadcasts( - self, request, all_arithmetic_operators, using_array_manager + self, request, all_arithmetic_operators ): # GH#23000 opname = all_arithmetic_operators - - if using_array_manager and opname in ("__rmod__", "__rfloordiv__"): - # TODO(ArrayManager) decide on dtypes - td.mark_array_manager_not_yet_implemented(request) - arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) @@ -1111,7 +1131,7 @@ def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): and expr.USE_NUMEXPR and switch_numexpr_min_elements == 0 ): - warn = UserWarning # "evaluating in Python space because ..." + warn = UserWarning else: msg = ( f"cannot perform __{op.__name__}__ with this " @@ -1119,17 +1139,16 @@ def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): ) with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="evaluating in Python"): op(df, elem.value) elif (op, dtype) in skip: if op in [operator.add, operator.mul]: if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: - # "evaluating in Python space because ..." warn = UserWarning else: warn = None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="evaluating in Python"): op(df, elem.value) else: @@ -1253,9 +1272,8 @@ def test_operators_none_as_na(self, op): # since filling converts dtypes from object, changed expected to be # object - msg = "Downcasting object dtype arrays" - with tm.assert_produces_warning(FutureWarning, match=msg): - filled = df.fillna(np.nan) + + filled = df.fillna(np.nan) result = op(df, 3) expected = op(filled, 3).astype(object) expected[pd.isna(expected)] = np.nan @@ -1266,14 +1284,10 @@ def test_operators_none_as_na(self, op): expected[pd.isna(expected)] = np.nan tm.assert_frame_equal(result, expected) - msg = "Downcasting object dtype arrays" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = op(df, df.fillna(7)) + result = op(df, df.fillna(7)) tm.assert_frame_equal(result, expected) - msg = "Downcasting object dtype arrays" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = op(df.fillna(7), df) + result = op(df.fillna(7), df) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) @@ -1572,7 +1586,12 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne) ) f = getattr(operator, compare_operators_no_eq_ne) - msg = "'[<>]=?' not supported between instances of 'str' and 'int'" + msg = "|".join( + [ + "'[<>]=?' not supported between instances of 'str' and 'int'", + "Invalid comparison between dtype=str and int", + ] + ) with pytest.raises(TypeError, match=msg): f(df, 0) @@ -2020,23 +2039,16 @@ def test_arith_list_of_arraylike_raise(to_add): to_add + df -def test_inplace_arithmetic_series_update(using_copy_on_write, warn_copy_on_write): +def test_inplace_arithmetic_series_update(): # https://github.com/pandas-dev/pandas/issues/36373 df = DataFrame({"A": [1, 2, 3]}) df_orig = df.copy() series = df["A"] vals = series._values - with tm.assert_cow_warning(warn_copy_on_write): - series += 1 - if using_copy_on_write: - assert series._values is not vals - tm.assert_frame_equal(df, df_orig) - else: - assert series._values is vals - - expected = DataFrame({"A": [2, 3, 4]}) - tm.assert_frame_equal(df, expected) + series += 1 + assert series._values is not vals + tm.assert_frame_equal(df, df_orig) def test_arithmetic_multiindex_align(): @@ -2058,6 +2070,51 @@ def test_arithmetic_multiindex_align(): tm.assert_frame_equal(result, expected) +def test_arithmetic_multiindex_column_align(): + # GH#60498 + df1 = DataFrame( + data=100, + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + df2 = DataFrame( + data=np.array([[0.1, 0.25], [0.2, 0.45]]), + columns=MultiIndex.from_product([["1A", "1B"]], names=["Lev1"]), + index=["C1", "C2"], + ) + expected = DataFrame( + data=np.array([[10.0, 10.0, 25.0, 25.0], [20.0, 20.0, 45.0, 45.0]]), + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + result = df1 * df2 + tm.assert_frame_equal(result, expected) + + +def test_arithmetic_multiindex_column_align_with_fillvalue(): + # GH#60903 + df1 = DataFrame( + data=[[1.0, 2.0]], + columns=MultiIndex.from_tuples([("A", "one"), ("A", "two")]), + ) + df2 = DataFrame( + data=[[3.0, 4.0]], + columns=MultiIndex.from_tuples([("B", "one"), ("B", "two")]), + ) + expected = DataFrame( + data=[[1.0, 2.0, 3.0, 4.0]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "two"), ("B", "one"), ("B", "two")] + ), + ) + result = df1.add(df2, fill_value=0) + tm.assert_frame_equal(result, expected) + + def test_bool_frame_mult_float(): # GH 18549 df = DataFrame(True, list("ab"), list("cd")) @@ -2126,11 +2183,19 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -def test_mixed_col_index_dtype(): +def test_mixed_col_index_dtype(using_infer_string): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) df1.columns = df2.columns.astype("string") result = df1 + df2 expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + if using_infer_string: + # df2.columns.dtype will be "str" instead of object, + # so the aligned result will be "string", not object + if HAS_PYARROW: + dtype = "string[pyarrow]" + else: + dtype = "string" + expected.columns = expected.columns.astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py new file mode 100644 index 0000000000000..b36b6b5ffe0cc --- /dev/null +++ b/pandas/tests/frame/test_arrow_interface.py @@ -0,0 +1,47 @@ +import ctypes + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd + +pa = pytest.importorskip("pyarrow") + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_arrow_interface(using_infer_string): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + capsule = df.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + table = pa.table(df) + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.table(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="15.0") +def test_dataframe_to_arrow(using_infer_string): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + table = pa.RecordBatchReader.from_stream(df).read_all() + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() + expected = expected.cast(schema) + assert table.equals(expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 712494ef15f97..6fdbfac8f4e0a 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,9 +7,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Categorical, @@ -26,11 +23,6 @@ # structure -# TODO(ArrayManager) check which of those tests need to be rewritten to test the -# equivalent for ArrayManager -pytestmark = td.skip_array_manager_invalid_test - - class TestDataFrameBlockInternals: def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz column inplace invalidates the @@ -87,25 +79,10 @@ def test_consolidate_inplace(self, float_frame): for letter in range(ord("A"), ord("Z")): float_frame[chr(letter)] = chr(letter) - def test_modify_values(self, float_frame, using_copy_on_write): - if using_copy_on_write: - with pytest.raises(ValueError, match="read-only"): - float_frame.values[5] = 5 - assert (float_frame.values[5] != 5).all() - return - - float_frame.values[5] = 5 - assert (float_frame.values[5] == 5).all() - - # unconsolidated - float_frame["E"] = 7.0 - col = float_frame["E"] - float_frame.values[6] = 6 - # as of 2.0 .values does not consolidate, so subsequent calls to .values - # does not share data - assert not (float_frame.values[6] == 6).all() - - assert (col == 7).all() + def test_modify_values(self, float_frame): + with pytest.raises(ValueError, match="read-only"): + float_frame.values[5] = 5 + assert (float_frame.values[5] != 5).all() def test_boolean_set_uncons(self, float_frame): float_frame["E"] = 7.0 @@ -184,19 +161,6 @@ def test_constructor_with_convert(self): tm.assert_series_equal(result, expected) def test_construction_with_mixed(self, float_string_frame, using_infer_string): - # test construction edge cases with mixed types - - # f7u12, this does not work without extensive workaround - data = [ - [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], - ] - df = DataFrame(data) - - # check dtypes - result = df.dtypes - expected = Series({"datetime64[us]": 3}) - # mixed-type frames float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) @@ -206,7 +170,9 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], @@ -218,8 +184,7 @@ def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert arr = np.array([1, 2, 3], dtype="timedelta64[s]") - df = DataFrame(index=range(3)) - df["A"] = arr + df = DataFrame({"A": arr}) expected = DataFrame( {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) ) @@ -237,11 +202,11 @@ def test_construction_with_conversions(self): assert expected.dtypes["dt1"] == "M8[s]" assert expected.dtypes["dt2"] == "M8[s]" - df = DataFrame(index=range(3)) - df["dt1"] = np.datetime64("2013-01-01") - df["dt2"] = np.array( + dt1 = np.datetime64("2013-01-01") + dt2 = np.array( ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" ) + df = DataFrame({"dt1": dt1, "dt2": dt2}) # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') @@ -268,24 +233,23 @@ def f(dtype): f("float64") # 10822 - msg = "^Unknown datetime string format, unable to parse: aa, at position 0$" + msg = "^Unknown datetime string format, unable to parse: aa$" with pytest.raises(ValueError, match=msg): f("M8[ns]") - def test_pickle(self, float_string_frame, timezone_frame): - empty_frame = DataFrame() - + def test_pickle_float_string_frame(self, float_string_frame): unpickled = tm.round_trip_pickle(float_string_frame) tm.assert_frame_equal(float_string_frame, unpickled) # buglet float_string_frame._mgr.ndim - # empty + def test_pickle_empty(self): + empty_frame = DataFrame() unpickled = tm.round_trip_pickle(empty_frame) repr(unpickled) - # tz frame + def test_pickle_empty_tz_frame(self, timezone_frame): unpickled = tm.round_trip_pickle(timezone_frame) tm.assert_frame_equal(timezone_frame, unpickled) @@ -338,7 +302,7 @@ def test_is_mixed_type(self, float_frame, float_string_frame): assert not float_frame._is_mixed_type assert float_string_frame._is_mixed_type - def test_stale_cached_series_bug_473(self, using_copy_on_write, warn_copy_on_write): + def test_stale_cached_series_bug_473(self): # this is chained, but ok with option_context("chained_assignment", None): Y = DataFrame( @@ -353,30 +317,23 @@ def test_stale_cached_series_bug_473(self, using_copy_on_write, warn_copy_on_wri repr(Y) Y.sum() Y["g"].sum() - if using_copy_on_write: - assert not pd.isna(Y["g"]["c"]) - else: - assert pd.isna(Y["g"]["c"]) + assert not pd.isna(Y["g"]["c"]) - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") - def test_strange_column_corruption_issue(self, using_copy_on_write): + def test_strange_column_corruption_issue(self, performance_warning): # TODO(wesm): Unclear how exactly this is related to internal matters df = DataFrame(index=[0, 1]) df[0] = np.nan wasCol = {} with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False + performance_warning, raise_on_extra_warnings=False ): for i, dt in enumerate(df.index): for col in range(100, 200): if col not in wasCol: wasCol[col] = 1 df[col] = np.nan - if using_copy_on_write: - df.loc[dt, col] = i - else: - df[col][dt] = i + df.loc[dt, col] = i myid = 100 @@ -414,33 +371,29 @@ def test_add_column_with_pandas_array(self): tm.assert_frame_equal(df, df2) -def test_update_inplace_sets_valid_block_values(using_copy_on_write): +def test_update_inplace_sets_valid_block_values(): # https://github.com/pandas-dev/pandas/issues/33457 df = DataFrame({"a": Series([1, 2, None], dtype="category")}) # inplace update of a single column - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["a"].fillna(1, inplace=True) - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - df["a"].fillna(1, inplace=True) + with tm.raises_chained_assignment_error(): + df["a"].fillna(1, inplace=True) # check we haven't put a Series into any block.values assert isinstance(df._mgr.blocks[0].values, Categorical) - if not using_copy_on_write: - # smoketest for OP bug from GH#35731 - assert df.isnull().sum().sum() == 0 - def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 # create non-consolidated dataframe with object dtype columns - df = DataFrame() - df["col1"] = Series(["a"], dtype=object) + df = DataFrame( + { + "col1": Series(["a"], dtype=object), + } + ) df["col2"] = Series([0], dtype=object) + assert not df._mgr.is_consolidated() # access column (item cache) df["col1"] == "A" diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6e818d79d5ba8..2426c89dbcff5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,18 +14,16 @@ ) import functools import re +import zoneinfo import numpy as np from numpy import ma from numpy.ma import mrecords import pytest -import pytz - -from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError -import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype from pandas.core.dtypes.dtypes import ( @@ -81,19 +79,18 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str), dtype=object) + expected = DataFrame(arr.astype(str), dtype="str") tm.assert_frame_equal(df, expected) - def test_constructor_from_2d_datetimearray(self, using_array_manager): + def test_constructor_from_2d_datetimearray(self): dti = date_range("2016-01-01", periods=6, tz="US/Pacific") dta = dti._data.reshape(3, 2) df = DataFrame(dta) expected = DataFrame({0: dta[:, 0], 1: dta[:, 1]}) tm.assert_frame_equal(df, expected) - if not using_array_manager: - # GH#44724 big performance hit if we de-consolidate - assert len(df._mgr.blocks) == 1 + # GH#44724 big performance hit if we de-consolidate + assert len(df._mgr.blocks) == 1 def test_constructor_dict_with_tzaware_scalar(self): # GH#42505 @@ -102,7 +99,7 @@ def test_constructor_dict_with_tzaware_scalar(self): df = DataFrame({"dt": dt}, index=[0]) expected = DataFrame({"dt": [dt]}) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected, check_index_type=False) # Non-homogeneous df = DataFrame({"dt": dt, "value": [1]}) @@ -122,7 +119,7 @@ def test_construct_ndarray_with_nas_and_int_dtype(self): def test_construct_from_list_of_datetimes(self): df = DataFrame([datetime.now(), datetime.now()]) - assert df[0].dtype == np.dtype("M8[ns]") + assert df[0].dtype == np.dtype("M8[us]") def test_constructor_from_tzaware_datetimeindex(self): # don't cast a DatetimeIndex WITH a tz, leave as object @@ -181,24 +178,24 @@ def test_datetimelike_values_with_object_dtype(self, kind, frame_or_series): arr = arr[:, 0] obj = frame_or_series(arr, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) # go through a different path in internals.construction obj = frame_or_series(frame_or_series(arr), dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) obj = frame_or_series(frame_or_series(arr), dtype=NumpyEADtype(object)) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) if frame_or_series is DataFrame: # other paths through internals.construction sers = [Series(x) for x in arr] obj = frame_or_series(sers, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) def test_series_with_name_not_matching_column(self): # GH#9232 @@ -236,16 +233,9 @@ def test_empty_constructor(self, constructor): assert len(result.columns) == 0 tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "constructor", - [ - lambda: DataFrame({}), - lambda: DataFrame(data={}), - ], - ) - def test_empty_constructor_object_index(self, constructor): + def test_empty_constructor_object_index(self): expected = DataFrame(index=RangeIndex(0), columns=RangeIndex(0)) - result = constructor() + result = DataFrame({}) assert len(result.index) == 0 assert len(result.columns) == 0 tm.assert_frame_equal(result, expected, check_index_type=True) @@ -264,7 +254,7 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns tm.assert_frame_equal(result, expected) def test_constructor_mixed(self, float_string_frame, using_infer_string): - dtype = "string" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): @@ -296,49 +286,49 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view_dataframe( - self, using_copy_on_write, warn_copy_on_write - ): + def test_constructor_dtype_nocast_view_dataframe(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) - if using_copy_on_write: - should_be_view.iloc[0, 0] = 99 - assert df.values[0, 0] == 1 - else: - with tm.assert_cow_warning(warn_copy_on_write): - should_be_view.iloc[0, 0] = 99 - assert df.values[0, 0] == 99 + should_be_view.iloc[0, 0] = 99 + assert df.values[0, 0] == 1 - def test_constructor_dtype_nocast_view_2d_array( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_constructor_dtype_nocast_view_2d_array(self): df = DataFrame([[1, 2], [3, 4]], dtype="int64") - if not using_array_manager and not using_copy_on_write: - should_be_view = DataFrame(df.values, dtype=df[0].dtype) - # TODO(CoW-warn) this should warn - # with tm.assert_cow_warning(warn_copy_on_write): - should_be_view.iloc[0, 0] = 97 - assert df.values[0, 0] == 97 - else: - # INFO(ArrayManager) DataFrame(ndarray) doesn't necessarily preserve - # a view on the array to ensure contiguous 1D arrays - df2 = DataFrame(df.values, dtype=df[0].dtype) - assert df2._mgr.arrays[0].flags.c_contiguous - - @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") - def test_1d_object_array_does_not_copy(self): + df2 = DataFrame(df.values, dtype=df[0].dtype) + assert df2._mgr.blocks[0].values.flags.c_contiguous + + def test_1d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) - @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -594,7 +584,7 @@ def test_constructor_invalid_items_unused(self, scalar): expected = DataFrame(columns=["b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) + @pytest.mark.parametrize("value", [4, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18455 cols = [1, value, 3] @@ -788,7 +778,7 @@ def test_constructor_dict_cast(self, using_infer_string): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ if not using_infer_string else "string" + assert frame["B"].dtype == np.object_ if not using_infer_string else "str" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -880,16 +870,18 @@ def create_data(constructor): expected = DataFrame( [ - {0: 0, 1: None, 2: None, 3: None}, - {0: None, 1: 2, 2: None, 3: None}, - {0: None, 1: None, 2: 4, 3: None}, - {0: None, 1: None, 2: None, 3: 6}, + [0, None, None, None], + [None, 2, None, None], + [None, None, 4, None], + [None, None, None, 6], ], index=[Timestamp(dt) for dt in dates_as_str], ) result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) + assert result_datetime.index.unit == "us" + result_datetime.index = result_datetime.index.as_unit("s") result_Timestamp = DataFrame(data_Timestamp) tm.assert_frame_equal(result_datetime64, expected) tm.assert_frame_equal(result_datetime, expected) @@ -959,7 +951,7 @@ def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype): ) def test_constructor_extension_scalar_data(self, data, dtype): # GH 34832 - df = DataFrame(index=[0, 1], columns=["a", "b"], data=data) + df = DataFrame(index=range(2), columns=["a", "b"], data=data) assert df["a"].dtype == dtype assert df["b"].dtype == dtype @@ -1208,7 +1200,7 @@ def test_constructor_scalar_inference(self, using_infer_string): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ if not using_infer_string else "string" + assert df["object"].dtype == np.object_ if not using_infer_string else "str" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1291,11 +1283,11 @@ def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ if not using_infer_string else "string" + assert df["str"].dtype == np.object_ if not using_infer_string else "str" # GH 4851 # list of 0-dim ndarrays - expected = DataFrame({0: np.arange(10)}) + expected = DataFrame(np.arange(10)) data = [np.array(x) for x in range(10)] result = DataFrame(data) tm.assert_frame_equal(result, expected) @@ -1347,12 +1339,12 @@ def test_constructor_unequal_length_nested_list_column(self): [[Timestamp("2021-01-01")]], [{"x": Timestamp("2021-01-01")}], {"x": [Timestamp("2021-01-01")]}, - {"x": Timestamp("2021-01-01").as_unit("ns")}, + {"x": Timestamp("2021-01-01")}, ], ) def test_constructor_one_element_data_list(self, data): # GH#42810 - result = DataFrame(data, index=[0, 1, 2], columns=["x"]) + result = DataFrame(data, index=range(3), columns=["x"]) expected = DataFrame({"x": [Timestamp("2021-01-01")] * 3}) tm.assert_frame_equal(result, expected) @@ -1559,7 +1551,7 @@ def test_constructor_mixed_type_rows(self): "tuples,lists", [ ((), []), - ((()), []), + (((),), [[]]), (((), ()), [(), ()]), (((), ()), [[], []]), (([], []), [[], []]), @@ -1659,7 +1651,7 @@ def test_constructor_Series_named(self): s = Series(arr, index=range(3, 13)) df = DataFrame(s) expected = DataFrame({0: s}) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected, check_column_type=False) msg = r"Shape of passed values is \(10, 1\), indices imply \(10, 2\)" with pytest.raises(ValueError, match=msg): @@ -1678,7 +1670,7 @@ def test_constructor_Series_named(self): # this is a bit non-intuitive here; the series collapse down to arrays df = DataFrame([arr, s1]).T - expected = DataFrame({1: s1, 0: arr}, columns=[0, 1]) + expected = DataFrame({1: s1, 0: arr}, columns=range(2)) tm.assert_frame_equal(df, expected) def test_constructor_Series_named_and_columns(self): @@ -1791,12 +1783,18 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - def test_constructor_empty_with_string_dtype(self): + def test_constructor_empty_with_string_dtype(self, using_infer_string): # GH 9428 expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) + expected_str = DataFrame( + index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan) + ) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - tm.assert_frame_equal(df, expected) + if using_infer_string: + tm.assert_frame_equal(df, expected_str) + else: + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") @@ -1859,7 +1857,12 @@ def test_constructor_with_datetimes(self, using_infer_string): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + + [ + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1881,7 +1884,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1903,7 +1910,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1915,7 +1926,7 @@ def test_constructor_with_datetimes1(self): ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) - assert datetime_s.dtype == "M8[ns]" + assert datetime_s.dtype == "M8[us]" def test_constructor_with_datetimes2(self): # GH 2810 @@ -1926,7 +1937,7 @@ def test_constructor_with_datetimes2(self): df["dates"] = dates result = df.dtypes expected = Series( - [np.dtype("datetime64[ns]"), np.dtype("object")], + [np.dtype("datetime64[us]"), np.dtype("object")], index=["datetimes", "dates"], ) tm.assert_series_equal(result, expected) @@ -1934,8 +1945,7 @@ def test_constructor_with_datetimes2(self): def test_constructor_with_datetimes3(self): # GH 7594 # don't coerce tz-aware - tz = pytz.timezone("US/Eastern") - dt = tz.localize(datetime(2012, 1, 1)) + dt = datetime(2012, 1, 1, tzinfo=zoneinfo.ZoneInfo("US/Eastern")) df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt @@ -1946,7 +1956,7 @@ def test_constructor_with_datetimes3(self): df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}, dtype=object) + df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object) ) def test_constructor_with_datetimes4(self): @@ -1999,7 +2009,14 @@ def test_constructor_with_datetimes6(self): def test_constructor_datetimes_with_nulls(self, arr): # gh-15869, GH#11220 result = DataFrame(arr).dtypes - expected = Series([np.dtype("datetime64[ns]")]) + unit = "ns" + if isinstance(arr, np.ndarray): + # inferred from a pydatetime object + unit = "us" + elif not any(isinstance(x, np.datetime64) for y in arr for x in y): + # TODO: this condition is not clear about why we have different behavior + unit = "s" + expected = Series([np.dtype(f"datetime64[{unit}]")]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) @@ -2122,8 +2139,10 @@ def test_constructor_for_list_with_dtypes(self, using_infer_string): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object") if not using_infer_string else "string", - np.dtype("datetime64[ns]"), + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), + np.dtype("datetime64[us]"), np.dtype("float64"), ], index=list("abcde"), @@ -2147,35 +2166,15 @@ def test_constructor_frame_shallow_copy(self, float_frame): cop.index = np.arange(len(cop)) tm.assert_frame_equal(float_frame, orig) - def test_constructor_ndarray_copy( - self, float_frame, using_array_manager, using_copy_on_write - ): - if not using_array_manager: - arr = float_frame.values.copy() - df = DataFrame(arr) - - arr[5] = 5 - if using_copy_on_write: - assert not (df.values[5] == 5).all() - else: - assert (df.values[5] == 5).all() + def test_constructor_ndarray_copy(self, float_frame): + arr = float_frame.values.copy() + df = DataFrame(arr) - df = DataFrame(arr, copy=True) - arr[6] = 6 - assert not (df.values[6] == 6).all() - else: - arr = float_frame.values.copy() - # default: copy to ensure contiguous arrays - df = DataFrame(arr) - assert df._mgr.arrays[0].flags.c_contiguous - arr[0, 0] = 100 - assert df.iloc[0, 0] != 100 - - # manually specify copy=False - df = DataFrame(arr, copy=False) - assert not df._mgr.arrays[0].flags.c_contiguous - arr[0, 0] = 1000 - assert df.iloc[0, 0] == 1000 + arr[5] = 5 + assert not (df.values[5] == 5).all() + df = DataFrame(arr, copy=True) + arr[6] = 6 + assert not (df.values[6] == 6).all() def test_constructor_series_copy(self, float_frame): series = float_frame._series @@ -2328,15 +2327,10 @@ def test_check_dtype_empty_numeric_column(self, dtype): @pytest.mark.parametrize( "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES ) - def test_check_dtype_empty_string_column(self, request, dtype, using_array_manager): + def test_check_dtype_empty_string_column(self, dtype): # GH24386: Ensure dtypes are set correctly for an empty DataFrame. # Empty DataFrame is generated via dictionary data with non-overlapping columns. data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) - - if using_array_manager and dtype in tm.BYTES_DTYPES: - # TODO(ArrayManager) astype to bytes dtypes does not yet give object dtype - td.mark_array_manager_not_yet_implemented(request) - assert data.b.dtype.name == "object" def test_to_frame_with_falsey_names(self): @@ -2407,6 +2401,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) @@ -2451,7 +2448,7 @@ class DatetimeSubclass(datetime): pass data = DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) - assert data.datetime.dtype == "datetime64[ns]" + assert data.datetime.dtype == "datetime64[us]" def test_with_mismatched_index_length_raises(self): # GH#33437 @@ -2511,21 +2508,10 @@ def test_constructor_list_str_na(self, string_dtype): @pytest.mark.parametrize("copy", [False, True]) def test_dict_nocopy( self, - request, copy, any_numeric_ea_dtype, any_numpy_dtype, - using_array_manager, - using_copy_on_write, ): - if ( - using_array_manager - and not copy - and any_numpy_dtype not in tm.STRING_DTYPES + tm.BYTES_DTYPES - ): - # TODO(ArrayManager) properly honor copy keyword for dict input - td.mark_array_manager_not_yet_implemented(request) - a = np.array([1, 2], dtype=any_numpy_dtype) b = np.array([3, 4], dtype=any_numpy_dtype) if b.dtype.kind in ["S", "U"]: @@ -2546,13 +2532,11 @@ def get_base(obj): raise TypeError def check_views(c_only: bool = False): - # written to work for either BlockManager or ArrayManager - # Check that the underlying data behind df["c"] is still `c` # after setting with iloc. Since we don't know which entry in - # df._mgr.arrays corresponds to df["c"], we just check that exactly + # df._mgr.blocks corresponds to df["c"], we just check that exactly # one of these arrays is `c`. GH#38939 - assert sum(x is c for x in df._mgr.arrays) == 1 + assert sum(x.values is c for x in df._mgr.blocks) == 1 if c_only: # If we ever stop consolidating in setitem_with_indexer, # this will become unnecessary. @@ -2560,17 +2544,17 @@ def check_views(c_only: bool = False): assert ( sum( - get_base(x) is a - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is a + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) assert ( sum( - get_base(x) is b - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is b + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) @@ -2580,11 +2564,13 @@ def check_views(c_only: bool = False): check_views() # TODO: most of the rest of this test belongs in indexing tests - if lib.is_np_dtype(df.dtypes.iloc[0], "fciuO"): - warn = None + should_raise = not lib.is_np_dtype(df.dtypes.iloc[0], "fciuO") + if should_raise: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + return else: - warn = FutureWarning - with tm.assert_produces_warning(warn, match="incompatible dtype"): df.iloc[0, 0] = 0 df.iloc[0, 1] = 0 if not copy: @@ -2594,9 +2580,6 @@ def check_views(c_only: bool = False): # view, so we have to check in the other direction df.iloc[:, 2] = pd.array([45, 46], dtype=c.dtype) assert df.dtypes.iloc[2] == c.dtype - if not copy and not using_copy_on_write: - check_views(True) - if copy: if a.dtype.kind == "M": assert a[0] == a.dtype.type(1, "ns") @@ -2606,12 +2589,6 @@ def check_views(c_only: bool = False): assert b[0] == b.dtype.type(3) # FIXME(GH#35417): enable after GH#35417 assert c[0] == c_orig[0] # i.e. df.iloc[0, 2]=45 did *not* update c - elif not using_copy_on_write: - # TODO: we can call check_views if we stop consolidating - # in setitem_with_indexer - assert c[0] == 45 # i.e. df.iloc[0, 2]=45 *did* update c - # TODO: we can check b[0] == 0 if we stop consolidating in - # setitem_with_indexer (except for datetimelike?) def test_construct_from_dict_ea_series(self): # GH#53744 - default of copy=True should also apply for Series with @@ -2703,8 +2680,7 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2738,8 +2714,7 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2763,28 +2738,61 @@ def test_frame_string_inference_array_string_dtype(self): def test_frame_string_inference_block_dim(self): # GH#55363 - pytest.importorskip("pyarrow") with pd.option_context("future.infer_string", True): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(idx, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": idx}) + obj = klass([Timestamp("2019-12-31")], dtype=object) + result = DataFrame(obj, columns=["a"]) assert result.dtypes.iloc[0] == np.object_ - ser = Series([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(ser, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": ser}) + result = DataFrame({"a": obj}) assert result.dtypes.iloc[0] == np.object_ + def test_dict_keys_returns_rangeindex(self): + result = DataFrame({0: [1], 1: [2]}).columns + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) + + @pytest.mark.parametrize( + "cons", [Series, Index, DatetimeIndex, DataFrame, pd.array, pd.to_datetime] + ) + def test_construction_datetime_resolution_inference(self, cons): + ts = Timestamp(2999, 1, 1) + ts2 = ts.tz_localize("US/Pacific") + + obj = cons([ts]) + res_dtype = tm.get_dtype(obj) + assert res_dtype == "M8[us]", res_dtype + + obj2 = cons([ts2]) + res_dtype2 = tm.get_dtype(obj2) + assert res_dtype2 == "M8[us, US/Pacific]", res_dtype2 + + def test_construction_nan_value_timedelta64_dtype(self): + # GH#60064 + result = DataFrame([None, 1], dtype="timedelta64[ns]") + expected = DataFrame( + ["NaT", "0 days 00:00:00.000000001"], dtype="timedelta64[ns]" + ) + tm.assert_frame_equal(result, expected) + + def test_dataframe_from_array_like_with_name_attribute(self): + # GH#61443 + class DummyArray(np.ndarray): + def __new__(cls, input_array): + obj = np.asarray(input_array).view(cls) + obj.name = "foo" + return obj + + dummy = DummyArray(np.eye(3)) + df = DataFrame(dummy) + expected = DataFrame(np.eye(3)) + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): @@ -2857,7 +2865,7 @@ def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type): ) result = DataFrame({key_val: [1, 2]}, columns=cols) expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols) - expected.iloc[:, 1] = expected.iloc[:, 1].astype(object) + expected.isetitem(1, expected.iloc[:, 1].astype(object)) tm.assert_frame_equal(result, expected) @@ -2916,8 +2924,8 @@ def test_construction_preserves_tzaware_dtypes(self, tz): [ np.dtype("datetime64[ns]"), DatetimeTZDtype(tz=tz), - np.dtype("datetime64[ns]"), - DatetimeTZDtype(tz=tz), + np.dtype("datetime64[us]"), + DatetimeTZDtype(tz=tz, unit="us"), ], index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"], ) @@ -3014,7 +3022,8 @@ def test_frame_timeseries_column(self): Timestamp("20130101T10:01:00", tz="US/Eastern"), Timestamp("20130101T10:02:00", tz="US/Eastern"), ] - } + }, + dtype="M8[ns, US/Eastern]", ) tm.assert_frame_equal(result, expected) @@ -3067,9 +3076,9 @@ def test_from_tzaware_mixed_object_array(self): res = DataFrame(arr, columns=["A", "B", "C"]) expected_dtypes = [ - "datetime64[ns]", - "datetime64[ns, US/Eastern]", - "datetime64[ns, CET]", + "datetime64[s]", + "datetime64[s, US/Eastern]", + "datetime64[s, CET]", ] assert (res.dtypes == expected_dtypes).all() @@ -3097,7 +3106,7 @@ def test_construction_from_ndarray_datetimelike(self): # constructed from 2D ndarray arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3) df = DataFrame(arr) - assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays) + assert all(isinstance(block.values, DatetimeArray) for block in df._mgr.blocks) def test_construction_from_ndarray_with_eadtype_mismatched_columns(self): arr = np.random.default_rng(2).standard_normal((10, 2)) @@ -3118,6 +3127,29 @@ def test_columns_indexes_raise_on_sets(self): with pytest.raises(ValueError, match="columns cannot be a set"): DataFrame(data, columns={"a", "b", "c"}) + def test_from_dict_with_columns_na_scalar(self): + result = DataFrame({"a": pd.NaT}, columns=["a"], index=range(2)) + expected = DataFrame({"a": Series([pd.NaT, pd.NaT])}) + tm.assert_frame_equal(result, expected) + + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]}, + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + data["a"] = np.array(data["a"], dtype=StringDType()) + res = DataFrame(data) + assert res["a"].dtype == np.object_ + assert (res["a"] == data["a"]).all() + def get1(obj): # TODO: make a helper in tm? if isinstance(obj, Series): @@ -3200,14 +3232,6 @@ def test_from_out_of_bounds_ns_datetime( self, constructor, cls, request, box, frame_or_series ): # scalar that won't fit in nanosecond dt64, but will fit in microsecond - if box is list or (frame_or_series is Series and box is dict): - mark = pytest.mark.xfail( - reason="Timestamp constructor has been updated to cast dt64 to " - "non-nano, but DatetimeArray._from_sequence has not", - strict=True, - ) - request.applymarker(mark) - scalar = datetime(9999, 1, 1) exp_dtype = "M8[us]" # pydatetime objects default to this reso diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py index 5bd9c42612315..ab217e1b1332a 100644 --- a/pandas/tests/frame/test_cumulative.py +++ b/pandas/tests/frame/test_cumulative.py @@ -12,6 +12,7 @@ from pandas import ( DataFrame, Series, + Timestamp, ) import pandas._testing as tm @@ -46,20 +47,23 @@ def test_cumprod_smoke(self, datetime_frame): df.cumprod(0) df.cumprod(1) - @pytest.mark.parametrize("method", ["cumsum", "cumprod", "cummin", "cummax"]) - def test_cumulative_ops_match_series_apply(self, datetime_frame, method): + def test_cumulative_ops_match_series_apply( + self, datetime_frame, all_numeric_accumulations + ): datetime_frame.iloc[5:10, 0] = np.nan datetime_frame.iloc[10:15, 1] = np.nan datetime_frame.iloc[15:, 2] = np.nan # axis = 0 - result = getattr(datetime_frame, method)() - expected = datetime_frame.apply(getattr(Series, method)) + result = getattr(datetime_frame, all_numeric_accumulations)() + expected = datetime_frame.apply(getattr(Series, all_numeric_accumulations)) tm.assert_frame_equal(result, expected) # axis = 1 - result = getattr(datetime_frame, method)(axis=1) - expected = datetime_frame.apply(getattr(Series, method), axis=1) + result = getattr(datetime_frame, all_numeric_accumulations)(axis=1) + expected = datetime_frame.apply( + getattr(Series, all_numeric_accumulations), axis=1 + ) tm.assert_frame_equal(result, expected) # fix issue TODO: GH ref? @@ -79,3 +83,25 @@ def test_cumsum_preserve_dtypes(self): } ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", ["cumsum", "cumprod", "cummin", "cummax"]) + @pytest.mark.parametrize("axis", [0, 1]) + def test_numeric_only_flag(self, method, axis): + df = DataFrame( + { + "int": [1, 2, 3], + "bool": [True, False, False], + "string": ["a", "b", "c"], + "float": [1.0, 3.5, 4.0], + "datetime": [ + Timestamp(2018, 1, 1), + Timestamp(2019, 1, 1), + Timestamp(2020, 1, 1), + ], + } + ) + df_numeric_only = df.drop(["string", "datetime"], axis=1) + + result = getattr(df, method)(axis=axis, numeric_only=True) + expected = getattr(df_numeric_only, method)(axis) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index 16ca3a202f1e0..fb43578744eb2 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -107,15 +107,12 @@ def test_logical_ops_invalid(self, using_infer_string): df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) - msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): - df1 | df2 + if using_infer_string and df1["A"].dtype.storage == "pyarrow": + msg = "operation 'or_' not supported for dtype 'str'" else: - with pytest.raises(TypeError, match=msg): - df1 | df2 + msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): @@ -157,7 +154,6 @@ def _check_unary_op(op): _check_unary_op(operator.inv) # TODO: belongs elsewhere - @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_logical_with_nas(self): d = DataFrame({"a": [np.nan, False], "b": [True, True]}) @@ -171,10 +167,7 @@ def test_logical_with_nas(self): result = d["a"].fillna(False) | d["b"] expected = Series([True, True]) tm.assert_series_equal(result, expected) - - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = d["a"].fillna(False, downcast=False) | d["b"] + result = d["a"].fillna(False) | d["b"] expected = Series([True, True]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 34f172e900ab7..1e9aa2325e880 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -284,7 +284,7 @@ def test_multi_dtype2(self): expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"]) tm.assert_frame_equal(df, expected) - def test_dups_across_blocks(self, using_array_manager): + def test_dups_across_blocks(self): # dups across blocks df_float = DataFrame( np.random.default_rng(2).standard_normal((10, 3)), dtype="float64" @@ -299,9 +299,8 @@ def test_dups_across_blocks(self, using_array_manager): ) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) - if not using_array_manager: - assert len(df._mgr.blknos) == len(df.columns) - assert len(df._mgr.blklocs) == len(df.columns) + assert len(df._mgr.blknos) == len(df.columns) + assert len(df._mgr.blklocs) == len(df.columns) # testing iloc for i in range(len(df.columns)): diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py index afb53bf2de93a..e9a241202d156 100644 --- a/pandas/tests/frame/test_npfuncs.py +++ b/pandas/tests/frame/test_npfuncs.py @@ -1,6 +1,7 @@ """ Tests for np.foo applied to DataFrame, not necessarily ufuncs. """ + import numpy as np from pandas import ( @@ -27,22 +28,16 @@ def test_np_sqrt(self, float_frame): tm.assert_frame_equal(result, float_frame.apply(np.sqrt)) - def test_sum_deprecated_axis_behavior(self): - # GH#52042 deprecated behavior of df.sum(axis=None), which gets + def test_sum_axis_behavior(self): + # GH#52042 df.sum(axis=None) now reduces over both axes, which gets # called when we do np.sum(df) arr = np.random.default_rng(2).standard_normal((4, 3)) df = DataFrame(arr) - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False - ): - res = np.sum(df) - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.sum(axis=None) - tm.assert_series_equal(res, expected) + res = np.sum(df) + expected = df.to_numpy().sum(axis=None) + assert res == expected def test_np_ravel(self): # GH26247 diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index a498296e09c52..f93105498ac79 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1,4 +1,5 @@ import operator +from tokenize import TokenError import numpy as np import pytest @@ -58,26 +59,26 @@ def test_query_default(self, df, expected1, expected2): result = df.query("A>0") tm.assert_frame_equal(result, expected1) result = df.eval("A+1") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_None(self, df, expected1, expected2): result = df.query("A>0", engine=None) tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine=None) - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_python(self, df, expected1, expected2): result = df.query("A>0", engine="python") tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="python") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_numexpr(self, df, expected1, expected2): if NUMEXPR_INSTALLED: result = df.query("A>0", engine="numexpr") tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="numexpr") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) else: msg = ( r"'numexpr' is not installed or an unsupported version. " @@ -158,6 +159,25 @@ def test_query_empty_string(self): with pytest.raises(ValueError, match=msg): df.query("") + def test_query_duplicate_column_name(self, engine, parser): + df = DataFrame( + { + "A": range(3), + "B": range(3), + "C": range(3) + } + ).rename(columns={"B": "A"}) + + res = df.query('C == 1', engine=engine, parser=parser) + + expect = DataFrame( + [[1, 1, 1]], + columns=["A", "A", "C"], + index=[1] + ) + + tm.assert_frame_equal(res, expect) + def test_eval_resolvers_as_list(self): # GH 14095 df = DataFrame( @@ -188,6 +208,39 @@ def test_eval_object_dtype_binop(self): expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]}) tm.assert_frame_equal(res, expected) + def test_using_numpy(self, engine, parser): + # GH 58041 + skip_if_no_pandas_parser(parser) + df = Series([0.2, 1.5, 2.8], name="a").to_frame() + res = df.eval("@np.floor(a)", engine=engine, parser=parser) + expected = np.floor(df["a"]) + tm.assert_series_equal(expected, res) + + def test_eval_simple(self, engine, parser): + df = Series([0.2, 1.5, 2.8], name="a").to_frame() + res = df.eval("a", engine=engine, parser=parser) + expected = df["a"] + tm.assert_series_equal(expected, res) + + def test_extension_array_eval(self, engine, parser, request): + # GH#58748 + if engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr does not support extension array dtypes" + ) + request.applymarker(mark) + df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) + result = df.eval("a / b", engine=engine, parser=parser) + expected = Series(pd.array([0.25, 0.40, 0.50])) + tm.assert_series_equal(result, expected) + + def test_complex_eval(self, engine, parser): + # GH#21374 + df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]}) + result = df.eval("a/b", engine=engine, parser=parser) + expected = Series([1.5 + 0.5j]) + tm.assert_series_equal(result, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): @@ -730,7 +783,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture df_index = date_range( - start="2019-01-01", freq="1d", periods=10, tz=tz, name="time" + start="2019-01-01", freq="1D", periods=10, tz=tz, name="time" ) expected = DataFrame(index=df_index) df = DataFrame(index=df_index) @@ -949,8 +1002,8 @@ def test_str_query_method(self, parser, engine): ops = 2 * ([eq] + [ne]) msg = r"'(Not)?In' nodes are not implemented" - for lhs, op, rhs in zip(lhs, ops, rhs): - ex = f"{lhs} {op} {rhs}" + for lh, op_, rh in zip(lhs, ops, rhs): + ex = f"{lh} {op_} {rh}" with pytest.raises(NotImplementedError, match=msg): df.query( ex, @@ -990,8 +1043,8 @@ def test_str_list_query_method(self, parser, engine): ops = 2 * ([eq] + [ne]) msg = r"'(Not)?In' nodes are not implemented" - for lhs, op, rhs in zip(lhs, ops, rhs): - ex = f"{lhs} {op} {rhs}" + for lh, ops_, rh in zip(lhs, ops, rhs): + ex = f"{lh} {ops_} {rh}" with pytest.raises(NotImplementedError, match=msg): df.query(ex, engine=engine, parser=parser) else: @@ -1035,7 +1088,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine, using_infer_string): + def test_object_array_eq_ne(self, parser, engine): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1044,14 +1097,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string): "d": np.random.default_rng(2).integers(9, size=12), } ) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query("a == b", parser=parser, engine=engine) + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - with tm.assert_produces_warning(warning): - res = df.query("a != b", parser=parser, engine=engine) + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1091,15 +1141,13 @@ def test_query_with_nested_special_character(self, parser, engine): ], ) def test_query_lex_compare_strings( - self, parser, engine, op, func, using_infer_string + self, parser, engine, op, func ): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1144,6 +1192,7 @@ def test_query_string_null_elements(self, in_list): df_expected = DataFrame({"a": expected}, dtype="string") df_expected.index = df_expected.index.astype("int64") df = DataFrame({"a": in_list}, dtype="string") + df.index = Index(list(df.index), dtype=df.index.dtype) res1 = df.query("a == 'asdf'", parser=parser, engine=engine) res2 = df[df["a"] == "asdf"] res3 = df.query("a <= 'asdf'", parser=parser, engine=engine) @@ -1187,7 +1236,7 @@ def df(self): by backticks. The last two columns cannot be escaped by backticks and should raise a ValueError. """ - yield DataFrame( + return DataFrame( { "A": [1, 2, 3], "B B": [3, 2, 1], @@ -1209,6 +1258,8 @@ def df(self): "it's": [6, 3, 1], "that's": [9, 1, 8], "☺": [8, 7, 6], + "xy (z)": [1, 2, 3], # noqa: RUF001 + "xy (z\\uff09": [4, 5, 6], # noqa: RUF001 "foo#bar": [2, 4, 5], 1: [5, 7, 9], } @@ -1294,6 +1345,11 @@ def test_start_with_spaces(self, df): expect = df[" A"] + df[" "] tm.assert_series_equal(res, expect) + def test_ints(self, df): + res = df.query("`1` == 7") + expect = df[df[1] == 7] + tm.assert_frame_equal(res, expect) + def test_lots_of_operators_string(self, df): res = df.query("` &^ :!€$?(} > <++*'' ` > 4") expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] @@ -1304,20 +1360,155 @@ def test_missing_attribute(self, df): with pytest.raises(AttributeError, match=message): df.eval("@pd.thing") - def test_failing_quote(self, df): - msg = r"(Could not convert ).*( to a valid Python identifier.)" - with pytest.raises(SyntaxError, match=msg): - df.query("`it's` > `that's`") + def test_quote(self, df): + res = df.query("`it's` > `that's`") + expect = df[df["it's"] > df["that's"]] + tm.assert_frame_equal(res, expect) - def test_failing_character_outside_range(self, df): - msg = r"(Could not convert ).*( to a valid Python identifier.)" - with pytest.raises(SyntaxError, match=msg): - df.query("`☺` > 4") + def test_character_outside_range_smiley(self, df): + res = df.query("`☺` > 4") + expect = df[df["☺"] > 4] + tm.assert_frame_equal(res, expect) - def test_failing_hashtag(self, df): - msg = "Failed to parse backticks" - with pytest.raises(SyntaxError, match=msg): - df.query("`foo#bar` > 4") + def test_character_outside_range_2_byte_parens(self, df): + # GH 49633 + res = df.query("`xy (z)` == 2") # noqa: RUF001 + expect = df[df["xy (z)"] == 2] # noqa: RUF001 + tm.assert_frame_equal(res, expect) + + def test_character_outside_range_and_actual_backslash(self, df): + # GH 49633 + res = df.query("`xy (z\\uff09` == 2") # noqa: RUF001 + expect = df[df["xy \uff08z\\uff09"] == 2] + tm.assert_frame_equal(res, expect) + + def test_hashtag(self, df): + res = df.query("`foo#bar` > 4") + expect = df[df["foo#bar"] > 4] + tm.assert_frame_equal(res, expect) + + def test_expr_with_column_name_with_hashtag_character(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a#"]) + result = df.query("`a#` < 2") + expected = df[df["a#"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_comment(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a#"]) + result = df.query("`a#` < 2 # This is a comment") + expected = df[df["a#"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_column_name_with_backtick_and_hash(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a`#b"]) + result = df.query("`a``#b` < 2") + expected = df[df["a`#b"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_column_name_with_backtick(self): + # GH 59285 + df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) + result = df.query("`a``b` < 2") # noqa + # Note: Formatting checks may wrongly consider the above ``inline code``. + expected = df[df["a`b"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_string_with_backticks(self): + # GH 59285 + df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) + result = df.query("'```' < `#backticks`") + expected = df["```" < df["#backticks"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_string_with_backticked_substring_same_as_column_name(self): + # GH 59285 + df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) + result = df.query("'`#backticks`' < `#backticks`") + expected = df["`#backticks`" < df["#backticks"]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "col1,col2,expr", + [ + ("it's", "that's", "`it's` < `that's`"), + ('it"s', 'that"s', '`it"s` < `that"s`'), + ("it's", 'that\'s "nice"', "`it's` < `that's \"nice\"`"), + ("it's", "that's #cool", "`it's` < `that's #cool` # This is a comment"), + ], + ) + def test_expr_with_column_names_with_special_characters(self, col1, col2, expr): + # GH 59285 + df = DataFrame( + [ + {col1: 1, col2: 2}, + {col1: 3, col2: 4}, + {col1: -1, col2: -2}, + {col1: -3, col2: -4}, + ] + ) + result = df.query(expr) + expected = df[df[col1] < df[col2]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_no_backticks(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"]) + result = df.query("'value' < column_name") + expected = df["value" < df["column_name"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_no_quotes_and_backtick_is_unmatched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["column-name"]) + with pytest.raises((SyntaxError, TokenError), match="invalid syntax"): + df.query("5 < `column-name") + + def test_expr_with_no_quotes_and_backtick_is_matched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["column-name"]) + result = df.query("5 < `column-name`") + expected = df[5 < df["column-name"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_backtick_opened_before_quote_and_backtick_is_unmatched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["It's"]) + with pytest.raises( + (SyntaxError, TokenError), match="unterminated string literal" + ): + df.query("5 < `It's") + + def test_expr_with_backtick_opened_before_quote_and_backtick_is_matched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["It's"]) + result = df.query("5 < `It's`") + expected = df[5 < df["It's"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + with pytest.raises( + (SyntaxError, TokenError), match="unterminated string literal" + ): + df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") + + def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + result = df.query("`column-name` < 'It`s that\\'s \"quote\" #hash'") + expected = df[df["column-name"] < 'It`s that\'s "quote" #hash'] + tm.assert_frame_equal(result, expected) + + def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + result = df.query("'It`s that\\'s \"quote\" #hash' < `column-name`") + expected = df['It`s that\'s "quote" #hash' < df["column-name"]] + tm.assert_frame_equal(result, expected) def test_call_non_named_expression(self, df): """ @@ -1386,12 +1577,12 @@ def test_query_ea_dtypes(self, dtype): if dtype == "int64[pyarrow]": pytest.importorskip("pyarrow") # GH#50261 - df = DataFrame({"a": Series([1, 2], dtype=dtype)}) + df = DataFrame({"a": [1, 2]}, dtype=dtype) ref = {2} # noqa: F841 warning = RuntimeWarning if dtype == "Int64" and NUMEXPR_INSTALLED else None with tm.assert_produces_warning(warning): result = df.query("a in @ref") - expected = DataFrame({"a": Series([2], dtype=dtype, index=[1])}) + expected = DataFrame({"a": [2]}, index=range(1, 2), dtype=dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("engine", ["python", "numexpr"]) @@ -1410,8 +1601,16 @@ def test_query_ea_equality_comparison(self, dtype, engine): result = df.query("A == B", engine=engine) expected = DataFrame( { - "A": Series([1, 2], dtype="Int64", index=[0, 2]), - "B": Series([1, 2], dtype=dtype, index=[0, 2]), + "A": Series([1, 2], dtype="Int64", index=range(0, 4, 2)), + "B": Series([1, 2], dtype=dtype, index=range(0, 4, 2)), } ) tm.assert_frame_equal(result, expected) + + def test_all_nat_in_object(self): + # GH#57068 + now = pd.Timestamp.now("UTC") # noqa: F841 + df = DataFrame({"a": pd.to_datetime([None, None], utc=True)}, dtype=object) + result = df.query("a > @now") + expected = DataFrame({"a": []}, dtype=object) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 66145c32c18d7..127f0fc50a747 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -245,17 +243,11 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame( - self, float_string_frame, axis, opname, using_infer_string - ): - if ( - (opname in ("sum", "min", "max") and axis == 0) - or opname - in ( - "count", - "nunique", - ) - ) and not (using_infer_string and opname == "sum"): + def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): + if (opname in ("sum", "min", "max") and axis == 0) or opname in ( + "count", + "nunique", + ): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -282,10 +274,11 @@ def test_stat_op_api_float_string_frame( msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": msg = re.compile( - r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + r"Cannot convert \[.*\] to numeric|does not support|Cannot perform", + flags=re.S, ) if not isinstance(msg, re.Pattern): - msg = msg + "|does not support" + msg = msg + "|does not support|Cannot perform reduction" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -447,26 +440,16 @@ def test_mixed_ops(self, op): "could not convert", "can't multiply sequence by non-int", "does not support", + "Cannot perform", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() with pd.option_context("use_bottleneck", False): - msg = "|".join( - [ - "Could not convert", - "could not convert", - "can't multiply sequence by non-int", - "does not support", - ] - ) with pytest.raises(TypeError, match=msg): getattr(df, op)() - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" - ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -490,10 +473,8 @@ def test_nunique(self): tm.assert_series_equal( df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) ) - tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) - tm.assert_series_equal( - df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) - ) + tm.assert_series_equal(df.nunique(axis=1), Series([1, 2, 2])) + tm.assert_series_equal(df.nunique(axis=1, dropna=False), Series([1, 3, 2])) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_mixed_datetime_numeric(self, tz): @@ -534,7 +515,7 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) with pytest.raises( - TypeError, match="unsupported operand type|does not support" + TypeError, match="unsupported operand type|does not support|Cannot perform" ): df.mean() result = df[["A", "C"]].mean() @@ -629,7 +610,7 @@ def test_sem(self, datetime_frame): "A": [12], "B": [10.0], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": Series([np.nan], dtype="str"), "E": Categorical([np.nan], categories=["a"]), "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"), "G": to_timedelta([pd.NaT]), @@ -671,7 +652,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), + "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -691,24 +672,16 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self, using_infer_string): - # Check for the warning that is raised when the mode - # results cannot be sorted - + def test_mode_sort_with_na(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - - warning = None if using_infer_string else UserWarning - with tm.assert_produces_warning(warning): - result = df.mode(dropna=False) - result = result.sort_values(by="A").reset_index(drop=True) - + result = df.mode(dropna=False) tm.assert_frame_equal(result, expected) def test_mode_empty_df(self): df = DataFrame([], columns=["a", "b"]) + expected = df.copy() result = df.mode() - expected = DataFrame([], columns=["a", "b"], index=Index([], dtype=np.int64)) tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): @@ -769,11 +742,11 @@ def test_operators_timedelta64(self): # excludes non-numeric result = mixed.min(axis=1, numeric_only=True) - expected = Series([1, 1, 1.0], index=[0, 1, 2]) + expected = Series([1, 1, 1.0]) tm.assert_series_equal(result, expected) # works when only those columns are selected - result = mixed[["A", "B"]].min(1) + result = mixed[["A", "B"]].min(axis=1) expected = Series([timedelta(days=-1)] * 3) tm.assert_series_equal(result, expected) @@ -817,17 +790,8 @@ def test_std_timedelta64_skipna_false(self): @pytest.mark.parametrize( "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]] ) - def test_std_datetime64_with_nat( - self, values, skipna, using_array_manager, request, unit - ): + def test_std_datetime64_with_nat(self, values, skipna, request, unit): # GH#51335 - if using_array_manager and ( - not skipna or all(value is pd.NaT for value in values) - ): - mark = pytest.mark.xfail( - reason="GH#51446: Incorrect type inference on NaT in reduction result" - ) - request.applymarker(mark) dti = to_datetime(values).as_unit(unit) df = DataFrame({"a": dti}) result = df.std(skipna=skipna) @@ -841,8 +805,8 @@ def test_std_datetime64_with_nat( def test_sum_corner(self): empty_frame = DataFrame() - axis0 = empty_frame.sum(0) - axis1 = empty_frame.sum(1) + axis0 = empty_frame.sum(axis=0) + axis1 = empty_frame.sum(axis=1) assert isinstance(axis0, Series) assert isinstance(axis1, Series) assert len(axis0) == 0 @@ -871,6 +835,16 @@ def test_axis_1_empty(self, all_reductions, index): expected = Series([], index=index, dtype=expected_dtype) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("min_count", [0, 1]) + def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + df = DataFrame({"a": [pd.NA]}, dtype=dtype) + result = df.sum(axis=1, skipna=skipna, min_count=min_count) + value = "" if skipna and min_count == 0 else pd.NA + expected = Series([value], dtype=dtype) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) @pytest.mark.parametrize("numeric_only", [None, True, False]) def test_sum_prod_nanops(self, method, unit, numeric_only): @@ -976,20 +950,20 @@ def test_sum_object(self, float_frame): def test_sum_bool(self, float_frame): # ensure this works, bug report bools = np.isnan(float_frame) - bools.sum(1) - bools.sum(0) + bools.sum(axis=1) + bools.sum(axis=0) def test_sum_mixed_datetime(self): # GH#30886 df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex( [2, 3, 4] ) - with pytest.raises(TypeError, match="does not support reduction 'sum'"): + with pytest.raises(TypeError, match="does not support operation 'sum'"): df.sum() def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - msg = "Could not convert|does not support" + msg = "Could not convert|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) @@ -999,7 +973,7 @@ def test_mean_corner(self, float_frame, float_string_frame): # take mean of boolean column float_frame["bool"] = float_frame["A"] > 0 - means = float_frame.mean(0) + means = float_frame.mean(axis=0) assert means["bool"] == float_frame["bool"].values.mean() def test_mean_datetimelike(self): @@ -1052,13 +1026,13 @@ def test_mean_extensionarray_numeric_only_true(self): def test_stats_mixed_type(self, float_string_frame): with pytest.raises(TypeError, match="could not convert"): - float_string_frame.std(1) + float_string_frame.std(axis=1) with pytest.raises(TypeError, match="could not convert"): - float_string_frame.var(1) + float_string_frame.var(axis=1) with pytest.raises(TypeError, match="unsupported operand type"): - float_string_frame.mean(1) + float_string_frame.mean(axis=1) with pytest.raises(TypeError, match="could not convert"): - float_string_frame.skew(1) + float_string_frame.skew(axis=1) def test_sum_bools(self): df = DataFrame(index=range(1), columns=range(10)) @@ -1068,25 +1042,26 @@ def test_sum_bools(self): # ---------------------------------------------------------------------- # Index of max / min - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): frame = float_frame frame.iloc[5:10] = np.nan frame.iloc[15:20, -2:] = np.nan for df in [frame, int_frame]: - warn = None - if skipna is False or axis == 1: - warn = None if df is int_frame else FutureWarning - msg = "The behavior of DataFrame.idxmin with all-NA values" - with tm.assert_produces_warning(warn, match=msg): + if (not skipna or axis == 1) and df is not int_frame: + if skipna: + msg = "Encountered all NA values" + else: + msg = "Encountered an NA value" + with pytest.raises(ValueError, match=msg): + df.idxmin(axis=axis, skipna=skipna) + with pytest.raises(ValueError, match=msg): + df.idxmin(axis=axis, skipna=skipna) + else: result = df.idxmin(axis=axis, skipna=skipna) - - msg2 = "The behavior of Series.idxmin" - with tm.assert_produces_warning(warn, match=msg2): expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) - expected = expected.astype(df.index.dtype) - tm.assert_series_equal(result, expected) + expected = expected.astype(df.index.dtype) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @@ -1117,23 +1092,23 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): frame = float_frame frame.iloc[5:10] = np.nan frame.iloc[15:20, -2:] = np.nan for df in [frame, int_frame]: - warn = None - if skipna is False or axis == 1: - warn = None if df is int_frame else FutureWarning - msg = "The behavior of DataFrame.idxmax with all-NA values" - with tm.assert_produces_warning(warn, match=msg): - result = df.idxmax(axis=axis, skipna=skipna) - - msg2 = "The behavior of Series.idxmax" - with tm.assert_produces_warning(warn, match=msg2): - expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) + if (skipna is False or axis == 1) and df is frame: + if skipna: + msg = "Encountered all NA values" + else: + msg = "Encountered an NA value" + with pytest.raises(ValueError, match=msg): + df.idxmax(axis=axis, skipna=skipna) + return + + result = df.idxmax(axis=axis, skipna=skipna) + expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) expected = expected.astype(df.index.dtype) tm.assert_series_equal(result, expected) @@ -1191,27 +1166,24 @@ def test_idxmax_axis_2(self, float_frame): def test_idxmax_mixed_dtype(self): # don't cast to object, which would raise in nanops dti = date_range("2016-01-01", periods=3) - - # Copying dti is needed for ArrayManager otherwise when we set - # df.loc[0, 3] = pd.NaT below it edits dti - df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)}) + df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti}) result = df.idxmax() - expected = Series([1, 0, 2], index=[1, 2, 3]) + expected = Series([1, 0, 2], index=range(1, 4)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 0], index=[1, 2, 3]) + expected = Series([0, 2, 0], index=range(1, 4)) tm.assert_series_equal(result, expected) # with NaTs df.loc[0, 3] = pd.NaT result = df.idxmax() - expected = Series([1, 0, 2], index=[1, 2, 3]) + expected = Series([1, 0, 2], index=range(1, 4)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 1], index=[1, 2, 3]) + expected = Series([0, 2, 1], index=range(1, 4)) tm.assert_series_equal(result, expected) # with multi-column dt64 block @@ -1219,11 +1191,11 @@ def test_idxmax_mixed_dtype(self): df._consolidate_inplace() result = df.idxmax() - expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4]) + expected = Series([1, 0, 2, 0], index=range(1, 5)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4]) + expected = Series([0, 2, 1, 2], index=range(1, 5)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1266,29 +1238,29 @@ def test_idxmax_dt64_multicolumn_axis1(self): # ---------------------------------------------------------------------- # Logical reductions - @pytest.mark.parametrize("opname", ["any", "all"]) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_only", [False, True]) - def test_any_all_mixed_float(self, opname, axis, bool_only, float_string_frame): + def test_any_all_mixed_float( + self, all_boolean_reductions, axis, bool_only, float_string_frame + ): # make sure op works on mixed-type frame mixed = float_string_frame mixed["_bool_"] = np.random.default_rng(2).standard_normal(len(mixed)) > 0.5 - getattr(mixed, opname)(axis=axis, bool_only=bool_only) + getattr(mixed, all_boolean_reductions)(axis=axis, bool_only=bool_only) - @pytest.mark.parametrize("opname", ["any", "all"]) @pytest.mark.parametrize("axis", [0, 1]) - def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na): - getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False) + def test_any_all_bool_with_na( + self, all_boolean_reductions, axis, bool_frame_with_na + ): + getattr(bool_frame_with_na, all_boolean_reductions)(axis=axis, bool_only=False) - @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") - @pytest.mark.parametrize("opname", ["any", "all"]) - def test_any_all_bool_frame(self, opname, bool_frame_with_na): + def test_any_all_bool_frame(self, all_boolean_reductions, bool_frame_with_na): # GH#12863: numpy gives back non-boolean data for object type # so fill NaNs to compare with pandas behavior frame = bool_frame_with_na.fillna(True) - alternative = getattr(np, opname) - f = getattr(frame, opname) + alternative = getattr(np, all_boolean_reductions) + f = getattr(frame, all_boolean_reductions) def skipna_wrapper(x): nona = x.dropna().values @@ -1317,9 +1289,9 @@ def wrapper(x): # all NA case all_na = frame * np.nan - r0 = getattr(all_na, opname)(axis=0) - r1 = getattr(all_na, opname)(axis=1) - if opname == "any": + r0 = getattr(all_na, all_boolean_reductions)(axis=0) + r1 = getattr(all_na, all_boolean_reductions)(axis=1) + if all_boolean_reductions == "any": assert not r0.any() assert not r1.any() else: @@ -1342,11 +1314,11 @@ def test_any_all_extra(self): result = df[["A", "B"]].any(axis=1, bool_only=True) tm.assert_series_equal(result, expected) - result = df.all(1) + result = df.all(axis=1) expected = Series([True, False, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) - result = df.all(1, bool_only=True) + result = df.all(axis=1, bool_only=True) tm.assert_series_equal(result, expected) # Axis is None @@ -1360,11 +1332,7 @@ def test_any_all_extra(self): assert result is True @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) - @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype( - self, axis, bool_agg_func, skipna, using_infer_string - ): + def test_any_all_object_dtype(self, axis, all_boolean_reductions, skipna): # GH#35450 df = DataFrame( data=[ @@ -1374,19 +1342,10 @@ def test_any_all_object_dtype( [np.nan, np.nan, "5", np.nan], ] ) - if using_infer_string: - # na in object is True while in string pyarrow numpy it's false - val = not axis == 0 and not skipna and bool_agg_func == "all" - else: - val = True - result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, val, True]) + result = getattr(df, all_boolean_reductions)(axis=axis, skipna=skipna) + expected = Series([True, True, True, True]) tm.assert_series_equal(result, expected) - # GH#50947 deprecates this but it is not emitting a warning in some builds. - @pytest.mark.filterwarnings( - "ignore:'any' with datetime64 dtypes is deprecated.*:FutureWarning" - ) def test_any_datetime(self): # GH 23070 float_data = [1, np.nan, 3, np.nan] @@ -1398,10 +1357,9 @@ def test_any_datetime(self): ] df = DataFrame({"A": float_data, "B": datetime_data}) - result = df.any(axis=1) - - expected = Series([True, True, True, False]) - tm.assert_series_equal(result, expected) + msg = "datetime64 type does not support operation 'any'" + with pytest.raises(TypeError, match=msg): + df.any(axis=1) def test_any_all_bool_only(self): # GH 25101 @@ -1484,32 +1442,32 @@ def test_any_all_np_func(self, func, data, expected): if any(isinstance(x, CategoricalDtype) for x in data.dtypes): with pytest.raises( - TypeError, match="dtype category does not support reduction" + TypeError, match=".* dtype category does not support operation" ): func(data) # method version with pytest.raises( - TypeError, match="dtype category does not support reduction" + TypeError, match=".* dtype category does not support operation" ): getattr(DataFrame(data), func.__name__)(axis=None) - else: - msg = "'(any|all)' with datetime64 dtypes is deprecated" - if data.dtypes.apply(lambda x: x.kind == "M").any(): - warn = FutureWarning - else: - warn = None + if data.dtypes.apply(lambda x: x.kind == "M").any(): + # GH#34479 + msg = "datetime64 type does not support operation '(any|all)'" + with pytest.raises(TypeError, match=msg): + func(data) + + # method version + with pytest.raises(TypeError, match=msg): + getattr(DataFrame(data), func.__name__)(axis=None) - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - # GH#34479 - result = func(data) + elif data.dtypes.apply(lambda x: x != "category").any(): + result = func(data) assert isinstance(result, np.bool_) assert result.item() is expected # method version - with tm.assert_produces_warning(warn, match=msg): - # GH#34479 - result = getattr(DataFrame(data), func.__name__)(axis=None) + result = getattr(DataFrame(data), func.__name__)(axis=None) assert isinstance(result, np.bool_) assert result.item() is expected @@ -1596,6 +1554,44 @@ def test_min_max_dt64_with_NaT(self): exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_precision(self): + # GH#60646 Make sure the reduction doesn't cast input timestamps to + # float and lose precision. + df = DataFrame( + {"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01 09:20:00.123456789")]}, + dtype="datetime64[ns]", + ) + + res = df.min(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + res = df.max(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + def test_min_max_td64_with_NaT_precision(self): + # GH#60646 Make sure the reduction doesn't cast input timedeltas to + # float and lose precision. + df = DataFrame( + { + "foo": [ + pd.NaT, + pd.NaT, + to_timedelta("10000 days 06:05:01.123456789"), + ], + }, + dtype="timedelta64[ns]", + ) + + res = df.min(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + res = df.max(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture @@ -1710,7 +1706,6 @@ def test_reductions_skipna_none_raises( with pytest.raises(ValueError, match=msg): getattr(obj, all_reductions)(skipna=None) - @td.skip_array_manager_invalid_test def test_reduction_timestamp_smallest_unit(self): # GH#52524 df = DataFrame( @@ -1729,7 +1724,6 @@ def test_reduction_timestamp_smallest_unit(self): ) tm.assert_series_equal(result, expected) - @td.skip_array_manager_not_yet_implemented def test_reduction_timedelta_smallest_unit(self): # GH#52524 df = DataFrame( @@ -1748,27 +1742,26 @@ def test_reduction_timedelta_smallest_unit(self): class TestNuisanceColumns: - @pytest.mark.parametrize("method", ["any", "all"]) - def test_any_all_categorical_dtype_nuisance_column(self, method): + def test_any_all_categorical_dtype_nuisance_column(self, all_boolean_reductions): # GH#36076 DataFrame should match Series behavior ser = Series([0, 1], dtype="category", name="A") df = ser.to_frame() # Double-check the Series behavior is to raise - with pytest.raises(TypeError, match="does not support reduction"): - getattr(ser, method)() + with pytest.raises(TypeError, match="does not support operation"): + getattr(ser, all_boolean_reductions)() - with pytest.raises(TypeError, match="does not support reduction"): - getattr(np, method)(ser) + with pytest.raises(TypeError, match="does not support operation"): + getattr(np, all_boolean_reductions)(ser) - with pytest.raises(TypeError, match="does not support reduction"): - getattr(df, method)(bool_only=False) + with pytest.raises(TypeError, match="does not support operation"): + getattr(df, all_boolean_reductions)(bool_only=False) - with pytest.raises(TypeError, match="does not support reduction"): - getattr(df, method)(bool_only=None) + with pytest.raises(TypeError, match="does not support operation"): + getattr(df, all_boolean_reductions)(bool_only=None) - with pytest.raises(TypeError, match="does not support reduction"): - getattr(np, method)(df, axis=0) + with pytest.raises(TypeError, match="does not support operation"): + getattr(np, all_boolean_reductions)(df, axis=0) def test_median_categorical_dtype_nuisance_column(self): # GH#21020 DataFrame.median should match Series.median @@ -1776,22 +1769,22 @@ def test_median_categorical_dtype_nuisance_column(self): ser = df["A"] # Double-check the Series behavior is to raise - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): ser.median() - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.median(numeric_only=False) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.median() # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(int) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.median(numeric_only=False) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.median() # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead @@ -1850,7 +1843,7 @@ def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=0) - expected = Series([exp_value, exp_value], dtype=exp_dtype) + expected = Series([exp_value, exp_value], dtype=exp_dtype, index=range(2)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1873,7 +1866,7 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=1) - expected = Series([np.nan, np.nan], dtype=exp_dtype) + expected = Series([np.nan, np.nan], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1881,7 +1874,6 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): [ ("sum", "Int8", 0, ("Int32" if is_windows_np2_or_is32 else "Int64")), ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), ("sum", "Int64", 0, "Int64"), ("prod", "Int64", 1, "Int64"), ("sum", "UInt8", 0, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), @@ -1897,7 +1889,7 @@ def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=0) - expected = Series([exp_value, exp_value], dtype=exp_dtype) + expected = Series([exp_value, exp_value], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) # TODO: why does min_count=1 impact the resulting Windows dtype @@ -1922,18 +1914,12 @@ def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=1) - expected = Series([pd.NA, pd.NA], dtype=exp_dtype) + expected = Series([pd.NA, pd.NA], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) -def test_sum_timedelta64_skipna_false(using_array_manager, request): +def test_sum_timedelta64_skipna_false(): # GH#17235 - if using_array_manager: - mark = pytest.mark.xfail( - reason="Incorrect type inference on NaT in reduction result" - ) - request.applymarker(mark) - arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) arr[-1, -1] = "Nat" @@ -1960,9 +1946,6 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" -) def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) @@ -1992,7 +1975,7 @@ def test_minmax_extensionarray(method, numeric_only): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation|Cannot perform"): df.sum() @@ -2143,15 +2126,16 @@ def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): if method in ("prod", "product", "sum"): kwargs["min_count"] = min_count - warn = None - msg = None if not skipna and method in ("idxmax", "idxmin"): - warn = FutureWarning + # GH#57745 - EAs use groupby for axis=1 which still needs a proper deprecation. msg = f"The behavior of DataFrame.{method} with all-NA values" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(df, method)(axis=1, **kwargs) - with tm.assert_produces_warning(warn, match=msg): - expected = getattr(expected_df, method)(axis=1, **kwargs) + with tm.assert_produces_warning(FutureWarning, match=msg): + getattr(df, method)(axis=1, **kwargs) + with pytest.raises(ValueError, match="Encountered an NA value"): + getattr(expected_df, method)(axis=1, **kwargs) + return + result = getattr(df, method)(axis=1, **kwargs) + expected = getattr(expected_df, method)(axis=1, **kwargs) if method not in ("idxmax", "idxmin"): expected = expected.astype(expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 776007fb9691d..73628424725e5 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas import ( NA, Categorical, @@ -38,10 +36,10 @@ def test_repr_should_return_str(self): index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] cols = ["\u03c8"] df = DataFrame(data, columns=cols, index=index1) - assert type(df.__repr__()) is str # noqa: E721 + assert type(df.__repr__()) is str ser = df[cols[0]] - assert type(ser.__repr__()) is str # noqa: E721 + assert type(ser.__repr__()) is str def test_repr_bytes_61_lines(self): # GH#12857 @@ -176,7 +174,6 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) @@ -425,38 +422,18 @@ def test_to_records_with_na_record(self): result = repr(df) assert result == expected - def test_to_records_with_inf_as_na_record(self): - # GH 48526 - expected = """ NaN inf record -0 inf b [0, inf, b] -1 NaN NaN [1, nan, nan] -2 e f [2, e, f]""" - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with option_context("use_inf_as_na", True): - df = DataFrame( - [[np.inf, "b"], [np.nan, np.nan], ["e", "f"]], - columns=[np.nan, np.inf], - ) - df["record"] = df[[np.nan, np.inf]].to_records() - result = repr(df) - assert result == expected - def test_to_records_with_inf_record(self): # GH 48526 expected = """ NaN inf record 0 inf b [0, inf, b] 1 NaN NaN [1, nan, nan] 2 e f [2, e, f]""" - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with option_context("use_inf_as_na", False): - df = DataFrame( - [[np.inf, "b"], [np.nan, np.nan], ["e", "f"]], - columns=[np.nan, np.inf], - ) - df["record"] = df[[np.nan, np.inf]].to_records() - result = repr(df) + df = DataFrame( + [[np.inf, "b"], [np.nan, np.nan], ["e", "f"]], + columns=[np.nan, np.inf], + ) + df["record"] = df[[np.nan, np.inf]].to_records() + result = repr(df) assert result == expected def test_masked_ea_with_formatter(self): @@ -518,4 +495,4 @@ def test_repr_with_complex_nans(data, output, as_frame): else: reprs = [f"{i} {val}" for i, val in enumerate(output)] expected = "\n".join(reprs) + "\ndtype: complex128" - assert str(obj) == expected, f"\n{str(obj)}\n\n{expected}" + assert str(obj) == expected, f"\n{obj!s}\n\n{expected}" diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 6e1e743eb60de..22fdfd3a01408 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -6,7 +6,6 @@ import pytest from pandas._libs import lib -from pandas.errors import PerformanceWarning import pandas as pd from pandas import ( @@ -28,6 +27,9 @@ def future_stack(request): class TestDataFrameReshape: + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_unstack(self, float_frame, future_stack): df = float_frame.copy() df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) @@ -72,13 +74,12 @@ def test_stack_mixed_level(self, future_stack): expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) - def test_unstack_not_consolidated(self, using_array_manager): + def test_unstack_not_consolidated(self): # Gh#34708 df = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]}) df2 = df[["x"]] df2["y"] = df["y"] - if not using_array_manager: - assert len(df2._mgr.blocks) == 2 + assert len(df2._mgr.blocks) == 2 res = df2.unstack() expected = df.unstack() @@ -655,7 +656,11 @@ def test_unstack_dtypes(self, using_infer_string): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes - dtype = "string" if using_infer_string else np.dtype("object") + dtype = ( + pd.StringDtype(na_value=np.nan) + if using_infer_string + else np.dtype("object") + ) expected = Series( [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( @@ -713,13 +718,13 @@ def test_unstack_unused_levels(self): df = DataFrame([[1, 0]] * 3, index=idx) result = df.unstack() - exp_col = MultiIndex.from_product([[0, 1], ["A", "B", "C"]]) + exp_col = MultiIndex.from_product([range(2), ["A", "B", "C"]]) expected = DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) tm.assert_frame_equal(result, expected) assert (result.columns.levels[1] == idx.levels[1]).all() # Unused items on both levels - levels = [[0, 1, 7], [0, 1, 2, 3]] + levels = [range(3), range(4)] codes = [[0, 0, 1, 1], [0, 2, 0, 2]] idx = MultiIndex(levels, codes) block = np.arange(4).reshape(2, 2) @@ -751,7 +756,7 @@ def test_unstack_unused_levels_mixed_with_nan( result = df.unstack(level=level) exp_data = np.zeros(18) * np.nan exp_data[idces] = data - cols = MultiIndex.from_product([[0, 1], col_level]) + cols = MultiIndex.from_product([range(2), col_level]) expected = DataFrame(exp_data.reshape(3, 6), index=idx_level, columns=cols) tm.assert_frame_equal(result, expected) @@ -969,7 +974,7 @@ def test_unstack_nan_index2(self): right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) - def test_unstack_nan_index3(self, using_array_manager): + def test_unstack_nan_index3(self): # GH7401 df = DataFrame( { @@ -991,10 +996,6 @@ def test_unstack_nan_index3(self, using_array_manager): ) right = DataFrame(vals, columns=cols, index=idx) - if using_array_manager: - # INFO(ArrayManager) with ArrayManager preserve dtype where possible - cols = right.columns[[1, 2, 3, 5]] - right[cols] = right[cols].astype(df["C"].dtype) tm.assert_frame_equal(left, right) def test_unstack_nan_index4(self): @@ -1070,7 +1071,7 @@ def test_stack_datetime_column_multiIndex(self, future_stack): with tm.assert_produces_warning(warn, match=msg): result = df.stack(future_stack=future_stack) - eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) + eidx = MultiIndex.from_product([range(4), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) tm.assert_frame_equal(result, expected) @@ -1153,7 +1154,7 @@ def test_stack_full_multiIndex(self, future_stack): expected = DataFrame( [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], index=MultiIndex( - levels=[[0, 1], ["u", "x", "y", "z"]], + levels=[range(2), ["u", "x", "y", "z"]], codes=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, "Lower"], ), @@ -1162,6 +1163,9 @@ def test_stack_full_multiIndex(self, future_stack): expected["B"] = expected["B"].astype(df.dtypes.iloc[0]) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize("ordered", [False, True]) def test_stack_preserve_categorical_dtype(self, ordered, future_stack): # GH13854 @@ -1201,17 +1205,20 @@ def test_stack_multi_preserve_categorical_dtype( s_cidx = pd.CategoricalIndex(labels, ordered=ordered) expected_data = sorted(data) if future_stack else data expected = Series( - expected_data, index=MultiIndex.from_product([[0], s_cidx, cidx2]) + expected_data, index=MultiIndex.from_product([range(1), s_cidx, cidx2]) ) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_preserve_categorical_dtype_values(self, future_stack): # GH-23077 cat = pd.Categorical(["a", "a", "b", "c"]) df = DataFrame({"A": cat, "B": cat}) result = df.stack(future_stack=future_stack) - index = MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) + index = MultiIndex.from_product([range(4), ["A", "B"]]) expected = Series( pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index ) @@ -1220,18 +1227,17 @@ def test_stack_preserve_categorical_dtype_values(self, future_stack): @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) - @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( - "index, columns", + "index", [ - ([0, 0, 1, 1], MultiIndex.from_product([[1, 2], ["a", "b"]])), - ([0, 0, 2, 3], MultiIndex.from_product([[1, 2], ["a", "b"]])), - ([0, 1, 2, 3], MultiIndex.from_product([[1, 2], ["a", "b"]])), + [0, 0, 1, 1], + [0, 0, 2, 3], + [0, 1, 2, 3], ], ) - def test_stack_multi_columns_non_unique_index(self, index, columns, future_stack): + def test_stack_multi_columns_non_unique_index(self, index, future_stack): # GH-28301 - + columns = MultiIndex.from_product([[1, 2], ["a", "b"]]) df = DataFrame(index=index, columns=columns).fillna(1) stacked = df.stack(future_stack=future_stack) new_index = MultiIndex.from_tuples(stacked.index.to_numpy()) @@ -1296,7 +1302,7 @@ def test_unstack_mixed_extension_types(self, level): @pytest.mark.parametrize("level", [0, "baz"]) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 - mi = MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) + mi = MultiIndex.from_product([range(1), ["d", "c"]], names=["bar", "baz"]) df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) df.columns.name = "foo" @@ -1319,10 +1325,27 @@ def test_unstack_sort_false(frame_or_series, dtype): [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")] ) obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype) + + result = obj.unstack(level=0, sort=False) + + if frame_or_series is DataFrame: + expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")]) + else: + expected_columns = ["two", "one"] + expected = DataFrame( + [[1.0, 3.0], [2.0, 4.0]], + index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]), + columns=expected_columns, + dtype=dtype, + ) + tm.assert_frame_equal(result, expected) + result = obj.unstack(level=-1, sort=False) if frame_or_series is DataFrame: - expected_columns = MultiIndex.from_tuples([(0, "b"), (0, "a")]) + expected_columns = MultiIndex( + levels=[range(1), ["b", "a"]], codes=[[0, 0], [0, 1]] + ) else: expected_columns = ["b", "a"] expected = DataFrame( @@ -1338,7 +1361,9 @@ def test_unstack_sort_false(frame_or_series, dtype): result = obj.unstack(level=[1, 2], sort=False) if frame_or_series is DataFrame: - expected_columns = MultiIndex.from_tuples([(0, "z", "b"), (0, "y", "a")]) + expected_columns = MultiIndex( + levels=[range(1), ["z", "y"], ["b", "a"]], codes=[[0, 0], [0, 1], [0, 1]] + ) else: expected_columns = MultiIndex.from_tuples([("z", "b"), ("y", "a")]) expected = DataFrame( @@ -1398,6 +1423,7 @@ def test_unstack_timezone_aware_values(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_timezone_aware_values(future_stack): # GH 19420 ts = date_range(freq="D", start="20180101", end="20180103", tz="America/New_York") @@ -1414,7 +1440,7 @@ def test_stack_timezone_aware_values(future_stack): @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) def test_stack_empty_frame(dropna, future_stack): # GH 36113 - levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] + levels = [pd.RangeIndex(0), pd.RangeIndex(0)] expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) if future_stack and dropna is not lib.no_default: with pytest.raises(ValueError, match="dropna must be unspecified"): @@ -1426,6 +1452,25 @@ def test_stack_empty_frame(dropna, future_stack): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") +@pytest.mark.parametrize("dropna", [True, False, lib.no_default]) +def test_stack_empty_level(dropna, future_stack, int_frame): + # GH 60740 + if future_stack and dropna is not lib.no_default: + with pytest.raises(ValueError, match="dropna must be unspecified"): + DataFrame(dtype=np.int64).stack(dropna=dropna, future_stack=future_stack) + else: + expected = int_frame + result = int_frame.copy().stack( + level=[], dropna=dropna, future_stack=future_stack + ) + tm.assert_frame_equal(result, expected) + + expected = DataFrame() + result = DataFrame().stack(level=[], dropna=dropna, future_stack=future_stack) + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) @pytest.mark.parametrize("fill_value", [None, 0]) @@ -1492,13 +1537,15 @@ def test_stack_positional_level_duplicate_column_names(future_stack): result = df.stack(0, future_stack=future_stack) new_columns = Index(["y", "z"], name="a") - new_index = MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"]) + new_index = MultiIndex( + levels=[range(1), ["x", "y"]], codes=[[0, 0], [0, 1]], names=[None, "a"] + ) expected = DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns) tm.assert_frame_equal(result, expected) -def test_unstack_non_slice_like_blocks(using_array_manager): +def test_unstack_non_slice_like_blocks(): # Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like mi = MultiIndex.from_product([range(5), ["A", "B", "C"]]) @@ -1511,8 +1558,7 @@ def test_unstack_non_slice_like_blocks(using_array_manager): }, index=mi, ) - if not using_array_manager: - assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks) + assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks) res = df.unstack() @@ -1725,12 +1771,14 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data, future_sta expected = ymd.unstack(0).stack(0, future_stack=future_stack) tm.assert_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize( - "idx, columns, exp_idx", + "idx, exp_idx", [ [ list("abab"), - ["1st", "2nd", "1st"], MultiIndex( levels=[["a", "b"], ["1st", "2nd"]], codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)], @@ -1738,7 +1786,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data, future_sta ], [ MultiIndex.from_tuples((("a", 2), ("b", 1), ("a", 1), ("b", 2))), - ["1st", "2nd", "1st"], MultiIndex( levels=[["a", "b"], [1, 2], ["1st", "2nd"]], codes=[ @@ -1750,12 +1797,12 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data, future_sta ], ], ) - def test_stack_duplicate_index(self, idx, columns, exp_idx, future_stack): + def test_stack_duplicate_index(self, idx, exp_idx, future_stack): # GH10417 df = DataFrame( np.arange(12).reshape(4, 3), index=idx, - columns=columns, + columns=["1st", "2nd", "1st"], ) if future_stack: msg = "Columns with duplicate values are not supported in stack" @@ -1813,6 +1860,9 @@ def test_stack_mixed_dtype(self, multiindex_dataframe_random_data, future_stack) assert result.name is None assert stacked["bar"].dtype == np.float64 + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_unstack_bug(self, future_stack): df = DataFrame( { @@ -1824,10 +1874,7 @@ def test_unstack_bug(self, future_stack): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) - + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) @@ -1847,6 +1894,9 @@ def test_stack_unstack_preserve_names( restacked = unstacked.stack(future_stack=future_stack) assert restacked.index.names == frame.index.names + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize("method", ["stack", "unstack"]) def test_stack_unstack_wrong_level_name( self, method, multiindex_dataframe_random_data, future_stack @@ -2075,7 +2125,7 @@ def test_unstack_period_frame(self): @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) - def test_stack_multiple_bug(self, future_stack): + def test_stack_multiple_bug(self, future_stack, using_infer_string): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) name = (["a"] * 3) + (["b"] * 3) @@ -2087,6 +2137,8 @@ def test_stack_multiple_bug(self, future_stack): multi.columns.name = "Params" unst = multi.unstack("ID") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) @@ -2175,7 +2227,9 @@ def test_unstack_unobserved_keys(self, future_stack): tm.assert_frame_equal(recons, df) @pytest.mark.slow - def test_unstack_number_of_levels_larger_than_int32(self, monkeypatch): + def test_unstack_number_of_levels_larger_than_int32( + self, performance_warning, monkeypatch + ): # GH#20601 # GH 26314: Change ValueError to PerformanceWarning @@ -2192,7 +2246,7 @@ def __init__(self, *args, **kwargs) -> None: index=[np.arange(2**16), np.arange(2**16)], ) msg = "The following operation may generate" - with tm.assert_produces_warning(PerformanceWarning, match=msg): + with tm.assert_produces_warning(performance_warning, match=msg): with pytest.raises(Exception, match="Don't compute final result."): df.unstack() @@ -2207,7 +2261,6 @@ def __init__(self, *args, **kwargs) -> None: ), ) @pytest.mark.parametrize("stack_lev", range(2)) - @pytest.mark.parametrize("sort", [True, False]) def test_stack_order_with_unsorted_levels( self, levels, stack_lev, sort, future_stack ): @@ -2293,7 +2346,7 @@ def test_stack_unstack_unordered_multiindex(self, future_stack): ) expected = DataFrame( [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]], - index=[0, 1, 2, 3, 4], + index=range(5), columns=MultiIndex.from_tuples( [("a", "x"), ("b", "x")], names=["first", "second"] ), @@ -2317,6 +2370,9 @@ def test_unstack_preserve_types( ) assert unstacked["F", 1].dtype == np.float64 + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_unstack_group_index_overflow(self, future_stack): codes = np.tile(np.arange(500), 2) level = np.arange(500) @@ -2354,7 +2410,7 @@ def test_unstack_group_index_overflow(self, future_stack): result = s.unstack(4) assert result.shape == (500, 2) - def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): + def test_unstack_with_missing_int_cast_to_float(self): # https://github.com/pandas-dev/pandas/issues/37115 df = DataFrame( { @@ -2366,8 +2422,7 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): # add another int column to get 2 blocks df["is_"] = 1 - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 2 result = df.unstack("b") result[("is_", "ca")] = result[("is_", "ca")].fillna(0) @@ -2380,10 +2435,6 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): names=[None, "b"], ), ) - if using_array_manager: - # INFO(ArrayManager) with ArrayManager preserve dtype where possible - expected[("v", "cb")] = expected[("v", "cb")].astype("int64") - expected[("is_", "cb")] = expected[("is_", "cb")].astype("int64") tm.assert_frame_equal(result, expected) def test_unstack_with_level_has_nan(self): @@ -2497,7 +2548,7 @@ def test_multi_level_stack_categorical(self, future_stack): ] ), ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" @@ -2624,6 +2675,7 @@ def test_unstack_mixed_level_names(self): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_tuple_columns(future_stack): # GH#54948 - test stack when the input has a non-MultiIndex with tuples df = DataFrame( @@ -2633,7 +2685,7 @@ def test_stack_tuple_columns(future_stack): expected = Series( [1, 2, 3, 4, 5, 6, 7, 8, 9], index=MultiIndex( - levels=[[0, 1, 2], [("a", 1), ("a", 2), ("b", 1)]], + levels=[range(3), [("a", 1), ("a", 2), ("b", 1)]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ), ) @@ -2657,7 +2709,7 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex): else: index = Index([na_value], dtype=dtype) df = DataFrame({"a": [1]}, index=index) - result = df.stack(future_stack=True) + result = df.stack() if test_multiindex: expected_index = MultiIndex.from_arrays( diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index ef78ae62cb4d6..cbd563a03b908 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -15,17 +15,18 @@ ) -@pytest.fixture() -def gpd_style_subclass_df(): - class SubclassedDataFrame(DataFrame): - @property - def _constructor(self): - return SubclassedDataFrame - - return SubclassedDataFrame({"a": [1, 2, 3]}) - - class TestDataFrameSubclassing: + def test_no_warning_on_mgr(self): + # GH#57032 + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"] + ) + with tm.assert_produces_warning(None): + # df.isna() goes through _constructor_from_mgr, which we want to + # *not* pass a Manager do __init__ + df.isna() + df["X"].isna() + def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 @@ -218,7 +219,7 @@ def test_subclass_stack(self): columns=["X", "Y", "Z"], ) - res = df.stack(future_stack=True) + res = df.stack() exp = tm.SubclassedSeries( [1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")] ) @@ -255,10 +256,10 @@ def test_subclass_stack_multi(self): columns=Index(["W", "X"], name="www"), ) - res = df.stack(future_stack=True) + res = df.stack() tm.assert_frame_equal(res, exp) - res = df.stack("yyy", future_stack=True) + res = df.stack("yyy") tm.assert_frame_equal(res, exp) exp = tm.SubclassedDataFrame( @@ -279,7 +280,7 @@ def test_subclass_stack_multi(self): columns=Index(["y", "z"], name="yyy"), ) - res = df.stack("www", future_stack=True) + res = df.stack("www") tm.assert_frame_equal(res, exp) def test_subclass_stack_multi_mixed(self): @@ -317,10 +318,10 @@ def test_subclass_stack_multi_mixed(self): columns=Index(["W", "X"], name="www"), ) - res = df.stack(future_stack=True) + res = df.stack() tm.assert_frame_equal(res, exp) - res = df.stack("yyy", future_stack=True) + res = df.stack("yyy") tm.assert_frame_equal(res, exp) exp = tm.SubclassedDataFrame( @@ -341,7 +342,7 @@ def test_subclass_stack_multi_mixed(self): columns=Index(["y", "z"], name="yyy"), ) - res = df.stack("www", future_stack=True) + res = df.stack("www") tm.assert_frame_equal(res, exp) def test_subclass_unstack(self): @@ -710,14 +711,21 @@ def test_idxmax_preserves_subclass(self): result = df.idxmax() assert isinstance(result, tm.SubclassedSeries) - def test_convert_dtypes_preserves_subclass(self, gpd_style_subclass_df): + def test_convert_dtypes_preserves_subclass(self): # GH 43668 df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) result = df.convert_dtypes() assert isinstance(result, tm.SubclassedDataFrame) - result = gpd_style_subclass_df.convert_dtypes() - assert isinstance(result, type(gpd_style_subclass_df)) + def test_convert_dtypes_preserves_subclass_with_constructor(self): + class SubclassedDataFrame(DataFrame): + @property + def _constructor(self): + return SubclassedDataFrame + + df = SubclassedDataFrame({"a": [1, 2, 3]}) + result = df.convert_dtypes() + assert isinstance(result, SubclassedDataFrame) def test_astype_preserves_subclass(self): # GH#40810 @@ -734,18 +742,6 @@ def test_equals_subclass(self): assert df1.equals(df2) assert df2.equals(df1) - def test_replace_list_method(self): - # https://github.com/pandas-dev/pandas/pull/46018 - df = tm.SubclassedDataFrame({"A": [0, 1, 2]}) - msg = "The 'method' keyword in SubclassedDataFrame.replace is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False - ): - result = df.replace([1, 2], method="ffill") - expected = tm.SubclassedDataFrame({"A": [0, 0, 0]}) - assert isinstance(result, tm.SubclassedDataFrame) - tm.assert_frame_equal(result, expected) - class MySubclassWithMetadata(DataFrame): _metadata = ["my_metadata"] @@ -773,6 +769,13 @@ def test_constructor_with_metadata(): assert isinstance(subset, MySubclassWithMetadata) +def test_constructor_with_metadata_from_records(): + # GH#57008 + df = MySubclassWithMetadata.from_records([{"a": 1, "b": 2}]) + assert df.my_metadata is None + assert type(df) is MySubclassWithMetadata + + class SimpleDataFrameSubClass(DataFrame): """A subclass of DataFrame that does not define a constructor.""" diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py index 88c62da2b0a73..092e65dd4b431 100644 --- a/pandas/tests/frame/test_ufunc.py +++ b/pandas/tests/frame/test_ufunc.py @@ -66,14 +66,14 @@ def test_binary_input_dispatch_binop(dtype): [ (np.add, 1, [2, 3, 4, 5]), ( - partial(np.add, where=[[False, True], [True, False]]), + partial(np.add, where=[[False, True], [True, False]]), # type: ignore[misc] np.array([[1, 1], [1, 1]]), [0, 3, 4, 0], ), (np.power, np.array([[1, 1], [2, 2]]), [1, 2, 9, 16]), (np.subtract, 2, [-1, 0, 1, 2]), ( - partial(np.negative, where=np.array([[False, True], [True, False]])), + partial(np.negative, where=np.array([[False, True], [True, False]])), # type: ignore[misc] None, [0, -2, -3, 0], ), @@ -245,6 +245,7 @@ def test_alignment_deprecation_enforced(): np.add(s2, df1) +@pytest.mark.single_cpu def test_alignment_deprecation_many_inputs_enforced(): # Enforced in 2.0 # https://github.com/pandas-dev/pandas/issues/39184 diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 850c92013694f..652f52bd226af 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -13,17 +13,16 @@ class TestDataFrameUnaryOperators: # __pos__, __neg__, __invert__ @pytest.mark.parametrize( - "df,expected", + "df_data,expected_data", [ - (pd.DataFrame({"a": [-1, 1]}), pd.DataFrame({"a": [1, -1]})), - (pd.DataFrame({"a": [False, True]}), pd.DataFrame({"a": [True, False]})), - ( - pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), - pd.DataFrame({"a": pd.Series(pd.to_timedelta([1, -1]))}), - ), + ([-1, 1], [1, -1]), + ([False, True], [True, False]), + (pd.to_timedelta([-1, 1]), pd.to_timedelta([1, -1])), ], ) - def test_neg_numeric(self, df, expected): + def test_neg_numeric(self, df_data, expected_data): + df = pd.DataFrame({"a": df_data}) + expected = pd.DataFrame({"a": expected_data}) tm.assert_frame_equal(-df, expected) tm.assert_series_equal(-df["a"], expected["a"]) @@ -42,31 +41,23 @@ def test_neg_object(self, df, expected): tm.assert_series_equal(-df["a"], expected["a"]) @pytest.mark.parametrize( - "df", + "df_data", [ - pd.DataFrame({"a": ["a", "b"]}), - pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), + ["a", "b"], + pd.to_datetime(["2017-01-22", "1970-01-01"]), ], ) - def test_neg_raises(self, df, using_infer_string): + def test_neg_raises(self, df_data, using_infer_string): + df = pd.DataFrame({"a": df_data}) msg = ( "bad operand type for unary -: 'str'|" - r"bad operand type for unary -: 'DatetimeArray'" + r"bad operand type for unary -: 'DatetimeArray'|" + "unary '-' not supported for dtype" ) - if using_infer_string and df.dtypes.iloc[0] == "string": - import pyarrow as pa - - msg = "has no kernel" - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - (-df) - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - (-df["a"]) - - else: - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame @@ -102,44 +93,36 @@ def test_invert_empty_not_input(self): assert df is not result @pytest.mark.parametrize( - "df", + "df_data", [ - pd.DataFrame({"a": [-1, 1]}), - pd.DataFrame({"a": [False, True]}), - pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + [-1, 1], + [False, True], + pd.to_timedelta([-1, 1]), ], ) - def test_pos_numeric(self, df): + def test_pos_numeric(self, df_data): # GH#16073 + df = pd.DataFrame({"a": df_data}) tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) @pytest.mark.parametrize( - "df", + "df_data", [ - pd.DataFrame({"a": np.array([-1, 2], dtype=object)}), - pd.DataFrame({"a": [Decimal("-1.0"), Decimal("2.0")]}), + np.array([-1, 2], dtype=object), + [Decimal("-1.0"), Decimal("2.0")], ], ) - def test_pos_object(self, df): + def test_pos_object(self, df_data): # GH#21380 + df = pd.DataFrame({"a": df_data}) tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.parametrize( - "df", - [ - pytest.param( - pd.DataFrame({"a": ["a", "b"]}), - # filterwarnings removable once min numpy version is 1.25 - marks=[ - pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning") - ], - ), - ], - ) - def test_pos_object_raises(self, df): + @pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning") + def test_pos_object_raises(self): # GH#21380 + df = pd.DataFrame({"a": ["a", "b"]}) if np_version_gte1p25: with pytest.raises( TypeError, match=r"^bad operand type for unary \+: \'str\'$" @@ -148,10 +131,8 @@ def test_pos_object_raises(self, df): else: tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.parametrize( - "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] - ) - def test_pos_raises(self, df): + def test_pos_raises(self): + df = pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}) msg = r"bad operand type for unary \+: 'DatetimeArray'" with pytest.raises(TypeError, match=msg): (+df) diff --git a/pandas/tests/frame/test_validate.py b/pandas/tests/frame/test_validate.py index e99e0a6863848..fdeecba29a617 100644 --- a/pandas/tests/frame/test_validate.py +++ b/pandas/tests/frame/test_validate.py @@ -3,11 +3,6 @@ from pandas.core.frame import DataFrame -@pytest.fixture -def dataframe(): - return DataFrame({"a": [1, 2], "b": [3, 4]}) - - class TestDataFrameValidate: """Tests for error handling related to data types of method arguments.""" @@ -24,7 +19,8 @@ class TestDataFrameValidate: ], ) @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0]) - def test_validate_bool_args(self, dataframe, func, inplace): + def test_validate_bool_args(self, func, inplace): + dataframe = DataFrame({"a": [1, 2], "b": [3, 4]}) msg = 'For argument "inplace" expected type bool' kwargs = {"inplace": inplace} diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index f54db07824daf..cfa3cabbc1747 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -1,4 +1,5 @@ """Tests dealing with the NDFrame.allows_duplicates.""" + import operator import numpy as np @@ -45,12 +46,11 @@ def test_preserved_series(self, func): s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) assert func(s).flags.allows_duplicate_labels is False - @pytest.mark.parametrize( - "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])] - ) + @pytest.mark.parametrize("index", [["a", "b", "c"], ["a", "b"]]) # TODO: frame @not_implemented - def test_align(self, other): + def test_align(self, index): + other = pd.Series(0, index=index) s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) a, b = s.align(other) assert a.flags.allows_duplicate_labels is False @@ -90,18 +90,6 @@ def test_preserve_getitem(self): assert df.loc[[0]].flags.allows_duplicate_labels is False assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False - def test_ndframe_getitem_caching_issue( - self, request, using_copy_on_write, warn_copy_on_write - ): - if not (using_copy_on_write or warn_copy_on_write): - request.applymarker(pytest.mark.xfail(reason="Unclear behavior.")) - # NDFrame.__getitem__ will cache the first df['A']. May need to - # invalidate that cache? Update the cached entries? - df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) - assert df["A"].flags.allows_duplicate_labels is False - df.flags.allows_duplicate_labels = True - assert df["A"].flags.allows_duplicate_labels is True - @pytest.mark.parametrize( "objs, kwargs", [ @@ -298,23 +286,15 @@ def test_getitem_raises(self, getter, target): with pytest.raises(pd.errors.DuplicateLabelError, match=msg): getter(target) - @pytest.mark.parametrize( - "objs, kwargs", - [ - ( - [ - pd.Series(1, index=[0, 1], name="a"), - pd.Series(2, index=[0, 1], name="a"), - ], - {"axis": 1}, - ) - ], - ) - def test_concat_raises(self, objs, kwargs): + def test_concat_raises(self): + objs = [ + pd.Series(1, index=[0, 1], name="a"), + pd.Series(2, index=[0, 1], name="a"), + ] objs = [x.set_flags(allows_duplicate_labels=False) for x in objs] msg = "Index has duplicates." with pytest.raises(pd.errors.DuplicateLabelError, match=msg): - pd.concat(objs, **kwargs) + pd.concat(objs, axis=1) @not_implemented def test_merge_raises(self): diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 866e9e203ffe3..4b841b54c488b 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -1,6 +1,7 @@ """ An exhaustive list of pandas methods exercising NDFrame.__finalize__. """ + import operator import re @@ -8,7 +9,6 @@ import pytest import pandas as pd -import pandas._testing as tm # TODO: # * Binary methods (mul, div, etc.) @@ -90,7 +90,6 @@ (pd.DataFrame, frame_data, operator.methodcaller("rename", columns={"A": "a"})), (pd.DataFrame, frame_data, operator.methodcaller("rename", index=lambda x: x)), (pd.DataFrame, frame_data, operator.methodcaller("fillna", "A")), - (pd.DataFrame, frame_data, operator.methodcaller("fillna", method="ffill")), (pd.DataFrame, frame_data, operator.methodcaller("set_index", "A")), (pd.DataFrame, frame_data, operator.methodcaller("reset_index")), (pd.DataFrame, frame_data, operator.methodcaller("isna")), @@ -303,16 +302,6 @@ ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), operator.methodcaller("between_time", "12:00", "13:00"), ), - ( - pd.Series, - (1, pd.date_range("2000", periods=4)), - operator.methodcaller("last", "3D"), - ), - ( - pd.DataFrame, - ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), - operator.methodcaller("last", "3D"), - ), (pd.Series, ([1, 2],), operator.methodcaller("rank")), (pd.DataFrame, frame_data, operator.methodcaller("rank")), (pd.Series, ([1, 2],), operator.methodcaller("where", np.array([True, False]))), @@ -386,18 +375,7 @@ def idfn(x): return str(x) -@pytest.fixture(params=_all_methods, ids=lambda x: idfn(x[-1])) -def ndframe_method(request): - """ - An NDFrame method returning an NDFrame. - """ - return request.param - - -@pytest.mark.filterwarnings( - "ignore:DataFrame.fillna with 'method' is deprecated:FutureWarning", - "ignore:last is deprecated:FutureWarning", -) +@pytest.mark.parametrize("ndframe_method", _all_methods, ids=lambda x: idfn(x[-1])) def test_finalize_called(ndframe_method): cls, init_args, method = ndframe_method ndframe = cls(*init_args) @@ -408,39 +386,6 @@ def test_finalize_called(ndframe_method): assert result.attrs == {"a": 1} -@pytest.mark.parametrize( - "data", - [ - pd.Series(1, pd.date_range("2000", periods=4)), - pd.DataFrame({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), - ], -) -def test_finalize_first(data): - deprecated_msg = "first is deprecated" - - data.attrs = {"a": 1} - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = data.first("3D") - assert result.attrs == {"a": 1} - - -@pytest.mark.parametrize( - "data", - [ - pd.Series(1, pd.date_range("2000", periods=4)), - pd.DataFrame({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), - ], -) -def test_finalize_last(data): - # GH 53710 - deprecated_msg = "last is deprecated" - - data.attrs = {"a": 1} - with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = data.last("3D") - assert result.attrs == {"a": 1} - - @not_implemented_mark def test_finalize_called_eval_numexpr(): pytest.importorskip("numexpr") @@ -482,53 +427,6 @@ def test_binops(request, args, annotate, all_binary_operators): if annotate == "right" and isinstance(right, int): pytest.skip("right is an int and doesn't support .attrs") - if not (isinstance(left, int) or isinstance(right, int)) and annotate != "both": - if not all_binary_operators.__name__.startswith("r"): - if annotate == "right" and isinstance(left, type(right)): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when right has " - f"attrs and both are {type(left)}" - ) - ) - if not isinstance(left, type(right)): - if annotate == "left" and isinstance(left, pd.Series): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when the " - "objects are different Series has attrs" - ) - ) - elif annotate == "right" and isinstance(right, pd.Series): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when the " - "objects are different Series has attrs" - ) - ) - else: - if annotate == "left" and isinstance(left, type(right)): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when left has " - f"attrs and both are {type(left)}" - ) - ) - if not isinstance(left, type(right)): - if annotate == "right" and isinstance(right, pd.Series): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when the " - "objects are different Series has attrs" - ) - ) - elif annotate == "left" and isinstance(left, pd.Series): - request.applymarker( - pytest.mark.xfail( - reason=f"{all_binary_operators} doesn't work when the " - "objects are different Series has attrs" - ) - ) if annotate in {"left", "both"} and not isinstance(left, int): left.attrs = {"a": 1} if annotate in {"right", "both"} and not isinstance(right, int): @@ -544,14 +442,26 @@ def test_binops(request, args, annotate, all_binary_operators): ] if is_cmp and isinstance(left, pd.DataFrame) and isinstance(right, pd.Series): # in 2.0 silent alignment on comparisons was removed xref GH#28759 - left, right = left.align(right, axis=1, copy=False) + left, right = left.align(right, axis=1) elif is_cmp and isinstance(left, pd.Series) and isinstance(right, pd.DataFrame): - right, left = right.align(left, axis=1, copy=False) + right, left = right.align(left, axis=1) result = all_binary_operators(left, right) assert result.attrs == {"a": 1} +@pytest.mark.parametrize("left", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("right", [pd.Series, pd.DataFrame]) +def test_attrs_binary_operations(all_binary_operators, left, right): + # GH 51607 + attrs = {"a": 1} + left = left([1]) + left.attrs = attrs + right = right([2]) + assert all_binary_operators(left, right).attrs == attrs + assert all_binary_operators(right, left).attrs == attrs + + # ---------------------------------------------------------------------------- # Accessors @@ -699,7 +609,7 @@ def test_timedelta_methods(method): operator.methodcaller("add_categories", ["c"]), operator.methodcaller("as_ordered"), operator.methodcaller("as_unordered"), - lambda x: getattr(x, "codes"), + lambda x: x.codes, operator.methodcaller("remove_categories", "a"), operator.methodcaller("remove_unused_categories"), operator.methodcaller("rename_categories", {"a": "A", "b": "B"}), diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index fc7aa9e7b2c46..1d0f491529b56 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -46,28 +46,11 @@ def test_set_axis_name_mi(self, func): assert result.index.names == [None, None] def test_nonzero_single_element(self): - # allow single item via bool method - msg_warn = ( - "DataFrame.bool is now deprecated and will be removed " - "in future version of pandas" - ) - df = DataFrame([[True]]) - df1 = DataFrame([[False]]) - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - assert df.bool() - - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - assert not df1.bool() - df = DataFrame([[False, False]]) msg_err = "The truth value of a DataFrame is ambiguous" with pytest.raises(ValueError, match=msg_err): bool(df) - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - with pytest.raises(ValueError, match=msg_err): - df.bool() - def test_metadata_propagation_indiv_groupby(self): # groupby df = DataFrame( diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 6564e381af0ea..b591b1b1092d4 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -93,8 +93,7 @@ def test_get_numeric_data(self, frame_or_series): if isinstance(o, DataFrame): # preserve columns dtype expected.columns = o.columns[:0] - # https://github.com/pandas-dev/pandas/issues/50862 - tm.assert_equal(result.reset_index(drop=True), expected) + tm.assert_equal(result, expected) # get the bool data arr = np.array([True, True, False, True]) @@ -102,6 +101,11 @@ def test_get_numeric_data(self, frame_or_series): result = o._get_numeric_data() tm.assert_equal(result, o) + def test_get_bool_data_empty_preserve_index(self): + expected = Series([], dtype="bool") + result = expected._get_bool_data() + tm.assert_series_equal(result, expected, check_index_type=True) + def test_nonzero(self, frame_or_series): # GH 4633 # look at the boolean/nonzero behavior for objects @@ -232,11 +236,8 @@ def test_size_compat(self, frame_or_series): def test_split_compat(self, frame_or_series): # xref GH8846 o = construct(frame_or_series, shape=10) - with tm.assert_produces_warning( - FutureWarning, match=".swapaxes' is deprecated", check_stacklevel=False - ): - assert len(np.array_split(o, 5)) == 5 - assert len(np.array_split(o, 2)) == 2 + assert len(np.array_split(o, 5)) == 5 + assert len(np.array_split(o, 2)) == 2 # See gh-12301 def test_stat_unexpected_keyword(self, frame_or_series): @@ -306,13 +307,6 @@ def test_copy_and_deepcopy(self, frame_or_series, shape, func): assert obj_copy is not obj tm.assert_equal(obj_copy, obj) - def test_data_deprecated(self, frame_or_series): - obj = frame_or_series() - msg = "(Series|DataFrame)._data is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - mgr = obj._data - assert mgr is obj._mgr - class TestNDFrame: # tests that don't fit elsewhere @@ -493,12 +487,3 @@ def test_flags_identity(self, frame_or_series): assert obj.flags is obj.flags obj2 = obj.copy() assert obj2.flags is not obj.flags - - def test_bool_dep(self) -> None: - # GH-51749 - msg_warn = ( - "DataFrame.bool is now deprecated and will be removed " - "in future version of pandas" - ) - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - DataFrame({"col": [False]}).bool() diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 97be46f716d7d..80c24c647bc11 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -34,15 +34,6 @@ def df_ambig(df): return df -@pytest.fixture -def df_duplabels(df): - """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2'""" - df = df.set_index(["L1"]) - df = pd.concat([df, df["L2"]], axis=1) - - return df - - # Test is label/level reference # ============================= def get_labels_levels(df_levels): @@ -229,7 +220,9 @@ def test_get_label_or_level_values_df_ambig(df_ambig, axis): assert_label_values(df_ambig, ["L3"], axis=axis) -def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): +def test_get_label_or_level_values_df_duplabels(df, axis): + df = df.set_index(["L1"]) + df_duplabels = pd.concat([df, df["L2"]], axis=1) axis = df_duplabels._get_axis_number(axis) # Transpose frame if axis == 1 if axis == 1: diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 3648961eb3808..7dcdcd96cce51 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -39,19 +39,6 @@ def test_get_bool_data_preserve_dtype(self): result = ser._get_bool_data() tm.assert_series_equal(result, ser) - def test_nonzero_single_element(self): - # allow single item via bool method - msg_warn = ( - "Series.bool is now deprecated and will be removed " - "in future version of pandas" - ) - ser = Series([True]) - ser1 = Series([False]) - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - assert ser.bool() - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - assert not ser1.bool() - @pytest.mark.parametrize("data", [np.nan, pd.NaT, True, False]) def test_nonzero_single_element_raise_1(self, data): # single item nan to raise @@ -61,48 +48,21 @@ def test_nonzero_single_element_raise_1(self, data): with pytest.raises(ValueError, match=msg): bool(series) - @pytest.mark.parametrize("data", [np.nan, pd.NaT]) - def test_nonzero_single_element_raise_2(self, data): - msg_warn = ( - "Series.bool is now deprecated and will be removed " - "in future version of pandas" - ) - msg_err = "bool cannot act on a non-boolean single element Series" - series = Series([data]) - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - with pytest.raises(ValueError, match=msg_err): - series.bool() - @pytest.mark.parametrize("data", [(True, True), (False, False)]) def test_nonzero_multiple_element_raise(self, data): # multiple bool are still an error - msg_warn = ( - "Series.bool is now deprecated and will be removed " - "in future version of pandas" - ) msg_err = "The truth value of a Series is ambiguous" series = Series([data]) with pytest.raises(ValueError, match=msg_err): bool(series) - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - with pytest.raises(ValueError, match=msg_err): - series.bool() @pytest.mark.parametrize("data", [1, 0, "a", 0.0]) def test_nonbool_single_element_raise(self, data): # single non-bool are an error - msg_warn = ( - "Series.bool is now deprecated and will be removed " - "in future version of pandas" - ) msg_err1 = "The truth value of a Series is ambiguous" - msg_err2 = "bool cannot act on a non-boolean single element Series" series = Series([data]) with pytest.raises(ValueError, match=msg_err1): bool(series) - with tm.assert_produces_warning(FutureWarning, match=msg_warn): - with pytest.raises(ValueError, match=msg_err2): - series.bool() def test_metadata_propagation_indiv_resample(self): # resample diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index e0d79c3f15282..8917e4e3f3854 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -6,11 +6,13 @@ DataFrame, MultiIndex, Series, + StringDtype, date_range, ) import pandas._testing as tm +from pandas.util.version import Version -pytest.importorskip("xarray") +xarray = pytest.importorskip("xarray") class TestDataFrameToXArray: @@ -29,71 +31,77 @@ def df(self): } ) - def test_to_xarray_index_types(self, index_flat, df, using_infer_string): + def test_to_xarray_index_types(self, index_flat, df, request): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: pytest.skip("Test doesn't make sense for empty index") - - from xarray import Dataset + elif Version(xarray.__version__) <= Version("2024.9.0"): + request.applymarker( + pytest.mark.xfail( + reason="Categorical column not preserved.", + ) + ) df.index = index[:4] df.index.name = "foo" df.columns.name = "bar" result = df.to_xarray() - assert result.dims["foo"] == 4 + assert result.sizes["foo"] == 4 assert len(result.coords) == 1 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, Dataset) + assert isinstance(result, xarray.Dataset) # idempotency # datetimes w/tz are preserved # column names are lost expected = df.copy() - expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" - ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) def test_to_xarray_empty(self, df): - from xarray import Dataset - df.index.name = "foo" result = df[0:0].to_xarray() - assert result.dims["foo"] == 0 - assert isinstance(result, Dataset) + assert result.sizes["foo"] == 0 + assert isinstance(result, xarray.Dataset) def test_to_xarray_with_multiindex(self, df, using_infer_string): - from xarray import Dataset - # MultiIndex df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"]) result = df.to_xarray() - assert result.dims["one"] == 1 - assert result.dims["two"] == 4 + assert result.sizes["one"] == 1 + assert result.sizes["two"] == 4 assert len(result.coords) == 2 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) - assert isinstance(result, Dataset) + assert isinstance(result, xarray.Dataset) result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result, expected) class TestSeriesToXArray: - def test_to_xarray_index_types(self, index_flat): + def test_to_xarray_index_types(self, index_flat, request): index = index_flat + if ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + and Version(xarray.__version__) > Version("2024.9.0") + ): + request.applymarker( + pytest.mark.xfail( + reason="xarray calling reshape of ArrowExtensionArray", + raises=NotImplementedError, + ) + ) # MultiIndex is tested in test_to_xarray_with_multiindex - from xarray import DataArray - ser = Series(range(len(index)), index=index, dtype="int64") ser.index.name = "foo" result = ser.to_xarray() @@ -101,30 +109,26 @@ def test_to_xarray_index_types(self, index_flat): assert len(result) == len(index) assert len(result.coords) == 1 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, DataArray) + assert isinstance(result, xarray.DataArray) # idempotency tm.assert_series_equal(result.to_series(), ser) def test_to_xarray_empty(self): - from xarray import DataArray - ser = Series([], dtype=object) ser.index.name = "foo" result = ser.to_xarray() assert len(result) == 0 assert len(result.coords) == 1 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, DataArray) + assert isinstance(result, xarray.DataArray) def test_to_xarray_with_multiindex(self): - from xarray import DataArray - mi = MultiIndex.from_product([["a", "b"], range(3)], names=["one", "two"]) ser = Series(range(6), dtype="int64", index=mi) result = ser.to_xarray() assert len(result) == 2 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) - assert isinstance(result, DataArray) + assert isinstance(result, xarray.DataArray) res = result.to_series() tm.assert_series_equal(res, ser) diff --git a/pandas/tests/groupby/__init__.py b/pandas/tests/groupby/__init__.py index 446d9da437771..79046cd7ed415 100644 --- a/pandas/tests/groupby/__init__.py +++ b/pandas/tests/groupby/__init__.py @@ -2,7 +2,7 @@ def get_groupby_method_args(name, obj): """ Get required arguments for a groupby method. - When parametrizing a test over groupby methods (e.g. "sum", "mean", "fillna"), + When parametrizing a test over groupby methods (e.g. "sum", "mean"), it is often the case that arguments are required for certain methods. Parameters @@ -16,7 +16,7 @@ def get_groupby_method_args(name, obj): ------- A tuple of required arguments for the method. """ - if name in ("nth", "fillna", "take"): + if name in ("nth", "take"): return (0,) if name == "quantile": return (0.5,) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 6223a153df358..b7e6e55739c17 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1,10 +1,10 @@ """ test .agg behavior / note that .apply is tested generally in test_groupby.py """ + import datetime import functools from functools import partial -import re import numpy as np import pytest @@ -62,6 +62,32 @@ def test_agg_ser_multi_key(df): tm.assert_series_equal(results, expected) +def test_agg_with_missing_values(): + # GH#58810 + missing_df = DataFrame( + { + "nan": [np.nan, np.nan, np.nan, np.nan], + "na": [pd.NA, pd.NA, pd.NA, pd.NA], + "nat": [pd.NaT, pd.NaT, pd.NaT, pd.NaT], + "none": [None, None, None, None], + "values": [1, 2, 3, 4], + } + ) + + result = missing_df.agg(x=("nan", "min"), y=("na", "min"), z=("values", "sum")) + + expected = DataFrame( + { + "nan": [np.nan, np.nan, np.nan], + "na": [np.nan, np.nan, np.nan], + "values": [np.nan, np.nan, 10.0], + }, + index=["x", "y", "z"], + ) + + tm.assert_frame_equal(result, expected) + + def test_groupby_aggregation_mixed_dtype(): # GH 6212 expected = DataFrame( @@ -111,28 +137,6 @@ def test_groupby_aggregation_mixed_dtype(): tm.assert_frame_equal(result, expected) -def test_groupby_aggregation_multi_level_column(): - # GH 29772 - lst = [ - [True, True, True, False], - [True, False, np.nan, False], - [True, True, np.nan, False], - [True, True, np.nan, False], - ] - df = DataFrame( - data=lst, - columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]), - ) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=1, axis=1) - result = gb.sum(numeric_only=False) - expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]}) - - tm.assert_frame_equal(result, expected) - - def test_agg_apply_corner(ts, tsframe): # nothing to group, all NA grouped = ts.groupby(ts * np.nan, group_keys=False) @@ -154,12 +158,38 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_frame_equal(grouped.sum(), exp_df) tm.assert_frame_equal(grouped.agg("sum"), exp_df) - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - res = grouped.apply(np.sum) + res = grouped.apply(np.sum, axis=0) + exp_df = exp_df.reset_index(drop=True) tm.assert_frame_equal(res, exp_df) +def test_with_na_groups(any_real_numpy_dtype): + index = Index(np.arange(10)) + values = Series(np.ones(10), index, dtype=any_real_numpy_dtype) + labels = Series( + [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], + index=index, + ) + + # this SHOULD be an int + grouped = values.groupby(labels) + agged = grouped.agg(len) + expected = Series([4, 2], index=["bar", "foo"]) + + tm.assert_series_equal(agged, expected, check_dtype=False) + + # assert issubclass(agged.dtype.type, np.integer) + + # explicitly return a float from my function + def f(x): + return float(len(x)) + + agged = grouped.agg(f) + expected = Series([4.0, 2.0], index=["bar", "foo"]) + + tm.assert_series_equal(agged, expected) + + def test_agg_grouping_is_list_tuple(ts): df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), @@ -236,77 +266,9 @@ def test_std_masked_dtype(any_numeric_ea_dtype): def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func): gb = df.groupby(level=0) - warn_msg = f"DataFrameGroupBy.{reduction_func} with axis=1 is deprecated" - if reduction_func in ("idxmax", "idxmin"): - error = TypeError - msg = "'[<>]' not supported between instances of 'float' and 'str'" - warn = FutureWarning - else: - error = ValueError - msg = f"Operation {reduction_func} does not support axis=1" - warn = None - with pytest.raises(error, match=msg): - with tm.assert_produces_warning(warn, match=warn_msg): - gb.agg(reduction_func, axis=1) - - -@pytest.mark.parametrize( - "func, expected, dtype, result_dtype_dict", - [ - ("sum", [5, 7, 9], "int64", {}), - ("std", [4.5**0.5] * 3, int, {"i": float, "j": float, "k": float}), - ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), - ("sum", [5, 7, 9], "Int64", {"j": "int64"}), - ("std", [4.5**0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), - ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}), - ], -) -def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): - # GH#43209 - df = DataFrame( - [[1, 2, 3, 4, 5, 6]] * 3, - columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), - ).astype({("a", "j"): dtype, ("b", "j"): dtype}) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=1, axis=1) - result = gb.agg(func) - expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( - result_dtype_dict - ) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "func, expected_data, result_dtype_dict", - [ - ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), - # std should ideally return Int64 / Float64 #43330 - ("std", [[2**0.5] * 2] * 3, "float64"), - ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), - ], -) -def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): - # GH#43209 - df = DataFrame( - np.arange(12).reshape(3, 4), - index=Index([0, 1, 0], name="y"), - columns=Index([10, 20, 10, 20], name="x"), - dtype="int64", - ).astype({10: "Int64"}) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby("x", axis=1) - result = gb.agg(func) - expected = DataFrame( - data=expected_data, - index=Index([0, 1, 0], name="y"), - columns=Index([10, 20], name="x"), - ).astype(result_dtype_dict) - tm.assert_frame_equal(result, expected) + msg = f"Operation {reduction_func} does not support axis=1" + with pytest.raises(ValueError, match=msg): + gb.agg(reduction_func, axis=1) def test_aggregate_item_by_item(df): @@ -337,7 +299,7 @@ def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype == object or ser.dtype == "string": raise TypeError("Test error message") return ser.sum() @@ -352,9 +314,7 @@ def func(ser): def test_agg_multiple_functions_maintain_order(df): # GH #610 funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)] - msg = "is currently using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A")["C"].agg(funcs) + result = df.groupby("A")["C"].agg(funcs) exp_cols = Index(["mean", "max", "min"]) tm.assert_index_equal(result.columns, exp_cols) @@ -466,8 +426,8 @@ def numpystd(x): # this uses column selection & renaming msg = r"nested renamer is not supported" + d = {"C": "mean", "D": {"foo": "mean", "bar": "std"}} with pytest.raises(SpecificationError, match=msg): - d = {"C": "mean", "D": {"foo": "mean", "bar": "std"}} grouped.aggregate(d) # But without renaming, these functions are OK @@ -882,8 +842,8 @@ def test_agg_relabel_other_raises(self): def test_missing_raises(self): df = DataFrame({"A": [0, 1], "B": [1, 2]}) - match = re.escape("Column(s) ['C'] do not exist") - with pytest.raises(KeyError, match=match): + msg = r"Label\(s\) \['C'\] do not exist" + with pytest.raises(KeyError, match=msg): df.groupby("A").agg(c=("C", "sum")) def test_agg_namedtuple(self): @@ -944,11 +904,9 @@ def test_agg_relabel_multiindex_column( expected = DataFrame({"a_max": [1, 3]}, index=idx) tm.assert_frame_equal(result, expected) - msg = "is currently using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(("x", "group")).agg( - col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 - ) + result = df.groupby(("x", "group")).agg( + col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 + ) expected = DataFrame( {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx ) @@ -1049,6 +1007,65 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): tm.assert_frame_equal(result, expected) +def test_groupby_as_index_agg(df): + grouped = df.groupby("A", as_index=False) + + # single-key + + result = grouped[["C", "D"]].agg("mean") + expected = grouped.mean(numeric_only=True) + tm.assert_frame_equal(result, expected) + + result2 = grouped.agg({"C": "mean", "D": "sum"}) + expected2 = grouped.mean(numeric_only=True) + expected2["D"] = grouped.sum()["D"] + tm.assert_frame_equal(result2, expected2) + + grouped = df.groupby("A", as_index=True) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"Q": "sum"}) + + # multi-key + + grouped = df.groupby(["A", "B"], as_index=False) + + result = grouped.agg("mean") + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + result2 = grouped.agg({"C": "mean", "D": "sum"}) + expected2 = grouped.mean() + expected2["D"] = grouped.sum()["D"] + tm.assert_frame_equal(result2, expected2) + + expected3 = grouped["C"].sum() + expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"Q": "sum"}) + + # GH7115 & GH8112 & GH8582 + df = DataFrame( + np.random.default_rng(2).integers(0, 100, (50, 3)), + columns=["jim", "joe", "jolie"], + ) + ts = Series(np.random.default_rng(2).integers(5, 10, 50), name="jim") + + gr = df.groupby(ts) + gr.nth(0) # invokes set_selection_from_grouper internally + + for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: + gr = df.groupby(ts, as_index=False) + left = getattr(gr, attr)() + + gr = df.groupby(ts.values, as_index=True) + right = getattr(gr, attr)().reset_index(drop=True) + + tm.assert_frame_equal(left, right) + + @pytest.mark.parametrize( "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] ) @@ -1109,7 +1126,7 @@ def test_aggregate_mixed_types(): expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"]), ) tm.assert_frame_equal(result, expected) @@ -1188,9 +1205,7 @@ def test_agg_with_one_lambda(self): # check pd.NameAgg case result1 = df.groupby(by="kind").agg( - height_sqr_min=pd.NamedAgg( - column="height", aggfunc=lambda x: np.min(x**2) - ), + height_sqr_min=pd.NamedAgg(column="height", aggfunc=lambda x: np.min(x**2)), height_max=pd.NamedAgg(column="height", aggfunc="max"), weight_max=pd.NamedAgg(column="weight", aggfunc="max"), ) @@ -1245,9 +1260,7 @@ def test_agg_multiple_lambda(self): # check pd.NamedAgg case result2 = df.groupby(by="kind").agg( - height_sqr_min=pd.NamedAgg( - column="height", aggfunc=lambda x: np.min(x**2) - ), + height_sqr_min=pd.NamedAgg(column="height", aggfunc=lambda x: np.min(x**2)), height_max=pd.NamedAgg(column="height", aggfunc="max"), weight_max=pd.NamedAgg(column="weight", aggfunc="max"), height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)), @@ -1256,6 +1269,25 @@ def test_agg_multiple_lambda(self): tm.assert_frame_equal(result2, expected) +def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): + # go through _aggregate_frame with self.axis == 0 and duplicate columns + tsframe.columns = ["A", "B", "A", "C"] + gb = tsframe.groupby(lambda x: x.month, as_index=as_index) + + res = gb.agg(np.percentile, 80, axis=0) + + ex_data = { + 1: tsframe[tsframe.index.month == 1].quantile(0.8), + 2: tsframe[tsframe.index.month == 2].quantile(0.8), + } + expected = DataFrame(ex_data).T + if not as_index: + expected.insert(0, "index", [1, 2]) + expected.index = Index(range(2)) + + tm.assert_frame_equal(res, expected) + + def test_groupby_get_by_index(): # GH 33439 df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]}) @@ -1513,19 +1545,6 @@ def test_groupby_complex_raises(func): data.groupby(data.index % 2).agg(func) -@pytest.mark.parametrize( - "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}] -) -def test_multi_axis_1_raises(func): - # GH#46995 - df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]}) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby("a", axis=1) - with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"): - gb.agg(func) - - @pytest.mark.parametrize( "test, constant", [ @@ -1670,3 +1689,121 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +def test_groupby_aggregation_duplicate_columns_single_dict_value(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": "sum"}) + + expected = DataFrame( + [[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a") + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_multiple_dict_values(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": ["sum", "min", "max", "min"]}) + + expected = DataFrame( + [[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]], + columns=MultiIndex( + levels=[["c"], ["sum", "min", "max"]], + codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]], + ), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_some_empty_result(): + # GH#55041 + df = DataFrame( + [ + [1, 9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, -546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=["a", "b", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": [], "c": ["var"]}) + + expected = DataFrame( + [[1.509268e11, 30944844.5], [2.178000e03, 0.0]], + columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): "min"}) + + expected = DataFrame( + [[-9843, 9], [244, -33]], + columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): ["min", "max"]}) + + expected = DataFrame( + [[-9843, 940, 9, 546], [244, 244, -33, -33]], + columns=MultiIndex( + levels=[["level1.1"], ["level2.2"], ["min", "max"]], + codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]], + ), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5c99882cef6d2..a706ea795a0e2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -21,7 +21,6 @@ bdate_range, ) import pandas._testing as tm -import pandas.core.common as com @pytest.mark.parametrize( @@ -68,7 +67,7 @@ def test_cythonized_aggers(op_name): expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group["C"]) - exp = DataFrame(expd).T.stack(future_stack=True) + exp = DataFrame(expd).T.stack() exp.index.names = ["A", "B"] exp.name = "C" @@ -85,10 +84,8 @@ def test_cython_agg_boolean(): } ) result = frame.groupby("a")["b"].mean() - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - expected = frame.groupby("a")["b"].agg(np.mean) + # GH#53425 + expected = frame.groupby("a")["b"].agg(np.mean) tm.assert_series_equal(result, expected) @@ -108,7 +105,9 @@ def test_cython_agg_nothing_to_agg(): result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype="str"), ) tm.assert_frame_equal(result, expected) @@ -126,21 +125,6 @@ def test_cython_agg_nothing_to_agg_with_dates(): frame.groupby("b").dates.mean(numeric_only=True) -def test_cython_agg_frame_columns(): - # #2113 - df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]}) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby(level=0, axis="columns").mean() - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby(level=0, axis="columns").mean() - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby(level=0, axis="columns").mean() - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby(level=0, axis="columns").mean() - - def test_cython_agg_return_dict(): # GH 16741 df = DataFrame( @@ -163,14 +147,11 @@ def test_cython_agg_return_dict(): def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) - ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) + ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - expected = grouped.agg(np.sum) + expected = grouped.agg(np.sum).astype(object) tm.assert_series_equal(summed, expected) @@ -191,13 +172,12 @@ def test_cython_fail_agg(): def test__cython_agg_general(op, targop): df = DataFrame(np.random.default_rng(2).standard_normal(1000)) labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + kwargs = {"ddof": 1} if op == "var" else {} + if op not in ["first", "last"]: + kwargs["axis"] = 0 result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) - warn = FutureWarning if targop in com._cython_table else None - msg = f"using DataFrameGroupBy.{op}" - with tm.assert_produces_warning(warn, match=msg): - # GH#53425 - expected = df.groupby(labels).agg(targop) + expected = df.groupby(labels).agg(targop, **kwargs) tm.assert_frame_equal(result, expected) @@ -307,7 +287,7 @@ def test_read_only_buffer_source_agg(agg): "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], } ) - df._mgr.arrays[0].flags.writeable = False + df._mgr.blocks[0].values.flags.writeable = False result = df.groupby(["species"]).agg({"sepal_length": agg}) expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index ee694129f7118..afddc90fdd055 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -11,8 +12,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): @@ -35,26 +45,49 @@ def incorrect_function(x): def test_check_nopython_kwargs(): pytest.importorskip("numba") - def incorrect_function(values, index): - return sum(values) * 2.7 + def incorrect_function(values, index, *, a): + return sum(values) * 2.7 + a + + def correct_function(values, index, a): + return sum(values) * 2.7 + a data = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) + expected = data.groupby("key").sum() * 2.7 + + # py signature binding + with pytest.raises( + TypeError, match="missing a required (keyword-only argument|argument): 'a'" + ): + data.groupby("key").agg(incorrect_function, engine="numba", b=1) + with pytest.raises(TypeError, match="missing a required argument: 'a'"): + data.groupby("key").agg(correct_function, engine="numba", b=1) + + with pytest.raises( + TypeError, match="missing a required (keyword-only argument|argument): 'a'" + ): + data.groupby("key")["data"].agg(incorrect_function, engine="numba", b=1) + with pytest.raises(TypeError, match="missing a required argument: 'a'"): + data.groupby("key")["data"].agg(correct_function, engine="numba", b=1) + + # numba signature check after binding with pytest.raises(NumbaUtilError, match="numba does not support"): data.groupby("key").agg(incorrect_function, engine="numba", a=1) + actual = data.groupby("key").agg(correct_function, engine="numba", a=1) + tm.assert_frame_equal(expected + 1, actual) with pytest.raises(NumbaUtilError, match="numba does not support"): data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1) + actual = data.groupby("key")["data"].agg(correct_function, engine="numba", a=1) + tm.assert_series_equal(expected["data"] + 1, actual) @pytest.mark.filterwarnings("ignore") # Filter warnings when parallel=True and the function can't be parallelized by Numba @pytest.mark.parametrize("jit", [True, False]) -@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) -@pytest.mark.parametrize("as_index", [True, False]) -def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): +def test_numba_vs_cython(jit, frame_or_series, nogil, parallel, nopython, as_index): pytest.importorskip("numba") def func_numba(values, index): @@ -71,7 +104,7 @@ def func_numba(values, index): ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} grouped = data.groupby(0, as_index=as_index) - if pandas_obj == "Series": + if frame_or_series is Series: grouped = grouped[1] result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs) @@ -83,8 +116,7 @@ def func_numba(values, index): @pytest.mark.filterwarnings("ignore") # Filter warnings when parallel=True and the function can't be parallelized by Numba @pytest.mark.parametrize("jit", [True, False]) -@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) -def test_cache(jit, pandas_obj, nogil, parallel, nopython): +def test_cache(jit, frame_or_series, nogil, parallel, nopython): # Test that the functions are cached correctly if we switch functions pytest.importorskip("numba") @@ -105,7 +137,7 @@ def func_2(values, index): ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} grouped = data.groupby(0) - if pandas_obj == "Series": + if frame_or_series is Series: grouped = grouped[1] result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs) @@ -164,6 +196,23 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["sum", "mean", "var", "std", "min", "max"]) +def test_multifunc_numba_vs_cython_frame_noskipna(func): + pytest.importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], + 1: [1.0, np.nan, 3.0, 4.0, 5.0], + 2: [1, 2, 3, 4, 5], + }, + columns=[0, 1, 2], + ) + grouped = data.groupby(0) + result = grouped.agg(func, skipna=False, engine="numba") + expected = grouped.agg(func, skipna=False, engine="cython") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "agg_kwargs,expected_func", [ diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 0596193c137e1..1c016143d50c3 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -209,7 +209,7 @@ def test_aggregate_api_consistency(): expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) - msg = r"Column\(s\) \['r', 'r2'\] do not exist" + msg = r"Label\(s\) \['r', 'r2'\] do not exist" with pytest.raises(KeyError, match=msg): grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"}) @@ -224,7 +224,7 @@ def test_agg_dict_renaming_deprecation(): {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} ) - msg = r"Column\(s\) \['ma'\] do not exist" + msg = r"Label\(s\) \['ma'\] do not exist" with pytest.raises(KeyError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) @@ -355,7 +355,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if isinstance(x.values, np.ndarray): + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) @@ -410,10 +411,7 @@ def __call__(self, x): expected = df.groupby("foo").agg("sum") for ecall in equiv_callables: - warn = FutureWarning if ecall is sum or ecall is np.sum else None - msg = "using DataFrameGroupBy.sum" - with tm.assert_produces_warning(warn, match=msg): - result = df.groupby("foo").agg(ecall) + result = df.groupby("foo").agg(ecall) tm.assert_frame_equal(result, expected) @@ -501,17 +499,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[0])["B"].iloc[0] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[-1])["B"].iloc[0] def test_sum_uint64_overflow(): @@ -540,46 +534,44 @@ def test_sum_uint64_overflow(): @pytest.mark.parametrize( - "structure, expected", + "structure, cast_as", [ - (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), - (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), - ( - lambda x: tuple(x), - DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), - ), - ( - lambda x: list(x), - DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), - ), + (tuple, tuple), + (list, list), + (lambda x: tuple(x), tuple), + (lambda x: list(x), list), ], ) -def test_agg_structs_dataframe(structure, expected): +def test_agg_structs_dataframe(structure, cast_as): df = DataFrame( {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} ) result = df.groupby(["A", "B"]).aggregate(structure) + expected = DataFrame( + {"C": {(1, 1): cast_as([1, 1, 1]), (3, 4): cast_as([3, 4, 4])}} + ) expected.index.names = ["A", "B"] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "structure, expected", + "structure, cast_as", [ - (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), - (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), - (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), - (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + (tuple, tuple), + (list, list), + (lambda x: tuple(x), tuple), + (lambda x: list(x), list), ], ) -def test_agg_structs_series(structure, expected): +def test_agg_structs_series(structure, cast_as): # Issue #18079 df = DataFrame( {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} ) result = df.groupby("A")["C"].aggregate(structure) + expected = Series([cast_as([1, 1, 1]), cast_as([3, 4, 4])], index=[1, 3], name="C") expected.index.name = "A" tm.assert_series_equal(result, expected) @@ -589,9 +581,7 @@ def test_agg_category_nansum(observed): df = DataFrame( {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} ) - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A", observed=observed).B.agg(np.nansum) + result = df.groupby("A", observed=observed).B.agg(np.nansum) expected = Series( [3, 3, 0], index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index dce3f072ed903..2745f7c2b8d0f 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -13,26 +13,6 @@ ) -@pytest.fixture(params=[True, False]) -def sort(request): - return request.param - - -@pytest.fixture(params=[True, False]) -def as_index(request): - return request.param - - -@pytest.fixture(params=[True, False]) -def dropna(request): - return request.param - - -@pytest.fixture(params=[True, False]) -def observed(request): - return request.param - - @pytest.fixture def df(): return DataFrame( @@ -112,7 +92,7 @@ def three_group(): ) -@pytest.fixture() +@pytest.fixture def slice_test_df(): data = [ [0, "a", "a0_at_0"], @@ -128,7 +108,7 @@ def slice_test_df(): return df.set_index("Index") -@pytest.fixture() +@pytest.fixture def slice_test_grouped(slice_test_df): return slice_test_df.groupby("Group", as_index=False) @@ -153,28 +133,6 @@ def groupby_func(request): return request.param -@pytest.fixture(params=[True, False]) -def parallel(request): - """parallel keyword argument for numba.jit""" - return request.param - - -# Can parameterize nogil & nopython over True | False, but limiting per -# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472 - - -@pytest.fixture(params=[False]) -def nogil(request): - """nogil keyword argument for numba.jit""" - return request.param - - -@pytest.fixture(params=[True]) -def nopython(request): - """nopython keyword argument for numba.jit""" - return request.param - - @pytest.fixture( params=[ ("mean", {}), diff --git a/pandas/tests/groupby/methods/test_corrwith.py b/pandas/tests/groupby/methods/test_corrwith.py deleted file mode 100644 index 53e8bdc4534dc..0000000000000 --- a/pandas/tests/groupby/methods/test_corrwith.py +++ /dev/null @@ -1,24 +0,0 @@ -import numpy as np - -from pandas import ( - DataFrame, - Index, - Series, -) -import pandas._testing as tm - - -def test_corrwith_with_1_axis(): - # GH 47723 - df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) - gb = df.groupby("a") - - msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.corrwith(df, axis=1) - index = Index( - data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], - name=("a", None), - ) - expected = Series([np.nan] * 6, index=index) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index a2440e09dfc02..72bdfebc5eeb7 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -35,7 +35,7 @@ def test_series_describe_single(): ) grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack(future_stack=True) + expected = grouped.describe().stack() tm.assert_series_equal(result, expected) @@ -79,7 +79,7 @@ def test_frame_describe_multikey(tsframe): group = grouped[col].describe() # GH 17464 - Remove duplicate MultiIndex levels group_col = MultiIndex( - levels=[[col], group.columns], + levels=[Index([col], dtype=tsframe.columns.dtype), group.columns], codes=[[0] * len(group.columns), range(len(group.columns))], ) group = DataFrame(group.values, columns=group_col, index=group.index) @@ -87,35 +87,25 @@ def test_frame_describe_multikey(tsframe): expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) - result = groupedT.describe() - expected = tsframe.describe().T - # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ - expected.index = MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) - tm.assert_frame_equal(result, expected) - def test_frame_describe_tupleindex(): # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame( + name = "k" + df = DataFrame( { "x": [1, 2, 3, 4, 5] * 3, - "y": [10, 20, 30, 40, 50] * 3, - "z": [100, 200, 300, 400, 500] * 3, + name: [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5, } ) - df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={"k": "key"}) - msg = "Names should be list-like for a MultiIndex" - with pytest.raises(ValueError, match=msg): - df1.groupby("k").describe() - with pytest.raises(ValueError, match=msg): - df2.groupby("key").describe() + result = df.groupby(name).describe() + expected = DataFrame( + [[5.0, 3.0, 1.581139, 1.0, 2.0, 3.0, 4.0, 5.0]] * 3, + index=Index([(0, 0, 1), (0, 1, 0), (1, 0, 0)], tupleize_cols=False, name=name), + columns=MultiIndex.from_arrays( + [["x"] * 8, ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]] + ), + ) + tm.assert_frame_equal(result, expected) def test_frame_describe_unstacked_format(): @@ -149,7 +139,6 @@ def test_frame_describe_unstacked_format(): "indexing past lexsort depth may impact performance:" "pandas.errors.PerformanceWarning" ) -@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) def test_describe_with_duplicate_output_column_names(as_index, keys): # GH 35314 @@ -213,63 +202,48 @@ def test_describe_duplicate_columns(): gb = df.groupby(df[1]) result = gb.describe(percentiles=[]) - columns = ["count", "mean", "std", "min", "50%", "max"] + columns = ["count", "mean", "std", "min", "max"] frames = [ - DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + DataFrame([[1.0, val, np.nan, val, val]], index=[1], columns=columns) for val in (0.0, 2.0, 3.0) ] expected = pd.concat(frames, axis=1) expected.columns = MultiIndex( levels=[[0, 2], columns], - codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + codes=[5 * [0] + 5 * [1] + 5 * [0], 3 * list(range(5))], ) expected.index.names = [1] tm.assert_frame_equal(result, expected) -class TestGroupByNonCythonPaths: +def test_describe_non_cython_paths(): # GH#5610 non-cython calls should not include the grouper # Tests for code not expected to go through cython paths. + df = DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], + columns=["A", "B", "C"], + ) + gb = df.groupby("A") + expected_index = Index([1, 3], name="A") + expected_col = MultiIndex( + levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], + codes=[[0] * 8, list(range(8))], + ) + expected = DataFrame( + [ + [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + ], + index=expected_index, + columns=expected_col, + ) + result = gb.describe() + tm.assert_frame_equal(result, expected) - @pytest.fixture - def df(self): - df = DataFrame( - [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], - columns=["A", "B", "C"], - ) - return df - - @pytest.fixture - def gb(self, df): - gb = df.groupby("A") - return gb - - @pytest.fixture - def gni(self, df): - gni = df.groupby("A", as_index=False) - return gni - - def test_describe(self, df, gb, gni): - # describe - expected_index = Index([1, 3], name="A") - expected_col = MultiIndex( - levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], - codes=[[0] * 8, list(range(8))], - ) - expected = DataFrame( - [ - [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - ], - index=expected_index, - columns=expected_col, - ) - result = gb.describe() - tm.assert_frame_equal(result, expected) - - expected = expected.reset_index() - result = gni.describe() - tm.assert_frame_equal(result, expected) + gni = df.groupby("A", as_index=False) + expected = expected.reset_index() + result = gni.describe() + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [int, float, object]) @@ -293,5 +267,5 @@ def test_groupby_empty_dataset(dtype, kwargs): result = df.iloc[:0].groupby("A").B.describe(**kwargs) expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] - expected.index = Index([]) + expected.index = Index([], dtype=df.columns.dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py index 94e672d4892fe..1256046d81949 100644 --- a/pandas/tests/groupby/methods/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/methods/test_groupby_shift_diff.py @@ -63,7 +63,7 @@ def test_group_shift_with_fill_value(): def test_group_shift_lose_timezone(): # GH 30134 - now_dt = Timestamp.utcnow().as_unit("ns") + now_dt = Timestamp.now("UTC").as_unit("ns") df = DataFrame({"a": [1, 1], "date": now_dt}) result = df.groupby("a").shift(0).iloc[0] expected = Series({"date": now_dt}, name=result.name) @@ -167,14 +167,12 @@ def test_shift_periods_freq(): tm.assert_frame_equal(result, expected) -def test_shift_deprecate_freq_and_fill_value(): +def test_shift_disallow_freq_and_fill_value(): # GH 53832 data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} df = DataFrame(data, index=date_range(start="20100101", periods=6)) - msg = ( - "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "Passing a 'freq' together with a 'fill_value'" + with pytest.raises(ValueError, match=msg): df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") @@ -247,9 +245,6 @@ def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, index=date_range("1/1/2000", periods=5, freq="h"), ) - msg = ( - "Passing a 'freq' together with a 'fill_value' silently ignores the " - "fill_value" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "Passing a 'freq' together with a 'fill_value'" + with pytest.raises(ValueError, match=msg): df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py new file mode 100644 index 0000000000000..21b7c50c3c5aa --- /dev/null +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +def test_groupby_kurt_equivalence(): + # GH#40139 + # Test that that groupby kurt method (which uses libgroupby.group_kurt) + # matches the results of operating group-by-group (which uses nanops.nankurt) + nrows = 1000 + ngroups = 3 + ncols = 2 + nan_frac = 0.05 + + arr = np.random.default_rng(2).standard_normal((nrows, ncols)) + arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan + + df = pd.DataFrame(arr) + grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) + gb = df.groupby(grps) + + result = gb.kurt() + + grpwise = [grp.kurt().to_frame(i).T for i, grp in gb] + expected = pd.concat(grpwise, axis=0) + expected.index = expected.index.astype("int64") # 32bit builds + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + "Float64", + ], +) +def test_groupby_kurt_arrow_float64(dtype): + # GH#40139 + # Test groupby.kurt() with float64[pyarrow] and Float64 dtypes + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + }, + dtype=dtype, + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt() + expected = pd.DataFrame({"x": [2.1644713], "y": [0.1513969]}, dtype=dtype) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_noskipna(): + # GH#40139 + # Test groupby.kurt() with skipna = False + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame({"x": [np.nan], "y": [0.1513969]}) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_all_ones(): + # GH#40139 + # Test groupby.kurt() with constant values + df = pd.DataFrame( + { + "x": [1.0] * 10, + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame( + { + "x": [0.0], # Same behavior as pd.DataFrame.kurt() + } + ) + tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_nlargest_nsmallest.py b/pandas/tests/groupby/methods/test_nlargest_nsmallest.py index bf983f04a3f3f..1225c325c4c23 100644 --- a/pandas/tests/groupby/methods/test_nlargest_nsmallest.py +++ b/pandas/tests/groupby/methods/test_nlargest_nsmallest.py @@ -99,17 +99,16 @@ def test_nsmallest(): [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], ) @pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) -@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) -def test_nlargest_and_smallest_noop(data, groups, dtype, method): +def test_nlargest_and_smallest_noop(data, groups, dtype, nselect_method): # GH 15272, GH 16345, GH 29129 # Test nlargest/smallest when it results in a noop, # i.e. input is sorted and group size <= n if dtype is not None: data = np.array(data, dtype=dtype) - if method == "nlargest": + if nselect_method == "nlargest": data = list(reversed(data)) ser = Series(data, name="a") - result = getattr(ser.groupby(groups), method)(n=2) + result = getattr(ser.groupby(groups), nselect_method)(n=2) expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index a8ed9e9d52021..d9c8706ec9202 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -529,7 +529,6 @@ def test_nth_multi_index_as_expected(): ], ) @pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]]) -@pytest.mark.parametrize("as_index", [True, False]) def test_groupby_head_tail(op, n, expected_rows, columns, as_index): df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A", as_index=as_index) @@ -541,32 +540,6 @@ def test_groupby_head_tail(op, n, expected_rows, columns, as_index): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "op, n, expected_cols", - [ - ("head", -1, [0]), - ("head", 0, []), - ("head", 1, [0, 2]), - ("head", 7, [0, 1, 2]), - ("tail", -1, [1]), - ("tail", 0, []), - ("tail", 1, [1, 2]), - ("tail", 7, [0, 1, 2]), - ], -) -def test_groupby_head_tail_axis_1(op, n, expected_cols): - # GH 9772 - df = DataFrame( - [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"] - ) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - g = df.groupby([0, 0, 1], axis=1) - expected = df.iloc[:, expected_cols] - result = getattr(g, op)(n) - tm.assert_frame_equal(result, expected) - - def test_group_selection_cache(): # GH 12839 nth, head, and tail should return same result consistently df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) @@ -707,10 +680,11 @@ def test_first_multi_key_groupby_categorical(): @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 - expected = Series(["y"]) + expected = Series(["y"], dtype=object) data = Series( [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], index=[0, 0, 0, 0, 0], + dtype=object, ).groupby(level=0) if method == "nth": @@ -774,24 +748,6 @@ def test_np_ints(slice_test_df, slice_test_grouped): tm.assert_frame_equal(result, expected) -def test_groupby_nth_with_column_axis(): - # GH43926 - df = DataFrame( - [ - [4, 5, 6], - [8, 8, 7], - ], - index=["z", "y"], - columns=["C", "B", "A"], - ) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(df.iloc[1], axis=1) - result = gb.nth(0) - expected = df.iloc[:, [0, 2]] - tm.assert_frame_equal(result, expected) - - def test_groupby_nth_interval(): # GH#24205 idx_result = MultiIndex( @@ -815,35 +771,6 @@ def test_groupby_nth_interval(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "start, stop, expected_values, expected_columns", - [ - (None, None, [0, 1, 2, 3, 4], list("ABCDE")), - (None, 1, [0, 3], list("AD")), - (None, 9, [0, 1, 2, 3, 4], list("ABCDE")), - (None, -1, [0, 1, 3], list("ABD")), - (1, None, [1, 2, 4], list("BCE")), - (1, -1, [1], list("B")), - (-1, None, [2, 4], list("CE")), - (-1, 2, [4], list("E")), - ], -) -@pytest.mark.parametrize("method", ["call", "index"]) -def test_nth_slices_with_column_axis( - start, stop, expected_values, expected_columns, method -): - df = DataFrame([range(5)], columns=[list("ABCDE")]) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby([5, 5, 5, 6, 6], axis=1) - result = { - "call": lambda start, stop: gb.nth(slice(start, stop)), - "index": lambda start, stop: gb.nth[start:stop], - }[method](start, stop) - expected = DataFrame([expected_values], columns=[expected_columns]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.filterwarnings( "ignore:invalid value encountered in remainder:RuntimeWarning" ) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 361a8c27fbf9d..28cb25b515ed2 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -38,19 +38,7 @@ ], ) @pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) -def test_quantile(interpolation, a_vals, b_vals, q, request): - if ( - interpolation == "nearest" - and q == 0.5 - and isinstance(b_vals, list) - and b_vals == [4, 3, 2, 1] - ): - request.applymarker( - pytest.mark.xfail( - reason="Unclear numpy expectation for nearest " - "result with equidistant data" - ) - ) +def test_quantile(interpolation, a_vals, b_vals, q): all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)]) a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) @@ -171,7 +159,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + msg = "dtype '(object|str)' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() @@ -259,9 +248,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) else: - with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" - ): + msg = "dtype '.*' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("a").quantile(q, numeric_only=numeric_only) @@ -389,32 +377,6 @@ def test_groupby_timedelta_quantile(): tm.assert_frame_equal(result, expected) -def test_columns_groupby_quantile(): - # GH 33795 - df = DataFrame( - np.arange(12).reshape(3, -1), - index=list("XYZ"), - columns=pd.Series(list("ABAB"), name="col"), - ) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby("col", axis=1) - result = gb.quantile(q=[0.8, 0.2]) - expected = DataFrame( - [ - [1.6, 0.4, 2.6, 1.4], - [5.6, 4.4, 6.6, 5.4], - [9.6, 8.4, 10.6, 9.4], - ], - index=list("XYZ"), - columns=pd.MultiIndex.from_tuples( - [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None] - ), - ) - - tm.assert_frame_equal(result, expected) - - def test_timestamp_groupby_quantile(unit): # GH 33168 dti = pd.date_range( diff --git a/pandas/tests/groupby/methods/test_rank.py b/pandas/tests/groupby/methods/test_rank.py index a3b7da3fa836c..1fb878cbc1111 100644 --- a/pandas/tests/groupby/methods/test_rank.py +++ b/pandas/tests/groupby/methods/test_rank.py @@ -480,19 +480,17 @@ def test_rank_avg_even_vals(dtype, upper): tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) -@pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) @pytest.mark.parametrize("pct", [True, False]) @pytest.mark.parametrize( "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] ) -def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals): +def test_rank_object_dtype(rank_method, ascending, na_option, pct, vals): df = DataFrame({"key": ["foo"] * 5, "val": vals}) mask = df["val"].isna() gb = df.groupby("key") - res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) + res = gb.rank(method=rank_method, ascending=ascending, na_option=na_option, pct=pct) # construct our expected by using numeric values with the same ordering if mask.any(): @@ -502,15 +500,13 @@ def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals): gb2 = df2.groupby("key") alt = gb2.rank( - method=ties_method, ascending=ascending, na_option=na_option, pct=pct + method=rank_method, ascending=ascending, na_option=na_option, pct=pct ) tm.assert_frame_equal(res, alt) @pytest.mark.parametrize("na_option", [True, "bad", 1]) -@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) -@pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("pct", [True, False]) @pytest.mark.parametrize( "vals", @@ -520,13 +516,13 @@ def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals): [1, np.nan, 2, np.nan, 3], ], ) -def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): +def test_rank_naoption_raises(rank_method, ascending, na_option, pct, vals): df = DataFrame({"key": ["foo"] * 5, "val": vals}) msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): df.groupby("key").rank( - method=ties_method, ascending=ascending, na_option=na_option, pct=pct + method=rank_method, ascending=ascending, na_option=na_option, pct=pct ) @@ -609,80 +605,6 @@ def test_rank_pct_equal_values_on_group_transition(use_nan): tm.assert_series_equal(result, expected) -def test_rank_multiindex(): - # GH27721 - df = concat( - { - "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}), - "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}), - }, - axis=1, - ) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=0, axis=1) - msg = "DataFrameGroupBy.rank with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.rank(axis=1) - - expected = concat( - [ - df["a"].rank(axis=1), - df["b"].rank(axis=1), - ], - axis=1, - keys=["a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -def test_groupby_axis0_rank_axis1(): - # GH#41320 - df = DataFrame( - {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, - index=["a", "a", "b", "b"], - ) - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=0, axis=0) - - msg = "DataFrameGroupBy.rank with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = gb.rank(axis=1) - - # This should match what we get when "manually" operating group-by-group - expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0) - tm.assert_frame_equal(res, expected) - - # check that we haven't accidentally written a case that coincidentally - # matches rank(axis=0) - msg = "The 'axis' keyword in DataFrameGroupBy.rank" - with tm.assert_produces_warning(FutureWarning, match=msg): - alt = gb.rank(axis=0) - assert not alt.equals(expected) - - -def test_groupby_axis0_cummax_axis1(): - # case where groupby axis is 0 and axis keyword in transform is 1 - - # df has mixed dtype -> multiple blocks - df = DataFrame( - {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, - index=["a", "a", "b", "b"], - ) - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=0, axis=0) - - msg = "DataFrameGroupBy.cummax with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - cmax = gb.cummax(axis=1) - expected = df[[0, 1]].astype(np.float64) - expected[2] = expected[1] - tm.assert_frame_equal(cmax, expected) - - def test_non_unique_index(): # GH 16577 df = DataFrame( diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 93a4e743d0d71..6664563bd2272 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,10 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - -from pandas.core.dtypes.common import is_integer_dtype - from pandas import ( DataFrame, Index, @@ -22,36 +18,7 @@ def test_size(df, by): assert result[key] == len(group) -@pytest.mark.parametrize( - "by", - [ - [0, 0, 0, 0], - [0, 1, 1, 1], - [1, 0, 1, 1], - [0, None, None, None], - pytest.param([None, None, None, None], marks=pytest.mark.xfail), - ], -) -def test_size_axis_1(df, axis_1, by, sort, dropna): - # GH#45715 - counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)} - if dropna: - counts = {key: value for key, value in counts.items() if key is not None} - expected = Series(counts, dtype="int64") - if sort: - expected = expected.sort_index() - if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): - expected.index = expected.index.astype(int) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) - result = grouped.size() - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) -@pytest.mark.parametrize("sort", [True, False]) def test_size_sort(sort, by): df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC")) left = df.groupby(by=by, sort=sort).size() @@ -83,7 +50,6 @@ def test_size_period_index(): tm.assert_series_equal(result, ser) -@pytest.mark.parametrize("as_index", [True, False]) def test_size_on_categorical(as_index): df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) df["A"] = df["A"].astype("category") @@ -108,22 +74,16 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): +def test_size_strings(any_string_dtype, using_infer_string): # GH#55627 + dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype expected = Series( [2, 1], - index=Index(["a", "b"], name="a", dtype=dtype), + index=Index(["a", "b"], name="a", dtype=exp_index_dtype), name="b", dtype=exp_dtype, ) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 2fa79c815d282..1050f8154572a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -4,12 +4,9 @@ and proper parameter handling """ - import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, CategoricalIndex, @@ -76,9 +73,6 @@ def seed_df(seed_nans, n, m): @pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr) @pytest.mark.parametrize("isort", [True, False]) @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) def test_series_groupby_value_counts( seed_nans, num_rows, @@ -235,14 +229,6 @@ def education_df(): ) -def test_axis(education_df): - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gp = education_df.groupby("country", axis=1) - with pytest.raises(NotImplementedError, match="axis"): - gp.value_counts() - - def test_bad_subset(education_df): gp = education_df.groupby("country") with pytest.raises(ValueError, match="subset"): @@ -269,10 +255,10 @@ def test_basic(education_df, request): index=MultiIndex.from_tuples( [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), - ("US", "female", "high"), + ("FR", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), ], names=["country", "gender", "education"], ), @@ -295,10 +281,18 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): (True, False), ], ) -@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, name, sort, ascending, as_index, frame, request + education_df, + groupby, + normalize, + name, + sort, + ascending, + as_index, + frame, + request, + using_infer_string, ): # test all parameters: # - Use column, array or function as by= parameter @@ -330,25 +324,19 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - warn = FutureWarning if groupby == "column" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) else: name = "proportion" if normalize else "count" expected = expected.reset_index().rename({0: name}, axis=1) - if groupby == "column": - expected = expected.rename({"level_0": "country"}, axis=1) - expected["country"] = np.where(expected["country"], "US", "FR") - elif groupby == "function": - expected["level_0"] = expected["level_0"] == 1 + if groupby in ["array", "function"] and (not as_index and frame): + expected.insert(loc=0, column="level_0", value=result["level_0"]) else: - expected["level_0"] = np.where(expected["level_0"], "US", "FR") + expected.insert(loc=0, column="country", value=result["country"]) tm.assert_frame_equal(result, expected) else: # compare against SeriesGroupBy value_counts @@ -362,24 +350,24 @@ def test_against_frame_and_seriesgroupby( index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) index_frame["education"] = index_frame["both"].str.split("-").str.get(1) del index_frame["both"] - index_frame = index_frame.rename({0: None}, axis=1) - expected.index = MultiIndex.from_frame(index_frame) + index_frame2 = index_frame.rename({0: None}, axis=1) + expected.index = MultiIndex.from_frame(index_frame2) + + if index_frame2.columns.isna()[0]: + # with using_infer_string, the columns in index_frame as string + # dtype, which makes the rename({0: None}) above use np.nan + # instead of None, so we need to set None more explicitly. + expected.index.names = [None] + expected.index.names[1:] tm.assert_series_equal(result, expected) else: expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + if using_infer_string: + expected = expected.astype({"gender": "str", "education": "str"}) del expected["both"] tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -397,8 +385,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, - dtype, + any_string_dtype, + using_infer_string, ): + dtype = any_string_dtype education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False @@ -415,20 +405,18 @@ def test_compound( expected["proportion"] = expected_count expected["proportion"] /= expected_group_size if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count if dtype == "string[pyarrow]": expected["count"] = expected["count"].convert_dtypes() - tm.assert_frame_equal(result, expected) - + if using_infer_string and dtype == object: + expected = expected.astype( + {"country": "str", "gender": "str", "education": "str"} + ) -@pytest.fixture -def animals_df(): - return DataFrame( - {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - index=["falcon", "dog", "cat", "ant"], - ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -448,10 +436,14 @@ def animals_df(): ], ) def test_data_frame_value_counts( - animals_df, sort, ascending, normalize, name, expected_data, expected_index + sort, ascending, normalize, name, expected_data, expected_index ): # 3-way compare with :meth:`~DataFrame.value_counts` # Tests from frame/methods/test_value_counts.py + animals_df = DataFrame( + {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) result_frame = animals_df.value_counts( sort=sort, ascending=ascending, normalize=normalize ) @@ -471,35 +463,22 @@ def test_data_frame_value_counts( tm.assert_series_equal(result_frame_groupby, expected) -@pytest.fixture -def nulls_df(): - n = np.nan - return DataFrame( - { - "A": [1, 1, n, 4, n, 6, 6, 6, 6], - "B": [1, 1, 3, n, n, 6, 6, 6, 6], - "C": [1, 2, 3, 4, 5, 6, n, 8, n], - "D": [1, 2, 3, 4, 5, 6, 7, n, n], - } - ) - - @pytest.mark.parametrize( "group_dropna, count_dropna, expected_rows, expected_values", [ ( False, False, - [0, 1, 3, 5, 7, 6, 8, 2, 4], + [0, 1, 3, 5, 6, 7, 8, 2, 4], [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], ), (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), - (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), + (True, False, [0, 1, 5, 6, 7, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), ], ) def test_dropna_combinations( - nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request + group_dropna, count_dropna, expected_rows, expected_values, request ): if Version(np.__version__) >= Version("1.25") and not group_dropna: request.applymarker( @@ -511,6 +490,14 @@ def test_dropna_combinations( strict=False, ) ) + nulls_df = DataFrame( + { + "A": [1, 1, np.nan, 4, np.nan, 6, 6, 6, 6], + "B": [1, 1, 3, np.nan, np.nan, 6, 6, 6, 6], + "C": [1, 2, 3, 4, 5, 6, np.nan, 8, np.nan], + "D": [1, 2, 3, 4, 5, 6, 7, np.nan, np.nan], + } + ) gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) columns = DataFrame() @@ -521,17 +508,6 @@ def test_dropna_combinations( tm.assert_series_equal(result, expected) -@pytest.fixture -def names_with_nulls_df(nulls_fixture): - return DataFrame( - { - "key": [1, 1, 1, 1], - "first_name": ["John", "Anne", "John", "Beth"], - "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], - }, - ) - - @pytest.mark.parametrize( "dropna, expected_data, expected_index", [ @@ -539,7 +515,7 @@ def names_with_nulls_df(nulls_fixture): True, [1, 1], MultiIndex.from_arrays( - [(1, 1), ("Beth", "John"), ("Louise", "Smith")], + [(1, 1), ("John", "Beth"), ("Smith", "Louise")], names=["key", "first_name", "middle_name"], ), ), @@ -552,7 +528,7 @@ def names_with_nulls_df(nulls_fixture): Index(["Anne", "Beth", "John"]), Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], + codes=[[0, 0, 0, 0], [2, 0, 2, 1], [1, 2, 2, 0]], names=["key", "first_name", "middle_name"], ), ), @@ -560,11 +536,18 @@ def names_with_nulls_df(nulls_fixture): ) @pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")]) def test_data_frame_value_counts_dropna( - names_with_nulls_df, dropna, normalize, name, expected_data, expected_index + nulls_fixture, dropna, normalize, name, expected_data, expected_index ): # GH 41334 # 3-way compare with :meth:`~DataFrame.value_counts` # Tests with nulls from frame/methods/test_value_counts.py + names_with_nulls_df = DataFrame( + { + "key": [1, 1, 1, 1], + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize) expected = Series( data=expected_data, @@ -583,7 +566,6 @@ def test_data_frame_value_counts_dropna( tm.assert_series_equal(result_frame_groupby, expected) -@pytest.mark.parametrize("as_index", [False, True]) @pytest.mark.parametrize("observed", [False, True]) @pytest.mark.parametrize( "normalize, name, expected_data", @@ -624,17 +606,17 @@ def test_categorical_single_grouper_with_only_observed_categories( expected_index = MultiIndex.from_tuples( [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ], names=["country", "gender", "education"], ) @@ -693,7 +675,6 @@ def assert_categorical_single_grouper( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( "normalize, name, expected_data", [ @@ -727,17 +708,17 @@ def test_categorical_single_grouper_observed_true( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ] assert_categorical_single_grouper( @@ -751,7 +732,6 @@ def test_categorical_single_grouper_observed_true( ) -@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( "normalize, name, expected_data", [ @@ -808,23 +788,23 @@ def test_categorical_single_grouper_observed_false( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ("ASIA", "female", "high"), - ("ASIA", "female", "low"), - ("ASIA", "female", "medium"), - ("ASIA", "male", "high"), ("ASIA", "male", "low"), ("ASIA", "male", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "high"), ] assert_categorical_single_grouper( @@ -838,7 +818,6 @@ def test_categorical_single_grouper_observed_false( ) -@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( "observed, expected_index", [ @@ -855,8 +834,8 @@ def test_categorical_single_grouper_observed_false( ("US", "high", "male"), ("US", "low", "male"), ("US", "low", "female"), - ("US", "medium", "female"), ("US", "medium", "male"), + ("US", "medium", "female"), ], ), ( @@ -924,7 +903,6 @@ def test_categorical_multiple_groupers( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("as_index", [False, True]) @pytest.mark.parametrize("observed", [False, True]) @pytest.mark.parametrize( "normalize, name, expected_data", @@ -968,17 +946,17 @@ def test_categorical_non_groupers( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ] expected_series = Series( data=expected_data, @@ -1033,7 +1011,6 @@ def test_mixed_groupings(normalize, expected_label, expected_values): ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]), ], ) -@pytest.mark.parametrize("as_index", [False, True]) def test_column_label_duplicates(test, columns, expected_names, as_index): # GH 44992 # Test for duplicate input column labels and generated duplicate labels @@ -1198,7 +1175,7 @@ def test_value_counts_sort(sort, vc_sort, normalize): if sort and vc_sort: taker = [0, 1, 2] elif sort and not vc_sort: - taker = [0, 1, 2] + taker = [1, 0, 2] elif not sort and vc_sort: taker = [0, 2, 1] else: @@ -1235,7 +1212,29 @@ def test_value_counts_sort_categorical(sort, vc_sort, normalize): elif not sort and vc_sort: taker = [0, 2, 1, 3] else: - taker = [2, 3, 0, 1] + taker = [2, 1, 0, 3] expected = expected.take(taker) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("groupby_sort", [True, False]) +def test_value_counts_all_na(sort, dropna, groupby_sort): + # GH#59989 + df = DataFrame({"a": [2, 1, 1], "b": np.nan}) + gb = df.groupby("a", sort=groupby_sort) + result = gb.value_counts(sort=sort, dropna=dropna) + + kwargs = {"levels": [[1, 2], [np.nan]], "names": ["a", "b"]} + if dropna: + data = [] + index = MultiIndex(codes=[[], []], **kwargs) + elif not groupby_sort and not sort: + data = [1, 2] + index = MultiIndex(codes=[[1, 0], [0, 0]], **kwargs) + else: + data = [2, 1] + index = MultiIndex(codes=[[0, 1], [0, 0]], **kwargs) + expected = Series(data, index=index, dtype="int64", name="count") + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index ad35bec70f668..4625c5c27a803 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -22,12 +22,15 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) - gb = df.groupby(["a", "b", "c"], group_keys=False) + gb = df.groupby(["a", "b", "c"], group_keys=True) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) - - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = method(*args).index expected = df.index @@ -42,18 +45,12 @@ def test_duplicate_columns(request, groupby_func, as_index): df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) args = get_groupby_method_args(groupby_func, df) gb = df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = getattr(gb, groupby_func)(*args) + result = getattr(gb, groupby_func)(*args) expected_df = df.set_axis(["a", "b", "c"], axis=1) expected_args = get_groupby_method_args(groupby_func, expected_df) expected_gb = expected_df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = getattr(expected_gb, groupby_func)(*expected_args) + expected = getattr(expected_gb, groupby_func)(*expected_args) if groupby_func not in ("size", "ngroup", "cumcount"): expected = expected.rename(columns={"c": "b"}) tm.assert_equal(result, expected) @@ -74,8 +71,12 @@ def test_dup_labels_output_shape(groupby_func, idx): grp_by = df.groupby([0]) args = get_groupby_method_args(groupby_func, df) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = getattr(grp_by, groupby_func)(*args) diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 5c5982954de2f..215e627abb018 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -68,20 +68,19 @@ def test_tab_completion(multiindex_dataframe_random_data): "tail", "resample", "cummin", - "fillna", "cumsum", "cumcount", "ngroup", "all", "shift", "skew", + "kurt", "take", "pct_change", "any", "corr", "corrwith", "cov", - "dtypes", "ndim", "diff", "idxmax", @@ -149,7 +148,7 @@ def test_all_methods_categorized(multiindex_dataframe_random_data): def test_frame_consistency(groupby_func): # GH#48028 if groupby_func in ("first", "last"): - msg = "first and last are entirely different between frame and groupby" + msg = "first and last don't exist for DataFrame anymore" pytest.skip(reason=msg) if groupby_func in ("cumcount", "ngroup"): @@ -175,27 +174,29 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("nunique",): exclude_expected = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + elif groupby_func in ("sum", "mean", "std", "var"): + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} - elif groupby_func in ("backfill", "bfill", "ffill", "pad"): - exclude_expected = {"downcast", "inplace", "axis", "limit_area"} + exclude_expected = {"axis", "kwargs"} + elif groupby_func in ("bfill", "ffill"): + exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): - exclude_expected = {"skipna", "args"} - exclude_result = {"numeric_only"} + exclude_expected = {"axis", "skipna", "args"} elif groupby_func in ("cumprod", "cumsum"): - exclude_expected = {"skipna"} + exclude_expected = {"axis", "skipna"} elif groupby_func in ("pct_change",): exclude_expected = {"kwargs"} - exclude_result = {"axis"} elif groupby_func in ("rank",): exclude_expected = {"numeric_only"} elif groupby_func in ("quantile",): exclude_expected = {"method", "axis"} + elif groupby_func in ["corrwith"]: + exclude_expected = {"min_periods"} + if groupby_func not in ["pct_change", "size"]: + exclude_expected |= {"axis"} # Ensure excluded arguments are actually in the signatures assert result & exclude_result == exclude_result @@ -209,7 +210,8 @@ def test_frame_consistency(groupby_func): def test_series_consistency(request, groupby_func): # GH#48028 if groupby_func in ("first", "last"): - pytest.skip("first and last are entirely different between Series and groupby") + msg = "first and last don't exist for Series anymore" + pytest.skip(msg) if groupby_func in ("cumcount", "corrwith", "ngroup"): assert not hasattr(Series, groupby_func) @@ -229,32 +231,39 @@ def test_series_consistency(request, groupby_func): exclude_expected, exclude_result = set(), set() if groupby_func in ("any", "all"): exclude_expected = {"kwargs", "bool_only", "axis"} - elif groupby_func in ("diff",): - exclude_result = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + elif groupby_func in ("sum", "mean", "std", "var"): + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} - elif groupby_func in ("backfill", "bfill", "ffill", "pad"): - exclude_expected = {"downcast", "inplace", "axis", "limit_area"} + exclude_expected = {"axis", "kwargs"} + elif groupby_func in ("bfill", "ffill"): + exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): exclude_expected = {"skipna", "args"} exclude_result = {"numeric_only"} elif groupby_func in ("cumprod", "cumsum"): exclude_expected = {"skipna"} + exclude_result = {"numeric_only"} elif groupby_func in ("pct_change",): exclude_expected = {"kwargs"} - exclude_result = {"axis"} elif groupby_func in ("rank",): exclude_expected = {"numeric_only"} elif groupby_func in ("idxmin", "idxmax"): exclude_expected = {"args", "kwargs"} elif groupby_func in ("quantile",): exclude_result = {"numeric_only"} + if groupby_func not in [ + "diff", + "pct_change", + "count", + "nunique", + "quantile", + "size", + ]: + exclude_expected |= {"axis"} # Ensure excluded arguments are actually in the signatures assert result & exclude_result == exclude_result diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 34b6e7c4cde5f..5bf16ee9ad0b8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -27,12 +27,9 @@ def test_apply_func_that_appends_group_to_list_without_copy(): def store(group): groups.append(group) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("index").apply(store) - expected_value = DataFrame( - {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) - ) + df.groupby("index").apply(store) + expected_value = DataFrame({0: [1] * 10}, index=pd.RangeIndex(0, 100, 10)) + expected_value.columns = expected_value.columns.astype(object) tm.assert_frame_equal(groups[0], expected_value) @@ -77,7 +74,7 @@ def test_apply_index_date(using_infer_string): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(using_infer_string): +def test_apply_index_date_object(): # GH 5789 # don't auto coerce dates ts = [ @@ -109,53 +106,12 @@ def test_apply_index_date_object(using_infer_string): 1.40750, 1.40649, ] - dtype = "string[pyarrow_numpy]" if using_infer_string else object - exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" - ) + exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) tm.assert_series_equal(result, expected) -def test_apply_trivial(using_infer_string): - # GH 20066 - # trivial apply: ignore input and return a constant dataframe. - df = DataFrame( - {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=["key", "data"], - ) - dtype = "string" if using_infer_string else "object" - expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby([str(x) for x in df.dtypes], axis=1) - result = gb.apply(lambda x: df.iloc[1:]) - - tm.assert_frame_equal(result, expected) - - -def test_apply_trivial_fail(using_infer_string): - # GH 20066 - df = DataFrame( - {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=["key", "data"], - ) - dtype = "string" if using_infer_string else "object" - expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True) - result = gb.apply(lambda x: df) - - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( "df, group_names", [ @@ -226,9 +182,7 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("a", group_keys=False).apply(func) + df.groupby("a").apply(func) assert names == group_names @@ -246,11 +200,9 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -270,15 +222,27 @@ def slow(group): def fast(group): return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - fast_df = df.groupby("A", group_keys=False).apply(fast) - with tm.assert_produces_warning(FutureWarning, match=msg): - slow_df = df.groupby("A", group_keys=False).apply(slow) - + fast_df = df.groupby("A", group_keys=False).apply(fast) + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) +def test_apply_fast_slow_identical_index(): + # GH#44803 + df = DataFrame( + { + "name": ["Alice", "Bob", "Carl"], + "age": [20, 21, 20], + } + ).set_index("name") + + grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) + tm.assert_frame_equal(grp_by_same_value, grp_by_copy) + + @pytest.mark.parametrize( "func", [ @@ -295,11 +259,8 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): # transparent to the user df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("g", group_keys=False).apply(func) - tm.assert_frame_equal(result, df) + result = df.groupby("g", group_keys=False).apply(func) + tm.assert_frame_equal(result, df[["a", "b"]]) def test_apply_with_mixed_dtype(): @@ -310,19 +271,19 @@ def test_apply_with_mixed_dtype(): "foo2": ["one", "two", "two", "three", "one", "two"], } ) - result = df.apply(lambda x: x, axis=1).dtypes - expected = df.dtypes - tm.assert_series_equal(result, expected) + result = df.apply(lambda x: x, axis=1) + expected = df + tm.assert_frame_equal(result, expected) # GH 3610 incorrect dtype conversion with as_index=False df = DataFrame({"c1": [1, 2, 6, 6, 8]}) df["c2"] = df.c1 / 2.0 - result1 = df.groupby("c2").mean().reset_index().c2 - result2 = df.groupby("c2", as_index=False).mean().c2 - tm.assert_series_equal(result1, result2) + result1 = df.groupby("c2").mean().reset_index() + result2 = df.groupby("c2", as_index=False).mean() + tm.assert_frame_equal(result1, result2) -def test_groupby_as_index_apply(): +def test_groupby_as_index_apply(as_index): # GH #4648 and #3417 df = DataFrame( { @@ -331,36 +292,41 @@ def test_groupby_as_index_apply(): "time": range(6), } ) + gb = df.groupby("user_id", as_index=as_index) - g_as = df.groupby("user_id", as_index=True) - g_not_as = df.groupby("user_id", as_index=False) - - res_as = g_as.head(2).index - res_not_as = g_not_as.head(2).index - exp = Index([0, 1, 2, 4]) - tm.assert_index_equal(res_as, exp) - tm.assert_index_equal(res_not_as, exp) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - res_as_apply = g_as.apply(lambda x: x.head(2)).index - with tm.assert_produces_warning(FutureWarning, match=msg): - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + expected = DataFrame( + { + "item_id": ["b", "b", "a", "a"], + "user_id": [1, 2, 1, 3], + "time": [0, 1, 2, 4], + }, + index=[0, 1, 2, 4], + ) + result = gb.head(2) + tm.assert_frame_equal(result, expected) # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) - tp = [(1, 0), (1, 2), (2, 1), (3, 4)] - exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) + if as_index: + tp = [(1, 0), (1, 2), (2, 1), (3, 4)] + index = MultiIndex.from_tuples(tp, names=["user_id", None]) + else: + index = Index([0, 2, 1, 4]) + expected = DataFrame( + { + "item_id": list("baba"), + "time": [0, 2, 1, 4], + }, + index=index, + ) + result = gb.apply(lambda x: x.head(2)) + tm.assert_frame_equal(result, expected) - tm.assert_index_equal(res_as_apply, exp_as_apply) - tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) +def test_groupby_as_index_apply_str(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -389,19 +355,13 @@ def desc3(group): # weirdo return result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.apply(desc) + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result2 = grouped.apply(desc2) + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result3 = grouped.apply(desc3) + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -413,8 +373,8 @@ def f(piece): {"value": piece, "demeaned": piece - piece.mean(), "logged": logged} ) - dr = bdate_range("1/1/2000", periods=100) - ts = Series(np.random.default_rng(2).standard_normal(100), index=dr) + dr = bdate_range("1/1/2000", periods=10) + ts = Series(np.random.default_rng(2).standard_normal(10), index=dr) grouped = ts.groupby(lambda x: x.month, group_keys=False) result = grouped.apply(f) @@ -431,9 +391,7 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["A", "B"]).apply(len) + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -444,9 +402,7 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -455,9 +411,7 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -480,9 +434,7 @@ def trans2(group): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(trans) + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -511,10 +463,8 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) - expected = df.take([0, 1, 3, 4, 6, 7]) + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + expected = df[["value"]].take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None] @@ -534,9 +484,7 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -553,11 +501,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) @@ -579,13 +525,10 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) - tm.assert_frame_equal(result, expected) @@ -619,11 +562,8 @@ def filt2(x): else: return x[x.category == "c"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = data.groupby("id_field").apply(filt1) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = data.groupby("id_field").apply(filt2) + expected = data.groupby("id_field").apply(filt1) + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -636,19 +576,11 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): if test_series: ser = df.set_index("Y")["X"] result = ser.groupby(level=0, group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_index() - expected = ser.sort_index() + expected = ser tm.assert_series_equal(result, expected) else: - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("Y", group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_values("Y") - expected = df.sort_values("Y") + result = df.groupby("Y", group_keys=False).apply(lambda x: x) + expected = df[["X"]] tm.assert_frame_equal(result, expected) @@ -673,13 +605,13 @@ def reindex_helper(x): def test_apply_corner_cases(): # #535, can't use sliding iterator - N = 1000 + N = 10 labels = np.random.default_rng(2).integers(0, 100, size=N) df = DataFrame( { "key": labels, "value1": np.random.default_rng(2).standard_normal(N), - "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + "value2": ["foo", "bar", "baz", "qux", "a"] * (N // 5), } ) @@ -689,9 +621,7 @@ def f(g): g["value3"] = g["value1"] * 2 return g - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert "value3" in result @@ -705,15 +635,13 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) + +def test_apply_numeric_coercion_when_datetime_getitem(): # GH 15421 df = DataFrame( {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3} @@ -722,13 +650,13 @@ def test_apply_numeric_coercion_when_datetime(): def get_B(g): return g.iloc[0][["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(get_B)["B"] + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) + +def test_apply_numeric_coercion_when_datetime_with_nat(): # GH 14423 def predictions(tool): out = Series(index=["p1", "p2", "useTime"], dtype=object) @@ -749,11 +677,8 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df1.groupby("Key").apply(predictions).p1 - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df2.groupby("Key").apply(predictions).p1 + expected = df1.groupby("Key").apply(predictions).p1 + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -768,13 +693,11 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} - ) + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} ) + ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -817,17 +740,13 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( - {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] + {"b": pd.Timestamp(2015, 1, 1), "c": 2}, index=[1] ) dfg_conversion_expected.index.name = "a" @@ -869,43 +788,46 @@ def test_groupby_apply_all_none(): def test_func(x): pass - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = test_df.groupby("groups").apply(test_func) - expected = DataFrame() + result = test_df.groupby("groups").apply(test_func) + expected = DataFrame(columns=["random_vars"], dtype="int64") tm.assert_frame_equal(result, expected) -def test_groupby_apply_none_first(): +@pytest.mark.parametrize( + "in_data, out_idx, out_data", + [ + [ + {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}, + [[1, 1], [0, 2]], + {"vars": [0, 2]}, + ], + [ + {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}, + [[2, 2], [1, 3]], + {"vars": [1, 3]}, + ], + ], +) +def test_groupby_apply_none_first(in_data, out_idx, out_data): # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) - test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) + test_df1 = DataFrame(in_data) def test_func(x): if x.shape[0] < 2: return None return x.iloc[[0, -1]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result1 = test_df1.groupby("groups").apply(test_func) - with tm.assert_produces_warning(FutureWarning, match=msg): - result2 = test_df2.groupby("groups").apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) - expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) - expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) + result1 = test_df1.groupby("groups").apply(test_func) + index1 = MultiIndex.from_arrays(out_idx, names=["groups", None]) + expected1 = DataFrame(out_data, index=index1) tm.assert_frame_equal(result1, expected1) - tm.assert_frame_equal(result2, expected2) def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = groups.apply(lambda group: group[group.value != 1]["value"]) + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -916,25 +838,21 @@ def test_groupby_apply_return_empty_chunk(): tm.assert_series_equal(result, expected) -def test_apply_with_mixed_types(): +@pytest.mark.parametrize("meth", ["apply", "transform"]) +def test_apply_with_mixed_types(meth): # gh-20949 df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) g = df.groupby("A", group_keys=False) - result = g.transform(lambda x: x / x.sum()) + result = getattr(g, meth)(lambda x: x / x.sum()) expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x / x.sum()) - tm.assert_frame_equal(result, expected) - def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("a").apply(lambda g: g.index) + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -944,19 +862,16 @@ def test_func_returns_object(): "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike, using_infer_string): +def test_apply_datetime_issue(group_column_dtlike): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - dtype = "string" if using_infer_string else "object" - expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) + expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -991,9 +906,7 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = tdf.groupby("day").apply(most_common_values)["userId"] + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -1034,13 +947,11 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): ], columns=["observation", "color", "mood", "intensity", "score"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes - dtype = "string" if using_infer_string else object + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], - index=["observation", "color", "mood", "intensity", "score"], + [np.dtype("datetime64[us]"), dtype, np.int64, dtype], + index=["observation", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @@ -1057,10 +968,8 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("group", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("group", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["value"]]) @pytest.mark.parametrize( @@ -1082,9 +991,7 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("groups").apply(function) + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -1096,9 +1003,7 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(fct) + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -1109,9 +1014,7 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("id").apply(function) + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1139,7 +1042,7 @@ def test_apply_function_with_indexing_return_column(): @pytest.mark.parametrize( "udf", - [(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))], + [lambda x: x.copy(), lambda x: x.copy().rename(lambda y: y + 1)], ) @pytest.mark.parametrize("group_keys", [True, False]) def test_apply_result_type(group_keys, udf): @@ -1147,9 +1050,7 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1164,11 +1065,8 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A", group_keys=False).apply(lambda x: x) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + result = df.groupby("A", group_keys=False).apply(lambda x: x) + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1180,15 +1078,8 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result1 = df1.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result2 = df2.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) + result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) + result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) tm.assert_frame_equal(result1, result2) @@ -1205,23 +1096,27 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): ) expected = DataFrame( - {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, + {"b": [15, 6], "c": [150, 60]}, index=Index([88, 99], name="a"), ) # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) - _ = getattr(grp, reduction_func)(*args) - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + _ = getattr(grp, reduction_func)(*args) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) @@ -1243,46 +1138,18 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grp.apply(lambda x: x.head(1)) + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) - expected = expected.drop(columns="idx") + expected = expected.drop(columns=["A", "B", "idx"]) tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: assert type(val) is date -def test_apply_by_cols_equals_apply_by_rows_transposed(): - # GH 16646 - # Operating on the columns, or transposing and operating on the rows - # should give the same result. There was previously a bug where the - # by_rows operation would work fine, but by_cols would throw a ValueError - - df = DataFrame( - np.random.default_rng(2).random([6, 4]), - columns=MultiIndex.from_product([["A", "B"], [1, 2]]), - ) - - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.T.groupby(axis=0, level=0) - by_rows = gb.apply(lambda x: x.droplevel(axis=0, level=0)) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb2 = df.groupby(axis=1, level=0) - by_cols = gb2.apply(lambda x: x.droplevel(axis=1, level=0)) - - tm.assert_frame_equal(by_cols, by_rows.T) - tm.assert_frame_equal(by_cols, df) - - -@pytest.mark.parametrize("dropna", [True, False]) def test_apply_dropna_with_indexed_same(dropna): # GH 38227 # GH#43205 @@ -1293,10 +1160,8 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) - expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + expected = df.dropna()[["col"]] if dropna else df[["col"]].iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1320,9 +1185,7 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1332,9 +1195,7 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1354,12 +1215,10 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) - tm.assert_frame_equal(result, expected) + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) + tm.assert_frame_equal(result, expected[["date", "vals"]]) def test_groupby_apply_shape_cache_safety(): @@ -1394,39 +1253,33 @@ def test_groupby_apply_to_series_name(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dropna", [True, False]) def test_apply_na(dropna): # GH#28984 df = DataFrame( {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } - ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) + ) expected = DataFrame( - [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], - columns=["a", "b", "c"], + [[pd.to_datetime(2, unit="s")], [pd.to_datetime(4, unit="s")]], + columns=["c"], index=MultiIndex.from_tuples([(1, ""), (2, "")], names=["a", "b"]), ) tm.assert_frame_equal(result, expected) @@ -1448,11 +1301,9 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1499,10 +1350,9 @@ def test_apply_index_key_error_bug(index_values): ) def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 - expected = DataFrame({"col": arg}, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + df = DataFrame({"grp": arg, "col": arg}, index=idx) + result = df.groupby("grp", group_keys=False).apply(lambda x: x) + expected = df[["col"]] tm.assert_frame_equal(result, expected) @@ -1531,6 +1381,7 @@ def test_result_name_when_one_group(name): ("apply", lambda gb: gb.values[-1]), ("apply", lambda gb: gb["b"].iloc[0]), ("agg", "skew"), + ("agg", "kurt"), ("agg", "prod"), ("agg", "sum"), ], @@ -1539,7 +1390,7 @@ def test_empty_df(method, op): # GH 47985 empty_df = DataFrame({"a": [], "b": []}) gb = empty_df.groupby("a", group_keys=True) - group = getattr(gb, "b") + group = gb.b result = getattr(group, method)(op) expected = Series( @@ -1549,58 +1400,117 @@ def test_empty_df(method, op): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("include_groups", [True, False]) -def test_include_groups(include_groups): +def test_include_groups(): # GH#7155 df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) gb = df.groupby("a") - warn = FutureWarning if include_groups else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - result = gb.apply(lambda x: x.sum(), include_groups=include_groups) - expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) - if not include_groups: - expected = expected[["b"]] - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match="include_groups=True is no longer allowed"): + gb.apply(lambda x: x.sum(), include_groups=True) -@pytest.mark.parametrize("f", [max, min, sum]) -@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key -def test_builtins_apply(keys, f): - # see gh-8155 - rs = np.random.default_rng(2) - df = DataFrame(rs.integers(1, 7, (10, 2)), columns=["jim", "joe"]) - df["jolie"] = rs.standard_normal(10) +@pytest.mark.parametrize("func, value", [(max, 2), (min, 1), (sum, 3)]) +def test_builtins_apply(func, value): + # GH#8155, GH#53974 + # Builtins act as e.g. sum(group), which sums the column labels of group + df = DataFrame({0: [1, 1, 2], 1: [3, 4, 5], 2: [3, 4, 5]}) + gb = df.groupby(0) + result = gb.apply(func) - gb = df.groupby(keys) + expected = Series([value, value], index=Index([1, 2], name=0)) + tm.assert_series_equal(result, expected) - fname = f.__name__ - warn = None if f is not sum else FutureWarning - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning( - warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False - ): - # Also warns on deprecation GH#53425 - result = gb.apply(f) - ngroups = len(df.drop_duplicates(subset=keys)) +def test_inconsistent_return_type(): + # GH5592 + # inconsistent return type + df = DataFrame( + { + "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], + "B": Series(np.arange(7), dtype="int64"), + "C": pd.date_range("20130101", periods=7), + } + ) - assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" - assert result.shape == (ngroups, 3), assert_msg + def f_0(grp): + return grp.iloc[0] - npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = gb.apply(npfunc) + expected = df.groupby("A").first()[["B"]] + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected2 = gb.apply(lambda x: npfunc(x)) - tm.assert_frame_equal(result, expected2) + def f_1(grp): + if grp.name == "Tiger": + return None + return grp.iloc[0] + + result = df.groupby("A").apply(f_1)[["B"]] + e = expected.copy() + e.loc["Tiger"] = np.nan + tm.assert_frame_equal(result, e) + + def f_2(grp): + if grp.name == "Pony": + return None + return grp.iloc[0] - if f != sum: - expected = gb.agg(fname).reset_index() - expected.set_index(keys, inplace=True, drop=False) - tm.assert_frame_equal(result, expected, check_dtype=False) + result = df.groupby("A").apply(f_2)[["B"]] + e = expected.copy() + e.loc["Pony"] = np.nan + tm.assert_frame_equal(result, e) - tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0)) + # 5592 revisited, with datetimes + def f_3(grp): + if grp.name == "Pony": + return None + return grp.iloc[0] + + result = df.groupby("A").apply(f_3)[["C"]] + e = df.groupby("A").first()[["C"]] + e.loc["Pony"] = pd.NaT + tm.assert_frame_equal(result, e) + + # scalar outputs + def f_4(grp): + if grp.name == "Pony": + return None + return grp.iloc[0].loc["C"] + + result = df.groupby("A").apply(f_4) + e = df.groupby("A").first()["C"].copy() + e.loc["Pony"] = np.nan + e.name = None + tm.assert_series_equal(result, e) + + +def test_nonreducer_nonstransform(): + # GH3380, GH60619 + # Was originally testing mutating in a UDF; now kept as an example + # of using apply with a nonreducer and nontransformer. + df = DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "val": np.random.default_rng(2).integers(100, size=14), + } + ) + + def f(x): + x = x.copy() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + expected = DataFrame( + { + "cat1": list("aaaabbb"), + "cat2": list("cdefcde"), + "rank": [3.0, 2.0, 5.0, 1.0, 2.0, 4.0, 1.0], + } + ).set_index(["cat1", "cat2"])["rank"] + result = df.groupby("cat1").apply(f) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py deleted file mode 100644 index 09d5e06bf6ddd..0000000000000 --- a/pandas/tests/groupby/test_apply_mutate.py +++ /dev/null @@ -1,161 +0,0 @@ -import numpy as np - -import pandas as pd -import pandas._testing as tm - - -def test_group_by_copy(): - # GH#44803 - df = pd.DataFrame( - { - "name": ["Alice", "Bob", "Carl"], - "age": [20, 21, 20], - } - ).set_index("name") - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - grp_by_same_value = df.groupby(["age"], group_keys=False).apply( - lambda group: group - ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) - tm.assert_frame_equal(grp_by_same_value, grp_by_copy) - - -def test_mutate_groups(): - # GH3380 - - df = pd.DataFrame( - { - "cat1": ["a"] * 8 + ["b"] * 6, - "cat2": ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2 - + ["f"] * 2 - + ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2, - "cat3": [f"g{x}" for x in range(1, 15)], - "val": np.random.default_rng(2).integers(100, size=14), - } - ) - - def f_copy(x): - x = x.copy() - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - def f_no_copy(x): - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - grpby_copy = df.groupby("cat1").apply(f_copy) - with tm.assert_produces_warning(FutureWarning, match=msg): - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) - tm.assert_series_equal(grpby_copy, grpby_no_copy) - - -def test_no_mutate_but_looks_like(): - # GH 8467 - # first show's mutation indicator - # second does not, but should yield the same results - df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - with tm.assert_produces_warning(FutureWarning, match=msg): - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) - tm.assert_series_equal(result1, result2) - - -def test_apply_function_with_indexing(): - # GH: 33058 - df = pd.DataFrame( - {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} - ) - - def fn(x): - x.loc[x.index[-1], "col2"] = 0 - return x.col2 - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["col1"], as_index=False).apply(fn) - expected = pd.Series( - [1, 2, 0, 4, 5, 0], - index=pd.MultiIndex.from_tuples( - [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] - ), - name="col2", - ) - tm.assert_series_equal(result, expected) - - -def test_apply_mutate_columns_multiindex(): - # GH 12652 - df = pd.DataFrame( - { - ("C", "julian"): [1, 2, 3], - ("B", "geoffrey"): [1, 2, 3], - ("A", "julian"): [1, 2, 3], - ("B", "julian"): [1, 2, 3], - ("A", "geoffrey"): [1, 2, 3], - ("C", "geoffrey"): [1, 2, 3], - }, - columns=pd.MultiIndex.from_tuples( - [ - ("A", "julian"), - ("A", "geoffrey"), - ("B", "julian"), - ("B", "geoffrey"), - ("C", "julian"), - ("C", "geoffrey"), - ] - ), - ) - - def add_column(grouped): - name = grouped.columns[0][1] - grouped["sum", name] = grouped.sum(axis=1) - return grouped - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=1, axis=1) - result = gb.apply(add_column) - expected = pd.DataFrame( - [ - [1, 1, 1, 3, 1, 1, 1, 3], - [2, 2, 2, 6, 2, 2, 2, 6], - [ - 3, - 3, - 3, - 9, - 3, - 3, - 3, - 9, - ], - ], - columns=pd.MultiIndex.from_tuples( - [ - ("geoffrey", "A", "geoffrey"), - ("geoffrey", "B", "geoffrey"), - ("geoffrey", "C", "geoffrey"), - ("geoffrey", "sum", "geoffrey"), - ("julian", "A", "julian"), - ("julian", "B", "julian"), - ("julian", "C", "julian"), - ("julian", "sum", "julian"), - ] - ), - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 49b2e621b7adc..07d52308e308a 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -2,7 +2,6 @@ import pytest from pandas._libs import lib -import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -22,7 +21,7 @@ def cumsum_max(x): "func", [ cumsum_max, - pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test), + assert_block_lengths, ], ) def test_mgr_locs_updated(func): @@ -42,24 +41,27 @@ def test_mgr_locs_updated(func): "binner,closed,expected", [ ( - np.array([0, 3, 6, 9], dtype=np.int64), + [0, 3, 6, 9], "left", - np.array([2, 5, 6], dtype=np.int64), + [2, 5, 6], ), ( - np.array([0, 3, 6, 9], dtype=np.int64), + [0, 3, 6, 9], "right", - np.array([3, 6, 6], dtype=np.int64), + [3, 6, 6], ), - (np.array([0, 3, 6], dtype=np.int64), "left", np.array([2, 5], dtype=np.int64)), + ([0, 3, 6], "left", [2, 5]), ( - np.array([0, 3, 6], dtype=np.int64), + [0, 3, 6], "right", - np.array([3, 6], dtype=np.int64), + [3, 6], ), ], ) def test_generate_bins(binner, closed, expected): values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) - result = lib.generate_bins_dt64(values, binner, closed=closed) + result = lib.generate_bins_dt64( + values, np.array(binner, dtype=np.int64), closed=closed + ) + expected = np.array(expected, dtype=np.int64) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7a91601bf688f..cae3013642739 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -42,8 +42,8 @@ def f(a): # These expected values can be used across several tests (i.e. they are # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be # hardcoded in one place. - "all": np.nan, - "any": np.nan, + "all": True, + "any": False, "count": 0, "corrwith": np.nan, "first": np.nan, @@ -56,11 +56,12 @@ def f(a): "min": np.nan, "nth": np.nan, "nunique": 0, - "prod": np.nan, + "prod": 1, "quantile": np.nan, "sem": np.nan, "size": 0, "skew": np.nan, + "kurt": np.nan, "std": np.nan, "sum": 0, "var": np.nan, @@ -82,7 +83,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(using_infer_string): # TODO: split this test +def test_basic(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -95,17 +96,20 @@ def test_basic(using_infer_string): # TODO: split this test result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) + +def test_basic_single_grouper(): cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) + +def test_basic_string(using_infer_string): # GH 8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], @@ -124,33 +128,26 @@ def test_basic(using_infer_string): # TODO: split this test def f(x): return x.drop_duplicates("person_name").iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = g.apply(f) - expected = x.iloc[[0, 1]].copy() + result = g.apply(f) + expected = x[["person_name"]].iloc[[0, 1]] expected.index = Index([1, 2], name="person_id") - dtype = "string[pyarrow_numpy]" if using_infer_string else object + dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) + +def test_basic_monotonic(): # GH 9921 - # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = df.a.groupby(c, observed=False).transform(sum) + result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] ) - msg = "using DataFrameGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = df.groupby(c, observed=False).transform(sum) + result = df.groupby(c, observed=False).transform(sum) expected = df[["a"]] tm.assert_frame_equal(result, expected) @@ -159,10 +156,7 @@ def f(x): tm.assert_frame_equal(result, df[["a"]]) result2 = gbc.transform(lambda xs: np.max(xs, axis=0)) - msg = "using DataFrameGroupBy.max" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result3 = gbc.transform(max) + result3 = gbc.transform(max) result4 = gbc.transform(np.maximum.reduce) result5 = gbc.transform(lambda xs: np.maximum.reduce(xs)) tm.assert_frame_equal(result2, df[["a"]], check_dtype=False) @@ -174,23 +168,18 @@ def f(x): tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"]) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) - # Non-monotonic + +def test_basic_non_monotonic(): df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = df.a.groupby(c, observed=False).transform(sum) + result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] ) - msg = "using DataFrameGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = df.groupby(c, observed=False).transform(sum) + result = df.groupby(c, observed=False).transform(sum) expected = df[["a"]] tm.assert_frame_equal(result, expected) @@ -198,6 +187,8 @@ def f(x): df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]] ) + +def test_basic_cut_grouping(): # GH 9603 df = DataFrame({"a": [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) @@ -208,13 +199,14 @@ def f(x): expected.index.name = "a" tm.assert_series_equal(result, expected) - # more basic + +def test_more_basic(): levels = ["foo", "bar", "baz", "qux"] - codes = np.random.default_rng(2).integers(0, 4, size=100) + codes = np.random.default_rng(2).integers(0, 4, size=10) cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + data = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) result = data.groupby(cats, observed=False).mean() @@ -240,13 +232,9 @@ def f(x): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal( - (desc_result.stack(future_stack=True).index.get_level_values(0)), exp - ) + tm.assert_index_equal(desc_result.stack().index.get_level_values(0), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal( - (desc_result.stack(future_stack=True).index.get_level_values(1)), exp - ) + tm.assert_index_equal(desc_result.stack().index.get_level_values(1), exp) def test_level_get_group(observed): @@ -271,11 +259,7 @@ def test_level_get_group(observed): names=["Index1", "Index2"], ), ) - msg = "you will need to pass a length-1 tuple" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#25971 - warn when not passing a length-1 tuple - result = g.get_group("a") - + result = g.get_group(("a",)) tm.assert_frame_equal(result, expected) @@ -323,22 +307,17 @@ def test_apply(ordered): result = grouped.mean() tm.assert_frame_equal(result, expected) - msg = "using DataFrameGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = grouped.agg(np.mean) + result = grouped.agg(np.mean) tm.assert_frame_equal(result, expected) # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.apply(lambda x: 1) + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) -def test_observed(observed): +def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -346,6 +325,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if using_infer_string and not observed: + # TODO(infer_string) this fails with filling the string column with 0 + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) @@ -378,6 +361,8 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) + +def test_observed_single_column(observed): # https://github.com/pandas-dev/pandas/issues/8138 d = { "cat": Categorical( @@ -388,7 +373,6 @@ def test_observed(observed): } df = DataFrame(d) - # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() @@ -404,7 +388,17 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) - # Grouping on two columns + +def test_observed_two_columns(observed): + # https://github.com/pandas-dev/pandas/issues/8138 + d = { + "cat": Categorical( + ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 1, 2, 2], + "val": [10, 20, 30, 40], + } + df = DataFrame(d) groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg("mean") expected = DataFrame( @@ -430,6 +424,8 @@ def test_observed(observed): expected = df[(df.cat == c) & (df.ints == i)] tm.assert_frame_equal(result, expected) + +def test_observed_with_as_index(observed): # gh-8869 # with as_index d = { @@ -510,6 +506,23 @@ def test_observed_groups(observed): tm.assert_dict_equal(result, expected) +def test_groups_na_category(dropna, observed): + # https://github.com/pandas-dev/pandas/issues/61356 + df = DataFrame( + {"cat": Categorical(["a", np.nan, "a"], categories=list("adb"))}, + index=list("xyz"), + ) + g = df.groupby("cat", observed=observed, dropna=dropna) + + result = g.groups + expected = {"a": Index(["x", "z"])} + if not dropna: + expected |= {np.nan: Index(["y"])} + if not observed: + expected |= {"b": Index([]), "d": Index([])} + tm.assert_dict_equal(result, expected) + + @pytest.mark.parametrize( "keys, expected_values, expected_index_levels", [ @@ -617,8 +630,6 @@ def test_dataframe_categorical_with_nan(observed): @pytest.mark.parametrize("ordered", [True, False]) -@pytest.mark.parametrize("observed", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # GH 25871: Fix groupby sorting on ordered Categoricals # GH 25167: Groupby with observed=True doesn't sort @@ -648,17 +659,17 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): f"for (ordered={ordered}, observed={observed}, sort={sort})\n" f"Result:\n{result}" ) - assert False, msg + pytest.fail(msg) def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) - codes = np.random.default_rng(2).integers(0, 4, size=100) + codes = np.random.default_rng(2).integers(0, 4, size=10) cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + data = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() @@ -685,13 +696,9 @@ def test_datetime(): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal( - (desc_result.stack(future_stack=True).index.get_level_values(0)), exp - ) + tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal( - (desc_result.stack(future_stack=True).index.get_level_values(1)), exp - ) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) def test_categorical_index(): @@ -729,10 +736,8 @@ def test_describe_categorical_columns(): df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() - tm.assert_index_equal(result.stack(future_stack=True).columns, cats) - tm.assert_categorical_equal( - result.stack(future_stack=True).columns.values, cats.values - ) + tm.assert_index_equal(result.stack().columns, cats) + tm.assert_categorical_equal(result.stack().columns.values, cats.values) def test_unstack_categorical(): @@ -808,24 +813,27 @@ def test_as_index(): # function grouper f = lambda r: df.loc[r, "A"] - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["cat", f], as_index=False, observed=True).sum() + result = df.groupby(["cat", f], as_index=False, observed=True).sum() expected = DataFrame( { "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "level_1": [10, 11], "A": [10, 22], "B": [101, 205], }, - columns=["cat", "A", "B"], ) tm.assert_frame_equal(result, expected) # another not in-axis grouper (conflicting names in index) s = Series(["a", "b", "b"], name="cat") - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["cat", s], as_index=False, observed=True).sum() + result = df.groupby(["cat", s], as_index=False, observed=True).sum() + expected = DataFrame( + { + "cat": ["a", "b"], + "A": [10, 22], + "B": [101, 205], + }, + ) tm.assert_frame_equal(result, expected) # is original index dropped? @@ -862,7 +870,10 @@ def test_preserve_categories(): df.groupby("A", sort=False, observed=False).first().index, nosort_index ) - # ordered=False + +def test_preserve_categories_ordered_false(): + # GH-13179 + categories = list("abc") df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) sort_index = CategoricalIndex(categories, categories, ordered=False, name="A") # GH#48749 - don't change order of categories @@ -876,7 +887,8 @@ def test_preserve_categories(): ) -def test_preserve_categorical_dtype(): +@pytest.mark.parametrize("col", ["C1", "C2"]) +def test_preserve_categorical_dtype(col): # GH13743, GH13854 df = DataFrame( { @@ -895,18 +907,15 @@ def test_preserve_categorical_dtype(): "C2": Categorical(list("bac"), categories=list("bac"), ordered=True), } ) - for col in ["C1", "C2"]: - result1 = df.groupby(by=col, as_index=False, observed=False).mean( - numeric_only=True - ) - result2 = ( - df.groupby(by=col, as_index=True, observed=False) - .mean(numeric_only=True) - .reset_index() - ) - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) + result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True) + result2 = ( + df.groupby(by=col, as_index=True, observed=False) + .mean(numeric_only=True) + .reset_index() + ) + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) @pytest.mark.parametrize( @@ -961,6 +970,8 @@ def test_categorical_no_compress(): ) tm.assert_series_equal(result, exp) + +def test_categorical_no_compress_string(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -995,8 +1006,8 @@ def test_sort(): # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)}) - labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] + df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)}) + labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) @@ -1256,10 +1267,7 @@ def test_seriesgroupby_observed_true(df_cat, operation): expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index() grouped = df_cat.groupby(["A", "B"], observed=True)["C"] - msg = "using np.sum" if operation == "apply" else "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = getattr(grouped, operation)(sum) + result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1276,16 +1284,9 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): names=["A", "B"], ).sortlevel() - expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") - if operation == "agg": - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = expected.fillna(0, downcast="infer") + expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] - msg = "using SeriesGroupBy.sum" if operation == "agg" else "using np.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = getattr(grouped, operation)(sum) + result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1349,22 +1350,6 @@ def test_groupby_categorical_series_dataframe_consistent(df_cat): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])]) -def test_groupby_categorical_axis_1(code): - # GH 13420 - df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]}) - cat = Categorical.from_codes(code, categories=list("abc")) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(cat, axis=1, observed=False) - result = gb.mean() - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb2 = df.T.groupby(cat, axis=0, observed=False) - expected = gb2.mean().T - tm.assert_frame_equal(result, expected) - - def test_groupby_cat_preserves_structure(observed, ordered): # GH 28787 df = DataFrame( @@ -1386,11 +1371,7 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - df.groupby("var").apply( - lambda rows: DataFrame( - {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} - ) - ) + df.groupby("var").apply(lambda rows: DataFrame({"val": [rows.iloc[-1]["vau"]]})) def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): @@ -1469,18 +1450,21 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( result = agg(*args) - zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] + missing_fillin = _results_for_groupbys_with_missing_categories[reduction_func] for idx in unobserved: val = result.loc[idx] - assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + assert (pd.isna(missing_fillin) and pd.isna(val)) or (val == missing_fillin) # If we expect unobserved values to be zero, we also expect the dtype to be int. # Except for .sum(). If the observed categories sum to dtype=float (i.e. their # sums have decimals), then the zeros for the missing categories should also be # floats. - if zero_or_nan == 0 and reduction_func != "sum": - assert np.issubdtype(result.dtype, np.integer) + if missing_fillin == 0: + if reduction_func in ["count", "nunique", "size"]: + assert np.issubdtype(result.dtype, np.integer) + else: + assert reduction_func in ["sum", "any"] def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func): @@ -1503,7 +1487,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun df_grp = df.groupby(["cat_1", "cat_2"], observed=True) args = get_groupby_method_args(reduction_func, df) - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) for cat in unobserved_cats: assert cat not in res.index @@ -1542,7 +1533,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( getattr(df_grp, reduction_func)(*args) return - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1737,10 +1735,7 @@ def test_categorical_transform(): categories=["Waiting", "OnTheWay", "Delivered"], ordered=True ) df["status"] = df["status"].astype(delivery_status_type) - msg = "using SeriesGroupBy.max" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - df["last_status"] = df.groupby("package_id")["status"].transform(max) + df["last_status"] = df.groupby("package_id")["status"].transform(max) result = df.copy() expected = DataFrame( @@ -1755,21 +1750,17 @@ def test_categorical_transform(): "Waiting", ], "last_status": [ - "Delivered", - "Delivered", - "Delivered", - "OnTheWay", - "OnTheWay", + "Waiting", + "Waiting", + "Waiting", + "Waiting", + "Waiting", "Waiting", ], } ) expected["status"] = expected["status"].astype(delivery_status_type) - - # .transform(max) should preserve ordered categoricals - expected["last_status"] = expected["last_status"].astype(delivery_status_type) - tm.assert_frame_equal(result, expected) @@ -1911,7 +1902,7 @@ def test_category_order_reducer( request, as_index, sort, observed, reduction_func, index_kind, ordered ): # GH#48749 - if reduction_func == "corrwith" and not as_index: + if reduction_func == "corrwith" and not as_index and index_kind != "single": msg = "GH#49950 - corrwith with as_index=False may not have grouping column" request.applymarker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: @@ -1941,8 +1932,14 @@ def test_category_order_reducer( ): getattr(gb, reduction_func)(*args) return - - op_result = getattr(gb, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories else: @@ -1976,10 +1973,7 @@ def test_category_order_transformer( df = df.set_index(keys) args = get_groupby_method_args(transformation_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = FutureWarning if transformation_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, transformation_func)(*args) + op_result = getattr(gb, transformation_func)(*args) result = op_result.index.get_level_values("a").categories expected = Index([1, 4, 3, 2]) tm.assert_index_equal(result, expected) @@ -2050,10 +2044,7 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = FutureWarning if method == "apply" and index_kind == "range" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: @@ -2106,18 +2097,6 @@ def test_many_categories(as_index, sort, index_kind, ordered): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("cat_columns", ["a", "b", ["a", "b"]]) -@pytest.mark.parametrize("keys", ["a", "b", ["a", "b"]]) -def test_groupby_default_depr(cat_columns, keys): - # GH#43999 - df = DataFrame({"a": [1, 1, 2, 3], "b": [4, 5, 6, 7]}) - df[cat_columns] = df[cat_columns].astype("category") - msg = "The default of observed=False is deprecated" - klass = FutureWarning if set(cat_columns) & set(keys) else None - with tm.assert_produces_warning(klass, match=msg): - df.groupby(keys) - - @pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) def test_agg_list(request, as_index, observed, reduction_func, test_series, keys): @@ -2128,15 +2107,6 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys elif reduction_func == "corrwith": msg = "GH#32293: attempts to call SeriesGroupBy.corrwith" request.applymarker(pytest.mark.xfail(reason=msg)) - elif ( - reduction_func == "nunique" - and not test_series - and len(keys) != 1 - and not observed - and not as_index - ): - msg = "GH#52848 - raises a ValueError" - request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) df = df.astype({"a1": "category", "a2": "category"}) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 16d7fe61b90ad..679f7eb7f7f11 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -289,9 +289,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + right = df.groupby(key).apply(DataFrame.count) tm.assert_frame_equal(left, right) @@ -321,19 +319,22 @@ def test_count_object(): expected = Series([3, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) + +def test_count_object_nan(): df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) result = df.groupby("c").a.count() expected = Series([1, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) -def test_count_cross_type(): +@pytest.mark.parametrize("typ", ["object", "float32"]) +def test_count_cross_type(typ): # GH8169 # Set float64 dtype to avoid upcast when setting nan below vals = np.hstack( ( - np.random.default_rng(2).integers(0, 5, (100, 2)), - np.random.default_rng(2).integers(0, 2, (100, 2)), + np.random.default_rng(2).integers(0, 5, (10, 2)), + np.random.default_rng(2).integers(0, 2, (10, 2)), ) ).astype("float64") @@ -341,11 +342,10 @@ def test_count_cross_type(): df[df == 2] = np.nan expected = df.groupby(["c", "d"]).count() - for t in ["float32", "object"]: - df["a"] = df["a"].astype(t) - df["b"] = df["b"].astype(t) - result = df.groupby(["c", "d"]).count() - tm.assert_frame_equal(result, expected) + df["a"] = df["a"].astype(typ) + df["b"] = df["b"].astype(typ) + result = df.groupby(["c", "d"]).count() + tm.assert_frame_equal(result, expected) def test_lower_int_prec_count(): diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index 1bdbef6d50c4c..b0a0414c1feb2 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns(): def test_cummin(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] df = base_df.astype(dtype) - expected = DataFrame({"B": expected_mins}).astype(dtype) result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ min value for dtype + +def test_cummin_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected = DataFrame({"B": expected_mins}).astype(dtype) + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = min_val df.loc[[1, 5], "B"] = min_val + 1 expected.loc[[2, 3, 6, 7], "B"] = min_val @@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected, check_exact=True) - # Test nan in some values + +def test_cummin_nan_in_some_values(dtypes_for_minmax): # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) @@ -132,6 +141,8 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummin_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -139,6 +150,8 @@ def test_cummin(dtypes_for_minmax): result = df.groupby("a")["b"].cummin() tm.assert_series_equal(expected, result) + +def test_cummin_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) result = df.groupby("a").b.cummin() @@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype): def test_cummax(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ max value for dtype + +def test_cummax_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = max_val + expected = DataFrame({"B": expected_maxs}).astype(dtype) expected.loc[[2, 3, 6, 7], "B"] = max_val result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) @@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_nan_in_some_values(dtypes_for_minmax): # Test nan in some values # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) @@ -199,6 +224,8 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -206,6 +233,8 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("a")["b"].cummax() tm.assert_series_equal(expected, result) + +def test_cummax_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) result = df.groupby("a").b.cummax() @@ -292,28 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val): tm.assert_frame_equal(result, expected) -def test_cython_api2(): +def test_cython_api2(as_index): # this takes the fast apply path # cumsum (GH5614) + # GH 5755 - cumsum is a transformer and should ignore as_index df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) - result = df.groupby("A").cumsum() - tm.assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby("A", as_index=False).cumsum() - tm.assert_frame_equal(result, expected) - - # GH 13994 - msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").cumsum(axis=1) - expected = df.cumsum(axis=1) - tm.assert_frame_equal(result, expected) - - msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").cumprod(axis=1) - expected = df.cumprod(axis=1) + result = df.groupby("A", as_index=as_index).cumsum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 309c4b7b57e84..4fe3aac629513 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -85,6 +85,9 @@ def test_filter_out_no_groups(): grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) + + +def test_filter_out_no_groups_dataframe(): df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) @@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df(): expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) + +def test_filter_out_all_groups_in_df_dropna_true(): + # GH12768 df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) @@ -121,19 +127,6 @@ def raise_if_sum_is_zero(x): grouped.filter(raise_if_sum_is_zero) -def test_filter_with_axis_in_groupby(): - # issue 11041 - index = pd.MultiIndex.from_product([range(10), [0, 1]]) - data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64") - - msg = "DataFrame.groupby with axis=1" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = data.groupby(level=0, axis=1) - result = gb.filter(lambda x: x.iloc[0, 0] > 10) - expected = data.iloc[:, 12:20] - tm.assert_frame_equal(result, expected) - - def test_filter_bad_shapes(): df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) s = df["B"] @@ -192,7 +185,7 @@ def test_filter_pdna_is_false(): def test_filter_against_workaround_ints(): # Series of ints - s = Series(np.random.default_rng(2).integers(0, 100, 100)) + s = Series(np.random.default_rng(2).integers(0, 100, 10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -204,7 +197,7 @@ def test_filter_against_workaround_ints(): def test_filter_against_workaround_floats(): # Series of floats - s = 100 * Series(np.random.default_rng(2).random(100)) + s = 100 * Series(np.random.default_rng(2).random(10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -216,13 +209,13 @@ def test_filter_against_workaround_floats(): def test_filter_against_workaround_dataframe(): # Set up DataFrame of ints, floats, strings. letters = np.array(list(ascii_lowercase)) - N = 100 + N = 10 random_letters = letters.take( np.random.default_rng(2).integers(0, 26, N, dtype=int) ) df = DataFrame( { - "ints": Series(np.random.default_rng(2).integers(0, 100, N)), + "ints": Series(np.random.default_rng(2).integers(0, 10, N)), "floats": N / 10 * Series(np.random.default_rng(2).random(N)), "letters": Series(random_letters), } @@ -230,32 +223,32 @@ def test_filter_against_workaround_dataframe(): # Group by ints; filter on floats. grouped = df.groupby("ints") - old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) + old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) - old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")] + new_way = grouped.filter(lambda x: len(x.letters) < N / 2) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby("letters") - old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) + old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) def test_filter_using_len(): - # BUG GH4447 + # GH 4447 df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) grouped = df.groupby("B") actual = grouped.filter(lambda x: len(x) > 2) expected = DataFrame( {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)}, - index=np.arange(2, 6, dtype=np.int64), + index=range(2, 6), ) tm.assert_frame_equal(actual, expected) @@ -263,11 +256,13 @@ def test_filter_using_len(): expected = df.loc[[]] tm.assert_frame_equal(actual, expected) - # Series have always worked properly, but we'll test anyway. - s = df["B"] + +def test_filter_using_len_series(): + # GH 4447 + s = Series(list("aabbbbcc"), name="B") grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B") + expected = Series(4 * ["b"], index=range(2, 6), name="B") tm.assert_series_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) @@ -275,10 +270,14 @@ def test_filter_using_len(): tm.assert_series_equal(actual, expected) -def test_filter_maintains_ordering(): - # Simple case: index is sequential. #4621 +@pytest.mark.parametrize( + "index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]] +) +def test_filter_maintains_ordering(index): + # GH 4621 df = DataFrame( - {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, ) s = df["pid"] grouped = df.groupby("tag") @@ -291,33 +290,6 @@ def test_filter_maintains_ordering(): expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - def test_filter_multiple_timestamp(): # GH 10114 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c903e691add1..4b1f23c1f755e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,14 +6,9 @@ import numpy as np import pytest -from pandas.errors import ( - PerformanceWarning, - SpecificationError, -) +from pandas.errors import SpecificationError import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -39,103 +34,10 @@ def test_repr(): # GH18203 result = repr(Grouper(key="A", level="B")) - expected = "Grouper(key='A', level='B', axis=0, sort=False, dropna=True)" + expected = "Grouper(key='A', level='B', sort=False, dropna=True)" assert result == expected -def test_groupby_std_datetimelike(warn_copy_on_write): - # GH#48481 - tdi = pd.timedelta_range("1 Day", periods=10000) - ser = Series(tdi) - ser[::5] *= 2 # get different std for different groups - - df = ser.to_frame("A").copy() - - df["B"] = ser + Timestamp(0) - df["C"] = ser + Timestamp(0, tz="UTC") - df.iloc[-1] = pd.NaT # last group includes NaTs - - gb = df.groupby(list(range(5)) * 2000) - - result = gb.std() - - # Note: this does not _exactly_ match what we would get if we did - # [gb.get_group(i).std() for i in gb.groups] - # but it _does_ match the floating point error we get doing the - # same operation on int64 data xref GH#51332 - td1 = Timedelta("2887 days 11:21:02.326710176") - td4 = Timedelta("2886 days 00:42:34.664668096") - exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) - expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) -def test_basic_aggregations(dtype): - data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) - - index = np.arange(9) - np.random.default_rng(2).shuffle(index) - data = data.reindex(index) - - grouped = data.groupby(lambda x: x // 3, group_keys=False) - - for k, v in grouped: - assert len(v) == 3 - - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - agged = grouped.aggregate(np.mean) - assert agged[1] == 1 - - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = grouped.agg(np.mean) - tm.assert_series_equal(agged, expected) # shorthand - tm.assert_series_equal(agged, grouped.mean()) - result = grouped.sum() - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = grouped.agg(np.sum) - tm.assert_series_equal(result, expected) - - expected = grouped.apply(lambda x: x * x.sum()) - transformed = grouped.transform(lambda x: x * x.sum()) - assert transformed[7] == 12 - tm.assert_series_equal(transformed, expected) - - value_grouped = data.groupby(data) - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = value_grouped.aggregate(np.mean) - tm.assert_series_equal(result, agged, check_index_type=False) - - # complex agg - msg = "using SeriesGroupBy.[mean|std]" - with tm.assert_produces_warning(FutureWarning, match=msg): - agged = grouped.aggregate([np.mean, np.std]) - - msg = r"nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - grouped.aggregate({"one": np.mean, "two": np.std}) - - group_constants = {0: 10, 1: 20, 2: 30} - msg = ( - "Pinning the groupby key to each group in SeriesGroupBy.agg is deprecated, " - "and cases that relied on it will raise in a future version" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#41090 - agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) - assert agged[1] == 21 - - # corner cases - msg = "Must produce aggregated value" - # exception raised is type Exception - with pytest.raises(Exception, match=msg): - grouped.aggregate(lambda x: x * 2) - - def test_groupby_nonobject_dtype(multiindex_dataframe_random_data): key = multiindex_dataframe_random_data.index.codes[0] grouped = multiindex_dataframe_random_data.groupby(key) @@ -162,87 +64,13 @@ def test_groupby_nonobject_dtype_mixed(): def max_value(group): return group.loc[group["value"].idxmax()] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - applied = df.groupby("A").apply(max_value) + applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = df.dtypes + expected = df.drop(columns="A").dtypes tm.assert_series_equal(result, expected) -def test_inconsistent_return_type(): - # GH5592 - # inconsistent return type - df = DataFrame( - { - "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], - "B": Series(np.arange(7), dtype="int64"), - "C": date_range("20130101", periods=7), - } - ) - - def f_0(grp): - return grp.iloc[0] - - expected = df.groupby("A").first()[["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_0)[["B"]] - tm.assert_frame_equal(result, expected) - - def f_1(grp): - if grp.name == "Tiger": - return None - return grp.iloc[0] - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_1)[["B"]] - e = expected.copy() - e.loc["Tiger"] = np.nan - tm.assert_frame_equal(result, e) - - def f_2(grp): - if grp.name == "Pony": - return None - return grp.iloc[0] - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_2)[["B"]] - e = expected.copy() - e.loc["Pony"] = np.nan - tm.assert_frame_equal(result, e) - - # 5592 revisited, with datetimes - def f_3(grp): - if grp.name == "Pony": - return None - return grp.iloc[0] - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_3)[["C"]] - e = df.groupby("A").first()[["C"]] - e.loc["Pony"] = pd.NaT - tm.assert_frame_equal(result, e) - - # scalar outputs - def f_4(grp): - if grp.name == "Pony": - return None - return grp.iloc[0].loc["C"] - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").apply(f_4) - e = df.groupby("A").first()["C"].copy() - e.loc["Pony"] = np.nan - e.name = None - tm.assert_series_equal(result, e) - - -def test_pass_args_kwargs(ts, tsframe): +def test_pass_args_kwargs(ts): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) @@ -268,54 +96,31 @@ def f(x, q=None, axis=0): tm.assert_series_equal(apply_result, agg_expected) tm.assert_series_equal(trans_result, trans_expected) - # DataFrame - for as_index in [True, False]: - df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) - warn = None if as_index else FutureWarning - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(warn, match=msg): - agg_result = df_grouped.agg(np.percentile, 80, axis=0) - with tm.assert_produces_warning(warn, match=msg): - apply_result = df_grouped.apply(DataFrame.quantile, 0.8) - with tm.assert_produces_warning(warn, match=msg): - expected = df_grouped.quantile(0.8) - tm.assert_frame_equal(apply_result, expected, check_names=False) - tm.assert_frame_equal(agg_result, expected) - - apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) - with tm.assert_produces_warning(warn, match=msg): - expected_seq = df_grouped.quantile([0.4, 0.8]) - tm.assert_frame_equal(apply_result, expected_seq, check_names=False) - - with tm.assert_produces_warning(warn, match=msg): - agg_result = df_grouped.agg(f, q=80) - with tm.assert_produces_warning(warn, match=msg): - apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) - tm.assert_frame_equal(agg_result, expected) - tm.assert_frame_equal(apply_result, expected, check_names=False) - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): - # go through _aggregate_frame with self.axis == 0 and duplicate columns - tsframe.columns = ["A", "B", "A", "C"] - gb = tsframe.groupby(lambda x: x.month, as_index=as_index) - - warn = None if as_index else FutureWarning - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(warn, match=msg): - res = gb.agg(np.percentile, 80, axis=0) - - ex_data = { - 1: tsframe[tsframe.index.month == 1].quantile(0.8), - 2: tsframe[tsframe.index.month == 2].quantile(0.8), - } - expected = DataFrame(ex_data).T + +def test_pass_args_kwargs_dataframe(tsframe, as_index): + def f(x, q=None, axis=0): + return np.percentile(x, q, axis=axis) + + df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) + tm.assert_frame_equal(apply_result, expected, check_names=False) + tm.assert_frame_equal(agg_result, expected) + + apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) + expected_seq = df_grouped.quantile([0.4, 0.8]) if not as_index: - # TODO: try to get this more consistent? - expected.index = Index(range(2)) + # apply treats the op as a transform; .quantile knows it's a reduction + apply_result.index = range(4) + apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2]) + apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8]) + tm.assert_frame_equal(apply_result, expected_seq, check_names=False) - tm.assert_frame_equal(res, expected) + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) + tm.assert_frame_equal(agg_result, expected) + tm.assert_frame_equal(apply_result, expected, check_names=False) def test_len(): @@ -337,7 +142,39 @@ def test_len_nan_group(): df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]}) assert len(df.groupby("a")) == 0 assert len(df.groupby("b")) == 3 - assert len(df.groupby(["a", "b"])) == 3 + assert len(df.groupby(["a", "b"])) == 0 + + +def test_groupby_timedelta_median(): + # issue 57926 + expected = Series(data=Timedelta("1D"), index=["foo"]) + df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1D")]}) + gb = df.groupby("label")["timedelta"] + actual = gb.median() + tm.assert_series_equal(actual, expected, check_names=False) + + +@pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) +def test_len_categorical(dropna, observed, keys): + # GH#57595 + df = DataFrame( + { + "a": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]), + "b": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]), + "c": 1, + } + ) + gb = df.groupby(keys, observed=observed, dropna=dropna) + result = len(gb) + if observed and dropna: + expected = 2 + elif observed and not dropna: + expected = 3 + elif len(keys) == 1: + expected = 3 if dropna else 4 + else: + expected = 9 if dropna else 16 + assert result == expected, f"{result} vs {expected}" def test_basic_regression(): @@ -351,36 +188,6 @@ def test_basic_regression(): grouped.mean() -@pytest.mark.parametrize( - "dtype", ["float64", "float32", "int64", "int32", "int16", "int8"] -) -def test_with_na_groups(dtype): - index = Index(np.arange(10)) - values = Series(np.ones(10), index, dtype=dtype) - labels = Series( - [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], - index=index, - ) - - # this SHOULD be an int - grouped = values.groupby(labels) - agged = grouped.agg(len) - expected = Series([4, 2], index=["bar", "foo"]) - - tm.assert_series_equal(agged, expected, check_dtype=False) - - # assert issubclass(agged.dtype.type, np.integer) - - # explicitly return a float from my function - def f(x): - return float(len(x)) - - agged = grouped.agg(f) - expected = Series([4.0, 2.0], index=["bar", "foo"]) - - tm.assert_series_equal(agged, expected) - - def test_indices_concatenation_order(): # GH 2808 @@ -417,14 +224,9 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) - depr_msg = "The behavior of array concatenation with empty entries is deprecated" - # correct result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result1 = df.groupby("a").apply(f1) - with tm.assert_produces_warning(FutureWarning, match=msg): - result2 = df2.groupby("a").apply(f1) + result1 = df.groupby("a").apply(f1) + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -438,8 +240,7 @@ def f3(x): with pytest.raises(AssertionError, match=msg): df.groupby("a").apply(f3) with pytest.raises(AssertionError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - df2.groupby("a").apply(f3) + df2.groupby("a").apply(f3) def test_attr_wrapper(ts): @@ -463,7 +264,7 @@ def test_attr_wrapper(ts): # make sure raises error msg = "'SeriesGroupBy' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): - getattr(grouped, "foo") + grouped.foo def test_frame_groupby(tsframe): @@ -506,29 +307,6 @@ def test_frame_groupby(tsframe): assert (samething == v).all() -def test_frame_groupby_columns(tsframe): - mapping = {"A": 0, "B": 0, "C": 1, "D": 1} - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = tsframe.groupby(mapping, axis=1) - - # aggregate - aggregated = grouped.aggregate("mean") - assert len(aggregated) == len(tsframe) - assert len(aggregated.columns) == 2 - - # transform - tf = lambda x: x - x.mean() - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - groupedT = tsframe.T.groupby(mapping, axis=0) - tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) - - # iterate - for k, v in grouped: - assert len(v.columns) == 2 - - def test_frame_set_name_single(df): grouped = df.groupby("A") @@ -640,7 +418,7 @@ def test_frame_multi_key_function_list(): tm.assert_frame_equal(agged, expected) -def test_frame_multi_key_function_list_partial_failure(): +def test_frame_multi_key_function_list_partial_failure(using_infer_string): data = DataFrame( { "A": [ @@ -691,6 +469,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -738,22 +518,10 @@ def test_as_index_select_column(): result = df.groupby("A", as_index=False, group_keys=True)["B"].apply( lambda x: x.cumsum() ) - expected = Series( - [2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) - ) + expected = Series([2, 6, 6], name="B", index=range(3)) tm.assert_series_equal(result, expected) -def test_obj_arg_get_group_deprecated(): - depr_msg = "obj is deprecated" - - df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) - expected = df.iloc[df.groupby("b").indices.get(4)] - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = df.groupby("b").get_group(4, obj=df) - tm.assert_frame_equal(result, expected) - - def test_groupby_as_index_select_column_sum_empty_df(): # GH 35246 df = DataFrame(columns=Index(["A", "B", "C"], name="alpha")) @@ -765,73 +533,6 @@ def test_groupby_as_index_select_column_sum_empty_df(): tm.assert_frame_equal(left, expected) -def test_groupby_as_index_agg(df): - grouped = df.groupby("A", as_index=False) - - # single-key - - result = grouped[["C", "D"]].agg("mean") - expected = grouped.mean(numeric_only=True) - tm.assert_frame_equal(result, expected) - - result2 = grouped.agg({"C": "mean", "D": "sum"}) - expected2 = grouped.mean(numeric_only=True) - expected2["D"] = grouped.sum()["D"] - tm.assert_frame_equal(result2, expected2) - - grouped = df.groupby("A", as_index=True) - - msg = r"nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - grouped["C"].agg({"Q": "sum"}) - - # multi-key - - grouped = df.groupby(["A", "B"], as_index=False) - - result = grouped.agg("mean") - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - result2 = grouped.agg({"C": "mean", "D": "sum"}) - expected2 = grouped.mean() - expected2["D"] = grouped.sum()["D"] - tm.assert_frame_equal(result2, expected2) - - expected3 = grouped["C"].sum() - expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) - msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result3 = grouped["C"].agg({"Q": "sum"}) - tm.assert_frame_equal(result3, expected3) - - # GH7115 & GH8112 & GH8582 - df = DataFrame( - np.random.default_rng(2).integers(0, 100, (50, 3)), - columns=["jim", "joe", "jolie"], - ) - ts = Series(np.random.default_rng(2).integers(5, 10, 50), name="jim") - - gr = df.groupby(ts) - gr.nth(0) # invokes set_selection_from_grouper internally - - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - res = gr.apply(sum) - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - alt = df.groupby(ts).apply(sum) - tm.assert_frame_equal(res, alt) - - for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: - gr = df.groupby(ts, as_index=False) - left = getattr(gr, attr)() - - gr = df.groupby(ts.values, as_index=True) - right = getattr(gr, attr)().reset_index(drop=True) - - tm.assert_frame_equal(left, right) - - def test_ops_not_as_index(reduction_func): # GH 10355, 21090 # Using as_index=False should not modify grouped column @@ -933,18 +634,6 @@ def test_groupby_as_index_series_scalar(df): tm.assert_frame_equal(result, expected) -def test_groupby_as_index_corner(df, ts): - msg = "as_index=False only valid with DataFrame" - with pytest.raises(TypeError, match=msg): - ts.groupby(lambda x: x.weekday(), as_index=False) - - msg = "as_index=False only valid for axis=0" - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - df.groupby(lambda x: x.lower(), as_index=False, axis=1) - - def test_groupby_multiple_key(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -955,19 +644,6 @@ def test_groupby_multiple_key(): agged = grouped.sum() tm.assert_almost_equal(df.values, agged.values) - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - grouped = df.T.groupby( - [lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1 - ) - - agged = grouped.agg(lambda x: x.sum()) - tm.assert_index_equal(agged.index, df.columns) - tm.assert_almost_equal(df.T.values, agged.values) - - agged = grouped.agg(lambda x: x.sum()) - tm.assert_almost_equal(df.T.values, agged.values) - def test_groupby_multi_corner(df): # test that having an all-NA column doesn't mess you up @@ -981,9 +657,11 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_raises_on_nuisance(df): +def test_raises_on_nuisance(df, using_infer_string): grouped = df.groupby("A") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -992,20 +670,12 @@ def test_raises_on_nuisance(df): df = df.loc[:, ["A", "C", "D"]] df["E"] = datetime.now() grouped = df.groupby("A") - msg = "datetime64 type does not support sum operations" + msg = "datetime64 type does not support operation 'sum'" with pytest.raises(TypeError, match=msg): grouped.agg("sum") with pytest.raises(TypeError, match=msg): grouped.sum() - # won't work with axis = 1 - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = "does not support reduction 'sum'" - with pytest.raises(TypeError, match=msg): - grouped.agg(lambda x: x.sum(0, numeric_only=False)) - @pytest.mark.parametrize( "agg_function", @@ -1026,7 +696,7 @@ def test_keep_nuisance_agg(df, agg_function): ["sum", "mean", "prod", "std", "var", "sem", "median"], ) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_omit_nuisance_agg(df, agg_function, numeric_only): +def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): # GH 38774, GH 38815 grouped = df.groupby("A") @@ -1034,7 +704,10 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - if agg_function in ("std", "sem"): + if using_infer_string: + msg = f"dtype 'str' does not support operation '{agg_function}'" + klass = TypeError + elif agg_function in ("std", "sem"): klass = ValueError msg = "could not convert string to float: 'one'" else: @@ -1055,16 +728,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): tm.assert_frame_equal(result, expected) -def test_raise_on_nuisance_python_single(df): +def test_raise_on_nuisance_python_single(df, using_infer_string): # GH 38815 grouped = df.groupby("A") - with pytest.raises(ValueError, match="could not convert"): + + err = ValueError + msg = "could not convert" + if using_infer_string: + err = TypeError + msg = "dtype 'str' does not support operation 'skew'" + with pytest.raises(err, match=msg): grouped.skew() -def test_raise_on_nuisance_python_multiple(three_group): +def test_raise_on_nuisance_python_multiple(three_group, using_infer_string): grouped = three_group.groupby(["A", "B"]) msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1102,12 +783,16 @@ def test_nonsense_func(): df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): +def test_wrap_aggregated_output_multindex( + multiindex_dataframe_random_data, using_infer_string +): df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1158,7 +843,7 @@ def test_groupby_level_mapper(multiindex_dataframe_random_data): def test_groupby_level_nonmulti(): # GH 1313, GH 13901 s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo")) - expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo")) + expected = Series([11, 22, 3, 4, 5, 6], Index(list(range(1, 7)), name="foo")) result = s.groupby(level=0).sum() tm.assert_series_equal(result, expected) @@ -1190,7 +875,7 @@ def test_groupby_level_nonmulti(): def test_groupby_complex(): # GH 12902 a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) - expected = Series((1 + 2j, 5 + 10j)) + expected = Series((1 + 2j, 5 + 10j), index=Index([0, 1])) result = a.groupby(level=0).sum() tm.assert_series_equal(result, expected) @@ -1214,7 +899,7 @@ def test_groupby_complex_mean(): tm.assert_frame_equal(result, expected) -def test_groupby_complex_numbers(using_infer_string): +def test_groupby_complex_numbers(): # GH 17927 df = DataFrame( [ @@ -1223,11 +908,10 @@ def test_groupby_complex_numbers(using_infer_string): {"a": 4, "b": 1}, ] ) - dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype=dtype), + columns=Index(["a"]), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1273,24 +957,12 @@ def test_groupby_with_hier_columns(): result = df.groupby(level=0).mean() tm.assert_index_equal(result.columns, columns) - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - gb = df.groupby(level=0, axis=1) - result = gb.mean() - tm.assert_index_equal(result.index, df.index) - result = df.groupby(level=0).agg("mean") tm.assert_index_equal(result.columns, columns) result = df.groupby(level=0).apply(lambda x: x.mean()) tm.assert_index_equal(result.columns, columns) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - gb = df.groupby(level=0, axis=1) - result = gb.agg(lambda x: x.mean(1)) - tm.assert_index_equal(result.columns, Index(["A", "B"])) - tm.assert_index_equal(result.index, df.index) - # add a nuisance column sorted_columns, _ = columns.sortlevel(0) df["A", "foo"] = "bar" @@ -1300,8 +972,10 @@ def test_groupby_with_hier_columns(): def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) + grouped2 = df.groupby(df["A"].rename(None)) + result = grouped.sum() - expected = df.groupby(df["A"].rename(None)).sum() + expected = grouped2.sum() tm.assert_frame_equal(result, expected) @@ -1374,17 +1048,13 @@ def summarize_random_name(df): # Provide a different name for each Series. In this case, groupby # should not attempt to propagate the Series name since they are # inconsistent. - return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["C"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - metrics = df.groupby("A").apply(summarize) + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - with tm.assert_produces_warning(FutureWarning, match=msg): - metrics = df.groupby("A").apply(summarize, "metrics") + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - with tm.assert_produces_warning(FutureWarning, match=msg): - metrics = df.groupby("A").apply(summarize_random_name) + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1547,7 +1217,10 @@ def test_groupby_nat_exclude(): ) grouped = df.groupby("dt") - expected = [Index([1, 7]), Index([3, 5])] + expected = [ + RangeIndex(start=1, stop=13, step=6), + RangeIndex(start=3, stop=7, step=2), + ] keys = sorted(grouped.groups.keys()) assert len(keys) == 2 for k, e in zip(keys, expected): @@ -1577,7 +1250,7 @@ def test_groupby_nat_exclude(): {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} ) assert nan_df["nan"].dtype == "float64" - assert nan_df["nat"].dtype == "datetime64[ns]" + assert nan_df["nat"].dtype == "datetime64[s]" for key in ["nan", "nat"]: grouped = nan_df.groupby(key) @@ -1677,10 +1350,8 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("key", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("key", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["name"]]) def test_skip_group_keys(): @@ -1742,18 +1413,14 @@ def g(group): @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper, using_infer_string): +def test_set_group_name(df, grouper): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): - with pytest.raises(TypeError, match="does not support"): - group.sum() - else: - return group.sum() + return group.sum() def freducex(x): return freduce(x) @@ -1761,9 +1428,7 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped.apply(f) + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1784,10 +1449,7 @@ def f(group): names.append(group.name) return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("a", sort=False, group_keys=False).apply(f) - + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1797,8 +1459,8 @@ def test_no_dummy_key_names(df): result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df["A"].values, df["B"].values]).sum() - assert result.index.names == (None, None) + result2 = df.groupby([df["A"].values, df["B"].values]).sum() + assert result2.index.names == (None, None) def test_groupby_sort_multiindex_series(): @@ -1870,7 +1532,7 @@ def test_groupby_multiindex_missing_pair(): tm.assert_frame_equal(res, exp) -def test_groupby_multiindex_not_lexsorted(): +def test_groupby_multiindex_not_lexsorted(performance_warning): # GH 11640 # define the lexsorted version @@ -1891,7 +1553,7 @@ def test_groupby_multiindex_not_lexsorted(): assert not not_lexsorted_df.columns._is_lexsorted() expected = lexsorted_df.groupby("a").mean() - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): result = not_lexsorted_df.groupby("a").mean() tm.assert_frame_equal(expected, result) @@ -1992,9 +1654,7 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - g.apply(test_sort) + g.apply(test_sort) def test_pivot_table_values_key_error(): @@ -2048,11 +1708,9 @@ def test_pivot_table_values_key_error(): ) @pytest.mark.parametrize("method", ["attr", "agg", "apply"]) @pytest.mark.parametrize( - "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] + "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew", "kurt"] ) -def test_empty_groupby( - columns, keys, values, method, op, using_array_manager, dropna, using_infer_string -): +def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string): # GH8093 & GH26411 override_dtype = None @@ -2094,7 +1752,7 @@ def get_categorical_invalid_expected(): idx = Index(lev, name=keys[0]) if using_infer_string: - columns = Index([], dtype="string[pyarrow_numpy]") + columns = Index([], dtype="str") else: columns = [] expected = DataFrame([], columns=columns, index=idx) @@ -2103,6 +1761,7 @@ def get_categorical_invalid_expected(): is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) + is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype) if ( isinstance(values, Categorical) @@ -2125,26 +1784,28 @@ def get_categorical_invalid_expected(): tm.assert_equal(result, expected) return - if op in ["prod", "sum", "skew"]: + if op in ["prod", "sum", "skew", "kurt"]: # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per: + if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 # datetime64 -> prod and sum are invalid if is_dt64: msg = "datetime64 type does not support" elif is_per: msg = "Period type does not support" + elif is_str: + msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" - if op == "skew": - msg = "|".join([msg, "does not support reduction 'skew'"]) + if op in ["skew", "kurt"]: + msg = "|".join([msg, f"does not support operation '{op}'"]) with pytest.raises(TypeError, match=msg): get_result() if not isinstance(columns, list): # i.e. SeriesGroupBy return - elif op == "skew": + elif op in ["skew", "kurt"]: # TODO: test the numeric_only=True case return else: @@ -2179,10 +1840,8 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = gb.apply(lambda x: x) - assert (res.dtypes == df.dtypes).all() + res = gb.apply(lambda x: x) + assert (res.dtypes == df.drop(columns=1).dtypes).all() def test_tuple_as_grouping(): @@ -2294,32 +1953,31 @@ def test_groupby_groups_in_BaseGrouper(): assert result.groups == expected.groups -@pytest.mark.parametrize("group_name", ["x", ["x"]]) -def test_groupby_axis_1(group_name): - # GH 27614 - df = DataFrame( - np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20] - ) - df.index.name = "y" - df.columns.name = "x" - - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - gb = df.groupby(group_name, axis=1) +def test_groups_sort_dropna(sort, dropna): + # GH#56966, GH#56851 + df = DataFrame([[2.0, 1.0], [np.nan, 4.0], [0.0, 3.0]]) + keys = [(2.0, 1.0), (np.nan, 4.0), (0.0, 3.0)] + values = [ + RangeIndex(0, 1), + RangeIndex(1, 2), + RangeIndex(2, 3), + ] + if sort: + taker = [2, 0] if dropna else [2, 0, 1] + else: + taker = [0, 2] if dropna else [0, 1, 2] + expected = {keys[idx]: values[idx] for idx in taker} - results = gb.sum() - expected = df.T.groupby(group_name).sum().T - tm.assert_frame_equal(results, expected) + gb = df.groupby([0, 1], sort=sort, dropna=dropna) + result = gb.groups - # test on MI column - iterables = [["bar", "baz", "foo"], ["one", "two"]] - mi = MultiIndex.from_product(iterables=iterables, names=["x", "x1"]) - df = DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - gb = df.groupby(group_name, axis=1) - results = gb.sum() - expected = df.T.groupby(group_name).sum().T - tm.assert_frame_equal(results, expected) + for result_key, expected_key in zip(result.keys(), expected.keys()): + # Compare as NumPy arrays to handle np.nan + result_key = np.array(result_key) + expected_key = np.array(expected_key) + tm.assert_numpy_array_equal(result_key, expected_key) + for result_value, expected_value in zip(result.values(), expected.values()): + tm.assert_index_equal(result_value, expected_value) @pytest.mark.parametrize( @@ -2418,77 +2076,39 @@ def test_group_on_empty_multiindex(transformation_func, request): df["col_3"] = df["col_3"].astype(int) df["col_4"] = df["col_4"].astype(int) df = df.set_index(["col_1", "col_2"]) - if transformation_func == "fillna": - args = ("ffill",) - else: - args = () - warn = FutureWarning if transformation_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) - with tm.assert_produces_warning(warn, match=warn_msg): - expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] + result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func) + expected = df.groupby(["col_1"]).transform(transformation_func).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = ( - df["col_3"] - .iloc[:0] - .groupby(["col_1"]) - .transform(transformation_func, *args) - ) - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = ( - df["col_3"] - .groupby(["col_1"]) - .transform(transformation_func, *args) - .iloc[:0] - ) + result = df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func) + expected = df["col_3"].groupby(["col_1"]).transform(transformation_func).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) -def test_groupby_crash_on_nunique(axis): +def test_groupby_crash_on_nunique(): # Fix following 30253 dti = date_range("2016-01-01", periods=2, name="foo") df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) df.columns.names = ("bar", "baz") df.index = dti - axis_number = df._get_axis_number(axis) - if not axis_number: - df = df.T - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - else: - msg = "DataFrame.groupby with axis=1 is deprecated" - - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(axis=axis_number, level=0) + df = df.T + gb = df.groupby(level=0) result = gb.nunique() expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti) expected.columns.name = "bar" - if not axis_number: - expected = expected.T + expected = expected.T tm.assert_frame_equal(result, expected) - if axis_number == 0: - # same thing, but empty columns - with tm.assert_produces_warning(FutureWarning, match=msg): - gb2 = df[[]].groupby(axis=axis_number, level=0) - exp = expected[[]] - else: - # same thing, but empty rows - with tm.assert_produces_warning(FutureWarning, match=msg): - gb2 = df.loc[[]].groupby(axis=axis_number, level=0) - # default for empty when we can't infer a dtype is float64 - exp = expected.loc[[]].astype(np.float64) + # same thing, but empty columns + gb2 = df[[]].groupby(level=0) + exp = expected[[]] res = gb2.nunique() tm.assert_frame_equal(res, exp) @@ -2564,17 +2184,6 @@ def test_subsetting_columns_keeps_attrs(klass, attr, value): assert getattr(result, attr) == getattr(expected, attr) -def test_subsetting_columns_axis_1(): - # GH 37725 - df = DataFrame({"A": [1], "B": [2], "C": [3]}) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - g = df.groupby([0, 0, 1], axis=1) - match = "Cannot subset columns when using axis=1" - with pytest.raises(ValueError, match=match): - g[["A", "B"]].sum() - - @pytest.mark.parametrize("func", ["sum", "any", "shift"]) def test_groupby_column_index_name_lost(func): # GH: 29764 groupby loses index sometimes @@ -2699,7 +2308,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + columns = ["C"] if not numeric_only else Index([], dtype="str") else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2717,7 +2326,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) @@ -2816,25 +2425,20 @@ def test_rolling_wrong_param_min_period(): test_df = DataFrame([name_l, val_l]).T test_df.columns = ["name", "val"] - result_error_msg = r"__init__\(\) got an unexpected keyword argument 'min_period'" + result_error_msg = ( + r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'" + ) with pytest.raises(TypeError, match=result_error_msg): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_by_column_values_with_same_starting_value(dtype): +def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} @@ -2862,11 +2466,13 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) -def test_groupby_none_column_name(): +def test_groupby_none_column_name(using_infer_string): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) - result = df.groupby(by=[None]).sum() - expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) + by = [np.nan] if using_infer_string else [None] + gb = df.groupby(by=by) + result = gb.sum() + expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=by[0])) tm.assert_frame_equal(result, expected) @@ -2904,19 +2510,14 @@ def test_groupby_string_dtype(): @pytest.mark.parametrize( "level_arg, multiindex", [([0], False), ((0,), False), ([0], True), ((0,), True)] ) -def test_single_element_listlike_level_grouping_deprecation(level_arg, multiindex): +def test_single_element_listlike_level_grouping(level_arg, multiindex): # GH 51583 df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) if multiindex: df = df.set_index(["a", "b"]) - depr_msg = ( - "Creating a Groupby object with a length-1 list-like " - "level parameter will yield indexes as tuples in a future version. " - "To keep indexes as scalars, create Groupby objects with " - "a scalar level parameter instead." - ) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - [key for key, _ in df.groupby(level=level_arg)] + result = [key for key, _ in df.groupby(level=level_arg)] + expected = [(1,), (2,)] if multiindex else [("x",), ("y",)] + assert result == expected @pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"]) @@ -3040,7 +2641,9 @@ def test_groupby_method_drop_na(method): Series(["a", "b", "c"], name="A") ) else: - expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4]) + expected = DataFrame( + {"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=range(0, 6, 2) + ) tm.assert_frame_equal(result, expected) @@ -3089,7 +2692,7 @@ def test_obj_with_exclusions_duplicate_columns(): def test_groupby_numeric_only_std_no_result(numeric_only): # GH 51080 dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}] - df = DataFrame(dicts_non_numeric) + df = DataFrame(dicts_non_numeric, dtype=object) dfgb = df.groupby("a", as_index=False, sort=False) if numeric_only: @@ -3148,10 +2751,14 @@ def test_grouping_with_categorical_interval_columns(): def test_groupby_sum_on_nan_should_return_nan(bug_var): # GH 24196 df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]}) + if isinstance(bug_var, str): + df = df.astype(object) dfgb = df.groupby(lambda x: x) result = dfgb.sum(min_count=1) - expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"]) + expected_df = DataFrame( + [bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype + ) tm.assert_frame_equal(result, expected_df) @@ -3203,12 +2810,6 @@ def test_groupby_selection_other_methods(df): g_exp = df[["C"]].groupby(df["A"]) # methods which aren't just .foo() - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) - msg = "DataFrameGroupBy.dtypes is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_frame_equal(g.dtypes, g_exp.dtypes) tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum())) tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean()) @@ -3261,22 +2862,18 @@ def test_groupby_series_with_datetimeindex_month_name(): "kwarg, value, name, warn", [ ("by", "a", 1, None), - ("by", ["a"], 1, FutureWarning), ("by", ["a"], (1,), None), ("level", 0, 1, None), - ("level", [0], 1, FutureWarning), ("level", [0], (1,), None), ], ) -def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn): +def test_get_group_len_1_list_likes(test_series, kwarg, value, name, warn): # GH#25971 obj = DataFrame({"b": [3, 4, 5]}, index=Index([1, 1, 2], name="a")) if test_series: obj = obj["b"] gb = obj.groupby(**{kwarg: value}) - msg = "you will need to pass a length-1 tuple" - with tm.assert_produces_warning(warn, match=msg): - result = gb.get_group(name) + result = gb.get_group(name) if test_series: expected = Series([3, 4], index=Index([1, 1], name="a"), name="b") else: @@ -3292,29 +2889,6 @@ def test_groupby_ngroup_with_nan(): tm.assert_series_equal(result, expected) -def test_get_group_axis_1(): - # GH#54858 - df = DataFrame( - { - "col1": [0, 3, 2, 3], - "col2": [4, 1, 6, 7], - "col3": [3, 8, 2, 10], - "col4": [1, 13, 6, 15], - "col5": [-4, 5, 6, -7], - } - ) - with tm.assert_produces_warning(FutureWarning, match="deprecated"): - grouped = df.groupby(axis=1, by=[1, 2, 3, 2, 1]) - result = grouped.get_group(1) - expected = DataFrame( - { - "col1": [0, 3, 2, 3], - "col5": [-4, 5, 6, -7], - } - ) - tm.assert_frame_equal(result, expected) - - def test_groupby_ffill_with_duplicated_index(): # GH#43412 df = DataFrame({"a": [1, 2, 3, 4, np.nan, np.nan]}, index=[0, 1, 2, 0, 1, 2]) @@ -3342,3 +2916,83 @@ def test_decimal_na_sort(test_series): result = gb._grouper.result_index expected = Index([Decimal(1), None], name="key") tm.assert_index_equal(result, expected) + + +def test_groupby_dropna_with_nunique_unique(): + # GH#42016 + df = [[1, 1, 1, "A"], [1, None, 1, "A"], [1, None, 2, "A"], [1, None, 3, "A"]] + df_dropna = DataFrame(df, columns=["a", "b", "c", "partner"]) + result = df_dropna.groupby(["a", "b", "c"], dropna=False).agg( + {"partner": ["nunique", "unique"]} + ) + + index = MultiIndex.from_tuples( + [(1, 1.0, 1), (1, np.nan, 1), (1, np.nan, 2), (1, np.nan, 3)], + names=["a", "b", "c"], + ) + columns = MultiIndex.from_tuples([("partner", "nunique"), ("partner", "unique")]) + expected = DataFrame( + [(1, ["A"]), (1, ["A"]), (1, ["A"]), (1, ["A"])], index=index, columns=columns + ) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_namedagg_with_duplicate_columns(): + # GH#58446 + df = DataFrame( + { + "col1": [2, 1, 1, 0, 2, 0], + "col2": [4, 5, 36, 7, 4, 5], + "col3": [3.1, 8.0, 12, 10, 4, 1.1], + "col4": [17, 3, 16, 15, 5, 6], + "col5": [-1, 3, -1, 3, -2, -1], + } + ) + + result = df.groupby(by=["col1", "col1", "col2"], as_index=False).agg( + new_col=pd.NamedAgg(column="col1", aggfunc="min"), + new_col1=pd.NamedAgg(column="col1", aggfunc="max"), + new_col2=pd.NamedAgg(column="col2", aggfunc="count"), + ) + + expected = DataFrame( + { + "col1": [0, 0, 1, 1, 2], + "col2": [5, 7, 5, 36, 4], + "new_col": [0, 0, 1, 1, 2], + "new_col1": [0, 0, 1, 1, 2], + "new_col2": [1, 1, 1, 1, 2], + } + ) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_multi_index_codes(): + # GH#54347 + df = DataFrame( + {"A": [1, 2, 3, 4], "B": [1, float("nan"), 2, float("nan")], "C": [2, 4, 6, 8]} + ) + df_grouped = df.groupby(["A", "B"], dropna=False).sum() + + index = df_grouped.index + tm.assert_index_equal(index, MultiIndex.from_frame(index.to_frame())) + + +def test_groupby_datetime_with_nat(): + # GH##35202 + df = DataFrame( + { + "a": [ + to_datetime("2019-02-12"), + to_datetime("2019-02-12"), + to_datetime("2019-02-13"), + pd.NaT, + ], + "b": [1, 2, 3, 4], + } + ) + grouped = df.groupby("a", dropna=False) + result = len(grouped) + assert result == 3 diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 73638eba0a3b3..8c4ab42b7be7a 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -123,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a")) tm.assert_frame_equal(grouped, expected) @@ -167,7 +167,6 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dropna", (False, True)) def test_grouper_dropna_propagation(dropna): # GH 36604 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) @@ -324,9 +323,7 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) @@ -411,7 +408,6 @@ def test_groupby_drop_nan_with_multi_index(): "UInt64", "Int64", "Float32", - "Int64", "Float64", "category", "string", @@ -422,7 +418,7 @@ def test_groupby_drop_nan_with_multi_index(): ), ), "datetime64[ns]", - "period[d]", + "period[D]", "Sparse[float]", ], ) @@ -439,7 +435,7 @@ def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index): # Unique values to use for grouper, depends on dtype if dtype in ("string", "string[pyarrow]"): uniques = {"x": "x", "y": "y", "z": pd.NA} - elif dtype in ("datetime64[ns]", "period[d]"): + elif dtype in ("datetime64[ns]", "period[D]"): uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA} else: uniques = {"x": 1, "y": 2, "z": np.nan} @@ -545,7 +541,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki return gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) - expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": expected["x2"] = expected["x2"].cat.remove_categories([4]) @@ -554,11 +557,6 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki expected = expected.set_index(["x", "x2"]) else: expected = expected.set_index("x") - elif index_kind != "range" and reduction_func != "size": - # size, unlike other methods, has the desired behavior in GH#49519 - expected = expected.drop(columns="x") - if index_kind == "multi": - expected = expected.drop(columns="x2") if reduction_func in ("idxmax", "idxmin") and index_kind != "range": # expected was computed with a RangeIndex; need to translate to index values values = expected["y"].values.tolist() @@ -574,11 +572,12 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki if as_index: expected = expected["size"].rename(None) - if as_index or index_kind == "range" or reduction_func == "size": - warn = None - else: + if reduction_func == "corrwith": warn = FutureWarning - msg = "A grouping .* was excluded from the result" + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" with tm.assert_produces_warning(warn, match=msg): result = getattr(gb_keepna, reduction_func)(*args) @@ -586,14 +585,8 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki tm.assert_equal(result, expected) -def test_categorical_transformers( - request, transformation_func, observed, sort, as_index -): +def test_categorical_transformers(transformation_func, observed, sort, as_index): # GH#36327 - if transformation_func == "fillna": - msg = "GH#49651 fillna may incorrectly reorders results when dropna=False" - request.applymarker(pytest.mark.xfail(reason=msg, strict=False)) - values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) df = pd.DataFrame( {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)} @@ -623,12 +616,7 @@ def test_categorical_transformers( ) gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort) - msg = "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated" - if transformation_func == "pct_change": - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(gb_keepna, "pct_change")(*args) - else: - result = getattr(gb_keepna, transformation_func)(*args) + result = getattr(gb_keepna, transformation_func)(*args) expected = getattr(gb_dropna, transformation_func)(*args) for iloc, value in zip( diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index bf809bd5db437..3ee9c9ea0c7fd 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -36,11 +36,11 @@ def test_groupby_preserves_subclass(obj, groupby_func): args = get_groupby_method_args(groupby_func, obj) - warn = FutureWarning if groupby_func == "fillna" else None - msg = f"{type(grouped).__name__}.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + warn = FutureWarning if groupby_func == "corrwith" else None + msg = f"{type(grouped).__name__}.corrwith is deprecated" + with tm.assert_produces_warning(warn, match=msg): result1 = getattr(grouped, groupby_func)(*args) - with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + with tm.assert_produces_warning(warn, match=msg): result2 = grouped.agg(groupby_func, *args) # Reduction or transformation kernels should preserve type @@ -69,16 +69,20 @@ def test_groupby_preserves_metadata(): def func(group): assert isinstance(group, tm.SubclassedDataFrame) assert hasattr(group, "testattr") + assert group.testattr == "hello" return group.testattr - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False - ): - result = custom_df.groupby("c").apply(func) + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) + result = custom_df.groupby("c").apply(func) + tm.assert_series_equal(result, expected) + + # https://github.com/pandas-dev/pandas/pull/56761 + result = custom_df.groupby("c")[["a", "b"]].apply(func) + tm.assert_series_equal(result, expected) + def func2(group): assert isinstance(group, tm.SubclassedSeries) assert hasattr(group, "testattr") @@ -98,7 +102,7 @@ def test_groupby_resample_preserves_subclass(obj): df = obj( { - "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object), "Quantity": [18, 3, 5, 1, 9, 3], "Date": [ datetime(2013, 9, 1, 13, 0), @@ -113,9 +117,5 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False - ): - result = df.groupby("Buyer").resample("5D").sum() + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 363ff883385db..53e9c53efebf7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1,6 +1,7 @@ """ test where we are determining what we are grouping, or getting groups """ + from datetime import ( date, timedelta, @@ -9,6 +10,8 @@ import numpy as np import pytest +from pandas.errors import SpecificationError + import pandas as pd from pandas import ( CategoricalIndex, @@ -230,18 +233,7 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - msg = "Grouper axis keyword is deprecated and will be removed" - with tm.assert_produces_warning(FutureWarning, match=msg): - gpr = Grouper(key="A", axis=0) - g = df.groupby(gpr) - result = g.sum() - tm.assert_frame_equal(result, expected) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = g.apply(lambda x: x.sum()) - expected["A"] = [0, 2, 4] - expected = expected.loc[:, ["A", "B"]] + result = g.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) def test_grouper_creation_bug2(self): @@ -386,22 +378,14 @@ def test_groupby_categorical_index_and_columns(self, observed): [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int ) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) - df = DataFrame(data=data, columns=cat_columns) - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int) expected_columns = CategoricalIndex( categories, categories=categories, ordered=True ) - expected = DataFrame(data=expected_data, columns=expected_columns) - tm.assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(axis=0, level=0, observed=observed).sum() + result = df.groupby(level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) tm.assert_frame_equal(result, expected) @@ -429,9 +413,7 @@ def test_grouper_getting_correct_binner(self): def test_grouper_iter(self, df): gb = df.groupby("A") - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouper = gb.grouper + grouper = gb._grouper result = sorted(grouper) expected = ["bar", "foo"] assert result == expected @@ -443,9 +425,7 @@ def test_empty_groups(self, df): def test_groupby_grouper(self, df): grouped = df.groupby("A") - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouper = grouped.grouper + grouper = grouped._grouper result = df.groupby(grouper).mean(numeric_only=True) expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) @@ -525,22 +505,10 @@ def test_groupby_with_datetime_key(self): assert len(gb.groups.keys()) == 4 def test_grouping_error_on_multidim_input(self, df): - msg = "Grouper for '' not 1-dimensional" + msg = "Grouper for '' not 1-dimensional" with pytest.raises(ValueError, match=msg): Grouping(df.index, df[["A", "A"]]) - def test_multiindex_passthru(self): - # GH 7997 - # regression from 0.14.1 - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - df.columns = MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) - - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - gb = df.groupby(axis=1, level=[0, 1]) - result = gb.first() - tm.assert_frame_equal(result, df) - def test_multiindex_negative_level(self, multiindex_dataframe_random_data): # GH 13901 result = multiindex_dataframe_random_data.groupby(level=-1).sum() @@ -561,12 +529,10 @@ def test_multiindex_negative_level(self, multiindex_dataframe_random_data): ).sum() tm.assert_frame_equal(result, expected) - def test_multifunc_select_col_integer_cols(self, df): + def test_agg_with_dict_raises(self, df): df.columns = np.arange(len(df.columns)) - - # it works! - msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): df.groupby(1, as_index=False)[2].agg({"Q": np.mean}) def test_multiindex_columns_empty_level(self): @@ -575,31 +541,38 @@ def test_multiindex_columns_empty_level(self): df = DataFrame([[1, "A"]], columns=midx) + msg = "`groups` by one element list returns scalar is deprecated" grouped = df.groupby("to filter").groups assert grouped["A"] == [0] - grouped = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = df.groupby([("to filter", "")]).groups assert grouped["A"] == [0] df = DataFrame([[1, "A"], [2, "B"]], columns=midx) expected = df.groupby("to filter").groups - result = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([("to filter", "")]).groups assert result == expected df = DataFrame([[1, "A"], [2, "A"]], columns=midx) expected = df.groupby("to filter").groups - result = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([("to filter", "")]).groups tm.assert_dict_equal(result, expected) def test_groupby_multiindex_tuple(self): - # GH 17979 + # GH 17979, GH#59179 df = DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), ) - expected = df.groupby([("b", 1)]).groups + + msg = "`groups` by one element list returns scalar is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby([("b", 1)]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) @@ -609,17 +582,21 @@ def test_groupby_multiindex_tuple(self): [["a", "b", "b", "c"], ["d", "d", "e", "e"]] ), ) - expected = df2.groupby([("b", "d")]).groups + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df2.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) df3 = DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) - expected = df3.groupby([("b", "d")]).groups + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df3.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) def test_groupby_multiindex_partial_indexing_equivalence(self): - # GH 17977 + # GH 17977, GH#59179 df = DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), @@ -645,11 +622,12 @@ def test_groupby_multiindex_partial_indexing_equivalence(self): result_max = df.groupby([("a", 1)])["b"].max() tm.assert_frame_equal(expected_max, result_max) - expected_groups = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].groups - result_groups = df.groupby([("a", 1)])["b"].groups + msg = "`groups` by one element list returns scalar is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected_groups = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].groups + result_groups = df.groupby([("a", 1)])["b"].groups tm.assert_dict_equal(expected_groups, result_groups) - @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level(self, sort, multiindex_dataframe_random_data, df): # GH 17537 frame = multiindex_dataframe_random_data @@ -678,37 +656,21 @@ def test_groupby_level(self, sort, multiindex_dataframe_random_data, df): tm.assert_frame_equal(result0, expected0) tm.assert_frame_equal(result1, expected1) - # axis=1 - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() - result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() - tm.assert_frame_equal(result0, expected0.T) - tm.assert_frame_equal(result1, expected1.T) - # raise exception for non-MultiIndex msg = "level > 0 or level < -1 only valid with MultiIndex" with pytest.raises(ValueError, match=msg): df.groupby(level=1) - def test_groupby_level_index_names(self, axis): + def test_groupby_level_index_names(self): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index( "exp" ) - if axis in (1, "columns"): - df = df.T - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - else: - depr_msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - df.groupby(level="exp", axis=axis) - msg = f"level name foo is not the name of the {df._get_axis_name(axis)}" + df.groupby(level="exp") + msg = "level name foo is not the name of the index" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - df.groupby(level="foo", axis=axis) + df.groupby(level="foo") - @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level_with_nas(self, sort): # GH 17537 index = MultiIndex( @@ -756,7 +718,7 @@ def test_level_preserve_order(self, sort, labels, multiindex_dataframe_random_da # GH 17537 grouped = multiindex_dataframe_random_data.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) + tm.assert_almost_equal(grouped._grouper.ids, exp_labels) def test_grouping_labels(self, multiindex_dataframe_random_data): grouped = multiindex_dataframe_random_data.groupby( @@ -766,15 +728,18 @@ def test_grouping_labels(self, multiindex_dataframe_random_data): tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): - # GH 14715 + # GH 14715, GH#59179 df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT grouper = Grouper(key="date", freq="YS") + msg = "`groups` by one element list returns scalar is deprecated" # Grouper in a list grouping - result = df.groupby([grouper]) + gb = df.groupby([grouper]) expected = {Timestamp("2011-01-01"): Index(list(range(364)))} - tm.assert_dict_equal(result.groups, expected) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.groups + tm.assert_dict_equal(result, expected) # Test case without a list result = df.groupby(grouper) @@ -808,10 +773,21 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = DataFrame({1: [], 2: []}) - g = df.groupby(1, group_keys=False) + g = df.groupby(1, group_keys=True) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) + def test_groupby_apply_empty_with_group_keys_false(self): + # 60471 + # test apply'ing empty groups with group_keys False + # (not testing other agg fns, because they return + # different index objects. + df = DataFrame({"A": [], "B": [], "C": []}) + g = df.groupby("A", group_keys=False) + result = g.apply(lambda x: x / x.sum()) + expected = DataFrame({"B": [], "C": []}, index=None) + tm.assert_frame_equal(result, expected) + def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 s = Series([], name="name", dtype="float64") @@ -824,20 +800,14 @@ def test_groupby_empty(self): # check group properties assert len(gr._grouper.groupings) == 1 tm.assert_numpy_array_equal( - gr._grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) - ) - - tm.assert_numpy_array_equal( - gr._grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) + gr._grouper.ids, np.array([], dtype=np.dtype(np.intp)) ) - assert gr._grouper.group_info[2] == 0 + assert gr._grouper.ngroups == 0 # check name gb = s.groupby(s) - msg = "SeriesGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouper = gb.grouper + grouper = gb._grouper result = grouper.names expected = ["name"] assert result == expected @@ -851,7 +821,7 @@ def test_groupby_level_index_value_all_na(self): expected = DataFrame( data=[], index=MultiIndex( - levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + levels=[Index(["x"], dtype="str"), Index([], dtype="float64")], codes=[[], []], names=["A", "B"], ), @@ -875,6 +845,33 @@ def test_groupby_multiindex_level_empty(self): ) tm.assert_frame_equal(result, expected) + def test_groupby_tuple_keys_handle_multiindex(self): + # https://github.com/pandas-dev/pandas/issues/21340 + df = DataFrame( + { + "num1": [0, 8, 9, 4, 3, 3, 5, 9, 3, 6], + "num2": [3, 8, 6, 4, 9, 2, 1, 7, 0, 9], + "num3": [6, 5, 7, 8, 5, 1, 1, 10, 7, 8], + "category_tuple": [ + (0, 1), + (0, 1), + (0, 1), + (0, 4), + (2, 3), + (2, 3), + (2, 3), + (2, 3), + (5,), + (6,), + ], + "category_string": list("aaabbbbcde"), + } + ) + expected = df.sort_values(by=["category_tuple", "num1"]) + result = df.groupby("category_tuple").apply(lambda x: x.sort_values(by="num1")) + expected = expected[result.columns] + tm.assert_frame_equal(result.reset_index(drop=True), expected) + # get_group # -------------------------------- @@ -990,7 +987,9 @@ def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str") + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) @@ -1018,17 +1017,20 @@ def test_gb_key_len_equal_axis_len(self): class TestIteration: def test_groups(self, df): grouped = df.groupby(["A"]) - groups = grouped.groups - assert groups is grouped.groups # caching works + msg = "`groups` by one element list returns scalar is deprecated" - for k, v in grouped.groups.items(): + with tm.assert_produces_warning(FutureWarning, match=msg): + groups = grouped.groups + assert groups is grouped.groups # caching works + + for k, v in groups.items(): assert (df.loc[v]["A"] == k).all() grouped = df.groupby(["A", "B"]) groups = grouped.groups assert groups is grouped.groups # caching works - for k, v in grouped.groups.items(): + for k, v in groups.items(): assert (df.loc[v]["A"] == k[0]).all() assert (df.loc[v]["B"] == k[1]).all() @@ -1097,18 +1099,9 @@ def test_multi_iter_frame(self, three_group): grouped = df.groupby(["k1", "k2"]) # calling `dict` on a DataFrameGroupBy leads to a TypeError, # we need to use a dictionary comprehension here - # pylint: disable-next=unnecessary-comprehension groups = {key: gp for key, gp in grouped} # noqa: C416 assert len(groups) == 2 - # axis = 1 - three_levels = three_group.groupby(["A", "B", "C"]).mean() - depr_msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - grouped = three_levels.T.groupby(axis=1, level=(1, 2)) - for key, group in grouped: - pass - def test_dictify(self, df): dict(iter(df.groupby("A"))) dict(iter(df.groupby(["A", "B"]))) @@ -1187,50 +1180,18 @@ def test_grouping_by_key_is_in_axis(): assert not gb._grouper.groupings[0].in_axis assert gb._grouper.groupings[1].in_axis - # Currently only in-axis groupings are including in the result when as_index=False; - # This is likely to change in the future. - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.sum() - expected = DataFrame({"b": [1, 2], "c": [7, 5]}) + result = gb.sum() + expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]}) tm.assert_frame_equal(result, expected) -def test_grouper_groups(): - # GH#51182 check Grouper.groups does not raise AttributeError - df = DataFrame({"a": [1, 2, 3], "b": 1}) - grper = Grouper(key="a") - gb = df.groupby(grper) - - msg = "Use GroupBy.groups instead" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = grper.groups - assert res is gb.groups - - msg = "Use GroupBy.grouper instead" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = grper.grouper - assert res is gb._grouper - - msg = "Grouper.obj is deprecated and will be removed" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = grper.obj - assert res is gb.obj - - msg = "Use Resampler.ax instead" - with tm.assert_produces_warning(FutureWarning, match=msg): - grper.ax - - msg = "Grouper.indexer is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grper.indexer - - -@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"]) -def test_depr_grouping_attrs(attr): - # GH#56148 - df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) - gb = df.groupby("a") - msg = f"{attr} is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - getattr(gb._grouper.groupings[0], attr) +def test_groupby_any_with_timedelta(): + # GH#59712 + df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]}) + + result = df.groupby(np.array([0, 1], dtype=np.int64))["value"].any() + + expected = Series({0: True, 1: False}, name="value", dtype=bool) + expected.index = expected.index.astype(np.int64) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 4aaf3de9a23b2..743db7e70b14b 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -5,38 +5,6 @@ import pandas._testing as tm -@pytest.fixture(params=[["inner"], ["inner", "outer"]]) -def frame(request): - levels = request.param - df = pd.DataFrame( - { - "outer": ["a", "a", "a", "b", "b", "b"], - "inner": [1, 2, 3, 1, 2, 3], - "A": np.arange(6), - "B": ["one", "one", "two", "two", "one", "one"], - } - ) - if levels: - df = df.set_index(levels) - - return df - - -@pytest.fixture() -def series(): - df = pd.DataFrame( - { - "outer": ["a", "a", "a", "b", "b", "b"], - "inner": [1, 2, 3, 1, 2, 3], - "A": np.arange(6), - "B": ["one", "one", "two", "two", "one", "one"], - } - ) - s = df.set_index(["outer", "inner", "B"])["A"] - - return s - - @pytest.mark.parametrize( "key_strs,groupers", [ @@ -46,7 +14,17 @@ def series(): (["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column ], ) -def test_grouper_index_level_as_string(frame, key_strs, groupers): +@pytest.mark.parametrize("levels", [["inner"], ["inner", "outer"]]) +def test_grouper_index_level_as_string(levels, key_strs, groupers): + frame = pd.DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 3, 1, 2, 3], + "A": np.arange(6), + "B": ["one", "one", "two", "two", "one", "one"], + } + ) + frame = frame.set_index(levels) if "B" not in key_strs or "outer" in frame.columns: result = frame.groupby(key_strs).mean(numeric_only=True) expected = frame.groupby(groupers).mean(numeric_only=True) @@ -71,8 +49,17 @@ def test_grouper_index_level_as_string(frame, key_strs, groupers): ["B", "outer", "inner"], ], ) -def test_grouper_index_level_as_string_series(series, levels): +def test_grouper_index_level_as_string_series(levels): # Compute expected result + df = pd.DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 3, 1, 2, 3], + "A": np.arange(6), + "B": ["one", "one", "two", "two", "one", "one"], + } + ) + series = df.set_index(["outer", "inner", "B"])["A"] if isinstance(levels, list): groupers = [pd.Grouper(level=lv) for lv in levels] else: diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py index 664c52babac13..a3d3f509e186a 100644 --- a/pandas/tests/groupby/test_indexing.py +++ b/pandas/tests/groupby/test_indexing.py @@ -118,15 +118,26 @@ def test_doc_examples(): tm.assert_frame_equal(result, expected) -@pytest.fixture() -def multiindex_data(): +def test_multiindex(): + # Test the multiindex mentioned as the use-case in the documentation + + def _make_df_from_data(data): + rows = {} + for date in data: + for level in data[date]: + rows[(date, level[0])] = {"A": level[1], "B": level[2]} + + df = pd.DataFrame.from_dict(rows, orient="index") + df.index.names = ("Date", "Item") + return df + rng = np.random.default_rng(2) ndates = 100 nitems = 20 dates = pd.date_range("20130101", periods=ndates, freq="D") items = [f"item {i}" for i in range(nitems)] - data = {} + multiindex_data = {} for date in dates: nitems_for_date = nitems - rng.integers(0, 12) levels = [ @@ -134,28 +145,12 @@ def multiindex_data(): for item in items[:nitems_for_date] ] levels.sort(key=lambda x: x[1]) - data[date] = levels - - return data - - -def _make_df_from_data(data): - rows = {} - for date in data: - for level in data[date]: - rows[(date, level[0])] = {"A": level[1], "B": level[2]} - - df = pd.DataFrame.from_dict(rows, orient="index") - df.index.names = ("Date", "Item") - return df - + multiindex_data[date] = levels -def test_multiindex(multiindex_data): - # Test the multiindex mentioned as the use-case in the documentation df = _make_df_from_data(multiindex_data) result = df.groupby("Date", as_index=False).nth(slice(3, -3)) - sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data} + sliced = {date: values[3:-3] for date, values in multiindex_data.items()} expected = _make_df_from_data(sliced) tm.assert_frame_equal(result, expected) @@ -271,24 +266,6 @@ def test_step(step): tm.assert_frame_equal(result, expected) -@pytest.fixture() -def column_group_df(): - return pd.DataFrame( - [[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]], - columns=["A", "B", "C", "D", "E", "F", "G"], - ) - - -def test_column_axis(column_group_df): - msg = "DataFrame.groupby with axis=1" - with tm.assert_produces_warning(FutureWarning, match=msg): - g = column_group_df.groupby(column_group_df.iloc[1], axis=1) - result = g._positional_selector[1:-1] - expected = column_group_df.iloc[:, [1, 3]] - - tm.assert_frame_equal(result, expected) - - def test_columns_on_iter(): # GitHub issue #44821 df = pd.DataFrame({k: range(10) for k in "ABC"}) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 3180a92be1236..2b590c50371e9 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -5,7 +5,6 @@ from pandas import ( DataFrame, Index, - date_range, ) import pandas._testing as tm @@ -36,80 +35,7 @@ def test_groupby_fill_duplicate_column_names(func): tm.assert_frame_equal(result, expected) -def test_ffill_missing_arguments(): - # GH 14955 - df = DataFrame({"a": [1, 2], "b": [1, 1]}) - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pytest.raises(ValueError, match="Must specify a fill"): - df.groupby("b").fillna() - - -@pytest.mark.parametrize( - "method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])] -) -def test_fillna_with_string_dtype(method, expected): - # GH 40250 - df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]}) - grp = df.groupby("b") - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grp.fillna(method=method) - expected = DataFrame({"a": pd.array(expected, dtype="string")}) - tm.assert_frame_equal(result, expected) - - -def test_fill_consistency(): - # GH9221 - # pass thru keyword arguments to the generated wrapper - # are set if the passed kw is None (only) - df = DataFrame( - index=pd.MultiIndex.from_product( - [["value1", "value2"], date_range("2014-01-01", "2014-01-06")] - ), - columns=Index(["1", "2"], name="id"), - ) - df["1"] = [ - np.nan, - 1, - np.nan, - np.nan, - 11, - np.nan, - np.nan, - 2, - np.nan, - np.nan, - 22, - np.nan, - ] - df["2"] = [ - np.nan, - 3, - np.nan, - np.nan, - 33, - np.nan, - np.nan, - 4, - np.nan, - np.nan, - 44, - np.nan, - ] - - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby(level=0, axis=0).fillna(method="ffill") - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("method", ["ffill", "bfill"]) -@pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("has_nan_group", [True, False]) def test_ffill_handles_nan_groups(dropna, method, has_nan_group): # GH 34725 diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index ee7d342472493..082319d8479f0 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -1,15 +1,24 @@ import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") @@ -61,13 +70,6 @@ def test_as_index_false_unsupported(self, numba_supported_reductions): with pytest.raises(NotImplementedError, match="as_index=False"): getattr(gb, func)(engine="numba", **kwargs) - def test_axis_1_unsupported(self, numba_supported_reductions): - func, kwargs = numba_supported_reductions - df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) - gb = df.groupby("a", axis=1) - with pytest.raises(NotImplementedError, match="axis=1"): - getattr(gb, func)(engine="numba", **kwargs) - def test_no_engine_doesnt_raise(self): # GH55520 df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index ff4685b1e412d..99a88a5d8fe7c 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -1,6 +1,5 @@ import re -import numpy as np import pytest from pandas._libs import lib @@ -29,7 +28,8 @@ def df(self): "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], - "string": list("abc"), + "string": Series(["a", "b", "c"], dtype="str"), + "object": Series(["a", "b", "c"], dtype=object), "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": date_range("20130101", periods=3), @@ -41,6 +41,7 @@ def df(self): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -113,6 +114,7 @@ def test_first_last(self, df, method): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -160,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # object dtypes for transformations are not implemented in Cython and # have no Python fallback - exception = NotImplementedError if method.startswith("cum") else TypeError + exception = ( + (NotImplementedError, TypeError) if method.startswith("cum") else TypeError + ) if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): # The methods default to numeric_only=False and raise TypeError @@ -171,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): re.escape(f"agg function failed [how->{method},dtype->object]"), # cumsum/cummin/cummax/cumprod "function is not implemented for this dtype", + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -181,6 +186,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -198,6 +204,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -207,94 +214,6 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): tm.assert_index_equal(result.columns, expected_columns) -@pytest.mark.parametrize("numeric_only", [True, False, None]) -def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): - if groupby_func in ("idxmax", "idxmin"): - pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") - if groupby_func in ("corrwith", "skew"): - msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1" - request.applymarker(pytest.mark.xfail(reason=msg)) - - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] - ) - df["E"] = "x" - groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - gb = df.groupby(groups) - method = getattr(gb, groupby_func) - args = get_groupby_method_args(groupby_func, df) - kwargs = {"axis": 1} - if numeric_only is not None: - # when numeric_only is None we don't pass any argument - kwargs["numeric_only"] = numeric_only - - # Functions without numeric_only and axis args - no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift") - # Functions with axis args - has_axis = ( - "cumprod", - "cumsum", - "diff", - "pct_change", - "rank", - "shift", - "cummax", - "cummin", - "idxmin", - "idxmax", - "fillna", - ) - warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated" - if numeric_only is not None and groupby_func in no_args: - msg = "got an unexpected keyword argument 'numeric_only'" - if groupby_func in ["cumprod", "cumsum"]: - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - method(*args, **kwargs) - else: - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) - elif groupby_func not in has_axis: - msg = "got an unexpected keyword argument 'axis'" - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) - # fillna and shift are successful even on object dtypes - elif (numeric_only is None or not numeric_only) and groupby_func not in ( - "fillna", - "shift", - ): - msgs = ( - # cummax, cummin, rank - "not supported between instances of", - # cumprod - "can't multiply sequence by non-int of type 'float'", - # cumsum, diff, pct_change - "unsupported operand type", - "has no kernel", - ) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError) - else: - errs = TypeError - with pytest.raises(errs, match=f"({'|'.join(msgs)})"): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - method(*args, **kwargs) - else: - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - result = method(*args, **kwargs) - - df_expected = df.drop(columns="E").T if numeric_only else df.T - expected = getattr(df_expected, groupby_func)(*args).T - if groupby_func == "shift" and not numeric_only: - # shift with axis=1 leaves the leftmost column as numeric - # but transposing for expected gives us object dtype - expected = expected.astype(float) - - tm.assert_equal(result, expected) - - @pytest.mark.parametrize( "kernel, has_arg", [ @@ -310,7 +229,6 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_str ("cumsum", True), ("diff", False), ("ffill", False), - ("fillna", False), ("first", True), ("idxmax", True), ("idxmin", True), @@ -326,6 +244,7 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_str ("quantile", True), ("sem", True), ("skew", True), + ("kurt", True), ("std", True), ("sum", True), ("var", True), @@ -346,21 +265,25 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): method = getattr(gb, kernel) if has_arg and numeric_only is True: # Cases where b does not appear in the result - result = method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg kernel in ("first", "last") or ( # kernels that work on any dtype and don't have numeric_only arg - kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") + kernel in ("any", "all", "bfill", "ffill", "nth", "nunique") and numeric_only is lib.no_default ) ): - warn = FutureWarning if kernel == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = method(*args, **kwargs) + result = method(*args, **kwargs) assert "b" in result.columns elif has_arg: assert numeric_only is not True @@ -374,19 +297,27 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): [ "not allowed for this dtype", "cannot be performed against 'object' dtypes", - # On PY39 message is "a number"; on PY310 and after is "a real number" - "must be a string or a.* number", + "must be a string or a real number", "unsupported operand type", "function is not implemented for this dtype", re.escape(f"agg function failed [how->{kernel},dtype->object]"), ] ) - if kernel == "idxmin": + if kernel == "quantile": + msg = "dtype 'object' does not support operation 'quantile'" + elif kernel == "idxmin": msg = "'<' not supported between instances of 'type' and 'type'" elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" with pytest.raises(exception, match=msg): - method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" @@ -400,7 +331,6 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): method(*args, **kwargs) -@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("dtype", [bool, int, float, object]) def test_deprecate_numeric_only_series(dtype, groupby_func, request): # GH#46560 @@ -449,27 +379,21 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "max", "prod", "skew", + "kurt", ) # Test default behavior; kernels that fail may be enabled in the future but kernels # that succeed should not be allowed to fail (without deprecation, at least) if groupby_func in fails_on_numeric_object and dtype is object: if groupby_func == "quantile": - msg = "cannot be performed against 'object' dtypes" + msg = "dtype 'object' does not support operation 'quantile'" else: msg = "is not supported for object dtype" - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - with pytest.raises(TypeError, match=msg): - method(*args) + with pytest.raises(TypeError, match=msg): + method(*args) elif dtype is object: - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = method(*args) - with tm.assert_produces_warning(warn, match=warn_msg): - expected = expected_method(*args) + result = method(*args) + expected = expected_method(*args) if groupby_func in obj_result: expected = expected.astype(object) tm.assert_series_equal(result, expected) @@ -485,6 +409,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "quantile", "sem", "skew", + "kurt", "std", "sum", "var", @@ -509,12 +434,10 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): with pytest.raises(TypeError, match=msg): method(*args, numeric_only=True) elif dtype == bool and groupby_func == "quantile": - msg = "Allowing bool dtype in SeriesGroupBy.quantile" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "Cannot use quantile with bool dtype" + with pytest.raises(TypeError, match=msg): # GH#51424 - result = method(*args, numeric_only=True) - expected = method(*args, numeric_only=False) - tm.assert_series_equal(result, expected) + method(*args, numeric_only=False) else: result = method(*args, numeric_only=True) expected = method(*args, numeric_only=False) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 7d5c1625b8ab4..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -35,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 0b451ce73db89..864b9e5d55991 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -67,19 +67,6 @@ def df_with_datetime_col(): return df -@pytest.fixture -def df_with_timedelta_col(): - df = DataFrame( - { - "a": [1, 1, 1, 1, 1, 2, 2, 2, 2], - "b": [3, 3, 4, 4, 4, 4, 4, 3, 3], - "c": range(9), - "d": datetime.timedelta(days=1), - } - ) - return df - - @pytest.fixture def df_with_cat_col(): df = DataFrame( @@ -99,7 +86,7 @@ def df_with_cat_col(): def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): warn_klass = None if warn_msg == "" else FutureWarning - with tm.assert_produces_warning(warn_klass, match=warn_msg): + with tm.assert_produces_warning(warn_klass, match=warn_msg, check_stacklevel=False): if klass is None: if how == "method": getattr(gb, groupby_func)(*args) @@ -119,7 +106,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( - how, by, groupby_series, groupby_func, df_with_string_col + how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string ): df = df_with_string_col args = get_groupby_method_args(groupby_func, df) @@ -157,7 +144,6 @@ def test_groupby_raises_string( ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), - "fillna": (None, ""), "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -179,12 +165,13 @@ def test_groupby_raises_string( TypeError, re.escape("agg function failed [how->prod,dtype->object]"), ), - "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), + "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), "shift": (None, ""), "size": (None, ""), "skew": (ValueError, "could not convert string to float"), + "kurt": (ValueError, "could not convert string to float"), "std": (ValueError, "could not convert string to float"), "sum": (None, ""), "var": ( @@ -193,9 +180,40 @@ def test_groupby_raises_string( ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" + if using_infer_string: + if groupby_func in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + "kurt", + "quantile", + ]: + msg = f"dtype 'str' does not support operation '{groupby_func}'" + if groupby_func in ["sem", "std", "skew", "kurt"]: + # The object-dtype raises ValueError when trying to convert to numeric. + klass = TypeError + elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'" + elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'" + + elif groupby_func in ["cummin", "cummax"]: + msg = msg.replace("object", "str") + elif groupby_func == "corrwith": + msg = "Cannot perform reduction 'mean' with string dtype" + + if groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -219,7 +237,12 @@ def func(x): @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( - how, by, groupby_series, groupby_func_np, df_with_string_col + how, + by, + groupby_series, + groupby_func_np, + df_with_string_col, + using_infer_string, ): # GH#50749 df = df_with_string_col @@ -232,15 +255,17 @@ def test_groupby_raises_string_np( np.sum: (None, ""), np.mean: ( TypeError, - re.escape("agg function failed [how->mean,dtype->object]"), + "Could not convert string .* to numeric|" + "Cannot perform reduction 'mean' with string dtype", ), }[groupby_func_np] - if groupby_series: - warn_msg = "using SeriesGroupBy.[sum|mean]" - else: - warn_msg = "using DataFrameGroupBy.[sum|mean]" - _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) + if using_infer_string: + if groupby_func_np is np.mean: + klass = TypeError + msg = f"Cannot perform reduction '{groupby_func_np.__name__}' with string dtype" + + _call_and_check(klass, msg, how, gb, groupby_func_np, ()) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -259,19 +284,18 @@ def test_groupby_raises_datetime( return klass, msg = { - "all": (None, ""), - "any": (None, ""), + "all": (TypeError, "'all' with datetime64 dtypes is no longer supported"), + "any": (TypeError, "'any' with datetime64 dtypes is no longer supported"), "bfill": (None, ""), "corrwith": (TypeError, "cannot perform __mul__ with this index type"), "count": (None, ""), "cumcount": (None, ""), "cummax": (None, ""), "cummin": (None, ""), - "cumprod": (TypeError, "datetime64 type does not support cumprod operations"), - "cumsum": (TypeError, "datetime64 type does not support cumsum operations"), + "cumprod": (TypeError, "datetime64 type does not support operation 'cumprod'"), + "cumsum": (TypeError, "datetime64 type does not support operation 'cumsum'"), "diff": (None, ""), "ffill": (None, ""), - "fillna": (None, ""), "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -283,7 +307,7 @@ def test_groupby_raises_datetime( "ngroup": (None, ""), "nunique": (None, ""), "pct_change": (TypeError, "cannot perform __truediv__ with this index type"), - "prod": (TypeError, "datetime64 type does not support prod"), + "prod": (TypeError, "datetime64 type does not support operation 'prod'"), "quantile": (None, ""), "rank": (None, ""), "sem": (None, ""), @@ -293,21 +317,27 @@ def test_groupby_raises_datetime( TypeError, "|".join( [ - r"dtype datetime64\[ns\] does not support reduction", - "datetime64 type does not support skew operations", + r"dtype datetime64\[ns\] does not support operation", + "datetime64 type does not support operation 'skew'", + ] + ), + ), + "kurt": ( + TypeError, + "|".join( + [ + r"dtype datetime64\[ns\] does not support operation", + "datetime64 type does not support operation 'kurt'", ] ), ), "std": (None, ""), - "sum": (TypeError, "datetime64 type does not support sum operations"), - "var": (TypeError, "datetime64 type does not support var operations"), + "sum": (TypeError, "datetime64 type does not support operation 'sum"), + "var": (TypeError, "datetime64 type does not support operation 'var'"), }[groupby_func] - if groupby_func in ["any", "all"]: - warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" - elif groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @@ -341,20 +371,25 @@ def test_groupby_raises_datetime_np( gb = gb["d"] klass, msg = { - np.sum: (TypeError, "datetime64 type does not support sum operations"), + np.sum: ( + TypeError, + re.escape("datetime64[us] does not support operation 'sum'"), + ), np.mean: (None, ""), }[groupby_func_np] - - if groupby_series: - warn_msg = "using SeriesGroupBy.[sum|mean]" - else: - warn_msg = "using DataFrameGroupBy.[sum|mean]" - _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) + _call_and_check(klass, msg, how, gb, groupby_func_np, ()) -@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) -def test_groupby_raises_timedelta(func, df_with_timedelta_col): - df = df_with_timedelta_col +@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "kurt", "var"]) +def test_groupby_raises_timedelta(func): + df = DataFrame( + { + "a": [1, 1, 1, 1, 1, 2, 2, 2, 2], + "b": [3, 3, 4, 4, 4, 4, 4, 3, 3], + "c": range(9), + "d": datetime.timedelta(days=1), + } + ) gb = df.groupby(by="a") _call_and_check( @@ -369,7 +404,7 @@ def test_groupby_raises_timedelta(func, df_with_timedelta_col): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_category( - how, by, groupby_series, groupby_func, using_copy_on_write, df_with_cat_col + how, by, groupby_series, groupby_func, df_with_cat_col ): # GH#50749 df = df_with_cat_col @@ -422,13 +457,6 @@ def test_groupby_raises_category( r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'", ), "ffill": (None, ""), - "fillna": ( - TypeError, - r"Cannot setitem on a Categorical with a new category \(0\), " - "set the categories first", - ) - if not using_copy_on_write - else (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -438,7 +466,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'mean'", + "'Categorical' .* does not support operation 'mean'", "category dtype does not support aggregation 'mean'", ] ), @@ -447,7 +475,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'median'", + "'Categorical' .* does not support operation 'median'", "category dtype does not support aggregation 'median'", ] ), @@ -466,7 +494,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'sem'", + "'Categorical' .* does not support operation 'sem'", "category dtype does not support aggregation 'sem'", ] ), @@ -477,16 +505,25 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "dtype category does not support reduction 'skew'", + "dtype category does not support operation 'skew'", "category type does not support skew operations", ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "dtype category does not support operation 'kurt'", + "category type does not support kurt operations", + ] + ), + ), "std": ( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'std'", + "'Categorical' .* does not support operation 'std'", "category dtype does not support aggregation 'std'", ] ), @@ -496,16 +533,15 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'var'", + "'Categorical' .* does not support operation 'var'", "category dtype does not support aggregation 'var'", ] ), ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -540,20 +576,18 @@ def test_groupby_raises_category_np( gb = gb["d"] klass, msg = { - np.sum: (TypeError, "category type does not support sum operations"), + np.sum: (TypeError, "dtype category does not support operation 'sum'"), np.mean: ( TypeError, - "category dtype does not support aggregation 'mean'", + "dtype category does not support operation 'mean'", ), }[groupby_func_np] - - if groupby_series: - warn_msg = "using SeriesGroupBy.[sum|mean]" - else: - warn_msg = "using DataFrameGroupBy.[sum|mean]" - _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) + _call_and_check(klass, msg, how, gb, groupby_func_np, ()) +@pytest.mark.filterwarnings( + "ignore:`groups` by one element list returns scalar is deprecated" +) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_category_on_category( how, @@ -561,7 +595,6 @@ def test_groupby_raises_category_on_category( groupby_series, groupby_func, observed, - using_copy_on_write, df_with_cat_col, ): # GH#50749 @@ -582,16 +615,6 @@ def test_groupby_raises_category_on_category( return empty_groups = not observed and any(group.empty for group in gb.groups.values()) - if ( - not observed - and how != "transform" - and isinstance(by, list) - and isinstance(by[0], str) - and by == ["a", "b"] - ): - assert not empty_groups - # TODO: empty_groups should be true due to unobserved categorical combinations - empty_groups = True if how == "transform": # empty groups will be ignored empty_groups = False @@ -632,13 +655,6 @@ def test_groupby_raises_category_on_category( ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), - "fillna": ( - TypeError, - r"Cannot setitem on a Categorical with a new category \(0\), " - "set the categories first", - ) - if not using_copy_on_write - else (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups @@ -661,7 +677,7 @@ def test_groupby_raises_category_on_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'sem'", + "'Categorical' .* does not support operation 'sem'", "category dtype does not support aggregation 'sem'", ] ), @@ -673,7 +689,16 @@ def test_groupby_raises_category_on_category( "|".join( [ "category type does not support skew operations", - "dtype category does not support reduction 'skew'", + "dtype category does not support operation 'skew'", + ] + ), + ), + "kurt": ( + TypeError, + "|".join( + [ + "category type does not support kurt operations", + "dtype category does not support operation 'kurt'", ] ), ), @@ -681,7 +706,7 @@ def test_groupby_raises_category_on_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'std'", + "'Categorical' .* does not support operation 'std'", "category dtype does not support aggregation 'std'", ] ), @@ -691,26 +716,15 @@ def test_groupby_raises_category_on_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'var'", + "'Categorical' .* does not support operation 'var'", "category dtype does not support aggregation 'var'", ] ), ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) - - -def test_subsetting_columns_axis_1_raises(): - # GH 35443 - df = DataFrame({"a": [1], "b": [2], "c": [3]}) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby("a", axis=1) - with pytest.raises(ValueError, match="Cannot subset columns when using axis=1"): - gb["b"] diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 425079f943aba..014558bbf4bba 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -7,6 +7,9 @@ from pandas._libs.tslibs import iNaT +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.missing import na_value_for_dtype + import pandas as pd from pandas import ( DataFrame, @@ -17,10 +20,59 @@ isna, ) import pandas._testing as tm +from pandas.tests.groupby import get_groupby_method_args from pandas.util import _test_decorators as td -@pytest.mark.parametrize("agg_func", ["any", "all"]) +@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) +def test_basic_aggregations(dtype): + data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) + + index = np.arange(9) + np.random.default_rng(2).shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3, group_keys=False) + + for k, v in grouped: + assert len(v) == 3 + + agged = grouped.aggregate(np.mean) + assert agged[1] == 1 + + expected = grouped.agg(np.mean) + tm.assert_series_equal(agged, expected) # shorthand + tm.assert_series_equal(agged, grouped.mean()) + result = grouped.sum() + expected = grouped.agg(np.sum) + if dtype == "int32": + # NumPy's sum returns int64 + expected = expected.astype("int32") + tm.assert_series_equal(result, expected) + + expected = grouped.apply(lambda x: x * x.sum()) + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + tm.assert_series_equal(transformed, expected) + + value_grouped = data.groupby(data) + result = value_grouped.aggregate(np.mean) + tm.assert_series_equal(result, agged, check_index_type=False) + + # complex agg + agged = grouped.aggregate([np.mean, np.std]) + + msg = r"nested renamer is not supported" + with pytest.raises(pd.errors.SpecificationError, match=msg): + grouped.aggregate({"one": np.mean, "two": np.std}) + + # corner cases + msg = "Must produce aggregated value" + # exception raised is type Exception + with pytest.raises(Exception, match=msg): + grouped.aggregate(lambda x: x * 2) + + @pytest.mark.parametrize( "vals", [ @@ -39,20 +91,20 @@ [np.nan, np.nan, np.nan], ], ) -def test_groupby_bool_aggs(skipna, agg_func, vals): +def test_groupby_bool_aggs(skipna, all_boolean_reductions, vals): df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) # Figure out expectation using Python builtin - exp = getattr(builtins, agg_func)(vals) + exp = getattr(builtins, all_boolean_reductions)(vals) # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func == "any": + if skipna and all(isna(vals)) and all_boolean_reductions == "any": exp = False expected = DataFrame( [exp] * 2, columns=["val"], index=pd.Index(["a", "b"], name="key") ) - result = getattr(df.groupby("key"), agg_func)(skipna=skipna) + result = getattr(df.groupby("key"), all_boolean_reductions)(skipna=skipna) tm.assert_frame_equal(result, expected) @@ -69,18 +121,16 @@ def test_any(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_bool_aggs_dup_column_labels(bool_agg_func): +def test_bool_aggs_dup_column_labels(all_boolean_reductions): # GH#21668 df = DataFrame([[True, True]], columns=["a", "a"]) grp_by = df.groupby([0]) - result = getattr(grp_by, bool_agg_func)() + result = getattr(grp_by, all_boolean_reductions)() expected = df.set_axis(np.array([0])) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize( "data", [ @@ -92,16 +142,16 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): [True, pd.NA, False], ], ) -def test_masked_kleene_logic(bool_agg_func, skipna, data): +def test_masked_kleene_logic(all_boolean_reductions, skipna, data): # GH#37506 ser = Series(data, dtype="boolean") # The result should match aggregating on the whole series. Correctness # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic - expected_data = getattr(ser, bool_agg_func)(skipna=skipna) + expected_data = getattr(ser, all_boolean_reductions)(skipna=skipna) expected = Series(expected_data, index=np.array([0]), dtype="boolean") - result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna) + result = ser.groupby([0, 0, 0]).agg(all_boolean_reductions, skipna=skipna) tm.assert_series_equal(result, expected) @@ -146,17 +196,18 @@ def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): +def test_masked_bool_aggs_skipna( + all_boolean_reductions, dtype, skipna, frame_or_series +): # GH#40585 obj = frame_or_series([pd.NA, 1], dtype=dtype) expected_res = True - if not skipna and bool_agg_func == "all": + if not skipna and all_boolean_reductions == "all": expected_res = pd.NA expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean") - result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) + result = obj.groupby([1, 1]).agg(all_boolean_reductions, skipna=skipna) tm.assert_equal(result, expected) @@ -177,24 +228,82 @@ def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_se tm.assert_equal(result, expected) -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_object_NA_raises_with_skipna_false(bool_agg_func): +def test_object_NA_raises_with_skipna_false(all_boolean_reductions): # GH#37501 ser = Series([pd.NA], dtype=object) with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): - ser.groupby([1]).agg(bool_agg_func, skipna=False) + ser.groupby([1]).agg(all_boolean_reductions, skipna=False) -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_empty(frame_or_series, bool_agg_func): +def test_empty(frame_or_series, all_boolean_reductions): # GH 45231 kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"} obj = frame_or_series(**kwargs, dtype=object) - result = getattr(obj.groupby(obj.index), bool_agg_func)() + result = getattr(obj.groupby(obj.index), all_boolean_reductions)() expected = frame_or_series(**kwargs, dtype=bool) tm.assert_equal(result, expected) +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): + # GH#57040 + if any_real_numpy_dtype is int or any_real_numpy_dtype is float: + # No need to test + return + info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo + min_value = info(any_real_numpy_dtype).min + max_value = info(any_real_numpy_dtype).max + df = DataFrame( + {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]}, + dtype=any_real_numpy_dtype, + ) + gb = df.groupby("a") + result = getattr(gb, how)() + expected = DataFrame( + {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): + # GH#57040 + min_value = np.finfo(float_numpy_dtype).min + max_value = np.finfo(float_numpy_dtype).max + df = DataFrame( + { + "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"), + "b": Series( + [ + np.nan, + min_value, + np.nan, + max_value, + min_value, + np.nan, + max_value, + np.nan, + np.nan, + np.nan, + ], + dtype=float_numpy_dtype, + ), + }, + ) + gb = df.groupby("a") + + if not skipna: + msg = f"DataFrameGroupBy.{how} with skipna=False" + with pytest.raises(ValueError, match=msg): + getattr(gb, how)(skipna=skipna) + return + result = getattr(gb, how)(skipna=skipna) + expected = DataFrame( + {"b": [1, 3, 4, 6, np.nan]}, index=pd.Index(range(1, 6), name="a", dtype="intp") + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func, values", [ @@ -265,30 +374,32 @@ def test_groupby_non_arithmetic_agg_int_like_precision(method, data): tm.assert_frame_equal(result, expected) -def test_idxmin_idxmax_axis1(): +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): + # GH#57019 + na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype)) df = DataFrame( - np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] + { + "a": [2, 1, 1, 2, 3, 3], + "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + }, + dtype=any_real_nullable_dtype, ) - df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - - gb = df.groupby("A") - - warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = gb.idxmax(axis=1) - - alt = df.iloc[:, 1:].idxmax(axis=1) - indexer = res.index.get_level_values(1) - - tm.assert_series_equal(alt[indexer], res.droplevel("A")) - - df["E"] = date_range("2016-01-01", periods=10) - gb2 = df.groupby("A") - - msg = "'>' not supported between instances of 'Timestamp' and 'float'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - gb2.idxmax(axis=1) + gb = df.groupby("a", sort=sort) + method = getattr(gb, how) + result = method(skipna=skipna) + + ilocs = { + ("first", True): [3, 1, 4], + ("first", False): [0, 1, 4], + ("last", True): [3, 1, 5], + ("last", False): [3, 2, 5], + }[how, skipna] + expected = df.iloc[ilocs].set_index("a") + if sort: + expected = expected.sort_index() + tm.assert_frame_equal(result, expected) def test_groupby_mean_no_overflow(): @@ -312,6 +423,239 @@ def test_mean_on_timedelta(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "values, dtype, result_dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64", "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]", "timedelta64[ns]"), + ( + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ], +) +def test_mean_skipna(values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype because + # Series.mean() changes the dtype to float64/object depending on the input dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: x.mean(skipna=skipna)) + .astype(result_dtype) + ) + result = df.groupby("cat")["val"].mean(skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]"), + ], +) +def test_sum_skipna(values, dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the original dtype because + # Series.sum() changes the dtype + expected = ( + df.groupby("cat")["val"].apply(lambda x: x.sum(skipna=skipna)).astype(dtype) + ) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + +def test_sum_skipna_object(skipna): + # GH#15675 + df = DataFrame( + { + "val": ["a", "b", np.nan, "d", "e", "f", "g", "h", "i", "j"], + "cat": ["A", "B"] * 5, + } + ).astype({"val": object}) + if skipna: + expected = Series( + ["aegi", "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + else: + expected = Series( + [np.nan, "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "func, values, dtype, result_dtype", + [ + ("prod", [0, 1, 3, np.nan, 4, 5, 6, 7, -8, 9], "float64", "float64"), + ("prod", [0, -1, 3, 4, 5, np.nan, 6, 7, 8, 9], "Float64", "Float64"), + ("prod", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Int64", "Int64"), + ("prod", [np.nan] * 10, "float64", "float64"), + ("prod", [np.nan] * 10, "Float64", "Float64"), + ("prod", [np.nan] * 10, "Int64", "Int64"), + ("var", [0, -1, 3, 4, np.nan, 5, 6, 7, 8, 9], "float64", "float64"), + ("var", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Float64", "Float64"), + ("var", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Int64", "Float64"), + ("var", [np.nan] * 10, "float64", "float64"), + ("var", [np.nan] * 10, "Float64", "Float64"), + ("var", [np.nan] * 10, "Int64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "float64", "float64"), + ("std", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Float64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Int64", "Float64"), + ("std", [np.nan] * 10, "float64", "float64"), + ("std", [np.nan] * 10, "Float64", "Float64"), + ("std", [np.nan] * 10, "Int64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("sem", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ("sem", [np.nan] * 10, "float64", "float64"), + ("sem", [np.nan] * 10, "Float64", "Float64"), + ("sem", [np.nan] * 10, "Int64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("min", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "min", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "min", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("min", [np.nan] * 10, "float64", "float64"), + ("min", [np.nan] * 10, "Float64", "Float64"), + ("min", [np.nan] * 10, "Int64", "Int64"), + ("max", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("max", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("max", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "max", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "max", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("max", [np.nan] * 10, "float64", "float64"), + ("max", [np.nan] * 10, "Float64", "Float64"), + ("max", [np.nan] * 10, "Int64", "Int64"), + ("median", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("median", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("median", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ( + "median", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "median", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("median", [np.nan] * 10, "float64", "float64"), + ("median", [np.nan] * 10, "Float64", "Float64"), + ("median", [np.nan] * 10, "Int64", "Float64"), + ], +) +def test_multifunc_skipna(func, values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype as some operations + # change the dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: getattr(x, func)(skipna=skipna)) + .astype(result_dtype) + ) + result = getattr(df.groupby("cat")["val"], func)(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cython_median(): arr = np.random.default_rng(2).standard_normal(1000) arr[::2] = np.nan @@ -321,15 +665,11 @@ def test_cython_median(): labels[::17] = np.nan result = df.groupby(labels).median() - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - exp = df.groupby(labels).agg(np.nanmedian) + exp = df.groupby(labels).agg(np.nanmedian) tm.assert_frame_equal(result, exp) df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = df.groupby(labels).agg(np.median) + rs = df.groupby(labels).agg(np.median) xp = df.groupby(labels).median() tm.assert_frame_equal(rs, xp) @@ -362,7 +702,7 @@ def test_max_min_non_numeric(): assert "ss" in result -def test_max_min_object_multiple_columns(using_array_manager): +def test_max_min_object_multiple_columns(using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -375,8 +715,7 @@ def test_max_min_object_multiple_columns(using_array_manager): } ) df._consolidate_inplace() # should already be consolidate, but double-check - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 3 if using_infer_string else 2 gb = df.groupby("A") @@ -606,10 +945,9 @@ def test_groupby_min_max_categorical(func): @pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): +def test_min_empty_string_dtype(func, string_dtype_no_object): # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = string_dtype_no_object df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] result = getattr(df.groupby("a"), func)() expected = DataFrame( @@ -618,6 +956,98 @@ def test_min_empty_string_dtype(func): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +@pytest.mark.parametrize("test_series", [True, False]) +def test_string_dtype_all_na( + string_dtype_no_object, reduction_func, skipna, min_count, test_series +): + # https://github.com/pandas-dev/pandas/issues/60985 + if reduction_func == "corrwith": + # corrwith is deprecated. + return + + dtype = string_dtype_no_object + + if reduction_func in [ + "any", + "all", + "idxmin", + "idxmax", + "mean", + "median", + "std", + "var", + ]: + kwargs = {"skipna": skipna} + elif reduction_func in ["kurt"]: + kwargs = {"min_count": min_count} + elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]: + kwargs = {} + else: + kwargs = {"skipna": skipna, "min_count": min_count} + + expected_dtype, expected_value = dtype, pd.NA + if reduction_func in ["all", "any"]: + expected_dtype = "bool" + # TODO: For skipna=False, bool(pd.NA) raises; should groupby? + expected_value = not skipna if reduction_func == "any" else True + elif reduction_func in ["count", "nunique", "size"]: + # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA? + if ( + test_series + and reduction_func == "size" + and dtype.storage == "pyarrow" + and dtype.na_value is pd.NA + ): + expected_dtype = "Int64" + else: + expected_dtype = "int64" + expected_value = 1 if reduction_func == "size" else 0 + elif reduction_func in ["idxmin", "idxmax"]: + expected_dtype, expected_value = "float64", np.nan + elif not skipna or min_count > 0: + expected_value = pd.NA + elif reduction_func == "sum": + # https://github.com/pandas-dev/pandas/pull/60936 + expected_value = "" + + df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype) + obj = df["b"] if test_series else df + args = get_groupby_method_args(reduction_func, obj) + gb = obj.groupby(df["a"]) + method = getattr(gb, reduction_func) + + if reduction_func in [ + "mean", + "median", + "kurt", + "prod", + "quantile", + "sem", + "skew", + "std", + "var", + ]: + msg = f"dtype '{dtype}' does not support operation '{reduction_func}'" + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + return + elif reduction_func in ["idxmin", "idxmax"] and not skipna: + msg = f"{reduction_func} with skipna=False encountered an NA value." + with pytest.raises(ValueError, match=msg): + method(*args, **kwargs) + return + + result = method(*args, **kwargs) + index = pd.Index(["x"], name="a", dtype=dtype) + if test_series or reduction_func == "size": + name = None if not test_series and reduction_func == "size" else "b" + expected = Series(expected_value, index=index, dtype=expected_dtype, name=name) + else: + expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype) + tm.assert_equal(result, expected) + + def test_max_nan_bug(): df = DataFrame( { @@ -639,9 +1069,6 @@ def test_max_nan_bug(): @pytest.mark.slow -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("dropna", [False, True]) -@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("with_nan", [True, False]) @pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]]) def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys): @@ -828,15 +1255,11 @@ def test_intercept_builtin_sum(): s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = grouped.agg(builtins.sum) - msg = "using np.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result2 = grouped.apply(builtins.sum) - expected = grouped.sum() + # GH#53425 + result = grouped.agg(builtins.sum) + # GH#53425 + result2 = grouped.apply(builtins.sum) + expected = Series([1.0, 2.0, np.nan], index=np.array([0, 1, 2])) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) @@ -884,7 +1307,7 @@ def test_groupby_sum_timedelta_with_nat(): df = DataFrame( { "a": [1, 1, 2, 2], - "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + "b": [pd.Timedelta("1D"), pd.Timedelta("2D"), pd.Timedelta("3D"), pd.NaT], } ) td3 = pd.Timedelta(days=3) @@ -971,10 +1394,8 @@ def test_ops_general(op, targop): labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) result = getattr(df.groupby(labels), op)() - warn = None if op in ("first", "last", "count", "sem") else FutureWarning - msg = f"using DataFrameGroupBy.{op}" - with tm.assert_produces_warning(warn, match=msg): - expected = df.groupby(labels).agg(targop) + kwargs = {"ddof": 1, "axis": 0} if op in ["std", "var"] else {} + expected = df.groupby(labels).agg(targop, **kwargs) tm.assert_frame_equal(result, expected) @@ -1019,43 +1440,32 @@ def test_apply_to_nullable_integer_returns_float(values, function): "median", "mean", "skew", + "kurt", "std", "var", "sem", ], ) -@pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -def test_regression_allowlist_methods(op, axis, skipna, sort): +def test_regression_allowlist_methods(op, skipna, sort): # GH6944 # GH 17537 # explicitly test the allowlist methods - raw_frame = DataFrame([0]) - if axis == 0: - frame = raw_frame - msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be" - else: - frame = raw_frame.T - msg = "DataFrame.groupby with axis=1 is deprecated" + frame = DataFrame([0]) - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = frame.groupby(level=0, axis=axis, sort=sort) + grouped = frame.groupby(level=0, sort=sort) - if op == "skew": - # skew has skipna + if op in ["skew", "kurt", "sum", "mean"]: + # skew, kurt, sum, mean have skipna result = getattr(grouped, op)(skipna=skipna) - expected = frame.groupby(level=0).apply( - lambda h: getattr(h, op)(axis=axis, skipna=skipna) - ) + expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(skipna=skipna)) if sort: - expected = expected.sort_index(axis=axis) + expected = expected.sort_index() tm.assert_frame_equal(result, expected) else: result = getattr(grouped, op)() - expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis)) + expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)()) if sort: - expected = expected.sort_index(axis=axis) + expected = expected.sort_index() tm.assert_frame_equal(result, expected) @@ -1081,3 +1491,30 @@ def test_groupby_prod_with_int64_dtype(): result = df.groupby(["A"]).prod().reset_index() expected = DataFrame({"A": [1], "B": [180970905912331920]}, dtype="int64") tm.assert_frame_equal(result, expected) + + +def test_groupby_std_datetimelike(): + # GH#48481 + tdi = pd.timedelta_range("1 Day", periods=10000) + ser = Series(tdi) + ser[::5] *= 2 # get different std for different groups + + df = ser.to_frame("A").copy() + + df["B"] = ser + Timestamp(0) + df["C"] = ser + Timestamp(0, tz="UTC") + df.iloc[-1] = pd.NaT # last group includes NaTs + + gb = df.groupby(list(range(5)) * 2000) + + result = gb.std() + + # Note: this does not _exactly_ match what we would get if we did + # [gb.get_group(i).std() for i in gb.groups] + # but it _does_ match the floating point error we get doing the + # same operation on int64 data xref GH#51332 + td1 = pd.Timedelta("2887 days 11:21:02.326710176") + td4 = pd.Timedelta("2886 days 00:42:34.664668096") + exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) + expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index d357a65e79796..550efe9187fe8 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -1,14 +1,17 @@ """ test with the TimeGrouper / grouping with datetimes """ + from datetime import ( datetime, timedelta, + timezone, ) import numpy as np import pytest -import pytz + +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -67,12 +70,15 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): gb = df.groupby(tdg) # check we're testing the case we're interested in - assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq) + assert len(gb._grouper.result_index) != len(gb._grouper.codes) return gb class TestGroupBy: + # TODO(infer_string) resample sum introduces 0's + # https://github.com/pandas-dev/pandas/issues/60229 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index @@ -477,12 +483,8 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -498,11 +500,8 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -773,12 +772,12 @@ def test_groupby_with_timezone_selection(self): def test_timezone_info(self): # see gh-11682: Timezone info lost when broadcasting # scalar datetime to DataFrame - - df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) - assert df["b"][0].tzinfo == pytz.utc + utc = timezone.utc + df = DataFrame({"a": [1], "b": [datetime.now(utc)]}) + assert df["b"][0].tzinfo == utc df = DataFrame({"a": [1, 2, 3]}) - df["b"] = datetime.now(pytz.utc) - assert df["b"][0].tzinfo == pytz.utc + df["b"] = datetime.now(utc) + assert df["b"][0].tzinfo == utc def test_datetime_count(self): df = DataFrame( @@ -925,12 +924,10 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( # check that we will go through the singular_series path # in _wrap_applied_output_series assert gb.ngroups == 1 - assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 + assert gb._selected_obj.index.nlevels == 1 # function that returns a Series - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = gb.apply(lambda x: x["Quantity"] * 2) + res = gb.apply(lambda x: x["Quantity"] * 2) dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") expected = DataFrame( diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 61fcc930f116a..e19b7592f75b3 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -9,8 +10,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): @@ -33,26 +43,49 @@ def incorrect_function(x): def test_check_nopython_kwargs(): pytest.importorskip("numba") - def incorrect_function(values, index): - return values + 1 + def incorrect_function(values, index, *, a): + return values + a + + def correct_function(values, index, a): + return values + a data = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) + # py signature binding + with pytest.raises( + TypeError, match="missing a required (keyword-only argument|argument): 'a'" + ): + data.groupby("key").transform(incorrect_function, engine="numba", b=1) + with pytest.raises(TypeError, match="missing a required argument: 'a'"): + data.groupby("key").transform(correct_function, engine="numba", b=1) + + with pytest.raises( + TypeError, match="missing a required (keyword-only argument|argument): 'a'" + ): + data.groupby("key")["data"].transform(incorrect_function, engine="numba", b=1) + with pytest.raises(TypeError, match="missing a required argument: 'a'"): + data.groupby("key")["data"].transform(correct_function, engine="numba", b=1) + + # numba signature check after binding with pytest.raises(NumbaUtilError, match="numba does not support"): data.groupby("key").transform(incorrect_function, engine="numba", a=1) + actual = data.groupby("key").transform(correct_function, engine="numba", a=1) + tm.assert_frame_equal(data[["data"]] + 1, actual) with pytest.raises(NumbaUtilError, match="numba does not support"): data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1) + actual = data.groupby("key")["data"].transform( + correct_function, engine="numba", a=1 + ) + tm.assert_series_equal(data["data"] + 1, actual) @pytest.mark.filterwarnings("ignore") # Filter warnings when parallel=True and the function can't be parallelized by Numba @pytest.mark.parametrize("jit", [True, False]) -@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) -@pytest.mark.parametrize("as_index", [True, False]) -def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): +def test_numba_vs_cython(jit, frame_or_series, nogil, parallel, nopython, as_index): pytest.importorskip("numba") def func(values, index): @@ -69,7 +102,7 @@ def func(values, index): ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} grouped = data.groupby(0, as_index=as_index) - if pandas_obj == "Series": + if frame_or_series is Series: grouped = grouped[1] result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs) @@ -81,8 +114,7 @@ def func(values, index): @pytest.mark.filterwarnings("ignore") # Filter warnings when parallel=True and the function can't be parallelized by Numba @pytest.mark.parametrize("jit", [True, False]) -@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) -def test_cache(jit, pandas_obj, nogil, parallel, nopython): +def test_cache(jit, frame_or_series, nogil, parallel, nopython): # Test that the functions are cached correctly if we switch functions pytest.importorskip("numba") @@ -103,7 +135,7 @@ def func_2(values, index): ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} grouped = data.groupby(0) - if pandas_obj == "Series": + if frame_or_series is Series: grouped = grouped[1] result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs) @@ -184,10 +216,25 @@ def f(values, index): df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3]) result = df.groupby("group").transform(f, engine="numba") - expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3]) + expected = DataFrame([-2.0, -3.0, -4.0], columns=["v"], index=[-1, -2, -3]) tm.assert_frame_equal(result, expected) +def test_index_order_consistency_preserved(): + # GH 57069 + pytest.importorskip("numba") + + def f(values, index): + return values + + df = DataFrame( + {"vals": [0.0, 1.0, 2.0, 3.0], "group": [0, 1, 0, 1]}, index=range(3, -1, -1) + ) + result = df.groupby("group")["vals"].transform(f, engine="numba") + expected = Series([0.0, 1.0, 2.0, 3.0], index=range(3, -1, -1), name="vals") + tm.assert_series_equal(result, expected) + + def test_engine_kwargs_not_cached(): # If the user passes a different set of engine_kwargs don't return the same # jitted function diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a2ecd6c65db60..fecd20fd6cece 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1,4 +1,5 @@ -""" test with the .transform """ +"""test with the .transform""" + import numpy as np import pytest @@ -78,9 +79,7 @@ def demean(arr): # GH 9700 df = DataFrame({"a": range(5, 10), "b": range(5)}) - msg = "using DataFrameGroupBy.max" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("a").transform(max) + result = df.groupby("a").transform(max) expected = DataFrame({"b": range(5)}) tm.assert_frame_equal(result, expected) @@ -88,8 +87,8 @@ def demean(arr): def test_transform_fast(): df = DataFrame( { - "id": np.arange(100000) / 3, - "val": np.random.default_rng(2).standard_normal(100000), + "id": np.arange(10) / 3, + "val": np.random.default_rng(2).standard_normal(10), } ) @@ -98,9 +97,7 @@ def test_transform_fast(): values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values)) expected = Series(values, index=df.index, name="val") - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grp.transform(np.mean) + result = grp.transform(np.mean) tm.assert_series_equal(result, expected) result = grp.transform("mean") @@ -151,18 +148,14 @@ def test_transform_fast3(): def test_transform_broadcast(tsframe, ts): grouped = ts.groupby(lambda x: x.month) - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.transform(np.mean) + result = grouped.transform(np.mean) tm.assert_index_equal(result.index, ts.index) for _, gp in grouped: assert_fp_equal(result.reindex(gp.index), gp.mean()) grouped = tsframe.groupby(lambda x: x.month) - msg = "using DataFrameGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.transform(np.mean) + result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) for _, gp in grouped: agged = gp.mean(axis=0) @@ -170,69 +163,9 @@ def test_transform_broadcast(tsframe, ts): for col in tsframe: assert_fp_equal(res[col], agged[col]) - # group columns - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) - msg = "using DataFrameGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.transform(np.mean) - tm.assert_index_equal(result.index, tsframe.index) - tm.assert_index_equal(result.columns, tsframe.columns) - for _, gp in grouped: - agged = gp.mean(1) - res = result.reindex(columns=gp.columns) - for idx in gp.index: - assert_fp_equal(res.xs(idx), agged[idx]) - - -def test_transform_axis_1(request, transformation_func): - # GH 36308 - - df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) - args = get_groupby_method_args(transformation_func, df) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby([0, 0, 1], axis=1) - warn = FutureWarning if transformation_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = gb.transform(transformation_func, *args) - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T - - if transformation_func in ["diff", "shift"]: - # Result contains nans, so transpose coerces to float - expected["b"] = expected["b"].astype("int64") - - # cumcount returns Series; the rest are DataFrame - tm.assert_equal(result, expected) - - -def test_transform_axis_1_reducer(request, reduction_func): - # GH#45715 - if reduction_func in ( - "corrwith", - "ngroup", - "nth", - ): - marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986") - request.applymarker(marker) - - df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby([0, 0, 1], axis=1) - - result = gb.transform(reduction_func) - expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T - tm.assert_equal(result, expected) - def test_transform_axis_ts(tsframe): - # make sure that we are setting the axes - # correctly when on axis=0 or 1 + # make sure that we are setting the axes correctly # in the presence of a non-monotonic indexer # GH12713 @@ -252,14 +185,6 @@ def test_transform_axis_ts(tsframe): expected = grouped.apply(lambda x: x - x.mean(axis=0)) tm.assert_frame_equal(result, expected) - ts = ts.T - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False) - result = ts - grouped.transform("mean") - expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) - tm.assert_frame_equal(result, expected) - # non-monotonic ts = tso.iloc[[1, 0] + list(range(2, len(base)))] grouped = ts.groupby(lambda x: x.weekday(), group_keys=False) @@ -267,14 +192,6 @@ def test_transform_axis_ts(tsframe): expected = grouped.apply(lambda x: x - x.mean(axis=0)) tm.assert_frame_equal(result, expected) - ts = ts.T - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False) - result = ts - grouped.transform("mean") - expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) - tm.assert_frame_equal(result, expected) - def test_transform_dtype(): # GH 9807 @@ -385,12 +302,8 @@ def test_transform_casting(): def test_transform_multiple(ts): grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - grouped.transform(lambda x: x * 2) - - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped.transform(np.mean) + grouped.transform(np.mean) def test_dispatch_transform(tsframe): @@ -398,33 +311,12 @@ def test_dispatch_transform(tsframe): grouped = df.groupby(lambda x: x.month) - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - filled = grouped.fillna(method="pad") - msg = "Series.fillna with 'method' is deprecated" - fillit = lambda x: x.fillna(method="pad") - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby(lambda x: x.month).transform(fillit) + filled = grouped.ffill() + fillit = lambda x: x.ffill() + expected = df.groupby(lambda x: x.month).transform(fillit) tm.assert_frame_equal(filled, expected) -def test_transform_fillna_null(): - df = DataFrame( - { - "price": [10, 10, 20, 20, 30, 30], - "color": [10, 10, 20, 20, 30, 30], - "cost": (100, 200, 300, 400, 500, 600), - } - ) - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): - df.groupby(["price"]).transform("fillna") - with tm.assert_produces_warning(FutureWarning, match=msg): - with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): - df.groupby(["price"]).fillna() - - def test_transform_transformation_func(transformation_func): # GH 30918 df = DataFrame( @@ -437,9 +329,6 @@ def test_transform_transformation_func(transformation_func): if transformation_func == "cumcount": test_op = lambda x: x.transform("cumcount") mock_op = lambda x: Series(range(len(x)), x.index) - elif transformation_func == "fillna": - test_op = lambda x: x.transform("fillna", value=0) - mock_op = lambda x: x.fillna(value=0) elif transformation_func == "ngroup": test_op = lambda x: x.transform("ngroup") counter = -1 @@ -453,31 +342,12 @@ def mock_op(x): test_op = lambda x: x.transform(transformation_func) mock_op = lambda x: getattr(x, transformation_func)() - if transformation_func == "pct_change": - msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated" - groupby_msg = ( - "The default fill_method='ffill' in DataFrameGroupBy.pct_change " - "is deprecated" - ) - warn = FutureWarning - groupby_warn = FutureWarning - elif transformation_func == "fillna": - msg = "" - groupby_msg = "DataFrameGroupBy.fillna is deprecated" - warn = None - groupby_warn = FutureWarning - else: - msg = groupby_msg = "" - warn = groupby_warn = None - - with tm.assert_produces_warning(groupby_warn, match=groupby_msg): - result = test_op(df.groupby("A")) + result = test_op(df.groupby("A")) # pass the group in same order as iterating `for ... in df.groupby(...)` # but reorder to match df's index since this is a transform groups = [df[["B"]].iloc[4:6], df[["B"]].iloc[6:], df[["B"]].iloc[:4]] - with tm.assert_produces_warning(warn, match=msg): - expected = concat([mock_op(g) for g in groups]).sort_index() + expected = concat([mock_op(g) for g in groups]).sort_index() # sort_index does not preserve the freq expected = expected.set_axis(df.index) @@ -497,7 +367,7 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) -def test_transform_nuisance_raises(df): +def test_transform_nuisance_raises(df, using_infer_string): # case that goes through _transform_item_by_item df.columns = ["A", "B", "B", "D"] @@ -507,24 +377,23 @@ def test_transform_nuisance_raises(df): grouped = df.groupby("A") gbc = grouped["B"] - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert" + if using_infer_string: + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): gbc.transform(lambda x: np.mean(x)) - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: np.mean(x)) def test_transform_function_aliases(df): result = df.groupby("A").transform("mean", numeric_only=True) - msg = "using DataFrameGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby("A")[["C", "D"]].transform(np.mean) + expected = df.groupby("A")[["C", "D"]].transform(np.mean) tm.assert_frame_equal(result, expected) result = df.groupby("A")["C"].transform("mean") - msg = "using SeriesGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby("A")["C"].transform(np.mean) + expected = df.groupby("A")["C"].transform(np.mean) tm.assert_series_equal(result, expected) @@ -544,22 +413,19 @@ def test_series_fast_transform_date(): tm.assert_series_equal(result, expected) -def test_transform_length(): +@pytest.mark.parametrize("func", [lambda x: np.nansum(x), sum]) +def test_transform_length(func): # GH 9697 df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]}) - expected = Series([3.0] * 4) - - def nsum(x): - return np.nansum(x) - - msg = "using DataFrameGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - results = [ - df.groupby("col1").transform(sum)["col2"], - df.groupby("col1")["col2"].transform(sum), - df.groupby("col1").transform(nsum)["col2"], - df.groupby("col1")["col2"].transform(nsum), - ] + if func is sum: + expected = Series([3.0, 3.0, np.nan, np.nan]) + else: + expected = Series([3.0] * 4) + + results = [ + df.groupby("col1").transform(func)["col2"], + df.groupby("col1")["col2"].transform(func), + ] for result in results: tm.assert_series_equal(result, expected, check_names=False) @@ -571,15 +437,12 @@ def test_transform_coercion(): df = DataFrame({"A": ["a", "a", "b", "b"], "B": [0, 1, 3, 4]}) g = df.groupby("A") - msg = "using DataFrameGroupBy.mean" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.transform(np.mean) - + expected = g.transform(np.mean) result = g.transform(lambda x: np.mean(x, axis=0)) tm.assert_frame_equal(result, expected) -def test_groupby_transform_with_int(): +def test_groupby_transform_with_int(using_infer_string): # GH 3740, make sure that we might upcast on item-by-item transform # floats @@ -609,8 +472,11 @@ def test_groupby_transform_with_int(): "D": "foo", } ) + msg = "Could not convert" + if using_infer_string: + msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -622,7 +488,7 @@ def test_groupby_transform_with_int(): s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -644,9 +510,7 @@ def test_groupby_transform_with_int(): def test_groupby_transform_with_nan_group(): # GH 9941 df = DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) - msg = "using SeriesGroupBy.max" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(df.b)["a"].transform(max) + result = df.groupby(df.b)["a"].transform(max) expected = Series([1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a") tm.assert_series_equal(result, expected) @@ -667,17 +531,14 @@ def f(group): return group[:1] grouped = df.groupby("c") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert result["d"].dtype == np.float64 # this is by definition a mutating operation! - with pd.option_context("mode.chained_assignment", None): - for key, group in grouped: - res = f(group) - tm.assert_frame_equal(res, result.loc[key]) + for key, group in grouped: + res = f(group.drop(columns="c")) + tm.assert_frame_equal(res, result.loc[key]) @pytest.mark.parametrize( @@ -706,7 +567,6 @@ def test_cython_transform_series(op, args, targop): @pytest.mark.parametrize("op", ["cumprod", "cumsum"]) -@pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize( "input, exp", [ @@ -823,22 +683,14 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): - warn = None - else: - warn = FutureWarning - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gb.apply(targop) + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": - depr_msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected["string_missing"] = expected["string_missing"].fillna( - np.nan, downcast=False - ) - expected["string"] = expected["string"].fillna(np.nan, downcast=False) + expected["string_missing"] = expected["string_missing"].fillna(np.nan) + by = gb_target.get("by") + if not isinstance(by, (str, list)) or (by != "string" and "string" not in by): + expected["string"] = expected["string"].fillna(np.nan) result = gb[expected.columns].transform(op, *args).sort_index(axis=1) tm.assert_frame_equal(result, expected) @@ -894,8 +746,10 @@ def test_cython_transform_frame_column( msg = "|".join( [ "does not support .* operations", + "does not support operation", ".* is not supported for object dtype", "is not implemented for this dtype", + ".* is not supported for str dtype", ] ) with pytest.raises(TypeError, match=msg): @@ -906,9 +760,7 @@ def test_cython_transform_frame_column( expected = gb[c].apply(targop) expected.name = c if c in ["string_missing", "string"]: - depr_msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = expected.fillna(np.nan, downcast=False) + expected = expected.fillna(np.nan) res = gb[c].transform(op, *args) tm.assert_series_equal(expected, res) @@ -916,38 +768,6 @@ def test_cython_transform_frame_column( tm.assert_series_equal(expected, res2) -def test_transform_with_non_scalar_group(): - # GH 10165 - cols = MultiIndex.from_tuples( - [ - ("syn", "A"), - ("foo", "A"), - ("non", "A"), - ("syn", "C"), - ("foo", "C"), - ("non", "C"), - ("syn", "T"), - ("foo", "T"), - ("non", "T"), - ("syn", "G"), - ("foo", "G"), - ("non", "G"), - ] - ) - df = DataFrame( - np.random.default_rng(2).integers(1, 10, (4, 12)), - columns=cols, - index=["A", "C", "G", "T"], - ) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(axis=1, level=1) - msg = "transform must return a scalar value for each group.*" - with pytest.raises(ValueError, match=msg): - gb.transform(lambda z: z.div(z.sum(axis=1), axis=0)) - - @pytest.mark.parametrize( "cols,expected", [ @@ -1078,9 +898,7 @@ def test_pad_stable_sorting(fill_method): ], ) @pytest.mark.parametrize("periods", [1, -1]) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None]) -@pytest.mark.parametrize("limit", [None, 1]) -def test_pct_change(frame_or_series, freq, periods, fill_method, limit): +def test_pct_change(frame_or_series, freq, periods): # GH 21200, 21621, 30463 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ["a", "b"] @@ -1088,8 +906,6 @@ def test_pct_change(frame_or_series, freq, periods, fill_method, limit): df = DataFrame({"key": key_v, "vals": vals * 2}) df_g = df - if fill_method is not None: - df_g = getattr(df.groupby("key"), fill_method)(limit=limit) grp = df_g.groupby(df.key) expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 @@ -1101,14 +917,7 @@ def test_pct_change(frame_or_series, freq, periods, fill_method, limit): else: expected = expected.to_frame("vals") - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - f"{type(gb).__name__}.pct_change are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.pct_change( - periods=periods, fill_method=fill_method, limit=limit, freq=freq - ) + result = gb.pct_change(periods=periods, freq=freq) tm.assert_equal(result, expected) @@ -1156,9 +965,7 @@ def test_any_all_np_func(func): exp = Series([True, np.nan, True], name="val") - msg = "using SeriesGroupBy.[any|all]" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.groupby("key")["val"].transform(func) + res = df.groupby("key")["val"].transform(func) tm.assert_series_equal(res, exp) @@ -1188,10 +995,7 @@ def test_groupby_transform_timezone_column(func): # GH 24198 ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") result = DataFrame({"end_time": [ts], "id": [1]}) - warn = FutureWarning if not isinstance(func, str) else None - msg = "using SeriesGroupBy.[min|max]" - with tm.assert_produces_warning(warn, match=msg): - result["max_end_time"] = result.groupby("id").end_time.transform(func) + result["max_end_time"] = result.groupby("id").end_time.transform(func) expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) tm.assert_frame_equal(result, expected) @@ -1224,14 +1028,14 @@ def test_groupby_transform_dtype(): df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + expected1 = Series(["+1.35"], name="val") tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val") tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object) @@ -1239,15 +1043,14 @@ def test_groupby_transform_dtype(): tm.assert_series_equal(result, expected1) -@pytest.mark.parametrize("func", ["cumsum", "cumprod", "cummin", "cummax"]) -def test_transform_absent_categories(func): +def test_transform_absent_categories(all_numeric_accumulations): # GH 16771 # cython transforms with more groups than rows x_vals = [1] x_cats = range(2) y = [1] df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y}) - result = getattr(df.y.groupby(df.x, observed=False), func)() + result = getattr(df.y.groupby(df.x, observed=False), all_numeric_accumulations)() expected = df.y tm.assert_series_equal(result, expected) @@ -1285,13 +1088,13 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): func = reduction_func obj = DataFrame( - {"a": [0, 0, 0, 1, 1, 1], "b": range(6)}, - index=["A", "B", "C", "D", "E", "F"], + {"a": [0, 0, 0, 0, 1, 1, 1, 1], "b": range(8)}, + index=["A", "B", "C", "D", "E", "F", "G", "H"], ) if frame_or_series is Series: obj = obj["a"] - g = obj.groupby(np.repeat([0, 1], 3)) + g = obj.groupby(np.repeat([0, 1], 4)) if func == "corrwith" and isinstance(obj, Series): # GH#32293 # TODO: implement SeriesGroupBy.corrwith @@ -1299,7 +1102,14 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): return args = get_groupby_method_args(reduction_func, obj) - result = g.transform(func, *args) + if func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = g.transform(func, *args) # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index) @@ -1309,7 +1119,7 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): tm.assert_index_equal(result.columns, obj.columns) # verify that values were broadcasted across each group - assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 + assert len(set(DataFrame(result).iloc[-4:, -1])) == 1 def test_transform_lambda_with_datetimetz(): @@ -1353,7 +1163,7 @@ def func(grp): # Check that the fastpath raises, see _transform_general obj = gb._obj_with_exclusions - gen = gb._grouper.get_iterator(obj, axis=gb.axis) + gen = gb._grouper.get_iterator(obj) fast_path, slow_path = gb._define_paths(func) _, group = next(gen) @@ -1427,9 +1237,9 @@ def test_categorical_and_not_categorical_key(observed): tm.assert_frame_equal(result, expected_explicit) # Series case - result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform( - "sum" - ) + gb = df_with_categorical.groupby(["A", "C"], observed=observed) + gbp = gb["B"] + result = gbp.transform("sum") expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum") tm.assert_series_equal(result, expected) expected_explicit = Series([4, 2, 4], name="B") @@ -1527,7 +1337,7 @@ def test_null_group_str_reducer(request, dropna, reduction_func): tm.assert_equal(result, expected) -def test_null_group_str_transformer(request, dropna, transformation_func): +def test_null_group_str_transformer(dropna, transformation_func): # GH 17093 df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3]) args = get_groupby_method_args(transformation_func, df) @@ -1552,21 +1362,7 @@ def test_null_group_str_transformer(request, dropna, transformation_func): # ngroup/cumcount always returns a Series as it counts the groups, not values expected = expected["B"].rename(None) - if transformation_func == "pct_change" and not dropna: - warn = FutureWarning - msg = ( - "The default fill_method='ffill' in DataFrameGroupBy.pct_change " - "is deprecated" - ) - elif transformation_func == "fillna": - warn = FutureWarning - msg = "DataFrameGroupBy.fillna is deprecated" - else: - warn = None - msg = "" - with tm.assert_produces_warning(warn, match=msg): - result = gb.transform(transformation_func, *args) - + result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) @@ -1631,11 +1427,7 @@ def test_null_group_str_transformer_series(dropna, transformation_func): dtype = object if transformation_func in ("any", "all") else None buffer.append(Series([np.nan], index=[3], dtype=dtype)) expected = concat(buffer) - - warn = FutureWarning if transformation_func == "fillna" else None - msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = gb.transform(transformation_func, *args) + result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) @@ -1677,8 +1469,12 @@ def test_as_index_no_change(keys, df, groupby_func): args = get_groupby_method_args(groupby_func, df) gb_as_index_true = df.groupby(keys, as_index=True) gb_as_index_false = df.groupby(keys, as_index=False) - warn = FutureWarning if groupby_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" with tm.assert_produces_warning(warn, match=msg): result = gb_as_index_true.transform(groupby_func, *args) with tm.assert_produces_warning(warn, match=msg): @@ -1692,11 +1488,109 @@ def test_idxmin_idxmax_transform_args(how, skipna, numeric_only): # GH#55268 - ensure *args are passed through when calling transform df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")}) gb = df.groupby("a") - msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.transform(how, 0, skipna, numeric_only) - warn = None if skipna else FutureWarning - msg = f"The behavior of DataFrameGroupBy.{how} with .* any-NA and skipna=False" - with tm.assert_produces_warning(warn, match=msg): + if skipna: + result = gb.transform(how, skipna, numeric_only) expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) + else: + msg = f"DataFrameGroupBy.{how} with skipna=False encountered an NA value" + with pytest.raises(ValueError, match=msg): + gb.transform(how, skipna, numeric_only) + + +def test_transform_sum_one_column_no_matching_labels(): + df = DataFrame({"X": [1.0]}) + series = Series(["Y"]) + result = df.groupby(series, as_index=False).transform("sum") + expected = DataFrame({"X": [1.0]}) tm.assert_frame_equal(result, expected) + + +def test_transform_sum_no_matching_labels(): + df = DataFrame({"X": [1.0, -93204, 4935]}) + series = Series(["A", "B", "C"]) + + result = df.groupby(series, as_index=False).transform("sum") + expected = DataFrame({"X": [1.0, -93204, 4935]}) + tm.assert_frame_equal(result, expected) + + +def test_transform_sum_one_column_with_matching_labels(): + df = DataFrame({"X": [1.0, -93204, 4935]}) + series = Series(["A", "B", "A"]) + + result = df.groupby(series, as_index=False).transform("sum") + expected = DataFrame({"X": [4936.0, -93204, 4936.0]}) + tm.assert_frame_equal(result, expected) + + +def test_transform_sum_one_column_with_missing_labels(): + df = DataFrame({"X": [1.0, -93204, 4935]}) + series = Series(["A", "C"]) + + result = df.groupby(series, as_index=False).transform("sum") + expected = DataFrame({"X": [1.0, -93204, np.nan]}) + tm.assert_frame_equal(result, expected) + + +def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): + df = DataFrame({"X": [1.0, -93204, 4935]}) + series = Series(["A", "A"]) + + result = df.groupby(series, as_index=False).transform("sum") + expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int32", "float32"]) +def test_min_one_unobserved_category_no_type_coercion(dtype): + # GH#58084 + df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]}) + df["B"] = df["B"].astype(dtype) + gb = df.groupby("A", observed=False) + result = gb.transform("min") + + expected = DataFrame({"B": [3, 3, 5]}, dtype=dtype) + tm.assert_frame_equal(expected, result) + + +def test_min_all_empty_data_no_type_coercion(): + # GH#58084 + df = DataFrame( + { + "X": Categorical( + [], + categories=[1, "randomcat", 100], + ), + "Y": [], + } + ) + df["Y"] = df["Y"].astype("int32") + + gb = df.groupby("X", observed=False) + result = gb.transform("min") + + expected = DataFrame({"Y": []}, dtype="int32") + tm.assert_frame_equal(expected, result) + + +def test_min_one_dim_no_type_coercion(): + # GH#58084 + df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]}) + df["Y"] = df["Y"].astype("int32") + categories = Categorical([1, 2, 2, 5, 1], categories=[1, 2, 3, 4, 5]) + + gb = df.groupby(categories, observed=False) + result = gb.transform("min") + + expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") + tm.assert_frame_equal(expected, result) + + +def test_nan_in_cumsum_group_label(): + # GH#58811 + df = DataFrame({"A": [1, None], "B": [2, 3]}, dtype="Int16") + gb = df.groupby("A")["B"] + result = gb.cumsum() + expected = Series([2, None], dtype="Int16", name="B") + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index fd5176a28565e..0896b97e8a40e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Index(["a", "b"], dtype=dtype) + expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) tm.assert_index_equal(ser, expected) @@ -59,15 +57,15 @@ def test_index_string_inference(self): ser = Index(["a", 1]) tm.assert_index_equal(ser, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([pd.Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(idx) - assert result.dtype != np.object_ + obj = klass([pd.Timestamp("2019-12-31")], dtype=object) + result = Index(obj) + assert result.dtype == np.object_ - ser = Series([pd.Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(ser) - assert result.dtype != np.object_ + def test_constructor_not_read_only(self): + # GH#57130 + ser = Series([1, 2], dtype=object) + idx = Index(ser) + assert idx._values.flags.writeable diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index f30b578cfcf56..260b4203a4f04 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import Index @@ -16,7 +16,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -81,7 +81,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -144,20 +144,6 @@ def test_summary_bug(self): def test_index_repr_bool_nan(self): # GH32146 arr = Index([True, False, np.nan], dtype=object) - msg = "Index.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - exp1 = arr.format() - out1 = ["True", "False", "NaN"] - assert out1 == exp1 - exp2 = repr(arr) out2 = "Index([True, False, nan], dtype='object')" assert out2 == exp2 - - def test_format_different_scalar_lengths(self): - # GH#35439 - idx = Index(["aaaaaaaaa", "b"]) - expected = ["aaaaaaaaa", "b"] - msg = r"Index\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert idx.format() == expected diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 814a6a516904b..d4932712de1bb 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -1,6 +1,7 @@ """ Tests for ndarray-like method on the base Index class """ + import numpy as np import pytest @@ -33,7 +34,7 @@ def test_insert(self): # test empty null_index = Index([]) - tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) + tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) def test_insert_missing(self, nulls_fixture, using_infer_string): # GH#22295 @@ -56,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) - def test_insert_none_into_string_numpy(self): + def test_insert_none_into_string_numpy(self, string_dtype_no_object): # GH#55365 - pytest.importorskip("pyarrow") - index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b", "c"], dtype=string_dtype_no_object) result = index.insert(-1, None) - expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 3ef3f3ad4d3a2..d57df82b2358c 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -84,13 +84,13 @@ def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 idx = Index([1, pd.Timestamp("2000")]) # default (sort=None) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) # sort=None - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = idx.union(idx[:1], sort=None) tm.assert_index_equal(result, idx) @@ -149,13 +149,13 @@ def test_intersection_str_dates(self, sort): @pytest.mark.parametrize( "index2,expected_arr", - [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B"])], + [(["B", "D"], ["B"]), (["B", "D", "A"], ["A", "B"])], ) def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique index1 = Index(["A", "B", "A", "C"]) expected = Index(expected_arr) - result = index1.intersection(index2, sort=sort) + result = index1.intersection(Index(index2), sort=sort) if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 03a298a13dc2b..d9c9fdc62b0bc 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -21,6 +21,9 @@ class TestCategoricalIndex: @pytest.fixture def simple_index(self) -> CategoricalIndex: + """ + Fixture that provides a CategoricalIndex. + """ return CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) def test_can_hold_identifiers(self): @@ -196,7 +199,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") + @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) @@ -392,3 +395,10 @@ def test_remove_maintains_order(self): ["a", "b", np.nan, "d", "d", "a"], categories=list("dba"), ordered=True ), ) + + +def test_contains_rangeindex_categories_no_engine(): + ci = CategoricalIndex(range(3)) + assert 2 in ci + assert 5 not in ci + assert "_engine" not in ci._cache diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 522ca1bc2afde..b1361b3e8106e 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -1,25 +1,17 @@ """ Tests for CategoricalIndex.__repr__ and related methods. """ + import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex -import pandas._testing as tm class TestCategoricalIndexRepr: - def test_format_different_scalar_lengths(self): - # GH#35439 - idx = CategoricalIndex(["aaaaaaaaa", "b"]) - expected = ["aaaaaaaaa", "b"] - msg = r"CategoricalIndex\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert idx.format() == expected - - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 49eb79da616e7..25232075a07d9 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -64,8 +64,7 @@ def test_take_fill_value(self): tm.assert_categorical_equal(result.values, expected.values) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -103,8 +102,7 @@ def test_take_fill_value_datetime(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/categorical/test_setops.py b/pandas/tests/indexes/categorical/test_setops.py new file mode 100644 index 0000000000000..2e87b90efd54c --- /dev/null +++ b/pandas/tests/indexes/categorical/test_setops.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("na_value", [None, np.nan]) +def test_difference_with_na(na_value): + # GH 57318 + ci = CategoricalIndex(["a", "b", "c", None]) + other = Index(["c", na_value]) + result = ci.difference(other) + expected = CategoricalIndex(["a", "b"], categories=["a", "b", "c"]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index bfb7acdcf4812..61f6be000cee8 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -15,8 +15,7 @@ def sort(request): Caution: Don't confuse this one with the "sort" fixture used - for DataFrame.append or concat. That one has - parameters [True, False]. + for concat. That one has parameters [True, False]. We can't combine them as sort=True is not permitted in the Index setops methods. diff --git a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py index 61a79c4ceabf9..c2d76c0bcc8bd 100644 --- a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py +++ b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py @@ -70,20 +70,32 @@ def test_drop_duplicates(self, keep, expected, index, idx): class TestDropDuplicatesPeriodIndex(DropDuplicates): @pytest.fixture(params=["D", "3D", "h", "2h", "min", "2min", "s", "3s"]) def freq(self, request): + """ + Fixture to test for different frequencies for PeriodIndex. + """ return request.param @pytest.fixture def idx(self, freq): + """ + Fixture to get PeriodIndex for 10 periods for different frequencies. + """ return period_range("2011-01-01", periods=10, freq=freq, name="idx") class TestDropDuplicatesDatetimeIndex(DropDuplicates): @pytest.fixture def idx(self, freq_sample): + """ + Fixture to get DatetimeIndex for 10 periods for different frequencies. + """ return date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") class TestDropDuplicatesTimedeltaIndex(DropDuplicates): @pytest.fixture def idx(self, freq_sample): + """ + Fixture to get TimedeltaIndex for 10 periods for different frequencies. + """ return timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") diff --git a/pandas/tests/indexes/datetimelike_/test_equals.py b/pandas/tests/indexes/datetimelike_/test_equals.py index fc9fbd33d0d28..df9182b2dd1c2 100644 --- a/pandas/tests/indexes/datetimelike_/test_equals.py +++ b/pandas/tests/indexes/datetimelike_/test_equals.py @@ -1,6 +1,7 @@ """ Tests shared for DatetimeIndex/TimedeltaIndex/PeriodIndex """ + from datetime import ( datetime, timedelta, @@ -51,6 +52,7 @@ def test_not_equals_misc_strs(self, index): class TestPeriodIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a PeriodIndex for use in equality tests.""" return period_range("2013-01-01", periods=5, freq="D") # TODO: de-duplicate with other test_equals2 methods @@ -90,6 +92,7 @@ def test_equals2(self, freq): class TestDatetimeIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a DatetimeIndex for use in equality tests.""" return date_range("2013-01-01", periods=5) def test_equals2(self): @@ -142,6 +145,7 @@ def test_not_equals_bday(self, freq): class TestTimedeltaIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a TimedeltaIndex for use in equality tests.""" return timedelta_range("1 day", periods=10) def test_equals2(self): diff --git a/pandas/tests/indexes/datetimelike_/test_indexing.py b/pandas/tests/indexes/datetimelike_/test_indexing.py index ee7128601256a..7b2c81aaf17de 100644 --- a/pandas/tests/indexes/datetimelike_/test_indexing.py +++ b/pandas/tests/indexes/datetimelike_/test_indexing.py @@ -19,7 +19,7 @@ @pytest.mark.parametrize("ldtype", dtlike_dtypes) @pytest.mark.parametrize("rdtype", dtlike_dtypes) def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): - vals = np.tile(3600 * 10**9 * np.arange(3), 2) + vals = np.tile(3600 * 10**9 * np.arange(3, dtype=np.int64), 2) def construct(dtype): if dtype is dtlike_dtypes[-1]: diff --git a/pandas/tests/indexes/datetimelike_/test_nat.py b/pandas/tests/indexes/datetimelike_/test_nat.py index 50cf29d016355..3dd0ce1cbd637 100644 --- a/pandas/tests/indexes/datetimelike_/test_nat.py +++ b/pandas/tests/indexes/datetimelike_/test_nat.py @@ -10,44 +10,33 @@ import pandas._testing as tm -class NATests: - def test_nat(self, index_without_na): - empty_index = index_without_na[:0] - - index_with_na = index_without_na.copy(deep=True) - index_with_na._data[1] = NaT - - assert empty_index._na_value is NaT - assert index_with_na._na_value is NaT - assert index_without_na._na_value is NaT - - idx = index_without_na - assert idx._can_hold_na - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) - assert idx.hasnans is False - - idx = index_with_na - assert idx._can_hold_na - - tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) - assert idx.hasnans is True +@pytest.mark.parametrize( + "index_without_na", + [ + TimedeltaIndex(["1 days", "2 days"]), + PeriodIndex(["2011-01-01", "2011-01-02"], freq="D"), + DatetimeIndex(["2011-01-01", "2011-01-02"]), + DatetimeIndex(["2011-01-01", "2011-01-02"], tz="UTC"), + ], +) +def test_nat(index_without_na): + empty_index = index_without_na[:0] + index_with_na = index_without_na.copy(deep=True) + index_with_na._data[1] = NaT -class TestDatetimeIndexNA(NATests): - @pytest.fixture - def index_without_na(self, tz_naive_fixture): - tz = tz_naive_fixture - return DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + assert empty_index._na_value is NaT + assert index_with_na._na_value is NaT + assert index_without_na._na_value is NaT + idx = index_without_na + assert idx._can_hold_na -class TestTimedeltaIndexNA(NATests): - @pytest.fixture - def index_without_na(self): - return TimedeltaIndex(["1 days", "2 days"]) + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + assert idx.hasnans is False + idx = index_with_na + assert idx._can_hold_na -class TestPeriodIndexNA(NATests): - @pytest.fixture - def index_without_na(self): - return PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + assert idx.hasnans is True diff --git a/pandas/tests/indexes/datetimelike_/test_sort_values.py b/pandas/tests/indexes/datetimelike_/test_sort_values.py index a2c349c8b0ef6..bfef0faebeebf 100644 --- a/pandas/tests/indexes/datetimelike_/test_sort_values.py +++ b/pandas/tests/indexes/datetimelike_/test_sort_values.py @@ -185,10 +185,6 @@ def test_sort_values_without_freq_timedeltaindex(self): @pytest.mark.parametrize( "index_dates,expected_dates", [ - ( - ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], - ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], - ), ( ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index c0bc6601769b1..62be8903da206 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -3,7 +3,6 @@ import dateutil import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -102,13 +101,16 @@ def test_astype_tznaive_to_tzaware(self): # dt64->dt64tz deprecated idx._data.astype("datetime64[ns, US/Eastern]") - def test_astype_str_nat(self): + def test_astype_str_nat(self, using_infer_string): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): @@ -118,7 +120,7 @@ def test_astype_str(self): expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -133,7 +135,7 @@ def test_astype_str_tz_and_name(self): "2012-01-03 00:00:00-05:00", ], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -144,7 +146,7 @@ def test_astype_str_freq_and_name(self): expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -156,7 +158,7 @@ def test_astype_str_freq_and_tz(self): result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], - dtype=object, + dtype="str", name="test_name", ) tm.assert_index_equal(result, expected) @@ -251,6 +253,8 @@ def _check_rng(rng): _check_rng(rng_utc) def test_index_convert_to_datetime_array_explicit_pytz(self): + pytz = pytest.importorskip("pytz") + def _check_rng(rng): converted = rng.to_pydatetime() assert isinstance(converted, np.ndarray) diff --git a/pandas/tests/indexes/datetimes/methods/test_insert.py b/pandas/tests/indexes/datetimes/methods/test_insert.py index ebfe490e0e067..4a5b7bcc1a86f 100644 --- a/pandas/tests/indexes/datetimes/methods/test_insert.py +++ b/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -1,8 +1,8 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest -import pytz from pandas import ( NA, @@ -133,49 +133,59 @@ def test_insert3(self, unit): assert result.name == expected.name assert result.freq is None - def test_insert4(self, unit): - for tz in ["US/Pacific", "Asia/Singapore"]: - idx = date_range( - "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit - ) - # preserve freq - expected = date_range( - "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit - ) - for d in [ - Timestamp("2000-01-01 15:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - expected = DatetimeIndex( - [ - "2000-01-01 09:00", - "2000-01-01 10:00", - "2000-01-01 11:00", - "2000-01-01 12:00", - "2000-01-01 13:00", - "2000-01-01 14:00", - "2000-01-01 10:00", - ], - name="idx", - tz=tz, - freq=None, - ).as_unit(unit) - # reset freq to None - for d in [ - Timestamp("2000-01-01 10:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.tz == expected.tz - assert result.freq is None + @pytest.mark.parametrize("tz", ["US/Pacific", "Asia/Singapore"]) + @pytest.mark.parametrize( + "to_ts", + [lambda x: x, lambda x: x.to_pydatetime()], + ids=["Timestamp", "datetime"], + ) + def test_insert4(self, unit, tz, to_ts): + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + # preserve freq + expected = date_range( + "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit + ) + tz = zoneinfo.ZoneInfo(tz) + d = to_ts(Timestamp("2000-01-01 15:00", tz=tz)) + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + @pytest.mark.parametrize("tz", ["US/Pacific", "Asia/Singapore"]) + @pytest.mark.parametrize( + "to_ts", + [lambda x: x, lambda x: x.to_pydatetime()], + ids=["Timestamp", "datetime"], + ) + def test_insert4_no_freq(self, unit, tz, to_ts): + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ).as_unit(unit) + # reset freq to None + d = to_ts(Timestamp("2000-01-01 10:00", tz=tz)) + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.tz == expected.tz + assert result.freq is None # TODO: also changes DataFrame.__setitem__ with expansion def test_insert_mismatched_tzawareness(self): @@ -214,7 +224,7 @@ def test_insert_mismatched_tz(self): assert expected.dtype == idx.dtype tm.assert_index_equal(result, expected) - item = datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern")) + item = datetime(2000, 1, 4, tzinfo=zoneinfo.ZoneInfo("US/Eastern")) result = idx.insert(3, item) expected = Index( list(idx[:3]) + [item.astimezone(idx.tzinfo)] + list(idx[3:]), diff --git a/pandas/tests/indexes/datetimes/methods/test_normalize.py b/pandas/tests/indexes/datetimes/methods/test_normalize.py index 74711f67e6446..0ce28d60b53b9 100644 --- a/pandas/tests/indexes/datetimes/methods/test_normalize.py +++ b/pandas/tests/indexes/datetimes/methods/test_normalize.py @@ -2,6 +2,7 @@ import numpy as np import pytest +from pandas.compat import WASM import pandas.util._test_decorators as td from pandas import ( @@ -70,6 +71,9 @@ def test_normalize_tz(self): assert not rng.is_normalized @td.skip_if_windows + @pytest.mark.skipif( + WASM, reason="tzset is available only on Unix-like systems, not WASM" + ) @pytest.mark.parametrize( "timezone", [ diff --git a/pandas/tests/indexes/datetimes/methods/test_resolution.py b/pandas/tests/indexes/datetimes/methods/test_resolution.py index 8399fafbbaff2..42c3ab0617b7c 100644 --- a/pandas/tests/indexes/datetimes/methods/test_resolution.py +++ b/pandas/tests/indexes/datetimes/methods/test_resolution.py @@ -1,7 +1,10 @@ from dateutil.tz import tzlocal import pytest -from pandas.compat import IS64 +from pandas.compat import ( + IS64, + WASM, +) from pandas import date_range @@ -20,9 +23,10 @@ ("us", "microsecond"), ], ) +@pytest.mark.skipif(WASM, reason="OverflowError received on WASM") def test_dti_resolution(request, tz_naive_fixture, freq, expected): tz = tz_naive_fixture - if freq == "YE" and not IS64 and isinstance(tz, tzlocal): + if freq == "YE" and ((not IS64) or WASM) and isinstance(tz, tzlocal): request.applymarker( pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") ) diff --git a/pandas/tests/indexes/datetimes/methods/test_round.py b/pandas/tests/indexes/datetimes/methods/test_round.py index cde4a3a65804d..b023542ba0a4c 100644 --- a/pandas/tests/indexes/datetimes/methods/test_round.py +++ b/pandas/tests/indexes/datetimes/methods/test_round.py @@ -216,6 +216,6 @@ def test_round_int64(self, start, index_freq, periods, round_freq): assert (mod == 0).all(), f"round not a {round_freq} multiple" assert (diff <= unit // 2).all(), "round error" if unit % 2 == 0: - assert ( - result.asi8[diff == unit // 2] % 2 == 0 - ).all(), "round half to even error" + assert (result.asi8[diff == unit // 2] % 2 == 0).all(), ( + "round half to even error" + ) diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index d8bdcc2a17685..a202627550cd2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -1,7 +1,7 @@ from datetime import datetime +import zoneinfo import pytest -import pytz from pandas.errors import NullFrequencyError @@ -13,8 +13,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexShift: # ------------------------------------------------------------- @@ -122,24 +120,28 @@ def test_dti_shift_across_dst(self, unit): ) def test_dti_shift_near_midnight(self, shift, result_time, unit): # GH 8616 - dt = datetime(2014, 11, 14, 0) - dt_est = pytz.timezone("EST").localize(dt) + tz = zoneinfo.ZoneInfo("US/Eastern") + dt_est = datetime(2014, 11, 14, 0, tzinfo=tz) idx = DatetimeIndex([dt_est]).as_unit(unit) ser = Series(data=[1], index=idx) result = ser.shift(shift, freq="h") - exp_index = DatetimeIndex([result_time], tz="EST").as_unit(unit) + exp_index = DatetimeIndex([result_time], tz=tz).as_unit(unit) expected = Series(1, index=exp_index) tm.assert_series_equal(result, expected) def test_shift_periods(self, unit): # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = date_range(start=START, end=END, periods=3, unit=unit) + idx = date_range( + start=datetime(2009, 1, 1), end=datetime(2010, 1, 1), periods=3, unit=unit + ) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) @pytest.mark.parametrize("freq", ["B", "C"]) def test_shift_bday(self, freq, unit): - rng = date_range(START, END, freq=freq, unit=unit) + rng = date_range( + datetime(2009, 1, 1), datetime(2010, 1, 1), freq=freq, unit=unit + ) shifted = rng.shift(5) assert shifted[0] == rng[5] assert shifted.freq == rng.freq @@ -152,13 +154,23 @@ def test_shift_bday(self, freq, unit): assert shifted[0] == rng[0] assert shifted.freq == rng.freq - def test_shift_bmonth(self, unit): - rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + def test_shift_bmonth(self, performance_warning, unit): + rng = date_range( + datetime(2009, 1, 1), + datetime(2010, 1, 1), + freq=pd.offsets.BMonthEnd(), + unit=unit, + ) shifted = rng.shift(1, freq=pd.offsets.BDay()) assert shifted[0] == rng[0] + pd.offsets.BDay() - rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + rng = date_range( + datetime(2009, 1, 1), + datetime(2010, 1, 1), + freq=pd.offsets.BMonthEnd(), + unit=unit, + ) + with tm.assert_produces_warning(performance_warning): shifted = rng.shift(1, freq=pd.offsets.CDay()) assert shifted[0] == rng[0] + pd.offsets.CDay() diff --git a/pandas/tests/indexes/datetimes/methods/test_snap.py b/pandas/tests/indexes/datetimes/methods/test_snap.py index 7064e9e7993f8..a3c06ac6257cf 100644 --- a/pandas/tests/indexes/datetimes/methods/test_snap.py +++ b/pandas/tests/indexes/datetimes/methods/test_snap.py @@ -7,9 +7,10 @@ import pandas._testing as tm +@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") +@pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") @pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) @pytest.mark.parametrize("name", [None, "my_dti"]) -@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_dti_snap(name, tz, unit): dti = DatetimeIndex( [ @@ -28,7 +29,9 @@ def test_dti_snap(name, tz, unit): dti = dti.as_unit(unit) result = dti.snap(freq="W-MON") - expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") + msg = "'w-mon' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") expected = expected.repeat([3, 4]) expected = expected.as_unit(unit) tm.assert_index_equal(result, expected) @@ -38,7 +41,9 @@ def test_dti_snap(name, tz, unit): result = dti.snap(freq="B") - expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") + msg = "'b' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") expected = expected.repeat([1, 1, 1, 2, 2]) expected = expected.as_unit(unit) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 42a3f3b0f7b42..cd4a142dd5b30 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -1,7 +1,8 @@ +from datetime import timezone + import dateutil.tz from dateutil.tz import tzlocal import pytest -import pytz from pandas._libs.tslibs.ccalendar import MONTHS from pandas._libs.tslibs.offsets import MonthEnd @@ -90,43 +91,14 @@ def test_dti_to_period_2monthish(self, freq_offset, freq_period): tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq_period)) @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ("1YE", "1A"), - ("2YE-MAR", "2A-MAR"), - ], - ) - def test_to_period_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): - # GH#9586 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - rng = date_range("01-Jan-2012", periods=8, freq=freq) - prng = rng.to_period() - with tm.assert_produces_warning(FutureWarning, match=msg): - assert prng.freq == freq_depr - - @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2BQE-SEP", "2BQ-SEP"), - ("2BYE-MAR", "2BY-MAR"), - ], + "freq", ["2ME", "1me", "2QE", "2QE-SEP", "1YE", "ye", "2YE-MAR"] ) - def test_to_period_frequency_BQ_BY_deprecated(self, freq, freq_depr): - # GH#9586 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_to_period_frequency_M_Q_Y_raises(self, freq): + msg = f"Invalid frequency: {freq}" - rng = date_range("01-Jan-2012", periods=8, freq=freq) - prng = rng.to_period() - with tm.assert_produces_warning(FutureWarning, match=msg): - prng.freq == freq_depr + rng = date_range("01-Jan-2012", periods=8, freq="ME") + with pytest.raises(ValueError, match=msg): + rng.to_period(freq) def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 @@ -136,10 +108,10 @@ def test_to_period_infer(self): freq="5min", ) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): pi1 = rng.to_period("5min") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): pi2 = rng.to_period() tm.assert_index_equal(pi1, pi2) @@ -162,8 +134,7 @@ def test_to_period_millisecond(self): ] ) - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): period = index.to_period(freq="ms") assert 2 == len(period) assert period[0] == Period("2007-01-01 10:11:12.123Z", "ms") @@ -177,8 +148,7 @@ def test_to_period_microsecond(self): ] ) - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): period = index.to_period(freq="us") assert 2 == len(period) assert period[0] == Period("2007-01-01 10:11:12.123456Z", "us") @@ -186,15 +156,18 @@ def test_to_period_microsecond(self): @pytest.mark.parametrize( "tz", - ["US/Eastern", pytz.utc, tzlocal(), "dateutil/US/Eastern", dateutil.tz.tzutc()], + [ + "US/Eastern", + timezone.utc, + tzlocal(), + "dateutil/US/Eastern", + dateutil.tz.tzutc(), + ], ) def test_to_period_tz(self, tz): ts = date_range("1/1/2000", "2/1/2000", tz=tz) - with tm.assert_produces_warning(UserWarning): - # GH#21333 warning that timezone info will be lost - # filter warning about freq deprecation - + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period()[0] expected = ts[0].to_period(ts.freq) @@ -202,8 +175,7 @@ def test_to_period_tz(self, tz): expected = date_range("1/1/2000", "2/1/2000").to_period() - with tm.assert_produces_warning(UserWarning): - # GH#21333 warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period(ts.freq) tm.assert_index_equal(result, expected) @@ -212,7 +184,7 @@ def test_to_period_tz(self, tz): def test_to_period_tz_utc_offset_consistency(self, tz): # GH#22905 ts = date_range("1/1/2000", "2/1/2000", tz="Etc/GMT-1") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period()[0] expected = ts[0].to_period(ts.freq) assert result == expected @@ -233,10 +205,16 @@ def test_to_period_nofreq(self): assert idx.freqstr is None tm.assert_index_equal(idx.to_period(), expected) - @pytest.mark.parametrize("freq", ["2BMS", "1SME-15"]) + @pytest.mark.parametrize("freq", ["2BME", "SME-15", "2BMS"]) def test_to_period_offsets_not_supported(self, freq): # GH#56243 - msg = f"{freq[1:]} is not supported as period frequency" + msg = "|".join( + [ + f"Invalid frequency: {freq}", + f"{freq} is not supported as period frequency", + ] + ) + ts = date_range("1/1/2012", periods=4, freq=freq) - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.to_period() diff --git a/pandas/tests/indexes/datetimes/methods/test_to_series.py b/pandas/tests/indexes/datetimes/methods/test_to_series.py index 0c397c8ab2cd3..cd67775b7a5fc 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_series.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_series.py @@ -13,6 +13,6 @@ def test_to_series(self): idx = naive.tz_localize("US/Pacific") expected = Series(np.array(idx.tolist(), dtype="object"), name="B") - result = idx.to_series(index=[0, 1]) + result = idx.to_series(index=range(2)) assert expected.dtype == idx.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_convert.py b/pandas/tests/indexes/datetimes/methods/test_tz_convert.py index b2cf488ac8313..9eabb742b93a4 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_convert.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_convert.py @@ -4,7 +4,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones @@ -260,11 +259,14 @@ def test_dti_tz_convert_tzlocal(self): [ "US/Eastern", "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), ], ) def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tz) diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py index ad7769c6b9671..78a79ac7d1546 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py @@ -1,13 +1,14 @@ from datetime import ( datetime, timedelta, + timezone, ) +from zoneinfo import ZoneInfo import dateutil.tz from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas import ( DatetimeIndex, @@ -19,22 +20,13 @@ ) import pandas._testing as tm -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type [misc] - ZoneInfo = None # type: ignore[misc, assignment] - -easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] -if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - easts.append(tz) +@pytest.fixture(params=["pytz/US/Eastern", gettz("US/Eastern"), ZoneInfo("US/Eastern")]) +def tz(request): + if isinstance(request.param, str) and request.param.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + return pytz.timezone(request.param.removeprefix("pytz/")) + return request.param class TestTZLocalize: @@ -76,10 +68,10 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] index = DatetimeIndex(times) tz = "US/Eastern" - with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): + with pytest.raises(ValueError, match="|".join(times)): index.tz_localize(tz=tz) - with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): + with pytest.raises(ValueError, match="|".join(times)): index.tz_localize(tz=tz, nonexistent="raise") result = index.tz_localize(tz=tz, nonexistent="NaT") @@ -88,15 +80,13 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): expected = dti.tz_convert("US/Eastern") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + with pytest.raises(ValueError, match="Cannot infer dst time"): dr.tz_localize(tz) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): # With repeated hours, we can infer the transition dr = date_range( @@ -116,7 +106,6 @@ def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): result2 = DatetimeIndex(times, tz=tz, ambiguous="infer").as_unit(unit) tm.assert_index_equal(result2, expected) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer3(self, tz): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) @@ -124,11 +113,10 @@ def test_dti_tz_localize_ambiguous_infer3(self, tz): localized_infer = dr.tz_localize(tz, ambiguous="infer") tm.assert_index_equal(localized, localized_infer) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour()) - with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"): + with pytest.raises(ValueError, match="2011-03-13 02:30:00"): dr.tz_localize(tz) # after dst transition, it works @@ -138,12 +126,12 @@ def test_dti_tz_localize_ambiguous_times(self, tz): # November 6, 2011, fall back, repeat 2 AM hour dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + with pytest.raises(ValueError, match="Cannot infer dst time"): dr.tz_localize(tz) # UTC is OK dr = date_range( - datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=pytz.utc + datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=timezone.utc ) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) @@ -174,22 +162,13 @@ def test_dti_tz_localize(self, prefix): tm.assert_numpy_array_equal(dti3.values, dti_utc.values) dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="ms") - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + with pytest.raises(ValueError, match="Cannot infer dst time"): dti.tz_localize(tzstr) dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="ms") - with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): + with pytest.raises(ValueError, match="2011-03-13 02:00:00"): dti.tz_localize(tzstr) - @pytest.mark.parametrize( - "tz", - [ - "US/Eastern", - "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - ], - ) def test_dti_tz_localize_utc_conversion(self, tz): # Localizing to time zone should: # 1) check for DST ambiguities @@ -204,7 +183,7 @@ def test_dti_tz_localize_utc_conversion(self, tz): # DST ambiguity, this should fail rng = date_range("3/11/2012", "3/12/2012", freq="30min") # Is this really how it should fail?? - with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"): + with pytest.raises(ValueError, match="2012-03-11 02:00:00"): rng.tz_localize(tz) def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): @@ -245,7 +224,6 @@ def test_dti_tz_localize_tzlocal(self): dti2 = dti.tz_localize(None) tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_nat(self, tz): times = [ "11/06/2011 00:00", @@ -270,7 +248,6 @@ def test_dti_tz_localize_ambiguous_nat(self, tz): # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] tm.assert_numpy_array_equal(di_test.values, localized.values) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_flags(self, tz, unit): # November 6, 2011, fall back, repeat 2 AM hour @@ -321,8 +298,7 @@ def test_dti_tz_localize_ambiguous_flags(self, tz, unit): dr = dr.append(dr) tm.assert_index_equal(dr, localized) - @pytest.mark.parametrize("tz", easts) - def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): + def test_dti_tz_localize_ambiguous_flags2(self, tz): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) is_dst = np.array([1] * 10) @@ -332,8 +308,8 @@ def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): def test_dti_tz_localize_bdate_range(self): dr = bdate_range("1/1/2009", "1/1/2010") - dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) - localized = dr.tz_localize(pytz.utc) + dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=timezone.utc) + localized = dr.tz_localize(timezone.utc) tm.assert_index_equal(dr_utc, localized) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 3a7c418b27de6..bac849301d1f7 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -1,5 +1,5 @@ # Arithmetic tests specific to DatetimeIndex are generally about `freq` -# rentention or inference. Other arithmetic tests belong in +# retention or inference. Other arithmetic tests belong in # tests/arithmetic/test_datetime64.py import pytest diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 2abbcf6688833..c418b2a18008b 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -7,16 +7,15 @@ ) from functools import partial from operator import attrgetter +import zoneinfo import dateutil import dateutil.tz from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( - OutOfBoundsDatetime, astype_overflowsafe, timezones, ) @@ -35,18 +34,6 @@ class TestDatetimeIndex: - def test_closed_deprecated(self): - # GH#52628 - msg = "The 'closed' keyword" - with tm.assert_produces_warning(FutureWarning, match=msg): - DatetimeIndex([], closed=True) - - def test_normalize_deprecated(self): - # GH#52628 - msg = "The 'normalize' keyword" - with tm.assert_produces_warning(FutureWarning, match=msg): - DatetimeIndex([], normalize=True) - def test_from_dt64_unsupported_unit(self): # GH#49292 val = np.datetime64(1, "D") @@ -135,7 +122,7 @@ def test_dti_with_period_data_raises(self): to_datetime(period_array(data)) def test_dti_with_timedelta64_data_raises(self): - # GH#23675 deprecated, enforrced in GH#29794 + # GH#23675 deprecated, enforced in GH#29794 data = np.array([0], dtype="m8[ns]") msg = r"timedelta64\[ns\] cannot be converted to datetime64" with pytest.raises(TypeError, match=msg): @@ -165,7 +152,9 @@ def test_construction_caching(self): df = pd.DataFrame( { "dt": date_range("20130101", periods=3), - "dttz": date_range("20130101", periods=3, tz="US/Eastern"), + "dttz": date_range( + "20130101", periods=3, tz=zoneinfo.ZoneInfo("US/Eastern") + ), "dt_with_null": [ Timestamp("20130101"), pd.NaT, @@ -174,7 +163,7 @@ def test_construction_caching(self): "dtns": date_range("20130101", periods=3, freq="ns"), } ) - assert df.dttz.dtype.tz.zone == "US/Eastern" + assert df.dttz.dtype.tz.key == "US/Eastern" @pytest.mark.parametrize( "kwargs", @@ -211,7 +200,11 @@ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): # incompat tz/dtype msg = "cannot supply both a tz and a dtype with a tz" with pytest.raises(ValueError, match=msg): - DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz="US/Pacific") + DatetimeIndex( + i.tz_localize(None).asi8, + dtype=i.dtype, + tz=zoneinfo.ZoneInfo("US/Hawaii"), + ) def test_construction_index_with_mixed_timezones(self): # gh-11488: no tz results in DatetimeIndex @@ -296,11 +289,9 @@ def test_construction_index_with_mixed_timezones(self): tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) - msg = "DatetimeIndex has mixed timezones" - msg_depr = "parsing datetimes with mixed time zones will raise an error" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg_depr): - DatetimeIndex(["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"]) + msg = "Mixed timezones detected. Pass utc=True in to_datetime" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"]) # length = 1 result = Index([Timestamp("2011-01-01")], name="idx") @@ -533,7 +524,7 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), Timestamp("2011-01-02 10:00", tz="US/Eastern"), ], - dtype="M8[ns, US/Eastern]", + dtype="M8[s, US/Eastern]", name="idx", ) tm.assert_index_equal(dti, expected) @@ -555,31 +546,25 @@ def test_construction_outofbounds(self): datetime(5000, 1, 1), datetime(6000, 1, 1), ] - exp = Index(dates, dtype=object) - # coerces to object - tm.assert_index_equal(Index(dates), exp) + exp = Index(dates, dtype="M8[us]") + res = Index(dates) + tm.assert_index_equal(res, exp) - msg = "^Out of bounds nanosecond timestamp: 3000-01-01 00:00:00, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - # can't create DatetimeIndex - DatetimeIndex(dates) + DatetimeIndex(dates) @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) def test_dti_date_out_of_range(self, data): # GH#1475 - msg = ( - "^Out of bounds nanosecond timestamp: " - "1400-01-01( 00:00:00)?, at position 0$" - ) - with pytest.raises(OutOfBoundsDatetime, match=msg): - DatetimeIndex(data) + DatetimeIndex(data) def test_construction_with_ndarray(self): # GH 5152 dates = [datetime(2013, 10, 7), datetime(2013, 10, 8), datetime(2013, 10, 9)] data = DatetimeIndex(dates, freq=offsets.BDay()).values result = DatetimeIndex(data, freq=offsets.BDay()) - expected = DatetimeIndex(["2013-10-07", "2013-10-08", "2013-10-09"], freq="B") + expected = DatetimeIndex( + ["2013-10-07", "2013-10-08", "2013-10-09"], dtype="M8[us]", freq="B" + ) tm.assert_index_equal(result, expected) def test_integer_values_and_tz_interpreted_as_utc(self): @@ -617,7 +602,7 @@ def test_constructor_coverage(self): expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) - from_ints = DatetimeIndex(expected.asi8) + from_ints = DatetimeIndex(expected.as_unit("ns").asi8).as_unit("s") tm.assert_index_equal(from_ints, expected) # string with NaT @@ -626,7 +611,7 @@ def test_constructor_coverage(self): expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) - from_ints = DatetimeIndex(expected.asi8) + from_ints = DatetimeIndex(expected.as_unit("ns").asi8).as_unit("s") tm.assert_index_equal(from_ints, expected) # non-conforming @@ -757,14 +742,14 @@ def test_disallow_setting_tz(self): dti = DatetimeIndex(["2010"], tz="UTC") msg = "Cannot directly set timezone" with pytest.raises(AttributeError, match=msg): - dti.tz = pytz.timezone("US/Pacific") + dti.tz = zoneinfo.ZoneInfo("US/Pacific") @pytest.mark.parametrize( "tz", [ None, "America/Los_Angeles", - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), Timestamp("2000", tz="America/Los_Angeles").tz, ], ) @@ -779,13 +764,15 @@ def test_constructor_start_end_with_tz(self, tz): freq="D", ) tm.assert_index_equal(result, expected) - # Especially assert that the timezone is consistent for pytz - assert pytz.timezone("America/Los_Angeles") is result.tz + # Especially assert that the timezone is consistent for zoneinfo + assert zoneinfo.ZoneInfo("America/Los_Angeles") is result.tz @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) def test_constructor_with_non_normalized_pytz(self, tz): # GH 18595 - non_norm_tz = Timestamp("2010", tz=tz).tz + pytz = pytest.importorskip("pytz") + tz_in = pytz.timezone(tz) + non_norm_tz = Timestamp("2010", tz=tz_in).tz result = DatetimeIndex(["2010"], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz @@ -795,8 +782,10 @@ def test_constructor_timestamp_near_dst(self): Timestamp("2016-10-30 03:00:00+0300", tz="Europe/Helsinki"), Timestamp("2016-10-30 03:00:00+0200", tz="Europe/Helsinki"), ] - result = DatetimeIndex(ts) - expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) + result = DatetimeIndex(ts).as_unit("ns") + expected = DatetimeIndex( + [ts[0].to_pydatetime(), ts[1].to_pydatetime()] + ).as_unit("ns") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) @@ -839,7 +828,7 @@ def test_construction_from_replaced_timestamps_with_dst(self): "2005-06-01 00:00:00", ], tz="Australia/Melbourne", - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) def test_construction_with_tz_and_tz_aware_dti(self): @@ -851,8 +840,8 @@ def test_construction_with_tz_and_tz_aware_dti(self): def test_construction_with_nat_and_tzlocal(self): tz = dateutil.tz.tzlocal() - result = DatetimeIndex(["2018", "NaT"], tz=tz) - expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) + result = DatetimeIndex(["2018", "NaT"], tz=tz).as_unit("ns") + expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]).as_unit("ns") tm.assert_index_equal(result, expected) def test_constructor_with_ambiguous_keyword_arg(self): @@ -895,7 +884,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), Timestamp("2015-03-29 04:00:00+02:00", tz=timezone), ] - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) @@ -907,7 +896,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), ] - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) @@ -933,7 +922,9 @@ def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(sel expected = DatetimeIndex([Timestamp("2019", tz="UTC"), pd.NaT]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Eastern"), gettz("US/Eastern")] + ) def test_dti_from_tzaware_datetime(self, tz): d = [datetime(2012, 8, 19, tzinfo=tz)] @@ -948,13 +939,16 @@ def test_dti_tz_constructors(self, tzstr): arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] idx1 = to_datetime(arr).tz_localize(tzstr) - idx2 = date_range(start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr) + idx2 = date_range( + start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr, unit="s" + ) idx2 = idx2._with_freq(None) # the others all have freq=None - idx3 = DatetimeIndex(arr, tz=tzstr) - idx4 = DatetimeIndex(np.array(arr), tz=tzstr) + idx3 = DatetimeIndex(arr, tz=tzstr).as_unit("s") + idx4 = DatetimeIndex(np.array(arr), tz=tzstr).as_unit("s") - for other in [idx2, idx3, idx4]: - tm.assert_index_equal(idx1, other) + tm.assert_index_equal(idx1, idx2) + tm.assert_index_equal(idx1, idx3) + tm.assert_index_equal(idx1, idx4) def test_dti_construction_idempotent(self, unit): rng = date_range( @@ -979,7 +973,7 @@ def test_dti_convert_datetime_list(self, tzstr): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), ], ) @@ -988,6 +982,9 @@ def test_dti_convert_datetime_list(self, tzstr): def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): # GH#47471 check that we get the same raising behavior in the DTI # constructor and Timestamp constructor + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) dtstr = "2013-11-03 01:59:59.999999" item = dtstr if not use_str: @@ -1003,7 +1000,7 @@ def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): mark = pytest.mark.xfail(reason="We implicitly get fold=0.") request.applymarker(mark) - with pytest.raises(pytz.AmbiguousTimeError, match=dtstr): + with pytest.raises(ValueError, match=dtstr): box_cls(item, tz=tz) @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) @@ -1033,24 +1030,29 @@ def test_dti_constructor_with_non_nano_dtype(self, tz): result2 = DatetimeIndex(np.array(vals, dtype=object), dtype=dtype) tm.assert_index_equal(result2, expected) - def test_dti_constructor_with_non_nano_now_today(self): + def test_dti_constructor_with_non_nano_now_today(self, request): # GH#55756 now = Timestamp.now() today = Timestamp.today() result = DatetimeIndex(["now", "today"], dtype="M8[s]") assert result.dtype == "M8[s]" - # result may not exactly match [now, today] so we'll test it up to a tolerance. - # (it *may* match exactly due to rounding) - tolerance = pd.Timedelta(microseconds=1) - diff0 = result[0] - now.as_unit("s") - assert diff0 >= pd.Timedelta(0) - assert diff0 < tolerance - diff1 = result[1] - today.as_unit("s") - assert diff1 >= pd.Timedelta(0) - assert diff1 < tolerance + assert diff1 >= pd.Timedelta(0), f"The difference is {diff0}" + assert diff0 >= pd.Timedelta(0), f"The difference is {diff0}" + + # result may not exactly match [now, today] so we'll test it up to a tolerance. + # (it *may* match exactly due to rounding) + # GH 57535 + request.applymarker( + pytest.mark.xfail( + reason="result may not exactly match [now, today]", strict=False + ) + ) + tolerance = pd.Timedelta(seconds=1) + assert diff0 < tolerance, f"The difference is {diff0}" + assert diff1 < tolerance, f"The difference is {diff0}" def test_dti_constructor_object_float_matches_float_dtype(self): # GH#55780 @@ -1196,9 +1198,9 @@ def test_dti_constructor_object_dtype_dayfirst_yearfirst_with_tz(self): yfirst = Timestamp(2005, 10, 16, tz="US/Pacific") result1 = DatetimeIndex([val], tz="US/Pacific", dayfirst=True) - expected1 = DatetimeIndex([dfirst]) + expected1 = DatetimeIndex([dfirst]).as_unit("s") tm.assert_index_equal(result1, expected1) result2 = DatetimeIndex([val], tz="US/Pacific", yearfirst=True) - expected2 = DatetimeIndex([yfirst]) + expected2 = DatetimeIndex([yfirst]).as_unit("s") tm.assert_index_equal(result2, expected2) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 44dd64e162413..1ec936f830768 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -11,8 +11,6 @@ import numpy as np import pytest -import pytz -from pytz import timezone from pandas._libs.tslibs import timezones from pandas._libs.tslibs.offsets import ( @@ -97,6 +95,7 @@ def test_date_range_timestamp_equiv_dateutil(self): assert ts == stamp def test_date_range_timestamp_equiv_explicit_pytz(self): + pytz = pytest.importorskip("pytz") rng = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) stamp = rng[0] @@ -135,35 +134,21 @@ def test_date_range_name(self): assert idx.name == "TEST" def test_date_range_invalid_periods(self): - msg = "periods must be a number, got foo" + msg = "periods must be an integer, got foo" with pytest.raises(TypeError, match=msg): date_range(start="1/1/2000", periods="foo", freq="D") def test_date_range_fractional_period(self): - msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" - with tm.assert_produces_warning(FutureWarning, match=msg): - rng = date_range("1/1/2000", periods=10.5) - exp = date_range("1/1/2000", periods=10) - tm.assert_index_equal(rng, exp) + msg = "periods must be an integer" + with pytest.raises(TypeError, match=msg): + date_range("1/1/2000", periods=10.5) - @pytest.mark.parametrize( - "freq,freq_depr", - [ - ("2ME", "2M"), - ("2SME", "2SM"), - ("2BQE", "2BQ"), - ("2BYE", "2BY"), - ], - ) - def test_date_range_frequency_M_SM_BQ_BY_deprecated(self, freq, freq_depr): - # GH#52064 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + @pytest.mark.parametrize("freq", ["2M", "1m", "2SM", "2BQ", "1bq", "2BY"]) + def test_date_range_frequency_M_SM_BQ_BY_raises(self, freq): + msg = f"Invalid frequency: {freq}" - expected = date_range("1/1/2000", periods=4, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = date_range("1/1/2000", periods=4, freq=freq_depr) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=4, freq=freq) def test_date_range_tuple_freq_raises(self): # GH#34703 @@ -504,7 +489,8 @@ def test_range_bug(self, unit): def test_range_tz_pytz(self): # see gh-2906 - tz = timezone("US/Eastern") + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") start = tz.localize(datetime(2011, 1, 1)) end = tz.localize(datetime(2011, 1, 3)) @@ -526,23 +512,21 @@ def test_range_tz_pytz(self): @pytest.mark.parametrize( "start, end", [ - [ - Timestamp(datetime(2014, 3, 6), tz="US/Eastern"), - Timestamp(datetime(2014, 3, 12), tz="US/Eastern"), - ], - [ - Timestamp(datetime(2013, 11, 1), tz="US/Eastern"), - Timestamp(datetime(2013, 11, 6), tz="US/Eastern"), - ], + [datetime(2014, 3, 6), datetime(2014, 3, 12)], + [datetime(2013, 11, 1), datetime(2013, 11, 6)], ], ) def test_range_tz_dst_straddle_pytz(self, start, end): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") + start = Timestamp(start, tz=tz) + end = Timestamp(end, tz=tz) dr = date_range(start, end, freq="D") assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) - dr = date_range(start, end, freq="D", tz="US/Eastern") + dr = date_range(start, end, freq="D", tz=tz) assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) @@ -551,7 +535,7 @@ def test_range_tz_dst_straddle_pytz(self, start, end): start.replace(tzinfo=None), end.replace(tzinfo=None), freq="D", - tz="US/Eastern", + tz=tz, ) assert dr[0] == start assert dr[-1] == end @@ -776,59 +760,57 @@ def test_freq_dateoffset_with_relateivedelta_nanos(self): ) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("freq", ["2T", "2L", "1l", "1U", "2N", "2n"]) + def test_frequency_H_T_S_L_U_N_raises(self, freq): + msg = f"Invalid frequency: {freq}" + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=2, freq=freq) + @pytest.mark.parametrize( - "freq,freq_depr", - [ - ("h", "H"), - ("2min", "2T"), - ("1s", "1S"), - ("2ms", "2L"), - ("1us", "1U"), - ("2ns", "2N"), - ], + "freq_depr", ["m", "bm", "CBM", "SM", "BQ", "q-feb", "y-may", "Y-MAY"] ) - def test_frequencies_H_T_S_L_U_N_deprecated(self, freq, freq_depr): - # GH#52536 + def test_frequency_raises(self, freq_depr): + msg = f"Invalid frequency: {freq_depr}" + + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=2, freq=freq_depr) + + def test_date_range_bday(self): + sdate = datetime(1999, 12, 25) + idx = date_range(start=sdate, freq="1B", periods=20) + assert len(idx) == 20 + assert idx[0] == sdate + 0 * offsets.BDay() + assert idx.freq == "B" + + @pytest.mark.parametrize("freq", ["200A", "2A-MAY"]) + def test_frequency_A_raises(self, freq): freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] - freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] - msg = ( - f"'{freq_depr_msg}' is deprecated and will be removed in a future version, " - ) - f"please use '{freq_msg}' instead" + msg = f"Invalid frequency: {freq_msg}" - expected = date_range("1/1/2000", periods=2, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("1/1/2000", periods=2, freq=freq_depr) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=2, freq=freq) @pytest.mark.parametrize( "freq,freq_depr", [ - ("200YE", "200A"), - ("YE", "Y"), - ("2YE-MAY", "2A-MAY"), - ("YE-MAY", "Y-MAY"), + ("2W", "2w"), + ("2W-WED", "2w-wed"), + ("2B", "2b"), + ("2D", "2d"), + ("2C", "2c"), ], ) - def test_frequencies_A_deprecated_Y_renamed(self, freq, freq_depr): - # GH#9586, GH#54275 - freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] - freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] - msg = f"'{freq_depr_msg}' is deprecated and will be removed " - f"in a future version, please use '{freq_msg}' instead." + def test_date_range_depr_lowercase_frequency(self, freq, freq_depr): + # GH#58998 + depr_msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) - expected = date_range("1/1/2000", periods=2, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("1/1/2000", periods=2, freq=freq_depr) + expected = date_range("1/1/2000", periods=4, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = date_range("1/1/2000", periods=4, freq=freq_depr) tm.assert_index_equal(result, expected) - def test_date_range_bday(self): - sdate = datetime(1999, 12, 25) - idx = date_range(start=sdate, freq="1B", periods=20) - assert len(idx) == 20 - assert idx[0] == sdate + 0 * offsets.BDay() - assert idx.freq == "B" - class TestDateRangeTZ: """Tests for date_range with timezones""" @@ -899,7 +881,7 @@ def test_date_range_ambiguous_endpoint(self, tz): # construction with an ambiguous end-point # GH#11626 - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + with pytest.raises(ValueError, match="Cannot infer dst time"): date_range( "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="h" ) @@ -923,7 +905,7 @@ def test_date_range_ambiguous_endpoint(self, tz): def test_date_range_nonexistent_endpoint(self, tz, option, expected): # construction with an nonexistent end-point - with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): + with pytest.raises(ValueError, match="2019-03-10 02:00:00"): date_range( "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="h" ) @@ -1048,7 +1030,7 @@ def test_constructor(self): bdate_range(START, periods=20, freq=BDay()) bdate_range(end=START, periods=20, freq=BDay()) - msg = "periods must be a number, got B" + msg = "periods must be an integer, got B" with pytest.raises(TypeError, match=msg): date_range("2011-1-1", "2012-1-1", "B") @@ -1093,12 +1075,11 @@ def test_daterange_bug_456(self): result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) - @pytest.mark.parametrize("inclusive", ["left", "right", "neither", "both"]) - def test_bdays_and_open_boundaries(self, inclusive): + def test_bdays_and_open_boundaries(self, inclusive_endpoints_fixture): # GH 6673 start = "2018-07-21" # Saturday end = "2018-07-29" # Sunday - result = date_range(start, end, freq="B", inclusive=inclusive) + result = date_range(start, end, freq="B", inclusive=inclusive_endpoints_fixture) bday_start = "2018-07-23" # Monday bday_end = "2018-07-27" # Friday @@ -1127,7 +1108,7 @@ def test_constructor(self): bdate_range(START, periods=20, freq=CDay()) bdate_range(end=START, periods=20, freq=CDay()) - msg = "periods must be a number, got C" + msg = "periods must be an integer, got C" with pytest.raises(TypeError, match=msg): date_range("2011-1-1", "2012-1-1", "C") @@ -1278,6 +1259,24 @@ def test_range_with_timezone_and_custombusinessday(self, start, period, expected expected = DatetimeIndex(expected).as_unit("ns") tm.assert_index_equal(result, expected) + def test_data_range_custombusinessday_partial_time(self, unit): + # GH#57456 + offset = offsets.CustomBusinessDay(weekmask="Sun Mon Tue") + start = datetime(2024, 2, 6, 23) + # end datetime is partial and not in the offset + end = datetime(2024, 2, 14, 14) + result = date_range(start, end, freq=offset, unit=unit) + expected = DatetimeIndex( + [ + "2024-02-06 23:00:00", + "2024-02-11 23:00:00", + "2024-02-12 23:00:00", + "2024-02-13 23:00:00", + ], + dtype=f"M8[{unit}]", + ) + tm.assert_index_equal(result, expected) + class TestDateRangeNonNano: def test_date_range_reso_validation(self): @@ -1708,3 +1707,33 @@ def test_date_range_freqstr_matches_offset(self, freqstr, offset): idx2 = date_range(start=sdate, end=edate, freq=offset) assert len(idx1) == len(idx2) assert idx1.freq == idx2.freq + + def test_date_range_partial_day_year_end(self, unit): + # GH#56134 + rng = date_range( + start="2021-12-31 00:00:01", + end="2023-10-31 00:00:00", + freq="YE", + unit=unit, + ) + exp = DatetimeIndex( + ["2021-12-31 00:00:01", "2022-12-31 00:00:01"], + dtype=f"M8[{unit}]", + freq="YE", + ) + tm.assert_index_equal(rng, exp) + + def test_date_range_negative_freq_year_end_inbounds(self, unit): + # GH#56147 + rng = date_range( + start="2023-10-31 00:00:00", + end="2021-10-31 00:00:00", + freq="-1YE", + unit=unit, + ) + exp = DatetimeIndex( + ["2022-12-31 00:00:00", "2021-12-31 00:00:00"], + dtype=f"M8[{unit}]", + freq="-1YE", + ) + tm.assert_index_equal(rng, exp) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index f7fc64d4b0163..04334a1d8d0c8 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,6 +1,5 @@ import datetime as dt from datetime import date -import re import numpy as np import pytest @@ -134,83 +133,23 @@ def test_asarray_tz_aware(self): tm.assert_numpy_array_equal(result, expected) - def test_CBH_deprecated(self): - msg = "'CBH' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range( - dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq="CBH" - ) - result = DatetimeIndex( - [ - "2022-12-12 09:00:00", - "2022-12-12 10:00:00", - "2022-12-12 11:00:00", - "2022-12-12 12:00:00", - "2022-12-12 13:00:00", - "2022-12-12 14:00:00", - "2022-12-12 15:00:00", - "2022-12-12 16:00:00", - ], - dtype="datetime64[ns]", - freq="cbh", - ) + @pytest.mark.parametrize("freq", ["2H", "2BH", "2S"]) + def test_CBH_raises(self, freq): + msg = f"Invalid frequency: {freq}" - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize( - "freq_depr, expected_values, expected_freq", - [ - ( - "AS-AUG", - ["2021-08-01", "2022-08-01", "2023-08-01"], - "YS-AUG", - ), - ( - "1BAS-MAY", - ["2021-05-03", "2022-05-02", "2023-05-01"], - "1BYS-MAY", - ), - ], - ) - def test_AS_BAS_deprecated(self, freq_depr, expected_values, expected_freq): - # GH#55479 - freq_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] - msg = f"'{freq_msg}' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range( - dt.datetime(2020, 12, 1), dt.datetime(2023, 12, 1), freq=freq_depr - ) - result = DatetimeIndex( - expected_values, - dtype="datetime64[ns]", - freq=expected_freq, - ) + with pytest.raises(ValueError, match=msg): + date_range(dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq=freq) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize( - "freq, expected_values, freq_depr", - [ - ("2BYE-MAR", ["2016-03-31"], "2BA-MAR"), - ("2BYE-JUN", ["2016-06-30"], "2BY-JUN"), - ("2BME", ["2016-02-29", "2016-04-29", "2016-06-30"], "2BM"), - ("2BQE", ["2016-03-31"], "2BQ"), - ("1BQE-MAR", ["2016-03-31", "2016-06-30"], "1BQ-MAR"), - ], - ) - def test_BM_BQ_BY_deprecated(self, freq, expected_values, freq_depr): - # GH#52064 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range(start="2016-02-21", end="2016-08-21", freq=freq_depr) - result = DatetimeIndex( - data=expected_values, - dtype="datetime64[ns]", - freq=freq, - ) + @pytest.mark.parametrize("freq", ["2BM", "1bm", "2BQ", "1BQ-MAR", "2BY-JUN", "1by"]) + def test_BM_BQ_BY_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + date_range(start="2016-02-21", end="2016-08-21", freq=freq) + + @pytest.mark.parametrize("freq", ["2BA-MAR", "1BAS-MAY", "2AS-AUG"]) + def test_BA_BAS_raises(self, freq): + msg = f"Invalid frequency: {freq}" - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range(start="2016-02-21", end="2016-08-21", freq=freq) diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index b52eed8c509c6..f4e0a63043335 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -1,9 +1,11 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import dateutil.tz import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -14,11 +16,6 @@ import pandas._testing as tm -@pytest.fixture(params=["s", "ms", "us", "ns"]) -def unit(request): - return request.param - - def test_get_values_for_csv(): index = pd.date_range(freq="1D", periods=3, start="2017-01-01") @@ -208,12 +205,7 @@ def test_dti_representation_to_series(self, unit): exp3 = "0 2011-01-01\n1 2011-01-02\ndtype: datetime64[ns]" - exp4 = ( - "0 2011-01-01\n" - "1 2011-01-02\n" - "2 2011-01-03\n" - "dtype: datetime64[ns]" - ) + exp4 = "0 2011-01-01\n1 2011-01-02\n2 2011-01-03\ndtype: datetime64[ns]" exp5 = ( "0 2011-01-01 09:00:00+09:00\n" @@ -229,11 +221,7 @@ def test_dti_representation_to_series(self, unit): "dtype: datetime64[ns, US/Eastern]" ) - exp7 = ( - "0 2011-01-01 09:00:00\n" - "1 2011-01-02 10:15:00\n" - "dtype: datetime64[ns]" - ) + exp7 = "0 2011-01-01 09:00:00\n1 2011-01-02 10:15:00\ndtype: datetime64[ns]" with pd.option_context("display.width", 300): for idx, expected in zip( @@ -281,7 +269,7 @@ def test_dti_summary(self): result = idx._summary() assert result == expected - @pytest.mark.parametrize("tz", [None, pytz.utc, dateutil.tz.tzutc()]) + @pytest.mark.parametrize("tz", [None, timezone.utc, dateutil.tz.tzutc()]) @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_repr_etc_smoke(self, tz, freq): # only really care that it works @@ -291,66 +279,3 @@ def test_dti_business_repr_etc_smoke(self, tz, freq): repr(dti) dti._summary() dti[2:2]._summary() - - -class TestFormat: - def test_format(self): - # GH#35439 - idx = pd.date_range("20130101", periods=5) - expected = [f"{x:%Y-%m-%d}" for x in idx] - msg = r"DatetimeIndex\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert idx.format() == expected - - def test_format_with_name_time_info(self): - # bug I fixed 12/20/2011 - dates = pd.date_range("2011-01-01 04:00:00", periods=10, name="something") - - msg = "DatetimeIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = dates.format(name=True) - assert formatted[0] == "something" - - def test_format_datetime_with_time(self): - dti = DatetimeIndex([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) - - msg = "DatetimeIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = dti.format() - expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] - assert len(result) == 2 - assert result == expected - - def test_format_datetime(self): - msg = "DatetimeIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() - assert formatted[0] == "2003-01-01 12:00:00" - assert formatted[1] == "NaT" - - def test_format_date(self): - msg = "DatetimeIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() - assert formatted[0] == "2003-01-01" - assert formatted[1] == "NaT" - - def test_format_date_tz(self): - dti = pd.to_datetime([datetime(2013, 1, 1)], utc=True) - msg = "DatetimeIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = dti.format() - assert formatted[0] == "2013-01-01 00:00:00+00:00" - - dti = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = dti.format() - assert formatted[0] == "2013-01-01 00:00:00+00:00" - - def test_format_date_explicit_date_format(self): - dti = pd.to_datetime([datetime(2003, 2, 1), NaT]) - msg = "DatetimeIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = dti.format(date_format="%m-%d-%Y", na_rep="UT") - assert formatted[0] == "02-01-2003" - assert formatted[1] == "UT" diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index bfbcdcff51ee6..c44345273466c 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -338,8 +338,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -375,8 +374,7 @@ def test_take_fill_value_with_timezone(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/datetimes/test_iter.py b/pandas/tests/indexes/datetimes/test_iter.py index a006ed79f27ba..dbd233eed908f 100644 --- a/pandas/tests/indexes/datetimes/test_iter.py +++ b/pandas/tests/indexes/datetimes/test_iter.py @@ -20,7 +20,7 @@ def test_iteration_preserves_nanoseconds(self, tz): ["2018-02-08 15:00:00.168456358", "2018-02-08 15:00:00.168456359"], tz=tz ) for i, ts in enumerate(index): - assert ts == index[i] # pylint: disable=unnecessary-list-index-lookup + assert ts == index[i] def test_iter_readonly(self): # GH#28055 ints_to_pydatetime with readonly array @@ -35,7 +35,7 @@ def test_iteration_preserves_tz(self): for i, ts in enumerate(index): result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup + expected = index[i] assert result == expected def test_iteration_preserves_tz2(self): @@ -45,7 +45,7 @@ def test_iteration_preserves_tz2(self): for i, ts in enumerate(index): result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup + expected = index[i] assert result._repr_base == expected._repr_base assert result == expected @@ -56,7 +56,7 @@ def test_iteration_preserves_tz3(self): ) for i, ts in enumerate(index): result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup + expected = index[i] assert result._repr_base == expected._repr_base assert result == expected diff --git a/pandas/tests/indexes/datetimes/test_join.py b/pandas/tests/indexes/datetimes/test_join.py index d0ac32939296c..abf6809d67f9c 100644 --- a/pandas/tests/indexes/datetimes/test_join.py +++ b/pandas/tests/indexes/datetimes/test_join.py @@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type): assert isinstance(result, DatetimeIndex) assert result.tz is timezone.utc - def test_datetimeindex_union_join_empty(self, sort): + def test_datetimeindex_union_join_empty(self, sort, using_infer_string): dti = date_range(start="1/1/2001", end="2/1/2001", freq="D") empty = Index([]) result = dti.union(empty, sort=sort) - expected = dti.astype("O") - tm.assert_index_equal(result, expected) + if using_infer_string: + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, dti) + else: + expected = dti.astype("O") + tm.assert_index_equal(result, expected) result = dti.join(empty) assert isinstance(result, DatetimeIndex) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 5db0aa5cf510f..bac9548b932c1 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -10,8 +10,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexOps: def test_infer_freq(self, freq_sample): @@ -26,6 +24,7 @@ def test_infer_freq(self, freq_sample): class TestBusinessDatetimeIndex: @pytest.fixture def rng(self, freq): + START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) return bdate_range(START, END, freq=freq) def test_comparison(self, rng): diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 0ebb88afb6c86..94175a56f1c4a 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -1,4 +1,4 @@ -""" test partial slicing on Series/Frame """ +"""test partial slicing on Series/Frame""" from datetime import datetime @@ -35,7 +35,7 @@ def test_string_index_series_name_converted(self): def test_stringified_slice_with_tz(self): # GH#2658 start = "2013-01-07" - idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") + idx = date_range(start=start, freq="1D", periods=10, tz="US/Eastern") df = DataFrame(np.arange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here @@ -236,7 +236,7 @@ def test_partial_slice_second_precision(self): rng = date_range( start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), periods=20, - freq="US", + freq="us", ) s = Series(np.arange(20), rng) @@ -450,8 +450,8 @@ def test_getitem_with_datestring_with_UTC_offset(self, start, end): with pytest.raises(ValueError, match="Both dates must"): df[start : end[:-4] + "1:00"] + df = df.tz_localize(None) with pytest.raises(ValueError, match="The index must be timezone"): - df = df.tz_localize(None) df[start:end] def test_slice_reduce_to_series(self): diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index e93fc0e2a4e2e..eb472b099fb1f 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -11,6 +11,8 @@ import locale import unicodedata +from hypothesis import given +import hypothesis.strategies as st import numpy as np import pytest @@ -135,7 +137,8 @@ def test_dti_hour_tzaware(self, prefix): # GH#12806 # error: Unsupported operand types for + ("List[None]" and "List[str]") @pytest.mark.parametrize( - "time_locale", [None] + tm.get_locales() # type: ignore[operator] + "time_locale", + [None] + tm.get_locales(), # type: ignore[operator] ) def test_day_name_month_name(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence @@ -327,3 +330,122 @@ def test_dti_is_month_start_custom(self): msg = "Custom business days is not supported by is_month_start" with pytest.raises(ValueError, match=msg): dti.is_month_start + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([False, True, False])), + ("2017-12-01", "QS", 3, np.array([True, False, False])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_year_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_year_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, False, False])), + ("2017-12-01", "QE", 3, np.array([True, False, False])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_year_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_year_end + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([False, True, False])), + ("2017-12-01", "QS", 3, np.array([True, True, True])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_quarter_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_quarter_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, False, False])), + ("2017-12-01", "QE", 3, np.array([True, True, True])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_quarter_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_quarter_end + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([True, True, True])), + ("2017-12-01", "QS", 3, np.array([True, True, True])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_month_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_month_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, True, True])), + ("2017-12-01", "QE", 3, np.array([True, True, True])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_month_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_month_end + tm.assert_numpy_array_equal(result, expected_values) + + def test_dti_is_year_quarter_start_doubledigit_freq(self): + # GH#58523 + dr = date_range("2017-01-01", periods=2, freq="10YS") + assert all(dr.is_year_start) + + dr = date_range("2017-01-01", periods=2, freq="10QS") + assert all(dr.is_quarter_start) + + def test_dti_is_year_start_freq_custom_business_day_with_digit(self): + # GH#58664 + dr = date_range("2020-01-01", periods=2, freq="2C") + msg = "Custom business days is not supported by is_year_start" + with pytest.raises(ValueError, match=msg): + dr.is_year_start + + @pytest.mark.parametrize("freq", ["3BMS", offsets.BusinessMonthBegin(3)]) + def test_dti_is_year_quarter_start_freq_business_month_begin(self, freq): + # GH#58729 + dr = date_range("2020-01-01", periods=5, freq=freq) + result = [x.is_year_start for x in dr] + assert result == [True, False, False, False, True] + + dr = date_range("2020-01-01", periods=4, freq=freq) + result = [x.is_quarter_start for x in dr] + assert all(dr.is_quarter_start) + + +@given( + dt=st.datetimes(min_value=datetime(1960, 1, 1), max_value=datetime(1980, 1, 1)), + n=st.integers(min_value=1, max_value=10), + freq=st.sampled_from(["MS", "QS", "YS"]), +) +@pytest.mark.slow +def test_against_scalar_parametric(freq, dt, n): + # https://github.com/pandas-dev/pandas/issues/49606 + freq = f"{n}{freq}" + d = date_range(dt, periods=3, freq=freq) + result = list(d.is_year_start) + expected = [x.is_year_start for x in d] + assert result == expected diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index fc3a1d4721841..7ef6efad0ff6f 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -1,11 +1,11 @@ from datetime import ( datetime, + timedelta, timezone, ) import numpy as np import pytest -import pytz import pandas.util._test_decorators as td @@ -560,6 +560,7 @@ def test_intersection_list(self): tm.assert_index_equal(res, idx) def test_month_range_union_tz_pytz(self, sort): + pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") early_start = datetime(2011, 1, 1) @@ -648,7 +649,7 @@ def test_intersection_bug(self): assert result.freq == b.freq @pytest.mark.parametrize( - "tz", [None, "UTC", "Europe/Berlin", pytz.FixedOffset(-60)] + "tz", [None, "UTC", "Europe/Berlin", timezone(timedelta(hours=-1))] ) def test_intersection_dst_transition(self, tz): # GH 46702: Europe/Berlin has DST transition @@ -664,3 +665,32 @@ def test_intersection_dst_transition(self, tz): result = index1.union(index2) expected = date_range("2021-10-28", periods=6, freq="D", tz="Europe/London") tm.assert_index_equal(result, expected) + + +def test_union_non_nano_rangelike(): + # GH 59036 + l1 = DatetimeIndex( + ["2024-05-11", "2024-05-12"], dtype="datetime64[us]", name="Date", freq="D" + ) + l2 = DatetimeIndex(["2024-05-13"], dtype="datetime64[us]", name="Date", freq="D") + result = l1.union(l2) + expected = DatetimeIndex( + ["2024-05-11", "2024-05-12", "2024-05-13"], + dtype="datetime64[us]", + name="Date", + freq="D", + ) + tm.assert_index_equal(result, expected) + + +def test_intersection_non_nano_rangelike(): + # GH 59271 + l1 = date_range("2024-01-01", "2024-01-03", unit="s") + l2 = date_range("2024-01-02", "2024-01-04", unit="s") + result = l1.intersection(l2) + expected = DatetimeIndex( + ["2024-01-02", "2024-01-03"], + dtype="datetime64[s]", + freq="D", + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index daa5b346eb4ec..8d9340818b511 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -1,17 +1,18 @@ """ Tests for DatetimeIndex timezone-related methods """ + from datetime import ( datetime, timedelta, timezone, tzinfo, ) +import zoneinfo from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( conversion, @@ -199,21 +200,22 @@ def test_utc_box_timestamp_and_localize(self, tzstr): # right tzinfo rng = date_range("3/13/2012", "3/14/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tzstr) - # test not valid for dateutil timezones. - # assert 'EDT' in repr(rng_eastern[0].tzinfo) - assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr( - rng_eastern[0].tzinfo - ) - - @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) + if "dateutil" in tzstr: + assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr( + rng_eastern[0].tzinfo + ) + + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Central"), gettz("US/Central")] + ) def test_with_tz(self, tz): # just want it to work - start = datetime(2011, 3, 12, tzinfo=pytz.utc) + start = datetime(2011, 3, 12, tzinfo=timezone.utc) dr = bdate_range(start, periods=50, freq=pd.offsets.Hour()) - assert dr.tz is pytz.utc + assert dr.tz is timezone.utc # DateRange with naive datetimes - dr = bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc) + dr = bdate_range("1/1/2005", "1/1/2009", tz=timezone.utc) dr = bdate_range("1/1/2005", "1/1/2009", tz=tz) # normalized @@ -230,13 +232,16 @@ def test_with_tz(self, tz): # datetimes with tzinfo set dr = bdate_range( - datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) + datetime(2005, 1, 1, tzinfo=timezone.utc), + datetime(2009, 1, 1, tzinfo=timezone.utc), ) msg = "Start and end cannot both be tz-aware with different timezones" with pytest.raises(Exception, match=msg): - bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) + bdate_range(datetime(2005, 1, 1, tzinfo=timezone.utc), "1/1/2009", tz=tz) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Eastern"), gettz("US/Eastern")] + ) def test_dti_convert_tz_aware_datetime_datetime(self, tz): # GH#1581 dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 778c07b46e57c..90423149658ab 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.dtypes.dtypes import IntervalDtype @@ -23,11 +25,6 @@ import pandas.core.common as com -@pytest.fixture(params=[None, "foo"]) -def name(request): - return request.param - - class ConstructorTests: """ Common tests for all variations of IntervalIndex construction. Input data @@ -35,24 +32,23 @@ class ConstructorTests: get_kwargs_from_breaks to the expected format. """ - @pytest.fixture( - params=[ + @pytest.mark.parametrize( + "breaks_and_expected_subtype", + [ ([3, 14, 15, 92, 653], np.int64), (np.arange(10, dtype="int64"), np.int64), (Index(np.arange(-10, 11, dtype=np.int64)), np.int64), (Index(np.arange(10, 31, dtype=np.uint64)), np.uint64), (Index(np.arange(20, 30, 0.5), dtype=np.float64), np.float64), - (date_range("20180101", periods=10), "=2. + return + + # for MultiIndex, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(idx, copy=False) + + def test_to_frame(): tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] @@ -148,6 +185,13 @@ def test_to_frame_duplicate_labels(): tm.assert_frame_equal(result, expected) +def test_to_frame_column_rangeindex(): + mi = MultiIndex.from_arrays([[1, 2], ["a", "b"]]) + result = mi.to_frame().columns + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) + + def test_to_flat_index(idx): expected = pd.Index( ( diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 99c8ebb1e57b2..b83680be6a5ce 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd from pandas import ( Index, @@ -121,7 +119,7 @@ def test_droplevel_list(): index[:2].droplevel(["one", "four"]) -def test_drop_not_lexsorted(): +def test_drop_not_lexsorted(performance_warning): # GH 12078 # define the lexsorted version of the multi-index @@ -140,7 +138,7 @@ def test_drop_not_lexsorted(): # compare the results tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 6c6d9022b1af3..1bbeedac3fb10 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -243,13 +243,14 @@ def f(a): @pytest.mark.parametrize( "keep, expected", [ - ("first", np.array([False, False, False, True, True, False])), - ("last", np.array([False, True, True, False, False, False])), - (False, np.array([False, True, True, True, True, False])), + ("first", [False, False, False, True, True, False]), + ("last", [False, True, True, False, False, False]), + (False, [False, True, True, True, True, False]), ], ) def test_duplicated(idx_dup, keep, expected): result = idx_dup.duplicated(keep=keep) + expected = np.array(expected) tm.assert_numpy_array_equal(result, expected) @@ -319,14 +320,7 @@ def test_duplicated_drop_duplicates(): tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) -@pytest.mark.parametrize( - "dtype", - [ - np.complex64, - np.complex128, - ], -) -def test_duplicated_series_complex_numbers(dtype): +def test_duplicated_series_complex_numbers(complex_dtype): # GH 17927 expected = Series( [False, False, False, True, False, False, False, True, False, True], @@ -345,7 +339,7 @@ def test_duplicated_series_complex_numbers(dtype): np.nan, np.nan + np.nan * 1j, ], - dtype=dtype, + dtype=complex_dtype, ).duplicated() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_formats.py b/pandas/tests/indexes/multi/test_formats.py index 52ff3109128f2..6ea42349bd04a 100644 --- a/pandas/tests/indexes/multi/test_formats.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -6,48 +6,6 @@ Index, MultiIndex, ) -import pandas._testing as tm - - -def test_format(idx): - msg = "MultiIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - idx.format() - idx[:0].format() - - -def test_format_integer_names(): - index = MultiIndex( - levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1] - ) - msg = "MultiIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - index.format(names=True) - - -def test_format_sparse_config(idx): - # GH1538 - msg = "MultiIndex.format is deprecated" - with pd.option_context("display.multi_sparse", False): - with tm.assert_produces_warning(FutureWarning, match=msg): - result = idx.format() - assert result[1] == "foo two" - - -def test_format_sparse_display(): - index = MultiIndex( - levels=[[0, 1], [0, 1], [0, 1], [0]], - codes=[ - [0, 0, 0, 1, 1, 1], - [0, 0, 1, 0, 0, 1], - [0, 1, 0, 0, 1, 0], - [0, 0, 0, 0, 0, 0], - ], - ) - msg = "MultiIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = index.format() - assert result[3] == "1 0 0 0" def test_repr_with_unicode_data(): diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py index 28c77e78924cb..4db74a716c514 100644 --- a/pandas/tests/indexes/multi/test_get_level_values.py +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -122,3 +122,12 @@ def test_values_loses_freq_of_underlying_index(): midx.values assert idx.freq is not None tm.assert_index_equal(idx, expected) + + +def test_get_level_values_gets_frequency_correctly(): + # GH#57949 GH#58327 + datetime_index = date_range(start=pd.to_datetime("1/1/2018"), periods=4, freq="YS") + other_index = ["A"] + multi_index = MultiIndex.from_product([datetime_index, other_index]) + + assert multi_index.get_level_values(0).freq == datetime_index.freq diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 6eeaeb6711d03..7f292aacc39ca 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -41,7 +41,7 @@ def test_get_dtypes(using_infer_string): names=["int", "string", "dt"], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "int": np.dtype("int64"), @@ -61,7 +61,7 @@ def test_get_dtypes_no_level_name(using_infer_string): pd.date_range("20200101", periods=2, tz="UTC"), ], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "level_0": np.dtype("int64"), @@ -82,7 +82,7 @@ def test_get_dtypes_duplicate_level_names(using_infer_string): ], names=["A", "A", "A"], ).dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( [np.dtype("int64"), exp, DatetimeTZDtype(tz="utc")], index=["A", "A", "A"], @@ -332,10 +332,8 @@ def test_set_value_keeps_names(): index=idx, ) df = df.sort_index() - assert df._is_copy is None assert df.index.names == ("Name", "Number") df.at[("grethe", "4"), "one"] = 99.34 - assert df._is_copy is None assert df.index.names == ("Name", "Number") diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 5e2d3c23da645..f098690be2afa 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -1,3 +1,4 @@ +from collections import namedtuple from datetime import timedelta import re @@ -5,10 +6,7 @@ import pytest from pandas._libs import index as libindex -from pandas.errors import ( - InvalidIndexError, - PerformanceWarning, -) +from pandas.errors import InvalidIndexError import pandas as pd from pandas import ( @@ -43,12 +41,12 @@ def test_slice_locs(self): columns=Index(list("ABCD"), dtype=object), index=date_range("2000-01-01", periods=50, freq="B"), ) - stacked = df.stack(future_stack=True) + stacked = df.stack() idx = stacked.index slob = slice(*idx.slice_locs(df.index[5], df.index[15])) sliced = stacked[slob] - expected = df[5:16].stack(future_stack=True) + expected = df[5:16].stack() tm.assert_almost_equal(sliced.values, expected.values) slob = slice( @@ -58,7 +56,7 @@ def test_slice_locs(self): ) ) sliced = stacked[slob] - expected = df[6:15].stack(future_stack=True) + expected = df[6:15].stack() tm.assert_almost_equal(sliced.values, expected.values) def test_slice_locs_with_type_mismatch(self): @@ -67,7 +65,7 @@ def test_slice_locs_with_type_mismatch(self): columns=Index(list("ABCD"), dtype=object), index=date_range("2000-01-01", periods=10, freq="B"), ) - stacked = df.stack(future_stack=True) + stacked = df.stack() idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs((1, 3)) @@ -78,7 +76,7 @@ def test_slice_locs_with_type_mismatch(self): index=Index([f"i-{i}" for i in range(5)], name="a"), columns=Index([f"i-{i}" for i in range(5)], name="a"), ) - stacked = df.stack(future_stack=True) + stacked = df.stack() idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(timedelta(seconds=30)) @@ -261,8 +259,7 @@ def test_get_indexer(self): def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) msg = ( - "method='nearest' not implemented yet for MultiIndex; " - "see GitHub issue 9365" + "method='nearest' not implemented yet for MultiIndex; see GitHub issue 9365" ) with pytest.raises(NotImplementedError, match=msg): midx.get_indexer(["a"], method="nearest") @@ -560,27 +557,26 @@ def test_getitem_group_select(idx): assert sorted_idx.get_loc("foo") == slice(0, 2) -@pytest.mark.parametrize("ind1", [[True] * 5, Index([True] * 5)]) -@pytest.mark.parametrize( - "ind2", - [[True, False, True, False, False], Index([True, False, True, False, False])], -) -def test_getitem_bool_index_all(ind1, ind2): +@pytest.mark.parametrize("box", [list, Index]) +def test_getitem_bool_index_all(box): # GH#22533 + ind1 = box([True] * 5) idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), (40, 4), (50, 5)]) tm.assert_index_equal(idx[ind1], idx) + ind2 = box([True, False, True, False, False]) expected = MultiIndex.from_tuples([(10, 1), (30, 3)]) tm.assert_index_equal(idx[ind2], expected) -@pytest.mark.parametrize("ind1", [[True], Index([True])]) -@pytest.mark.parametrize("ind2", [[False], Index([False])]) -def test_getitem_bool_index_single(ind1, ind2): +@pytest.mark.parametrize("box", [list, Index]) +def test_getitem_bool_index_single(box): # GH#22533 + ind1 = box([True]) idx = MultiIndex.from_tuples([(10, 1)]) tm.assert_index_equal(idx[ind1], idx) + ind2 = box([False]) expected = MultiIndex( levels=[np.array([], dtype=np.int64), np.array([], dtype=np.int64)], codes=[[], []], @@ -750,7 +746,7 @@ def test_get_loc_duplicates2(self): assert index.get_loc("D") == slice(0, 3) - def test_get_loc_past_lexsort_depth(self): + def test_get_loc_past_lexsort_depth(self, performance_warning): # GH#30053 idx = MultiIndex( levels=[["a"], [0, 7], [1]], @@ -760,7 +756,7 @@ def test_get_loc_past_lexsort_depth(self): ) key = ("a", 7) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): # PerformanceWarning: indexing past lexsort depth may impact performance result = idx.get_loc(key) @@ -923,30 +919,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id assert result == expected -def test_pyint_engine(): +@pytest.mark.parametrize( + "N, expected_dtype", + [ + (1, "uint8"), # 2*4*N = 8 + (2, "uint16"), # 2*4*N = 16 + (4, "uint32"), # 2*4*N = 32 + (8, "uint64"), # 2*4*N = 64 + (10, "object"), # 2*4*N = 80 + ], +) +def test_pyint_engine(N, expected_dtype): # GH#18519 : when combinations of codes cannot be represented in 64 # bits, the index underlying the MultiIndex engine works with Python # integers, rather than uint64. - N = 5 keys = [ tuple(arr) for arr in [ - [0] * 10 * N, - [1] * 10 * N, - [2] * 10 * N, - [np.nan] * N + [2] * 9 * N, - [0] * N + [2] * 9 * N, - [np.nan] * N + [2] * 8 * N + [0] * N, + [0] * 4 * N, + [1] * 4 * N, + [np.nan] * N + [0] * 3 * N, + [0] * N + [1] * 3 * N, + [np.nan] * N + [1] * 2 * N + [0] * N, ] ] - # Each level contains 4 elements (including NaN), so it is represented - # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a - # 64 bit engine and truncating the first levels, the fourth and fifth - # keys would collide; if truncating the last levels, the fifth and - # sixth; if rotating bits rather than shifting, the third and fifth. + # Each level contains 3 elements (NaN, 0, 1), and it's represented + # in 2 bits to store 4 possible values (0=notfound, 1=NaN, 2=0, 3=1), for + # a total of 2*N*4 = 80 > 64 bits where N=10 and the number of levels is N*4. + # If we were using a 64 bit engine and truncating the first levels, the + # fourth and fifth keys would collide; if truncating the last levels, the + # fifth and sixth; if rotating bits rather than shifting, the third and fifth. + + index = MultiIndex.from_tuples(keys) + assert index._engine.values.dtype == expected_dtype for idx, key_value in enumerate(keys): - index = MultiIndex.from_tuples(keys) assert index.get_loc(key_value) == idx expected = np.arange(idx + 1, dtype=np.intp) @@ -956,7 +963,7 @@ def test_pyint_engine(): # With missing key: idces = range(len(keys)) expected = np.array([-1] + list(idces), dtype=np.intp) - missing = tuple([0, 1] * 5 * N) + missing = tuple([0, 1, 0, 1] * N) result = index.get_indexer([missing] + [keys[i] for i in idces]) tm.assert_numpy_array_equal(result, expected) @@ -999,3 +1006,26 @@ def test_get_indexer_for_multiindex_with_nans(nulls_fixture): result = idx1.get_indexer(idx2) expected = np.array([-1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + + +def test_get_loc_namedtuple_behaves_like_tuple(): + # GH57922 + NamedIndex = namedtuple("NamedIndex", ("a", "b")) + multi_idx = MultiIndex.from_tuples( + [NamedIndex("i1", "i2"), NamedIndex("i3", "i4"), NamedIndex("i5", "i6")] + ) + for idx in (multi_idx, multi_idx.to_flat_index()): + assert idx.get_loc(NamedIndex("i1", "i2")) == 0 + assert idx.get_loc(NamedIndex("i3", "i4")) == 1 + assert idx.get_loc(NamedIndex("i5", "i6")) == 2 + assert idx.get_loc(("i1", "i2")) == 0 + assert idx.get_loc(("i3", "i4")) == 1 + assert idx.get_loc(("i5", "i6")) == 2 + multi_idx = MultiIndex.from_tuples([("i1", "i2"), ("i3", "i4"), ("i5", "i6")]) + for idx in (multi_idx, multi_idx.to_flat_index()): + assert idx.get_loc(NamedIndex("i1", "i2")) == 0 + assert idx.get_loc(NamedIndex("i3", "i4")) == 1 + assert idx.get_loc(NamedIndex("i5", "i6")) == 2 + assert idx.get_loc(("i1", "i2")) == 0 + assert idx.get_loc(("i3", "i4")) == 1 + assert idx.get_loc(("i5", "i6")) == 2 diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index d956747cbc859..d570e911bf584 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -245,7 +245,7 @@ def test_rangeindex_fallback_coercion_bug(): df1 = pd.DataFrame(np.arange(100).reshape((10, 10))) df2 = pd.DataFrame(np.arange(100).reshape((10, 10))) df = pd.concat( - {"df1": df1.stack(future_stack=True), "df2": df2.stack(future_stack=True)}, + {"df1": df1.stack(), "df2": df2.stack()}, axis=1, ) df.index.names = ["fizz", "buzz"] diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py index 68fdf25359f1b..92ac2468d5993 100644 --- a/pandas/tests/indexes/multi/test_isin.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -75,15 +75,16 @@ def test_isin_level_kwarg(): @pytest.mark.parametrize( "labels,expected,level", [ - ([("b", np.nan)], np.array([False, False, True]), None), - ([np.nan, "a"], np.array([True, True, False]), 0), - (["d", np.nan], np.array([False, True, True]), 1), + ([("b", np.nan)], [False, False, True], None), + ([np.nan, "a"], [True, True, False], 0), + (["d", np.nan], [False, True, True], 1), ], ) def test_isin_multi_index_with_missing_value(labels, expected, level): # GH 19132 midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]]) result = midx.isin(labels, level=level) + expected = np.array(expected) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index edd0feaaa1159..2be6bba475af7 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -12,10 +12,9 @@ import pandas._testing as tm -@pytest.mark.parametrize( - "other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])] -) +@pytest.mark.parametrize("other", [["three", "one", "two"], ["one"], ["one", "three"]]) def test_join_level(idx, other, join_type): + other = Index(other) join_index, lidx, ridx = other.join( idx, how=join_type, level="second", return_indexers=True ) @@ -36,7 +35,10 @@ def test_join_level(idx, other, join_type): assert join_index.equals(join_index2) tm.assert_numpy_array_equal(lidx, lidx2) - tm.assert_numpy_array_equal(ridx, ridx2) + if ridx is None: + assert ridx == ridx2 + else: + tm.assert_numpy_array_equal(ridx, ridx2) tm.assert_numpy_array_equal(join_index2.values, exp_values) @@ -258,7 +260,7 @@ def test_join_dtypes_all_nan(any_numeric_ea_dtype): def test_join_index_levels(): # GH#53093 - midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")]) + midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")]) midx2 = MultiIndex.from_tuples([("a", "2019-01-31")]) result = midx.join(midx2, how="outer") expected = MultiIndex.from_tuples( diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 14ffc42fb4b59..41cfa093ae53c 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -8,7 +8,7 @@ def test_fillna(idx): # GH 11343 - msg = "isna is not defined for MultiIndex" + msg = "fillna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): idx.fillna(idx[0]) diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 06dbb33aadf97..cc3dadc6bb61c 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -1,8 +1,8 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -114,11 +114,11 @@ def test_append_index(): result = idx1.append(midx_lv2) # see gh-7112 - tz = pytz.timezone("Asia/Tokyo") + tz = zoneinfo.ZoneInfo("Asia/Tokyo") expected_tuples = [ - (1.1, tz.localize(datetime(2011, 1, 1))), - (1.2, tz.localize(datetime(2011, 1, 2))), - (1.3, tz.localize(datetime(2011, 1, 3))), + (1.1, datetime(2011, 1, 1, tzinfo=tz)), + (1.2, datetime(2011, 1, 2, tzinfo=tz)), + (1.3, datetime(2011, 1, 3, tzinfo=tz)), ] expected = Index([1.1, 1.2, 1.3] + expected_tuples) tm.assert_index_equal(result, expected) @@ -138,9 +138,9 @@ def test_append_index(): expected = Index._simple_new( np.array( [ - (1.1, tz.localize(datetime(2011, 1, 1)), "A"), - (1.2, tz.localize(datetime(2011, 1, 2)), "B"), - (1.3, tz.localize(datetime(2011, 1, 3)), "C"), + (1.1, datetime(2011, 1, 1, tzinfo=tz), "A"), + (1.2, datetime(2011, 1, 2, tzinfo=tz), "B"), + (1.3, datetime(2011, 1, 3, tzinfo=tz), "C"), ] + expected_tuples, dtype=object, diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 0abb56ecf9de7..f7544cf62e5fa 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -382,7 +382,7 @@ def test_union_sort_other_incomparable(): idx = MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) # default, sort=None - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="are unorderable"): result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) @@ -711,17 +711,11 @@ def test_intersection_lexsort_depth(levels1, levels2, codes1, codes2, names): "a", [pd.Categorical(["a", "b"], categories=["a", "b"]), ["a", "b"]], ) -@pytest.mark.parametrize( - "b", - [ - pd.Categorical(["a", "b"], categories=["b", "a"], ordered=True), - pd.Categorical(["a", "b"], categories=["b", "a"]), - ], -) -def test_intersection_with_non_lex_sorted_categories(a, b): +@pytest.mark.parametrize("b_ordered", [True, False]) +def test_intersection_with_non_lex_sorted_categories(a, b_ordered): # GH#49974 other = ["1", "2"] - + b = pd.Categorical(["a", "b"], categories=["b", "a"], ordered=b_ordered) df1 = DataFrame({"x": a, "y": other}) df2 = DataFrame({"x": b, "y": other}) @@ -763,7 +757,7 @@ def test_union_with_na_when_constructing_dataframe(): series1 = Series( (1,), index=MultiIndex.from_arrays( - [Series([None], dtype="string"), Series([None], dtype="string")] + [Series([None], dtype="str"), Series([None], dtype="str")] ), ) series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index b4dcef71dcf50..3d21ee8a57716 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -1,10 +1,7 @@ import numpy as np import pytest -from pandas.errors import ( - PerformanceWarning, - UnsortedIndexError, -) +from pandas.errors import UnsortedIndexError from pandas import ( CategoricalIndex, @@ -135,7 +132,7 @@ def test_unsortedindex(): df.loc(axis=0)["q", :] -def test_unsortedindex_doc_examples(): +def test_unsortedindex_doc_examples(performance_warning): # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex dfm = DataFrame( { @@ -146,12 +143,12 @@ def test_unsortedindex_doc_examples(): ) dfm = dfm.set_index(["jim", "joe"]) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): dfm.loc[(1, "z")] msg = r"Key length \(2\) was greater than MultiIndex lexsort depth \(1\)" with pytest.raises(UnsortedIndexError, match=msg): - dfm.loc[(0, "y"):(1, "z")] + dfm.loc[(0, "y") : (1, "z")] assert not dfm.index._is_lexsorted() assert dfm.index._lexsort_depth == 1 @@ -159,7 +156,7 @@ def test_unsortedindex_doc_examples(): # sort it dfm = dfm.sort_index() dfm.loc[(1, "z")] - dfm.loc[(0, "y"):(1, "z")] + dfm.loc[(0, "y") : (1, "z")] assert dfm.index._is_lexsorted() assert dfm.index._lexsort_depth == 2 diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/indexes/multi/test_util.py similarity index 73% rename from pandas/tests/reshape/test_util.py rename to pandas/tests/indexes/multi/test_util.py index 4d0be7464cb3d..68792ce53f04e 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/indexes/multi/test_util.py @@ -6,7 +6,7 @@ date_range, ) import pandas._testing as tm -from pandas.core.reshape.util import cartesian_product +from pandas.core.indexes.multi import cartesian_product class TestCartesianProduct: @@ -28,22 +28,6 @@ def test_datetimeindex(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) - def test_tzaware_retained(self): - x = date_range("2000-01-01", periods=2, tz="US/Pacific") - y = np.array([3, 4]) - result1, result2 = cartesian_product([x, y]) - - expected = x.repeat(2) - tm.assert_index_equal(result1, expected) - - def test_tzaware_retained_categorical(self): - x = date_range("2000-01-01", periods=2, tz="US/Pacific").astype("category") - y = np.array([3, 4]) - result1, result2 = cartesian_product([x, y]) - - expected = x.repeat(2) - tm.assert_index_equal(result1, expected) - @pytest.mark.parametrize("x, y", [[[], []], [[0, 1], []], [[], ["a", "b", "c"]]]) def test_empty(self, x, y): # product of empty factors @@ -72,8 +56,8 @@ def test_exceed_product_space(self): # GH31355: raise useful error when produce space is too large msg = "Product space too large to allocate arrays!" + dims = [np.arange(0, 22, dtype=np.int16) for i in range(12)] + [ + (np.arange(15128, dtype=np.int16)), + ] with pytest.raises(ValueError, match=msg): - dims = [np.arange(0, 22, dtype=np.int16) for i in range(12)] + [ - (np.arange(15128, dtype=np.int16)), - ] cartesian_product(X=dims) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index cd28d519313ed..3c1b98d57b2a0 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -17,13 +17,6 @@ ) -@pytest.fixture -def index_large(): - # large values used in Index[uint64] tests where no compat needed with Int64/Float64 - large = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25] - return Index(large, dtype=np.uint64) - - class TestGetLoc: def test_get_loc(self): index = Index([0, 1, 2]) @@ -117,16 +110,16 @@ def test_get_indexer(self): @pytest.mark.parametrize( "expected,method", [ - (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "pad"), - (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "ffill"), - (np.array([0, 0, 1, 1, 2], dtype=np.intp), "backfill"), - (np.array([0, 0, 1, 1, 2], dtype=np.intp), "bfill"), + ([-1, 0, 0, 1, 1], "pad"), + ([-1, 0, 0, 1, 1], "ffill"), + ([0, 0, 1, 1, 2], "backfill"), + ([0, 0, 1, 1, 2], "bfill"), ], ) def test_get_indexer_methods(self, reverse, expected, method): index1 = Index([1, 2, 3, 4, 5]) index2 = Index([2, 4, 6]) - + expected = np.array(expected, dtype=np.intp) if reverse: index1 = index1[::-1] expected = expected[::-1] @@ -173,12 +166,11 @@ def test_get_indexer_nearest(self, method, tolerance, indexer, expected): @pytest.mark.parametrize("listtype", [list, tuple, Series, np.array]) @pytest.mark.parametrize( "tolerance, expected", - list( - zip( - [[0.3, 0.3, 0.1], [0.2, 0.1, 0.1], [0.1, 0.5, 0.5]], - [[0, 2, -1], [0, -1, -1], [-1, 2, 9]], - ) - ), + [ + [[0.3, 0.3, 0.1], [0, 2, -1]], + [[0.2, 0.1, 0.1], [0, -1, -1]], + [[0.1, 0.5, 0.5], [-1, 2, 9]], + ], ) def test_get_indexer_nearest_listlike_tolerance( self, tolerance, expected, listtype @@ -303,7 +295,11 @@ def test_get_indexer_int64(self): expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - def test_get_indexer_uint64(self, index_large): + def test_get_indexer_uint64(self): + index_large = Index( + [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25], + dtype=np.uint64, + ) target = Index(np.arange(10).astype("uint64") * 5 + 2**63) indexer = index_large.get_indexer(target) expected = np.array([0, -1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) @@ -406,8 +402,9 @@ def test_get_indexer_arrow_dictionary_target(self): tm.assert_numpy_array_equal(result, expected) result_1, result_2 = idx.get_indexer_non_unique(target) - expected_1, expected_2 = np.array([0, -1], dtype=np.int64), np.array( - [1], dtype=np.int64 + expected_1, expected_2 = ( + np.array([0, -1], dtype=np.int64), + np.array([1], dtype=np.int64), ) tm.assert_numpy_array_equal(result_1, expected_1) tm.assert_numpy_array_equal(result_2, expected_2) @@ -482,8 +479,7 @@ def test_take_fill_value_float64(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py index 918d505216735..4737af987a04e 100644 --- a/pandas/tests/indexes/numeric/test_join.py +++ b/pandas/tests/indexes/numeric/test_join.py @@ -21,20 +21,14 @@ def test_join_non_unique(self): tm.assert_numpy_array_equal(ridx, exp_ridx) def test_join_inner(self): - index = Index(range(0, 20, 2), dtype=np.int64) - other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) - other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + index = Index(range(0, 20, 2), dtype=np.int64, name="lhs") + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64, name="rhs") + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64, name="rhs") # not monotonic res, lidx, ridx = index.join(other, how="inner", return_indexers=True) - # no guarantee of sortedness, so sort for comparison purposes - ind = res.argsort() - res = res.take(ind) - lidx = lidx.take(ind) - ridx = ridx.take(ind) - - eres = Index([2, 12], dtype=np.int64) + eres = Index([2, 12], dtype=np.int64, name="lhs") elidx = np.array([1, 6], dtype=np.intp) eridx = np.array([4, 1], dtype=np.intp) @@ -46,7 +40,7 @@ def test_join_inner(self): # monotonic res, lidx, ridx = index.join(other_mono, how="inner", return_indexers=True) - res2 = index.intersection(other_mono) + res2 = index.intersection(other_mono).set_names(["lhs"]) tm.assert_index_equal(res, res2) elidx = np.array([1, 6], dtype=np.intp) @@ -57,9 +51,9 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_left(self): - index = Index(range(0, 20, 2), dtype=np.int64) - other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) - other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + index = Index(range(0, 20, 2), dtype=np.int64, name="lhs") + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64, name="rhs") + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64, name="rhs") # not monotonic res, lidx, ridx = index.join(other, how="left", return_indexers=True) @@ -80,10 +74,10 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # non-unique - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = Index([1, 1, 2, 5], name="rhs") + idx2 = Index([1, 2, 5, 7, 9], name="lhs") res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) - eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 + eres = Index([1, 1, 2, 5, 7, 9], name="lhs") # 1 is in idx2, so it should be x2 eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) tm.assert_index_equal(res, eres) @@ -91,9 +85,9 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_right(self): - index = Index(range(0, 20, 2), dtype=np.int64) - other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) - other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + index = Index(range(0, 20, 2), dtype=np.int64, name="lhs") + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64, name="rhs") + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64, name="rhs") # not monotonic res, lidx, ridx = index.join(other, how="right", return_indexers=True) @@ -115,10 +109,10 @@ def test_join_right(self): assert ridx is None # non-unique - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = Index([1, 1, 2, 5], name="lhs") + idx2 = Index([1, 2, 5, 7, 9], name="rhs") res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) - eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 + eres = Index([1, 1, 2, 5, 7, 9], name="rhs") # 1 is in idx2, so it should be x2 elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) tm.assert_index_equal(res, eres) @@ -313,15 +307,11 @@ def test_join_right(self, index_large): tm.assert_numpy_array_equal(ridx, eridx) def test_join_non_int_index(self, index_large): - other = Index( - 2**63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object - ) + other = Index(2**63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object) outer = index_large.join(other, how="outer") outer2 = other.join(index_large, how="outer") - expected = Index( - 2**63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64") - ) + expected = Index(2**63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64")) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) @@ -353,9 +343,7 @@ def test_join_outer(self, index_large): noidx_res = index_large.join(other, how="outer") tm.assert_index_equal(res, noidx_res) - eres = Index( - 2**63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64") - ) + eres = Index(2**63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64")) elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp) diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 4fd807e1827dd..8dde5febe810d 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -15,12 +15,16 @@ def dtype(self, request): return request.param @pytest.fixture - def simple_index(self, dtype): - values = np.arange(5, dtype=dtype) - return Index(values) + def mixed_index(self, dtype): + return Index([1.5, 2, 3, 4, 5], dtype=dtype) + + @pytest.fixture + def float_index(self, dtype): + return Index([0.0, 2.5, 5.0, 7.5, 10.0], dtype=dtype) - @pytest.fixture( - params=[ + @pytest.mark.parametrize( + "index_data", + [ [1.5, 2, 3, 4, 5], [0.0, 2.5, 5.0, 7.5, 10.0], [5, 4, 3, 2, 1.5], @@ -28,18 +32,8 @@ def simple_index(self, dtype): ], ids=["mixed", "float", "mixed_dec", "float_dec"], ) - def index(self, request, dtype): - return Index(request.param, dtype=dtype) - - @pytest.fixture - def mixed_index(self, dtype): - return Index([1.5, 2, 3, 4, 5], dtype=dtype) - - @pytest.fixture - def float_index(self, dtype): - return Index([0.0, 2.5, 5.0, 7.5, 10.0], dtype=dtype) - - def test_repr_roundtrip(self, index): + def test_repr_roundtrip(self, index_data, dtype): + index = Index(index_data, dtype=dtype) tm.assert_index_equal(eval(repr(index)), index, exact=True) def check_coerce(self, a, b, is_float_index=True): @@ -227,8 +221,8 @@ def test_fillna_float64(self): exp = Index([1.0, "obj", 3.0], name="x") tm.assert_index_equal(idx.fillna("obj"), exp, exact=True) - def test_logical_compat(self, simple_index): - idx = simple_index + def test_logical_compat(self, dtype): + idx = Index(np.arange(5, dtype=dtype)) assert idx.all() == idx.values.all() assert idx.any() == idx.values.any() @@ -237,9 +231,9 @@ def test_logical_compat(self, simple_index): class TestNumericInt: - @pytest.fixture(params=[np.int64, np.int32, np.int16, np.int8, np.uint64]) - def dtype(self, request): - return request.param + @pytest.fixture + def dtype(self, any_int_numpy_dtype): + return np.dtype(any_int_numpy_dtype) @pytest.fixture def simple_index(self, dtype): @@ -318,8 +312,11 @@ def test_cant_or_shouldnt_cast(self, dtype): def test_view_index(self, simple_index): index = simple_index - msg = "Passing a type in .*Index.view is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) + with pytest.raises(TypeError, match=msg): index.view(Index) def test_prevent_casting(self, simple_index): @@ -400,7 +397,7 @@ def test_constructor_corner(self, dtype): # preventing casting arr = np.array([1, "2", 3, "4"], dtype=object) - msg = "Trying to coerce float values to integers" + msg = "Trying to coerce object values to integers" with pytest.raises(ValueError, match=msg): index_cls(arr, dtype=dtype) diff --git a/pandas/tests/indexes/numeric/test_setops.py b/pandas/tests/indexes/numeric/test_setops.py index 376b51dd98bb1..5d3981dbf93d0 100644 --- a/pandas/tests/indexes/numeric/test_setops.py +++ b/pandas/tests/indexes/numeric/test_setops.py @@ -41,7 +41,7 @@ def test_intersection(self): other = Index([1, 2, 3, 4, 5]) result = index.intersection(other) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) + expected = Index(range(1, 5)) tm.assert_index_equal(result, expected) result = other.intersection(index) @@ -96,7 +96,11 @@ def test_float64_index_difference(self): result = string_index.difference(float_index) tm.assert_index_equal(result, string_index) - def test_intersection_uint64_outside_int64_range(self, index_large): + def test_intersection_uint64_outside_int64_range(self): + index_large = Index( + [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25], + dtype=np.uint64, + ) other = Index([2**63, 2**63 + 5, 2**63 + 10, 2**63 + 15, 2**63 + 20]) result = index_large.intersection(other) expected = Index(np.sort(np.intersect1d(index_large.values, other.values))) @@ -109,13 +113,14 @@ def test_intersection_uint64_outside_int64_range(self, index_large): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "index2,keeps_name", + "index2_name,keeps_name", [ - (Index([4, 7, 6, 5, 3], name="index"), True), - (Index([4, 7, 6, 5, 3], name="other"), False), + ("index", True), + ("other", False), ], ) - def test_intersection_monotonic(self, index2, keeps_name, sort): + def test_intersection_monotonic(self, index2_name, keeps_name, sort): + index2 = Index([4, 7, 6, 5, 3], name=index2_name) index1 = Index([5, 3, 2, 4, 1], name="index") expected = Index([5, 3, 4]) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 9c1ef302c5b51..7e0de138aacfb 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -3,25 +3,7 @@ from pandas import ( Index, NaT, - Series, ) -import pandas._testing as tm - - -def test_astype_str_from_bytes(): - # https://github.com/pandas-dev/pandas/issues/38607 - # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively - # did a .decode() on the bytes object. In 2.0 we go through - # ensure_string_array which does f"{val}" - idx = Index(["あ", b"a"], dtype="object") - result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") - tm.assert_index_equal(result, expected) - - # while we're here, check that Series.astype behaves the same - result = Series(idx).astype(str) - expected = Series(expected, dtype=object) - tm.assert_series_equal(result, expected) def test_astype_invalid_nas_to_tdt64_raises(): diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ebf9dac715f8d..2c5968314e5cf 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,13 +3,8 @@ import numpy as np import pytest -from pandas._libs.missing import ( - NA, - is_matching_na, -) -import pandas.util._test_decorators as td +from pandas._libs.missing import is_matching_na -import pandas as pd from pandas import Index import pandas._testing as tm @@ -18,46 +13,36 @@ class TestGetIndexer: @pytest.mark.parametrize( "method,expected", [ - ("pad", np.array([-1, 0, 1, 1], dtype=np.intp)), - ("backfill", np.array([0, 0, 1, -1], dtype=np.intp)), + ("pad", [-1, 0, 1, 1]), + ("backfill", [0, 0, 1, -1]), ], ) def test_get_indexer_strings(self, method, expected): - index = Index(["b", "c"]) + expected = np.array(expected, dtype=np.intp) + index = Index(["b", "c"], dtype=object) actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected) - def test_get_indexer_strings_raises(self, using_infer_string): - index = Index(["b", "c"]) + def test_get_indexer_strings_raises(self): + index = Index(["b", "c"], dtype=object) - if using_infer_string: - import pyarrow as pa - - msg = "has no kernel" - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") - - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) - - else: - msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str'", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - with pytest.raises(TypeError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) def test_get_indexer_with_NA_values( self, unique_nulls_fixture, unique_nulls_fixture2 @@ -77,15 +62,20 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_infer_string_missing_values(self): + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: - def test_get_indexer_non_unique_nas( - self, nulls_fixture, request, using_infer_string - ): + def test_get_indexer_non_unique_nas(self, nulls_fixture): # even though this isn't non-unique, this should still work - if using_infer_string and (nulls_fixture is None or nulls_fixture is NA): - request.applymarker(pytest.mark.xfail(reason="NAs are cast to NaN")) - index = Index(["a", "b", nulls_fixture]) + index = Index(["a", "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([2], dtype=np.intp) @@ -94,7 +84,7 @@ def test_get_indexer_non_unique_nas( tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", nulls_fixture, "b", nulls_fixture]) + index = Index(["a", nulls_fixture, "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([1, 3], dtype=np.intp) @@ -103,10 +93,10 @@ def test_get_indexer_non_unique_nas( # matching-but-not-identical nans if is_matching_na(nulls_fixture, float("NaN")): - index = Index(["a", float("NaN"), "b", float("NaN")]) + index = Index(["a", float("NaN"), "b", float("NaN")], dtype=object) match_but_not_identical = True elif is_matching_na(nulls_fixture, Decimal("NaN")): - index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")]) + index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")], dtype=object) match_but_not_identical = True else: match_but_not_identical = False @@ -167,67 +157,3 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) - - -class TestSliceLocs: - @pytest.mark.parametrize( - "dtype", - [ - "object", - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], - ) - @pytest.mark.parametrize( - "in_slice,expected", - [ - # error: Slice index must be an integer or None - (pd.IndexSlice[::-1], "yxdcb"), - (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] - (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] - (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] - (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] - # absent labels - (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] - (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] - (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] - (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] - (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] - (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] - (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] - ], - ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): - index = Index(list("bcdxy"), dtype=dtype) - - s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) - result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=dtype) - tm.assert_index_equal(result, expected) - - @td.skip_if_no("pyarrow") - def test_slice_locs_negative_step_oob(self): - index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") - - result = index[-10:5:1] - tm.assert_index_equal(result, index) - - result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") - tm.assert_index_equal(result, expected) - - def test_slice_locs_dup(self): - index = Index(["a", "a", "b", "c", "d", "d"]) - assert index.slice_locs("a", "d") == (0, 6) - assert index.slice_locs(end="d") == (0, 6) - assert index.slice_locs("a", "c") == (0, 4) - assert index.slice_locs("b", "d") == (2, 6) - - index2 = index[::-1] - assert index2.slice_locs("d", "a") == (0, 6) - assert index2.slice_locs(end="a") == (0, 6) - assert index2.slice_locs("d", "b") == (0, 4) - assert index2.slice_locs("c", "a") == (2, 6) diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index ed078a3e8fb8b..8fca53c28a036 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -1,3 +1,5 @@ +import re + import pytest from pandas import ( @@ -7,6 +9,8 @@ ) import pandas._testing as tm +from pandas.tseries import offsets + class TestPeriodIndex: def test_asfreq(self): @@ -136,3 +140,43 @@ def test_asfreq_with_different_n(self): excepted = Series([1, 2], index=PeriodIndex(["2020-02", "2020-04"], freq="M")) tm.assert_series_equal(result, excepted) + + @pytest.mark.parametrize( + "freq", + [ + "2BMS", + "2YS-MAR", + "2bh", + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), + ], + ) + def test_pi_asfreq_not_supported_frequency(self, freq): + # GH#55785, GH#56945 + msg = "|".join( + [ + f"Invalid frequency: {freq}", + re.escape(f"{freq} is not supported as period frequency"), + "bh is not supported as period frequency", + ] + ) + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + "2BME", + "2YE-MAR", + "2QE", + ], + ) + def test_pi_asfreq_invalid_frequency(self, freq): + # GH#55785 + msg = f"Invalid frequency: {freq}" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index d545bfd2fae0f..af3c2667f51b4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -22,7 +22,7 @@ def test_astype_raises(self, dtype): with pytest.raises(TypeError, match=msg): idx.astype(dtype) - def test_astype_conversion(self): + def test_astype_conversion(self, using_infer_string): # GH#13149, GH#13209 idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") @@ -41,7 +41,12 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 3867f9e3245dc..4fe429ce71ee4 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -140,3 +140,10 @@ def test_to_timestamp_1703(self): result = index.to_timestamp() assert result[0] == Timestamp("1/1/2012") + + +def test_ms_to_timestamp_error_message(): + # https://github.com/pandas-dev/pandas/issues/58974#issuecomment-2164265446 + ix = period_range("2000", periods=3, freq="M") + with pytest.raises(ValueError, match="for Period, please use 'M' instead of 'MS'"): + ix.to_timestamp("MS") diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 387dc47c48d20..be07a71b283fd 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -26,11 +26,14 @@ class TestPeriodIndexDisallowedFreqs: ("2M", "2ME"), ("2Q-MAR", "2QE-MAR"), ("2Y-FEB", "2YE-FEB"), + ("2M", "2me"), + ("2Q-MAR", "2qe-MAR"), + ("2Y-FEB", "2yE-feb"), ], ) - def test_period_index_frequency_ME_error_message(self, freq, freq_depr): + def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): # GH#52064 - msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + msg = f"Invalid frequency: {freq_depr}" with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) @@ -38,55 +41,71 @@ def test_period_index_frequency_ME_error_message(self, freq, freq_depr): with pytest.raises(ValueError, match=msg): period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2SME", "2CBME", "2BYE"]) - def test_period_index_frequency_invalid_freq(self, freq_depr): + @pytest.mark.parametrize( + "freq", + ["2SME", "2sme", "2BYE", "2Bye", "2CBME"], + ) + def test_period_index_frequency_invalid_freq(self, freq): # GH#9586 - msg = f"Invalid frequency: {freq_depr[1:]}" + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + period_range("2020-01", "2020-05", freq=freq) + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01", "2020-05"], freq=freq) + + @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) + def test_period_index_from_datetime_index_invalid_freq(self, freq): + # GH#56899 + msg = f"Invalid frequency: {freq}" + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + with pytest.raises(ValueError, match=msg): + rng.to_period() + + @pytest.mark.parametrize("freq_depr", ["2T", "1l", "2U", "n"]) + def test_period_index_T_L_U_N_raises(self, freq_depr): + # GH#9586 + msg = f"Invalid frequency: {freq_depr}" with pytest.raises(ValueError, match=msg): period_range("2020-01", "2020-05", freq=freq_depr) with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + @pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") + @pytest.mark.parametrize( + "freq,freq_depr", + [("2W", "2w"), ("2W-FRI", "2w-fri"), ("2D", "2d"), ("2B", "2b")], + ) + def test_period_index_depr_lowercase_frequency(self, freq, freq_depr): + # GH#58998 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) -class TestPeriodIndex: - def test_from_ordinals(self): - Period(ordinal=-1000, freq="Y") - Period(ordinal=0, freq="Y") - - msg = "The 'ordinal' keyword in PeriodIndex is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="Y") - with tm.assert_produces_warning(FutureWarning, match=msg): - idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="Y") - tm.assert_index_equal(idx1, idx2) + result = PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) - alt1 = PeriodIndex.from_ordinals([-1, 0, 1], freq="Y") - tm.assert_index_equal(alt1, idx1) + expected = PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq) + tm.assert_index_equal(result, expected) - alt2 = PeriodIndex.from_ordinals(np.array([-1, 0, 1]), freq="Y") - tm.assert_index_equal(alt2, idx2) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) - def test_keyword_mismatch(self): - # GH#55961 we should get exactly one of data/ordinals/**fields - per = Period("2016-01-01", "D") - depr_msg1 = "The 'ordinal' keyword in PeriodIndex is deprecated" - depr_msg2 = "Constructing PeriodIndex from fields is deprecated" + expected = period_range(start="2020-01-01", end="2020-01-02", freq=freq) + tm.assert_index_equal(result, expected) - err_msg1 = "Cannot pass both data and ordinal" - with pytest.raises(ValueError, match=err_msg1): - with tm.assert_produces_warning(FutureWarning, match=depr_msg1): - PeriodIndex(data=[per], ordinal=[per.ordinal], freq=per.freq) - err_msg2 = "Cannot pass both data and fields" - with pytest.raises(ValueError, match=err_msg2): - with tm.assert_produces_warning(FutureWarning, match=depr_msg2): - PeriodIndex(data=[per], year=[per.year], freq=per.freq) +class TestPeriodIndex: + def test_from_ordinals(self): + Period(ordinal=-1000, freq="Y") + Period(ordinal=0, freq="Y") - err_msg3 = "Cannot pass both ordinal and fields" - with pytest.raises(ValueError, match=err_msg3): - with tm.assert_produces_warning(FutureWarning, match=depr_msg2): - PeriodIndex(ordinal=[per.ordinal], year=[per.year], freq=per.freq) + idx1 = PeriodIndex.from_ordinals(ordinals=[-1, 0, 1], freq="Y") + idx2 = PeriodIndex.from_ordinals(ordinals=np.array([-1, 0, 1]), freq="Y") + tm.assert_index_equal(idx1, idx2) def test_construction_base_constructor(self): # GH 13664 @@ -146,18 +165,14 @@ def test_constructor_field_arrays(self): years = np.arange(1990, 2010).repeat(4)[2:-2] quarters = np.tile(np.arange(1, 5), 20)[2:-2] - depr_msg = "Constructing PeriodIndex from fields is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") + index = PeriodIndex.from_fields(year=years, quarter=quarters, freq="Q-DEC") expected = period_range("1990Q3", "2009Q2", freq="Q-DEC") tm.assert_index_equal(index, expected) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") + index2 = PeriodIndex.from_fields(year=years, quarter=quarters, freq="2Q-DEC") tm.assert_numpy_array_equal(index.asi8, index2.asi8) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - index = PeriodIndex(year=years, quarter=quarters) + index = PeriodIndex.from_fields(year=years, quarter=quarters) tm.assert_index_equal(index, expected) years = [2007, 2007, 2007] @@ -165,16 +180,13 @@ def test_constructor_field_arrays(self): msg = "Mismatched Period array lengths" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - PeriodIndex(year=years, month=months, freq="M") + PeriodIndex.from_fields(year=years, month=months, freq="M") with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - PeriodIndex(year=years, month=months, freq="2M") + PeriodIndex.from_fields(year=years, month=months, freq="2M") years = [2007, 2007, 2007] months = [1, 2, 3] - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - idx = PeriodIndex(year=years, month=months, freq="M") + idx = PeriodIndex.from_fields(year=years, month=months, freq="M") exp = period_range("2007-01", periods=3, freq="M") tm.assert_index_equal(idx, exp) @@ -198,32 +210,22 @@ def test_constructor_nano(self): def test_constructor_arrays_negative_year(self): years = np.arange(1960, 2000, dtype=np.int64).repeat(4) quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) - - msg = "Constructing PeriodIndex from fields is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - pindex = PeriodIndex(year=years, quarter=quarters) + pindex = PeriodIndex.from_fields(year=years, quarter=quarters) tm.assert_index_equal(pindex.year, Index(years)) tm.assert_index_equal(pindex.quarter, Index(quarters)) - alt = PeriodIndex.from_fields(year=years, quarter=quarters) - tm.assert_index_equal(alt, pindex) - def test_constructor_invalid_quarters(self): - depr_msg = "Constructing PeriodIndex from fields is deprecated" msg = "Quarter must be 1 <= q <= 4" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - PeriodIndex( - year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC" - ) + PeriodIndex.from_fields( + year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC" + ) def test_period_range_fractional_period(self): - msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = period_range("2007-01", periods=10.5, freq="M") - exp = period_range("2007-01", periods=10, freq="M") - tm.assert_index_equal(result, exp) + msg = "periods must be an integer, got 10.5" + with pytest.raises(TypeError, match=msg): + period_range("2007-01", periods=10.5, freq="M") def test_constructor_with_without_freq(self): # GH53687 @@ -422,9 +424,7 @@ def test_constructor_floats(self, floats): def test_constructor_year_and_quarter(self): year = Series([2001, 2002, 2003]) quarter = year - 2000 - msg = "Constructing PeriodIndex from fields is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - idx = PeriodIndex(year=year, quarter=quarter) + idx = PeriodIndex.from_fields(year=year, quarter=quarter) strs = [f"{t[0]:d}Q{t[1]:d}" for t in zip(quarter, year)] lops = list(map(Period, strs)) p = PeriodIndex(lops) @@ -538,7 +538,9 @@ def test_period_range_length(self): assert i1.freq == end_intv.freq assert i1[-1] == end_intv - end_intv = Period("2006-12-31", "1w") + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_intv = Period("2006-12-31", "1w") i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() @@ -567,7 +569,7 @@ def test_mixed_freq_raises(self): with tm.assert_produces_warning(FutureWarning, match=msg): end_intv = Period("2005-05-01", "B") - vals = [end_intv, Period("2006-12-31", "w")] + vals = [end_intv, Period("2006-12-31", "W")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" depr_msg = r"PeriodDtype\[B\] is deprecated" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 81c79f7d18f2f..dc95e19523842 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -1,10 +1,3 @@ -from contextlib import nullcontext -from datetime import ( - datetime, - time, -) -import locale - import numpy as np import pytest @@ -16,13 +9,6 @@ import pandas._testing as tm -def get_local_am_pm(): - """Return the AM and PM strings returned by strftime in current locale.""" - am_local = time(1).strftime("%p") - pm_local = time(13).strftime("%p") - return am_local, pm_local - - def test_get_values_for_csv(): index = PeriodIndex(["2017-01-01", "2017-01-02", "2017-01-03"], freq="D") @@ -56,15 +42,6 @@ def test_get_values_for_csv(): class TestPeriodIndexRendering: - def test_format_empty(self): - # GH#35712 - empty_idx = PeriodIndex([], freq="Y") - msg = r"PeriodIndex\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert empty_idx.format() == [] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert empty_idx.format(name=True) == [""] - @pytest.mark.parametrize("method", ["__repr__", "__str__"]) def test_representation(self, method): # GH#7601 @@ -86,8 +63,7 @@ def test_representation(self, method): exp3 = "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]')" exp4 = ( - "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='period[D]')" + "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='period[D]')" ) exp5 = "PeriodIndex(['2011', '2012', '2013'], dtype='period[Y-DEC]')" @@ -215,136 +191,3 @@ def test_summary(self): ): result = idx._summary() assert result == expected - - -class TestPeriodIndexFormat: - def test_period_format_and_strftime_default(self): - per = PeriodIndex([datetime(2003, 1, 1, 12), None], freq="h") - - # Default formatting - msg = "PeriodIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format() - assert formatted[0] == "2003-01-01 12:00" # default: minutes not shown - assert formatted[1] == "NaT" - # format is equivalent to strftime(None)... - assert formatted[0] == per.strftime(None)[0] - assert per.strftime(None)[1] is np.nan # ...except for NaTs - - # Same test with nanoseconds freq - per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format() - assert (formatted == per.strftime(None)).all() - assert formatted[0] == "2003-01-01 12:01:01.123456789" - assert formatted[1] == "2003-01-01 12:01:01.123456790" - - def test_period_custom(self): - # GH#46252 custom formatting directives %l (ms) and %u (us) - msg = "PeriodIndex.format is deprecated" - - # 3 digits - per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="ms") - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") - assert formatted[0] == "03 12:01:01 (ms=123 us=123000 ns=123000000)" - assert formatted[1] == "03 12:01:01 (ms=124 us=124000 ns=124000000)" - - # 6 digits - per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="us") - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") - assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456000)" - assert formatted[1] == "03 12:01:01 (ms=123 us=123457 ns=123457000)" - - # 9 digits - per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") - assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" - assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" - - def test_period_tz(self): - # Formatting periods created from a datetime with timezone. - msg = r"PeriodIndex\.format is deprecated" - # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC - dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) - - # Converting to a period looses the timezone information - # Since tz is currently set as utc, we'll see 2012 - with tm.assert_produces_warning(UserWarning, match="will drop timezone"): - per = dt.to_period(freq="h") - with tm.assert_produces_warning(FutureWarning, match=msg): - assert per.format()[0] == "2012-12-31 23:00" - - # If tz is currently set as paris before conversion, we'll see 2013 - dt = dt.tz_convert("Europe/Paris") - with tm.assert_produces_warning(UserWarning, match="will drop timezone"): - per = dt.to_period(freq="h") - with tm.assert_produces_warning(FutureWarning, match=msg): - assert per.format()[0] == "2013-01-01 00:00" - - @pytest.mark.parametrize( - "locale_str", - [ - pytest.param(None, id=str(locale.getlocale())), - "it_IT.utf8", - "it_IT", # Note: encoding will be 'ISO8859-1' - "zh_CN.utf8", - "zh_CN", # Note: encoding will be 'gb2312' - ], - ) - def test_period_non_ascii_fmt(self, locale_str): - # GH#46468 non-ascii char in input format string leads to wrong output - - # Skip if locale cannot be set - if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): - pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") - - # Change locale temporarily for this test. - with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): - # Scalar - per = pd.Period("2018-03-11 13:00", freq="h") - assert per.strftime("%y é") == "18 é" - - # Index - per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") - msg = "PeriodIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format(date_format="%y é") - assert formatted[0] == "03 é" - assert formatted[1] == "03 é" - - @pytest.mark.parametrize( - "locale_str", - [ - pytest.param(None, id=str(locale.getlocale())), - "it_IT.utf8", - "it_IT", # Note: encoding will be 'ISO8859-1' - "zh_CN.utf8", - "zh_CN", # Note: encoding will be 'gb2312' - ], - ) - def test_period_custom_locale_directive(self, locale_str): - # GH#46319 locale-specific directive leads to non-utf8 c strftime char* result - - # Skip if locale cannot be set - if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): - pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") - - # Change locale temporarily for this test. - with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): - # Get locale-specific reference - am_local, pm_local = get_local_am_pm() - - # Scalar - per = pd.Period("2018-03-11 13:00", freq="h") - assert per.strftime("%p") == pm_local - - # Index - per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") - msg = "PeriodIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format(date_format="%y %I:%M:%S%p") - assert formatted[0] == f"03 01:00:00{am_local}" - assert formatted[1] == f"03 01:00:00{pm_local}" diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 2683e25eda618..00e8262ddfa4c 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -700,8 +700,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 4fab12f195dc0..8d173d850583f 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -12,9 +12,7 @@ class TestPeriodIndex: - def test_getitem_periodindex_duplicates_string_slice( - self, using_copy_on_write, warn_copy_on_write - ): + def test_getitem_periodindex_duplicates_string_slice(self): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="Y-JUN") ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) @@ -23,12 +21,8 @@ def test_getitem_periodindex_duplicates_string_slice( result = ts["2007"] expected = ts[1:3] tm.assert_series_equal(result, expected) - with tm.assert_cow_warning(warn_copy_on_write): - result[:] = 1 - if using_copy_on_write: - tm.assert_series_equal(ts, original) - else: - assert (ts[1:3] == 1).all() + result[:] = 1 + tm.assert_series_equal(ts, original) # not monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="Y-JUN") diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 2543b49089948..85af43e7d2e5e 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -70,7 +70,7 @@ def test_start_end_non_nat(self): def test_periods_requires_integer(self): # invalid periods param - msg = "periods must be a number, got foo" + msg = "periods must be an integer, got foo" with pytest.raises(TypeError, match=msg): period_range(start="2017Q1", periods="foo") @@ -181,8 +181,8 @@ def test_construction_from_period(self): def test_mismatched_start_end_freq_raises(self): depr_msg = "Period with BDay freq is deprecated" - end_w = Period("2006-12-31", "1w") + end_w = Period("2006-12-31", "1W") with tm.assert_produces_warning(FutureWarning, match=depr_msg): start_b = Period("02-Apr-2005", "B") end_b = Period("2005-05-01", "B") @@ -203,19 +203,39 @@ def test_constructor_U(self): with pytest.raises(ValueError, match="Invalid frequency: X"): period_range("2007-1-1", periods=500, freq="X") - def test_H_deprecated_from_time_series(self): - # GH#52536 - msg = "'H' is deprecated and will be removed in a future version." + @pytest.mark.parametrize("freq_depr", ["2MIN", "2US", "2NS"]) + def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): - period_range(freq="2H", start="1/1/2001", end="12/1/2009") + period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) + + @pytest.mark.parametrize("freq", ["2m", "2q-sep", "2y", "2H", "2S"]) + def test_incorrect_case_freq_from_time_series_raises(self, freq): + # GH#52536, GH#54939, GH#59143 + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + period_range(freq=freq, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq", ["2A", "2a", "2A-AUG", "2A-aug"]) + def test_A_raises_from_time_series(self, freq): + msg = f"Invalid frequency: {freq}" - @pytest.mark.parametrize("freq_depr", ["2A", "A-DEC", "200A-AUG"]) - def test_a_deprecated_from_time_series(self, freq_depr): - # GH#52536 - freq_msg = freq_depr[freq_depr.index("A") :] + with pytest.raises(ValueError, match=msg): + period_range(freq=freq, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq", ["2w"]) + def test_lowercase_freq_from_time_series_deprecated(self, freq): + # GH#52536, GH#54939 msg = ( - f"'{freq_msg}' is deprecated and will be removed in a future version, " - f"please use 'Y{freq_msg[1:]}' instead." + f"'{freq[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq.upper()[1:]}' instead." ) + with tm.assert_produces_warning(FutureWarning, match=msg): - period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") + period_range(freq=freq, start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/indexes/ranges/test_join.py b/pandas/tests/indexes/ranges/test_join.py index 682b5c8def9ff..09db30b1d4c51 100644 --- a/pandas/tests/indexes/ranges/test_join.py +++ b/pandas/tests/indexes/ranges/test_join.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( Index, @@ -175,3 +176,60 @@ def test_join_self(self, join_type): index = RangeIndex(start=0, stop=20, step=2) joined = index.join(index, how=join_type) assert index is joined + + +@pytest.mark.parametrize( + "left, right, expected, expected_lidx, expected_ridx, how", + [ + [RangeIndex(2), RangeIndex(3), RangeIndex(2), None, [0, 1], "left"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "left"], + [RangeIndex(2), RangeIndex(20, 22), RangeIndex(2), None, [-1, -1], "left"], + [RangeIndex(2), RangeIndex(3), RangeIndex(3), [0, 1, -1], None, "right"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "right"], + [ + RangeIndex(2), + RangeIndex(20, 22), + RangeIndex(20, 22), + [-1, -1], + None, + "right", + ], + [RangeIndex(2), RangeIndex(3), RangeIndex(2), [0, 1], [0, 1], "inner"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "inner"], + [RangeIndex(2), RangeIndex(1, 3), RangeIndex(1, 2), [1], [0], "inner"], + [RangeIndex(2), RangeIndex(3), RangeIndex(3), [0, 1, -1], [0, 1, 2], "outer"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "outer"], + [ + RangeIndex(2), + RangeIndex(2, 4), + RangeIndex(4), + [0, 1, -1, -1], + [-1, -1, 0, 1], + "outer", + ], + [RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "left"], + [RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "right"], + [RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "inner"], + [RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "outer"], + ], +) +@pytest.mark.parametrize( + "right_type", [RangeIndex, lambda x: Index(list(x), dtype=x.dtype)] +) +def test_join_preserves_rangeindex( + left, right, expected, expected_lidx, expected_ridx, how, right_type +): + result, lidx, ridx = left.join(right_type(right), how=how, return_indexers=True) + tm.assert_index_equal(result, expected, exact=True) + + if expected_lidx is None: + assert lidx is expected_lidx + else: + exp_lidx = np.array(expected_lidx, dtype=np.intp) + tm.assert_numpy_array_equal(lidx, exp_lidx) + + if expected_ridx is None: + assert ridx is expected_ridx + else: + exp_ridx = np.array(expected_ridx, dtype=np.intp) + tm.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 06e19eeca6766..1f9df30d60c11 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -9,6 +9,7 @@ RangeIndex, ) import pandas._testing as tm +from pandas.core.indexes.range import min_fitting_element class TestRangeIndex: @@ -199,11 +200,6 @@ def test_view(self): i_view = i.view("i8") tm.assert_numpy_array_equal(i.values, i_view) - msg = "Passing a type in RangeIndex.view is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - i_view = i.view(RangeIndex) - tm.assert_index_equal(i, i_view) - def test_dtype(self, simple_index): index = simple_index assert index.dtype == np.int64 @@ -242,11 +238,6 @@ def test_cache(self): pass assert idx._cache == {} - msg = "RangeIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - idx.format() - assert idx._cache == {} - df = pd.DataFrame({"a": range(10)}, index=idx) # df.__repr__ should not populate index cache @@ -384,8 +375,11 @@ def test_cant_or_shouldnt_cast(self, start, stop, step): def test_view_index(self, simple_index): index = simple_index - msg = "Passing a type in RangeIndex.view is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) + with pytest.raises(TypeError, match=msg): index.view(Index) def test_prevent_casting(self, simple_index): @@ -424,21 +418,21 @@ def test_extended_gcd(self, simple_index): assert 2 == result[0] def test_min_fitting_element(self): - result = RangeIndex(0, 20, 2)._min_fitting_element(1) + result = min_fitting_element(0, 2, 1) assert 2 == result - result = RangeIndex(1, 6)._min_fitting_element(1) + result = min_fitting_element(1, 1, 1) assert 1 == result - result = RangeIndex(18, -2, -2)._min_fitting_element(1) + result = min_fitting_element(18, -2, 1) assert 2 == result - result = RangeIndex(5, 0, -1)._min_fitting_element(1) + result = min_fitting_element(5, -1, 1) assert 1 == result big_num = 500000000000000000000000 - result = RangeIndex(5, big_num * 2, 1)._min_fitting_element(big_num) + result = min_fitting_element(5, 1, big_num) assert big_num == result def test_slice_specialised(self, simple_index): @@ -570,15 +564,6 @@ def test_engineless_lookup(self): assert "_engine" not in idx._cache - def test_format_empty(self): - # GH35712 - empty_idx = RangeIndex(0) - msg = r"RangeIndex\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert empty_idx.format() == [] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert empty_idx.format(name=True) == [""] - @pytest.mark.parametrize( "ri", [ @@ -620,3 +605,305 @@ def test_range_index_rsub_by_const(self): result = 3 - RangeIndex(0, 4, 1) expected = RangeIndex(3, -1, -1) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "rng, decimals", + [ + [range(5), 0], + [range(5), 2], + [range(10, 30, 10), -1], + [range(30, 10, -10), -1], + ], +) +def test_range_round_returns_rangeindex(rng, decimals): + ri = RangeIndex(rng) + expected = ri.copy() + result = ri.round(decimals=decimals) + tm.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize( + "rng, decimals", + [ + [range(10, 30, 1), -1], + [range(30, 10, -1), -1], + [range(11, 14), -10], + ], +) +def test_range_round_returns_index(rng, decimals): + ri = RangeIndex(rng) + expected = Index(list(rng)).round(decimals=decimals) + result = ri.round(decimals=decimals) + tm.assert_index_equal(result, expected, exact=True) + + +def test_reindex_1_value_returns_rangeindex(): + ri = RangeIndex(0, 10, 2, name="foo") + result, result_indexer = ri.reindex([2]) + expected = RangeIndex(2, 4, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([1], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + +def test_reindex_empty_returns_rangeindex(): + ri = RangeIndex(0, 10, 2, name="foo") + result, result_indexer = ri.reindex([]) + expected = RangeIndex(0, 0, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + +def test_insert_empty_0_loc(): + ri = RangeIndex(0, step=10, name="foo") + result = ri.insert(0, 5) + expected = RangeIndex(5, 15, 10, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_append_non_rangeindex_return_rangeindex(): + ri = RangeIndex(1) + result = ri.append(Index([1])) + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) + + +def test_append_non_rangeindex_return_index(): + ri = RangeIndex(1) + result = ri.append(Index([1, 3, 4])) + expected = Index([0, 1, 3, 4]) + tm.assert_index_equal(result, expected, exact=True) + + +def test_reindex_returns_rangeindex(): + ri = RangeIndex(2, name="foo") + result, result_indexer = ri.reindex([1, 2, 3]) + expected = RangeIndex(1, 4, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + +def test_reindex_returns_index(): + ri = RangeIndex(4, name="foo") + result, result_indexer = ri.reindex([0, 1, 3]) + expected = Index([0, 1, 3], name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + +def test_take_return_rangeindex(): + ri = RangeIndex(5, name="foo") + result = ri.take([]) + expected = RangeIndex(0, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + result = ri.take([3, 4]) + expected = RangeIndex(3, 5, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize( + "rng, exp_rng", + [ + [range(5), range(3, 4)], + [range(0, -10, -2), range(-6, -8, -2)], + [range(0, 10, 2), range(6, 8, 2)], + ], +) +def test_take_1_value_returns_rangeindex(rng, exp_rng): + ri = RangeIndex(rng, name="foo") + result = ri.take([3]) + expected = RangeIndex(exp_rng, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_append_one_nonempty_preserve_step(): + expected = RangeIndex(0, -1, -1) + result = RangeIndex(0).append([expected]) + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_all_true(): + ri = RangeIndex(3, name="foo") + expected = ri.copy() + result = ri[[True] * 3] + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_all_false(): + ri = RangeIndex(3, name="foo") + result = ri[[False] * 3] + expected = RangeIndex(0, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_returns_rangeindex(): + ri = RangeIndex(3, name="foo") + result = ri[[False, True, True]] + expected = RangeIndex(1, 3, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + result = ri[[True, False, True]] + expected = RangeIndex(0, 3, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_returns_index(): + ri = RangeIndex(4, name="foo") + result = ri[[True, True, False, True]] + expected = Index([0, 1, 3], name="foo") + tm.assert_index_equal(result, expected) + + +def test_getitem_boolmask_wrong_length(): + ri = RangeIndex(4, name="foo") + with pytest.raises(IndexError, match="Boolean index has wrong length"): + ri[[True]] + + +def test_pos_returns_rangeindex(): + ri = RangeIndex(2, name="foo") + expected = ri.copy() + result = +ri + tm.assert_index_equal(result, expected, exact=True) + + +def test_neg_returns_rangeindex(): + ri = RangeIndex(2, name="foo") + result = -ri + expected = RangeIndex(0, -2, -1, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + ri = RangeIndex(-2, 2, name="foo") + result = -ri + expected = RangeIndex(2, -2, -1, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize( + "rng, exp_rng", + [ + [range(0), range(0)], + [range(10), range(10)], + [range(-2, 1, 1), range(2, -1, -1)], + [range(0, -10, -1), range(0, 10, 1)], + ], +) +def test_abs_returns_rangeindex(rng, exp_rng): + ri = RangeIndex(rng, name="foo") + expected = RangeIndex(exp_rng, name="foo") + result = abs(ri) + tm.assert_index_equal(result, expected, exact=True) + + +def test_abs_returns_index(): + ri = RangeIndex(-2, 2, name="foo") + result = abs(ri) + expected = Index([2, 1, 0, 1], name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize( + "rng", + [ + range(0), + range(5), + range(0, -5, -1), + range(-2, 2, 1), + range(2, -2, -2), + range(0, 5, 2), + ], +) +def test_invert_returns_rangeindex(rng): + ri = RangeIndex(rng, name="foo") + result = ~ri + assert isinstance(result, RangeIndex) + expected = ~Index(list(rng), name="foo") + tm.assert_index_equal(result, expected, exact=False) + + +@pytest.mark.parametrize( + "rng", + [ + range(0, 5, 1), + range(0, 5, 2), + range(10, 15, 1), + range(10, 5, -1), + range(10, 5, -2), + range(5, 0, -1), + ], +) +@pytest.mark.parametrize("meth", ["argmax", "argmin"]) +def test_arg_min_max(rng, meth): + ri = RangeIndex(rng) + idx = Index(list(rng)) + assert getattr(ri, meth)() == getattr(idx, meth)() + + +@pytest.mark.parametrize("meth", ["argmin", "argmax"]) +def test_empty_argmin_argmax_raises(meth): + with pytest.raises(ValueError, match=f"attempt to get {meth} of an empty sequence"): + getattr(RangeIndex(0), meth)() + + +def test_getitem_integers_return_rangeindex(): + result = RangeIndex(0, 10, 2, name="foo")[[0, -1]] + expected = RangeIndex(start=0, stop=16, step=8, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + result = RangeIndex(0, 10, 2, name="foo")[[3]] + expected = RangeIndex(start=6, stop=8, step=2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_empty_return_rangeindex(): + result = RangeIndex(0, 10, 2, name="foo")[[]] + expected = RangeIndex(start=0, stop=0, step=1, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_integers_return_index(): + result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]] + expected = Index([0, 2, 8], dtype="int64", name="foo") + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "rng", + [ + range(3), + range(0), + range(0, 3, 2), + range(3, -3, -2), + ], +) +def test_value_counts(sort, dropna, ascending, normalize, rng): + ri = RangeIndex(rng, name="A") + result = ri.value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + expected = Index(list(rng), name="A").value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + tm.assert_series_equal(result, expected, check_index_type=False) + + +@pytest.mark.parametrize("side", ["left", "right"]) +@pytest.mark.parametrize("value", [0, -5, 5, -3, np.array([-5, -3, 0, 5])]) +def test_searchsorted(side, value): + ri = RangeIndex(-3, 3, 2) + result = ri.searchsorted(value=value, side=side) + expected = Index(list(ri)).searchsorted(value=value, side=side) + if isinstance(value, int): + assert result == expected + else: + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index d417b8b743dc5..ac24ff828cb8f 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -93,12 +93,12 @@ def test_intersection(self, sort): # GH 17296: intersect two decreasing RangeIndexes first = RangeIndex(10, -2, -2) other = RangeIndex(5, -4, -1) - expected = first.astype(int).intersection(other.astype(int), sort=sort) - result = first.intersection(other, sort=sort).astype(int) + expected = RangeIndex(start=4, stop=-2, step=-2) + result = first.intersection(other, sort=sort) tm.assert_index_equal(result, expected) # reversed - result = other.intersection(first, sort=sort).astype(int) + result = other.intersection(first, sort=sort) tm.assert_index_equal(result, expected) index = RangeIndex(5, name="foo") diff --git a/pandas/tests/indexes/string/__init__.py b/pandas/tests/indexes/string/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/string/test_astype.py b/pandas/tests/indexes/string/test_astype.py new file mode 100644 index 0000000000000..0349d85f23167 --- /dev/null +++ b/pandas/tests/indexes/string/test_astype.py @@ -0,0 +1,21 @@ +from pandas import ( + Index, + Series, +) +import pandas._testing as tm + + +def test_astype_str_from_bytes(): + # https://github.com/pandas-dev/pandas/issues/38607 + # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively + # did a .decode() on the bytes object. In 2.0 we go through + # ensure_string_array which does f"{val}" + idx = Index(["あ", b"a"], dtype="object") + result = idx.astype(str) + expected = Index(["あ", "a"], dtype="str") + tm.assert_index_equal(result, expected) + + # while we're here, check that Series.astype behaves the same + result = Series(idx).astype(str) + expected = Series(expected, dtype="str") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py new file mode 100644 index 0000000000000..648ee47ddc34c --- /dev/null +++ b/pandas/tests/indexes/string/test_indexing.py @@ -0,0 +1,199 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index +import pandas._testing as tm + + +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +def _equivalent_na(dtype, null): + if dtype.na_value is pd.NA and null is pd.NA: + return True + elif _isnan(dtype.na_value) and _isnan(null): + return True + else: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + assert index.get_loc(nulls_fixture) == 2 + + +class TestGetIndexer: + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", [-1, 0, 1, 1]), + ("backfill", [0, 0, 1, -1]), + ], + ) + def test_get_indexer_strings(self, any_string_dtype, method, expected): + expected = np.array(expected, dtype=np.intp) + index = Index(["b", "c"], dtype=any_string_dtype) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) + + tm.assert_numpy_array_equal(actual, expected) + + def test_get_indexer_strings_raises(self, any_string_dtype): + index = Index(["b", "c"], dtype=any_string_dtype) + + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # actually non-unique + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + +class TestSliceLocs: + @pytest.mark.parametrize( + "in_slice,expected", + [ + # error: Slice index must be an integer or None + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] + (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] + (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] + (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] + (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] + ], + ) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] + expected = Index(list(expected), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_dup(self, any_string_dtype): + index = Index(["a", "a", "b", "c", "d", "d"], dtype=any_string_dtype) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) + + index2 = index[::-1] + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 10204cfb78e89..a4c18732ef258 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -1,6 +1,7 @@ """ Tests that can be parametrized over _any_ Index object. """ + import re import numpy as np @@ -21,12 +22,6 @@ def test_boolean_context_compat(index): bool(index) -def test_sort(index): - msg = "cannot sort an Index object in-place, use sort_values instead" - with pytest.raises(TypeError, match=msg): - index.sort() - - def test_hash_error(index): with pytest.raises(TypeError, match=f"unhashable type: '{type(index).__name__}'"): hash(index) @@ -45,7 +40,7 @@ def test_map_identity_mapping(index, request): # GH#12766 result = index.map(lambda x: x) - if index.dtype == object and result.dtype == bool: + if index.dtype == object and (result.dtype == bool or result.dtype == "string"): assert (index == result).all() # TODO: could work that into the 'exact="equiv"'? return # FIXME: doesn't belong in this file anymore! diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 666d92064c86c..5b75bd9afd6df 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -71,15 +71,15 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.parametrize("index", ["string"], indirect=True) - def test_constructor_copy(self, index, using_infer_string): + def test_constructor_copy(self, using_infer_string): + index = Index(list("abc"), name="name") arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) assert new_index.name == "name" if using_infer_string: tm.assert_extension_array_equal( - new_index.values, pd.array(arr, dtype="string[pyarrow_numpy]") + new_index.values, pd.array(arr, dtype="str") ) else: tm.assert_numpy_array_equal(arr, new_index.values) @@ -104,16 +104,9 @@ def test_constructor_copy(self, index, using_infer_string): ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(index.astype(object)) - else: - result = Index(index) - - tm.assert_index_equal(result, index) - - if isinstance(index, DatetimeIndex): - assert result.tz == index.tz - if cast_as_obj: + result = Index(index.astype(object)) + assert result.dtype == np.dtype(object) + if isinstance(index, DatetimeIndex): # GH#23524 check that Index(dti, dtype=object) does not # incorrectly raise ValueError, and that nanoseconds are not # dropped @@ -121,6 +114,10 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): result = Index(index, dtype=object) assert result.dtype == np.object_ assert list(result) == list(index) + else: + result = Index(index) + + tm.assert_index_equal(result, index) @pytest.mark.parametrize( "index,has_tz", @@ -160,7 +157,7 @@ def test_constructor_from_frame_series_freq(self, using_infer_string): df = DataFrame(np.random.default_rng(2).random((5, 3))) df["date"] = dts result = DatetimeIndex(df["date"], freq="MS") - dtype = object if not using_infer_string else "string" + dtype = object if not using_infer_string else "str" assert df["date"].dtype == dtype expected.name = "date" tm.assert_index_equal(result, expected) @@ -186,7 +183,7 @@ def test_constructor_int_dtype_nan(self): "klass,dtype,na_val", [ (Index, np.float64, np.nan), - (DatetimeIndex, "datetime64[ns]", pd.NaT), + (DatetimeIndex, "datetime64[s]", pd.NaT), ], ) def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): @@ -354,11 +351,12 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "string": - with pytest.raises(NotImplementedError, match="i8"): - index.view("i8") else: - msg = "Cannot change data-type for object array" + msg = ( + r"Cannot change data-type for array of references\.|" + r"Cannot change data-type for object array\.|" + r"Cannot change data-type for array of strings\.|" + ) with pytest.raises(TypeError, match=msg): index.view("i8") @@ -478,7 +476,7 @@ def test_empty_fancy(self, index, dtype, request, using_infer_string): assert index[[]].identical(empty_index) if dtype == np.bool_: - with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + with pytest.raises(ValueError, match="length of the boolean indexer"): assert index[empty_arr].identical(empty_index) else: assert index[empty_arr].identical(empty_index) @@ -636,9 +634,7 @@ def test_append_empty_preserve_name(self, name, expected): left = Index([], name="foo") right = Index([1, 2, 3], name=name) - msg = "The behavior of array concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = left.append(right) + result = left.append(right) assert result.name == expected @pytest.mark.parametrize( @@ -686,47 +682,11 @@ def test_is_object(self, index, expected, using_infer_string): def test_summary(self, index): index._summary() - def test_format_bug(self): - # GH 14626 - # windows has different precision on datetime.datetime.now (it doesn't - # include us since the default for Timestamp shows these but Index - # formatting does not we are skipping) - now = datetime.now() - msg = r"Index\.format is deprecated" - - if not str(now).endswith("000"): - index = Index([now]) - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = index.format() - expected = [str(index[0])] - assert formatted == expected - - with tm.assert_produces_warning(FutureWarning, match=msg): - Index([]).format() - - @pytest.mark.parametrize("vals", [[1, 2.0 + 3.0j, 4.0], ["a", "b", "c"]]) - def test_format_missing(self, vals, nulls_fixture): - # 2845 - vals = list(vals) # Copy for each iteration - vals.append(nulls_fixture) - index = Index(vals, dtype=object) - # TODO: case with complex dtype? - - msg = r"Index\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = index.format() - null_repr = "NaN" if isinstance(nulls_fixture, float) else str(nulls_fixture) - expected = [str(index[0]), str(index[1]), str(index[2]), null_repr] - - assert formatted == expected - assert index[3] is nulls_fixture - - @pytest.mark.parametrize("op", ["any", "all"]) - def test_logical_compat(self, op, simple_index): + def test_logical_compat(self, all_boolean_reductions, simple_index): index = simple_index - left = getattr(index, op)() - assert left == getattr(index.values, op)() - right = getattr(index.to_series(), op)() + left = getattr(index, all_boolean_reductions)() + assert left == getattr(index.values, all_boolean_reductions)() + right = getattr(index.to_series(), all_boolean_reductions)() # left might not match right exactly in e.g. string cases where the # because we use np.any/all instead of .any/all assert bool(left) == bool(right) @@ -845,12 +805,14 @@ def test_is_monotonic_incomparable(self, attr): @pytest.mark.parametrize( "index,expected", [ - (Index(["qux", "baz", "foo", "bar"]), np.array([False, False, True, True])), - (Index([]), np.array([], dtype=bool)), # empty + (["qux", "baz", "foo", "bar"], [False, False, True, True]), + ([], []), # empty ], ) def test_isin(self, values, index, expected): + index = Index(index) result = index.isin(values) + expected = np.array(expected, dtype=bool) tm.assert_numpy_array_equal(result, expected) def test_isin_nan_common_object( @@ -898,7 +860,7 @@ def test_isin_nan_common_float64(self, nulls_fixture, float_numpy_dtype): # and 2) that with an NaN we do not have .isin(nulls_fixture) msg = ( r"float\(\) argument must be a string or a (real )?number, " - f"not {repr(type(nulls_fixture).__name__)}" + f"not {type(nulls_fixture).__name__!r}" ) with pytest.raises(TypeError, match=msg): Index([1.0, nulls_fixture], dtype=dtype) @@ -919,11 +881,12 @@ def test_isin_nan_common_float64(self, nulls_fixture, float_numpy_dtype): @pytest.mark.parametrize( "index", [ - Index(["qux", "baz", "foo", "bar"]), - Index([1.0, 2.0, 3.0, 4.0], dtype=np.float64), + ["qux", "baz", "foo", "bar"], + np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float64), ], ) def test_isin_level_kwarg(self, level, index): + index = Index(index) values = index.tolist()[-2:] + ["nonexisting"] expected = np.array([False, False, True, True]) @@ -957,10 +920,9 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @td.skip_if_no("pyarrow") - def test_isin_arrow_string_null(self): + def test_isin_string_null(self, string_dtype_no_object): # GH#55821 - index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b"], dtype=string_dtype_no_object) result = index.isin([None]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) @@ -996,6 +958,31 @@ def test_slice_keep_name(self): index = Index(["a", "b"], name="asdf") assert index.name == index[1:].name + def test_slice_is_unique(self): + # GH 57911 + index = Index([1, 1, 2, 3, 4]) + assert not index.is_unique + filtered_index = index[2:].copy() + assert filtered_index.is_unique + + def test_slice_is_montonic(self): + """Test that is_monotonic_decreasing is correct on slices.""" + # GH 57911 + index = Index([1, 2, 3, 3]) + assert not index.is_monotonic_decreasing + + filtered_index = index[2:].copy() + assert filtered_index.is_monotonic_decreasing + assert filtered_index.is_monotonic_increasing + + filtered_index = index[1:].copy() + assert not filtered_index.is_monotonic_decreasing + assert filtered_index.is_monotonic_increasing + + filtered_index = index[:].copy() + assert not filtered_index.is_monotonic_decreasing + assert filtered_index.is_monotonic_increasing + @pytest.mark.parametrize( "index", [ @@ -1079,10 +1066,11 @@ def test_str_bool_series_indexing(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "index,expected", [(Index(list("abcd")), True), (Index(range(4)), False)] + "index,expected", [(list("abcd"), True), (range(4), False)] ) def test_tab_completion(self, index, expected): # GH 9910 + index = Index(index) result = "str" in dir(index) assert result == expected @@ -1096,10 +1084,10 @@ def test_outer_join_sort(self): left_index = Index(np.random.default_rng(2).permutation(15)) right_index = date_range("2020-01-01", periods=10) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = left_index.join(right_index, how="outer") - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): expected = left_index.astype(object).union(right_index.astype(object)) tm.assert_index_equal(result, expected) @@ -1124,8 +1112,7 @@ def test_take_fill_value(self): def test_take_fill_value_none_raises(self): index = Index(list("ABC"), name="xxx") msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): @@ -1165,15 +1152,11 @@ def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, labels): index = Index(list("abc")) assert index.reindex(labels)[0].dtype.type == index.dtype.type - @pytest.mark.parametrize( - "labels,dtype", - [ - (DatetimeIndex([]), np.datetime64), - ], - ) - def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self, labels, dtype): + def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self): # GH7774 index = Index(list("abc")) + labels = DatetimeIndex([]) + dtype = np.datetime64 assert index.reindex(labels)[0].dtype.type == dtype def test_reindex_doesnt_preserve_type_if_target_is_empty_index_numeric( @@ -1550,8 +1533,10 @@ class TestIndexUtils: @pytest.mark.parametrize( "data, names, expected", [ - ([[1, 2, 3]], None, Index([1, 2, 3])), - ([[1, 2, 3]], ["name"], Index([1, 2, 3], name="name")), + ([[1, 2, 4]], None, Index([1, 2, 4])), + ([[1, 2, 4]], ["name"], Index([1, 2, 4], name="name")), + ([[1, 2, 3]], None, RangeIndex(1, 4)), + ([[1, 2, 3]], ["name"], RangeIndex(1, 4, name="name")), ( [["a", "a"], ["c", "d"]], None, @@ -1566,7 +1551,7 @@ class TestIndexUtils: ) def test_ensure_index_from_sequences(self, data, names, expected): result = ensure_index_from_sequences(data, names) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) def test_ensure_index_mixed_closed_intervals(self): # GH27172 @@ -1594,7 +1579,7 @@ def test_ensure_index_uint64(self): def test_get_combined_index(self): result = _get_combined_index([]) - expected = Index([]) + expected = RangeIndex(0) tm.assert_index_equal(result, expected) @@ -1636,7 +1621,7 @@ def test_generated_op_names(opname, index): partial(DatetimeIndex, data=["2020-01-01"]), partial(PeriodIndex, data=["2020-01-01"]), partial(TimedeltaIndex, data=["1 day"]), - partial(RangeIndex, data=range(1)), + partial(RangeIndex, start=range(1)), partial(IntervalIndex, data=[pd.Interval(0, 1)]), partial(Index, data=["a"], dtype=object), partial(MultiIndex, levels=[1], codes=[0]), @@ -1722,3 +1707,13 @@ def test_nan_comparison_same_object(op): result = op(idx, idx.copy()) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_is_monotonic_pyarrow_list_type(): + # GH 57333 + import pyarrow as pa + + idx = Index([[1], [2, 3]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + assert not idx.is_monotonic_increasing + assert not idx.is_monotonic_decreasing diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 412a59d15307d..bf16554871efc 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -3,6 +3,7 @@ any index subclass except for MultiIndex. Makes use of the `index_flat` fixture defined in pandas/conftest.py. """ + from copy import ( copy, deepcopy, @@ -32,7 +33,7 @@ class TestCommon: @pytest.mark.parametrize("name", [None, "new_name"]) - def test_to_frame(self, name, index_flat, using_copy_on_write): + def test_to_frame(self, name, index_flat): # see GH#15230, GH#22580 idx = index_flat @@ -46,8 +47,6 @@ def test_to_frame(self, name, index_flat, using_copy_on_write): assert df.index is idx assert len(df.columns) == 1 assert df.columns[0] == idx_name - if not using_copy_on_write: - assert df[idx_name].values is not idx.values df = idx.to_frame(index=False, name=idx_name) assert df.index is not idx @@ -224,7 +223,9 @@ def test_unique(self, index_flat): pass result = idx.unique() - tm.assert_index_equal(result, idx_unique) + tm.assert_index_equal( + result, idx_unique, exact=not isinstance(index, RangeIndex) + ) # nans: if not index._can_hold_na: @@ -271,9 +272,7 @@ def test_searchsorted_monotonic(self, index_flat, request): # all values are the same, expected_right should be length expected_right = len(index) - # test _searchsorted_monotonic in all cases - # test searchsorted only for increasing - if index.is_monotonic_increasing: + if index.is_monotonic_increasing or index.is_monotonic_decreasing: ssm_left = index._searchsorted_monotonic(value, side="left") assert expected_left == ssm_left @@ -285,13 +284,6 @@ def test_searchsorted_monotonic(self, index_flat, request): ss_right = index.searchsorted(value, side="right") assert expected_right == ss_right - - elif index.is_monotonic_decreasing: - ssm_left = index._searchsorted_monotonic(value, side="left") - assert expected_left == ssm_left - - ssm_right = index._searchsorted_monotonic(value, side="right") - assert expected_right == ssm_right else: # non-monotonic should raise. msg = "index must be monotonic increasing or decreasing" @@ -480,6 +472,17 @@ def test_sort_values_with_missing(index_with_missing, na_position, request): tm.assert_index_equal(result, expected) +def test_sort_values_natsort_key(): + # GH#56081 + def split_convert(s): + return tuple(int(x) for x in s.split(".")) + + idx = pd.Index(["1.9", "2.0", "1.11", "1.10"]) + expected = pd.Index(["1.9", "1.10", "1.11", "2.0"]) + result = idx.sort_values(key=lambda x: tuple(map(split_convert, x))) + tm.assert_index_equal(result, expected) + + def test_ndarray_compat_properties(index): if isinstance(index, PeriodIndex) and not IS64: pytest.skip("Overflow") @@ -500,3 +503,26 @@ def test_ndarray_compat_properties(index): # test for validity idx.nbytes idx.values.nbytes + + +def test_compare_read_only_array(): + # GH#57130 + arr = np.array([], dtype=object) + arr.flags.writeable = False + idx = pd.Index(arr) + result = idx > 69 + assert result.dtype == bool + + +def test_to_frame_column_rangeindex(): + idx = pd.Index([1]) + result = idx.to_frame().columns + expected = RangeIndex(1) + tm.assert_index_equal(result, expected, exact=True) + + +def test_to_frame_name_tuple_multiindex(): + idx = pd.Index([1]) + result = idx.to_frame(name=(1, 2)) + expected = pd.DataFrame([1], columns=MultiIndex.from_arrays([[1], [2]]), index=idx) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 21a686e8bc05b..e45d11e6286e2 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -1,4 +1,4 @@ -""" generic datetimelike tests """ +"""generic datetimelike tests""" import numpy as np import pytest @@ -85,15 +85,15 @@ def test_str(self, simple_index): def test_view(self, simple_index): idx = simple_index - idx_view = idx.view("i8") result = type(simple_index)(idx) tm.assert_index_equal(result, idx) - msg = "Passing a type in .*Index.view is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - idx_view = idx.view(type(simple_index)) - result = type(simple_index)(idx) - tm.assert_index_equal(result, idx_view) + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) + with pytest.raises(TypeError, match=msg): + idx.view(type(simple_index)) def test_map_callable(self, simple_index): index = simple_index @@ -162,7 +162,6 @@ def test_where_cast_str(self, simple_index): result = index.where(mask, ["foo"]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_diff(self, unit): # GH 55080 dti = pd.to_datetime([10, 20, 30], unit=unit).as_unit(unit) diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 72641077c90fe..dd228e6b713b5 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -1,6 +1,7 @@ """ Tests for the Index constructor conducting inference. """ + from datetime import ( datetime, timedelta, @@ -60,16 +61,16 @@ def test_infer_nat(self, val): values = [NaT, val] idx = Index(values) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(values[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() @pytest.mark.parametrize("na_value", [None, np.nan]) @pytest.mark.parametrize("vtype", [list, tuple, iter]) @@ -80,14 +81,10 @@ def test_construction_list_tuples_nan(self, na_value, vtype): expected = MultiIndex.from_tuples(values) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "dtype", - [int, "int64", "int32", "int16", "int8", "uint64", "uint32", "uint16", "uint8"], - ) - def test_constructor_int_dtype_float(self, dtype): + def test_constructor_int_dtype_float(self, any_int_numpy_dtype): # GH#18400 - expected = Index([0, 1, 2, 3], dtype=dtype) - result = Index([0.0, 1.0, 2.0, 3.0], dtype=dtype) + expected = Index([0, 1, 2, 3], dtype=any_int_numpy_dtype) + result = Index([0.0, 1.0, 2.0, 3.0], dtype=any_int_numpy_dtype) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("cast_index", [True, False]) @@ -141,29 +138,25 @@ def test_constructor_infer_nat_dt_like( ) expected = klass([NaT, NaT]) + if dtype[0] == "d": + # we infer all-NaT as second resolution + expected = expected.astype("M8[ns]") assert expected.dtype == dtype data = [ctor] data.insert(pos, nulls_fixture) - warn = None if nulls_fixture is NA: expected = Index([NA, NaT]) mark = pytest.mark.xfail(reason="Broken with np.NaT ctor; see GH 31884") request.applymarker(mark) - # GH#35942 numpy will emit a DeprecationWarning within the - # assert_index_equal calls. Since we can't do anything - # about it until GH#31884 is fixed, we suppress that warning. - warn = DeprecationWarning result = Index(data) - with tm.assert_produces_warning(warn): - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) result = Index(np.array(data, dtype=object)) - with tm.assert_produces_warning(warn): - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("swap_objs", [True, False]) def test_constructor_mixed_nat_objs_infers_object(self, swap_objs): @@ -332,11 +325,12 @@ def test_constructor_dtypes_to_categorical(self, vals): @pytest.mark.parametrize( "vals", [ - Index(np.array([np.datetime64("2011-01-01"), np.datetime64("2011-01-02")])), - Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]), + np.array([np.datetime64("2011-01-01"), np.datetime64("2011-01-02")]), + [datetime(2011, 1, 1), datetime(2011, 1, 2)], ], ) def test_constructor_dtypes_to_datetime(self, cast_index, vals): + vals = Index(vals) if cast_index: index = Index(vals, dtype=object) assert isinstance(index, Index) @@ -413,7 +407,7 @@ class ArrayLike: def __init__(self, array) -> None: self.array = array - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: return self.array expected = Index(array) @@ -425,8 +419,7 @@ class TestIndexConstructionErrors: def test_constructor_overflow_int64(self): # see GH#15832 msg = ( - "The elements provided in the data cannot " - "all be casted to the dtype int64" + "The elements provided in the data cannot all be casted to the dtype int64" ) with pytest.raises(OverflowError, match=msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 1ea47f636ac9b..1bbffcee3b671 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -14,6 +14,7 @@ The corresponding tests.indexes.[index_type].test_indexing files contain tests for the corresponding methods specific to those Index subclasses. """ + import numpy as np import pytest @@ -92,15 +93,16 @@ class TestContains: @pytest.mark.parametrize( "index,val", [ - (Index([0, 1, 2]), 2), - (Index([0, 1, "2"]), "2"), - (Index([0, 1, 2, np.inf, 4]), 4), - (Index([0, 1, 2, np.nan, 4]), 4), - (Index([0, 1, 2, np.inf]), np.inf), - (Index([0, 1, 2, np.nan]), np.nan), + ([0, 1, 2], 2), + ([0, 1, "2"], "2"), + ([0, 1, 2, np.inf, 4], 4), + ([0, 1, 2, np.nan, 4], 4), + ([0, 1, 2, np.inf], np.inf), + ([0, 1, 2, np.nan], np.nan), ], ) def test_index_contains(self, index, val): + index = Index(index) assert val in index @pytest.mark.parametrize( @@ -123,18 +125,16 @@ def test_index_contains(self, index, val): def test_index_not_contains(self, index, val): assert val not in index - @pytest.mark.parametrize( - "index,val", [(Index([0, 1, "2"]), 0), (Index([0, 1, "2"]), "2")] - ) - def test_mixed_index_contains(self, index, val): + @pytest.mark.parametrize("val", [0, "2"]) + def test_mixed_index_contains(self, val): # GH#19860 + index = Index([0, 1, "2"]) assert val in index - @pytest.mark.parametrize( - "index,val", [(Index([0, 1, "2"]), "1"), (Index([0, 1, "2"]), 2)] - ) + @pytest.mark.parametrize("val", ["1", 2]) def test_mixed_index_not_contains(self, index, val): # GH#19860 + index = Index([0, 1, "2"]) assert val not in index def test_contains_with_float_index(self, any_real_numpy_dtype): @@ -303,12 +303,10 @@ def test_putmask_with_wrong_mask(self, index): index.putmask("foo", fill) -@pytest.mark.parametrize( - "idx", [Index([1, 2, 3]), Index([0.1, 0.2, 0.3]), Index(["a", "b", "c"])] -) +@pytest.mark.parametrize("idx", [[1, 2, 3], [0.1, 0.2, 0.3], ["a", "b", "c"]]) def test_getitem_deprecated_float(idx): # https://github.com/pandas-dev/pandas/issues/34191 - + idx = Index(idx) msg = "Indexing with a float is no longer supported" with pytest.raises(IndexError, match=msg): idx[1.0] diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 1787379b0faee..5f36b8c3f5dbf 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.core.dtypes.common import ( @@ -27,6 +25,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, isna, period_range, @@ -86,6 +85,7 @@ def test_pickle_compat_construction(self, simple_index): r"kind, None was passed", r"__new__\(\) missing 1 required positional argument: 'data'", r"__new__\(\) takes at least 2 arguments \(1 given\)", + r"'NoneType' object is not iterable", ] ) with pytest.raises(TypeError, match=msg): @@ -221,12 +221,7 @@ def test_logical_compat(self, simple_index): assert idx.any() == idx._values.any() assert idx.any() == idx.to_series().any() else: - msg = "cannot perform (any|all)" - if isinstance(idx, IntervalIndex): - msg = ( - r"'IntervalArray' with dtype interval\[.*\] does " - "not support reduction '(any|all)'" - ) + msg = "does not support operation '(any|all)'" with pytest.raises(TypeError, match=msg): idx.all() with pytest.raises(TypeError, match=msg): @@ -261,7 +256,7 @@ def test_ensure_copied_data(self, index): "RangeIndex cannot be initialized from data, " "MultiIndex and CategoricalIndex are tested separately" ) - elif index.dtype == object and index.inferred_type == "boolean": + elif index.dtype == object and index.inferred_type in ["boolean", "string"]: init_kwargs["dtype"] = index.dtype index_type = type(index) @@ -275,9 +270,7 @@ def test_ensure_copied_data(self, index): if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied - depr_msg = "The 'ordinal' keyword in PeriodIndex is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) + result = index_type.from_ordinals(ordinals=index.asi8, **init_kwargs) tm.assert_numpy_array_equal(index.asi8, result.asi8, check_same="same") elif isinstance(index, IntervalIndex): # checked in test_interval.py @@ -295,12 +288,17 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._mask, result._values._mask, check_same="same" ) - elif index.dtype == "string[python]": + elif ( + isinstance(index.dtype, StringDtype) and index.dtype.storage == "python" + ): assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): + elif ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + ): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) @@ -332,6 +330,30 @@ def test_memory_usage(self, index): if index.inferred_type == "object": assert result3 > result2 + def test_memory_usage_doesnt_trigger_engine(self, index): + index._cache.clear() + assert "_engine" not in index._cache + + res_without_engine = index.memory_usage() + assert "_engine" not in index._cache + + # explicitly load and cache the engine + _ = index._engine + assert "_engine" in index._cache + + res_with_engine = index.memory_usage() + + # the empty engine doesn't affect the result even when initialized with values, + # because engine.sizeof() doesn't consider the content of engine.values + assert res_with_engine == res_without_engine + + if len(index) == 0: + assert res_without_engine == 0 + assert res_with_engine == 0 + else: + assert res_without_engine > 0 + assert res_with_engine > 0 + def test_argsort(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"{type(self).__name__} separately tested") @@ -410,26 +432,16 @@ def test_where(self, listlike_box, simple_index): tm.assert_index_equal(result, expected) def test_insert_base(self, index): + # GH#51363 trimmed = index[1:4] if not len(index): pytest.skip("Not applicable for empty index") - # test 0th element - warn = None - if index.dtype == object and index.inferred_type == "boolean": - # GH#51363 - warn = FutureWarning - msg = "The behavior of Index.insert with object-dtype is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = trimmed.insert(0, index[0]) + result = trimmed.insert(0, index[0]) assert index[0:4].equals(result) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), - reason="completely different behavior, tested elsewher", - ) - def test_insert_out_of_bounds(self, index): + def test_insert_out_of_bounds(self, index, using_infer_string): # TypeError/IndexError matches what np.insert raises in these cases if len(index) > 0: @@ -441,6 +453,14 @@ def test_insert_out_of_bounds(self, index): msg = "index (0|0.5) is out of bounds for axis 0 with size 0" else: msg = "slice indices must be integers or None or have an __index__ method" + + if using_infer_string: + if index.dtype == "string" or index.dtype == "category": + msg = "loc must be an integer between" + elif index.dtype == "object" and len(index) == 0: + msg = "loc must be an integer between" + err = TypeError + with pytest.raises(err, match=msg): index.insert(0.5, "foo") @@ -567,29 +587,6 @@ def test_equals_op(self, simple_index): tm.assert_numpy_array_equal(index_a == item, expected3) tm.assert_series_equal(series_a == item, Series(expected3)) - def test_format(self, simple_index): - # GH35439 - if is_numeric_dtype(simple_index.dtype) or isinstance( - simple_index, DatetimeIndex - ): - pytest.skip("Tested elsewhere.") - idx = simple_index - expected = [str(x) for x in idx] - msg = r"Index\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert idx.format() == expected - - def test_format_empty(self, simple_index): - # GH35712 - if isinstance(simple_index, (PeriodIndex, RangeIndex)): - pytest.skip("Tested elsewhere") - empty_idx = type(simple_index)([]) - msg = r"Index\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert empty_idx.format() == [] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert empty_idx.format(name=True) == [""] - def test_fillna(self, index): # GH 11343 if len(index) == 0: @@ -600,7 +597,7 @@ def test_fillna(self, index): pytest.skip(f"Not relevant for Index with {index.dtype}") elif isinstance(index, MultiIndex): idx = index.copy(deep=True) - msg = "isna is not defined for MultiIndex" + msg = "fillna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): idx.fillna(idx[0]) else: @@ -620,13 +617,6 @@ def test_fillna(self, index): idx = type(index)(values) - msg = "does not support 'downcast'" - msg2 = r"The 'downcast' keyword in .*Index\.fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - with pytest.raises(NotImplementedError, match=msg): - # For now at least, we only raise if there are NAs present - idx.fillna(idx[0], downcast="infer") - expected = np.array([False] * len(idx), dtype=bool) expected[1] = True tm.assert_numpy_array_equal(idx._isnan, expected) @@ -840,8 +830,9 @@ def test_append_preserves_dtype(self, simple_index): result = index.append(index) assert result.dtype == index.dtype - tm.assert_index_equal(result[:N], index, check_exact=True) - tm.assert_index_equal(result[N:], index, check_exact=True) + + tm.assert_index_equal(result[:N], index, exact=False, check_exact=True) + tm.assert_index_equal(result[N:], index, exact=False, check_exact=True) alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) @@ -859,78 +850,16 @@ def test_inv(self, simple_index, using_infer_string): tm.assert_series_equal(res2, Series(expected)) else: if idx.dtype.kind == "f": - err = TypeError msg = "ufunc 'invert' not supported for the input types" - elif using_infer_string and idx.dtype == "string": - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" else: - err = TypeError - msg = "bad operand" - with pytest.raises(err, match=msg): + msg = "bad operand|__invert__ is not supported for string dtype" + with pytest.raises(TypeError, match=msg): ~idx # check that we get the same behavior with Series - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ~Series(idx) - def test_is_boolean_is_deprecated(self, simple_index): - # GH50042 - idx = simple_index - with tm.assert_produces_warning(FutureWarning): - idx.is_boolean() - - def test_is_floating_is_deprecated(self, simple_index): - # GH50042 - idx = simple_index - with tm.assert_produces_warning(FutureWarning): - idx.is_floating() - - def test_is_integer_is_deprecated(self, simple_index): - # GH50042 - idx = simple_index - with tm.assert_produces_warning(FutureWarning): - idx.is_integer() - - def test_holds_integer_deprecated(self, simple_index): - # GH50243 - idx = simple_index - msg = f"{type(idx).__name__}.holds_integer is deprecated. " - with tm.assert_produces_warning(FutureWarning, match=msg): - idx.holds_integer() - - def test_is_numeric_is_deprecated(self, simple_index): - # GH50042 - idx = simple_index - with tm.assert_produces_warning( - FutureWarning, - match=f"{type(idx).__name__}.is_numeric is deprecated. ", - ): - idx.is_numeric() - - def test_is_categorical_is_deprecated(self, simple_index): - # GH50042 - idx = simple_index - with tm.assert_produces_warning( - FutureWarning, - match=r"Use pandas\.api\.types\.is_categorical_dtype instead", - ): - idx.is_categorical() - - def test_is_interval_is_deprecated(self, simple_index): - # GH50042 - idx = simple_index - with tm.assert_produces_warning(FutureWarning): - idx.is_interval() - - def test_is_object_is_deprecated(self, simple_index): - # GH50042 - idx = simple_index - with tm.assert_produces_warning(FutureWarning): - idx.is_object() - class TestNumericBase: @pytest.fixture( @@ -977,21 +906,13 @@ def test_view(self, simple_index): idx_view = idx.view(dtype) tm.assert_index_equal(idx, index_cls(idx_view, name="Foo"), exact=True) - msg = "Passing a type in .*Index.view is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - idx_view = idx.view(index_cls) - tm.assert_index_equal(idx, index_cls(idx_view, name="Foo"), exact=True) - - def test_format(self, simple_index): - # GH35439 - if isinstance(simple_index, DatetimeIndex): - pytest.skip("Tested elsewhere") - idx = simple_index - max_width = max(len(str(x)) for x in idx) - expected = [str(x).ljust(max_width) for x in idx] - msg = r"Index\.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert idx.format() == expected + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) + with pytest.raises(TypeError, match=msg): + # GH#55709 + idx.view(index_cls) def test_insert_non_na(self, simple_index): # GH#43921 inserting an element that we know we can hold should diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 4a6982cf98670..7cc74f4b3405c 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -2,6 +2,7 @@ The tests in this package are to ensure the proper resultant dtypes of set operations. """ + from datetime import datetime import operator @@ -56,6 +57,11 @@ def any_dtype_for_small_pos_integer_indexes(request): return request.param +@pytest.fixture +def index_flat2(index_flat): + return index_flat + + def test_union_same_types(index): # Union with a non-unique, non-monotonic index raises error # Only needed for bool index factory @@ -240,9 +246,6 @@ def test_intersection_base(self, index): with pytest.raises(TypeError, match=msg): first.intersection([1, 2, 3]) - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): index = index.unique() @@ -270,9 +273,6 @@ def test_union_base(self, index): first.union([1, 2, 3]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_difference_base(self, sort, index): first = index[2:] second = index[:4] @@ -299,10 +299,13 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) - def test_symmetric_difference(self, index): + def test_symmetric_difference(self, index, using_infer_string, request): + if ( + using_infer_string + and index.dtype == "object" + and index.inferred_type == "string" + ): + request.applymarker(pytest.mark.xfail(reason="TODO: infer_string")) if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") if len(index) < 2: @@ -522,14 +525,12 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -@pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" -) @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) -def test_setop_with_categorical(index_flat, sort, method): +def test_setop_with_categorical(index_flat, sort, method, using_infer_string): # MultiIndex tested separately in tests.indexes.multi.test_setops index = index_flat @@ -538,10 +539,22 @@ def test_setop_with_categorical(index_flat, sort, method): result = getattr(index, method)(other, sort=sort) expected = getattr(index, method)(index, sort=sort) + if ( + using_infer_string + and index.empty + and method in ("union", "symmetric_difference") + ): + expected = expected.astype("category") tm.assert_index_equal(result, expected, exact=exact) result = getattr(index, method)(other[:5], sort=sort) expected = getattr(index, method)(index[:5], sort=sort) + if ( + using_infer_string + and index.empty + and method in ("union", "symmetric_difference") + ): + expected = expected.astype("category") tm.assert_index_equal(result, expected, exact=exact) @@ -716,14 +729,15 @@ def test_intersection(self, index, sort): assert inter is first @pytest.mark.parametrize( - "index2,keeps_name", + "index2_name,keeps_name", [ - (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name - (Index([3, 4, 5, 6, 7], name="other"), False), # drop diff names - (Index([3, 4, 5, 6, 7]), False), + ("index", True), # preserve same name + ("other", False), # drop diff names + (None, False), ], ) - def test_intersection_name_preservation(self, index2, keeps_name, sort): + def test_intersection_name_preservation(self, index2_name, keeps_name, sort): + index2 = Index([3, 4, 5, 6, 7], name=index2_name) index1 = Index([1, 2, 3, 4, 5], name="index") expected = Index([3, 4, 5]) result = index1.intersection(index2, sort) @@ -875,7 +889,7 @@ def test_difference_incomparable(self, opname): b = Index([2, Timestamp("1999"), 1]) op = operator.methodcaller(opname, b) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): # sort=None, the default result = op(a) expected = Index([3, Timestamp("2000"), 2, Timestamp("1999")]) @@ -910,11 +924,13 @@ def test_symmetric_difference_mi(self, sort): @pytest.mark.parametrize( "index2,expected", [ - (Index([0, 1, np.nan]), Index([2.0, 3.0, 0.0])), - (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0])), + ([0, 1, np.nan], [2.0, 3.0, 0.0]), + ([0, 1], [np.nan, 2.0, 3.0, 0.0]), ], ) def test_symmetric_difference_missing(self, index2, expected, sort): + index2 = Index(index2) + expected = Index(expected) # GH#13514 change: {nan} - {nan} == {} # (GH#6444, sorting of nans, is no longer an issue) index1 = Index([1, np.nan, 2, 3]) diff --git a/pandas/tests/indexes/test_subclass.py b/pandas/tests/indexes/test_subclass.py index c3287e1ddcddc..a8ba8c3090cf2 100644 --- a/pandas/tests/indexes/test_subclass.py +++ b/pandas/tests/indexes/test_subclass.py @@ -1,6 +1,7 @@ """ Tests involving custom Index subclasses """ + import numpy as np from pandas import ( diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 311f2b5c9aa59..5166cadae499e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_object_with_nat(self): tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - def test_astype(self): + def test_astype(self, using_infer_string): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") @@ -61,7 +61,12 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/indexes/timedeltas/methods/test_shift.py b/pandas/tests/indexes/timedeltas/methods/test_shift.py index a0986d1496881..9bbf06dc51a0c 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_shift.py +++ b/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -37,7 +37,7 @@ def test_tdi_shift_minutes(self): def test_tdi_shift_int(self): # GH#8083 - tdi = pd.to_timedelta(range(5), unit="d") + tdi = pd.to_timedelta(range(5), unit="D") trange = tdi._with_freq("infer") + pd.offsets.Hour(1) result = trange.shift(1) expected = TimedeltaIndex( @@ -54,7 +54,7 @@ def test_tdi_shift_int(self): def test_tdi_shift_nonstandard_freq(self): # GH#8083 - tdi = pd.to_timedelta(range(5), unit="d") + tdi = pd.to_timedelta(range(5), unit="D") trange = tdi._with_freq("infer") + pd.offsets.Hour(1) result = trange.shift(3, freq="2D 1s") expected = TimedeltaIndex( diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index 0510700bb64d7..ace0ab7990138 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -15,12 +15,6 @@ class TestTimedeltaIndex: - def test_closed_deprecated(self): - # GH#52628 - msg = "The 'closed' keyword" - with tm.assert_produces_warning(FutureWarning, match=msg): - TimedeltaIndex([], closed=True) - def test_array_of_dt64_nat_raises(self): # GH#39462 nat = np.datetime64("NaT", "ns") @@ -36,14 +30,6 @@ def test_array_of_dt64_nat_raises(self): with pytest.raises(TypeError, match=msg): to_timedelta(arr) - @pytest.mark.parametrize("unit", ["Y", "y", "M"]) - def test_unit_m_y_raises(self, unit): - msg = "Units 'M', 'Y', and 'y' are no longer supported" - depr_msg = "The 'unit' keyword in TimedeltaIndex construction is deprecated" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - TimedeltaIndex([1, 3, 7], unit) - def test_int64_nocopy(self): # GH#23539 check that a copy isn't made when we pass int64 data # and copy=False @@ -70,7 +56,6 @@ def test_infer_from_tdi_mismatch(self): # has one and it does not match the `freq` input tdi = timedelta_range("1 second", periods=100, freq="1s") - depr_msg = "TimedeltaArray.__init__ is deprecated" msg = ( "Inferred frequency .* from passed values does " "not conform to passed frequency" @@ -78,18 +63,9 @@ def test_infer_from_tdi_mismatch(self): with pytest.raises(ValueError, match=msg): TimedeltaIndex(tdi, freq="D") - with pytest.raises(ValueError, match=msg): - # GH#23789 - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - TimedeltaArray(tdi, freq="D") - with pytest.raises(ValueError, match=msg): TimedeltaIndex(tdi._data, freq="D") - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - TimedeltaArray(tdi._data, freq="D") - def test_dt64_data_invalid(self): # GH#23539 # passing tz-aware DatetimeIndex raises, naive or ndarray[datetime64] @@ -138,9 +114,6 @@ def test_construction_base_constructor(self): tm.assert_index_equal(pd.Index(arr), TimedeltaIndex(arr)) tm.assert_index_equal(pd.Index(np.array(arr)), TimedeltaIndex(np.array(arr))) - @pytest.mark.filterwarnings( - "ignore:The 'unit' keyword in TimedeltaIndex construction:FutureWarning" - ) def test_constructor(self): expected = TimedeltaIndex( [ @@ -162,22 +135,6 @@ def test_constructor(self): ) tm.assert_index_equal(result, expected) - expected = TimedeltaIndex( - ["0 days 00:00:00", "0 days 00:00:01", "0 days 00:00:02"] - ) - result = TimedeltaIndex(range(3), unit="s") - tm.assert_index_equal(result, expected) - expected = TimedeltaIndex( - ["0 days 00:00:00", "0 days 00:00:05", "0 days 00:00:09"] - ) - result = TimedeltaIndex([0, 5, 9], unit="s") - tm.assert_index_equal(result, expected) - expected = TimedeltaIndex( - ["0 days 00:00:00.400", "0 days 00:00:00.450", "0 days 00:00:01.200"] - ) - result = TimedeltaIndex([400, 450, 1200], unit="ms") - tm.assert_index_equal(result, expected) - def test_constructor_iso(self): # GH #21877 expected = timedelta_range("1s", periods=9, freq="s") @@ -186,14 +143,12 @@ def test_constructor_iso(self): tm.assert_index_equal(result, expected) def test_timedelta_range_fractional_period(self): - msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" - with tm.assert_produces_warning(FutureWarning, match=msg): - rng = timedelta_range("1 days", periods=10.5) - exp = timedelta_range("1 days", periods=10) - tm.assert_index_equal(rng, exp) + msg = "periods must be an integer" + with pytest.raises(TypeError, match=msg): + timedelta_range("1 days", periods=10.5) def test_constructor_coverage(self): - msg = "periods must be a number, got foo" + msg = "periods must be an integer, got foo" with pytest.raises(TypeError, match=msg): timedelta_range(start="1 days", periods="foo", freq="D") @@ -213,7 +168,7 @@ def test_constructor_coverage(self): # NumPy string array strings = np.array(["1 days", "2 days", "3 days"]) result = TimedeltaIndex(strings) - expected = to_timedelta([1, 2, 3], unit="d") + expected = to_timedelta([1, 2, 3], unit="D") tm.assert_index_equal(result, expected) from_ints = TimedeltaIndex(expected.asi8) @@ -273,11 +228,6 @@ def test_explicit_none_freq(self): result = TimedeltaIndex(tdi._data, freq=None) assert result.freq is None - msg = "TimedeltaArray.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tda = TimedeltaArray(tdi, freq=None) - assert tda.freq is None - def test_from_categorical(self): tdi = timedelta_range(1, periods=5) @@ -289,3 +239,28 @@ def test_from_categorical(self): ci = pd.CategoricalIndex(tdi) result = TimedeltaIndex(ci) tm.assert_index_equal(result, tdi) + + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): + # GH#52536, GH#59051 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + expected = TimedeltaIndex([f"1{unit}", f"2{unit}"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = TimedeltaIndex([f"1{unit_depr}", f"2{unit_depr}"]) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + tdi = to_timedelta([1, 2], unit=unit_depr) + tm.assert_index_equal(tdi, expected) diff --git a/pandas/tests/indexes/timedeltas/test_delete.py b/pandas/tests/indexes/timedeltas/test_delete.py index 6e6f54702ce1a..f49af7cd0befd 100644 --- a/pandas/tests/indexes/timedeltas/test_delete.py +++ b/pandas/tests/indexes/timedeltas/test_delete.py @@ -44,7 +44,7 @@ def test_delete_slice(self): # reset freq to None expected_3_5 = TimedeltaIndex( - ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ["1 D", "2 D", "3 D", "7 D", "8 D", "9 D", "10D"], freq=None, name="idx" ) cases = { diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 397f9d9e18331..426083cb6b67c 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -20,8 +20,10 @@ class TestGetItem: def test_getitem_slice_keeps_name(self): - # GH#4226 - tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") + # GH#4226, GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") assert tdi[1:].name == tdi.name def test_getitem(self): @@ -230,7 +232,7 @@ def test_take_invalid_kwargs(self): def test_take_equiv_getitem(self): tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] - idx = timedelta_range(start="1d", end="2d", freq="h", name="idx") + idx = timedelta_range(start="1D", end="2D", freq="h", name="idx") expected = TimedeltaIndex(tds, freq=None, name="idx") taken1 = idx.take([2, 4, 10]) @@ -260,8 +262,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -337,8 +338,10 @@ def test_contains_nonunique(self): def test_contains(self): # Checking for any NaT-like objects - # GH#13603 - td = to_timedelta(range(5), unit="d") + offsets.Hour(1) + # GH#13603, GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + td = to_timedelta(range(5), unit="d") + offsets.Hour(1) for v in [NaT, None, float("nan"), np.nan]: assert v not in td diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 9f0552f8baa90..9a00c556dc515 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -103,30 +103,34 @@ def test_round(self): t1c = TimedeltaIndex(np.array([1, 1, 1], "m8[D]")).as_unit("ns") # note that negative times round DOWN! so don't give whole numbers - for freq, s1, s2 in [ - ("ns", t1, t2), - ("us", t1, t2), - ( - "ms", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + for freq, s1, s2 in [ + ("ns", t1, t2), + ("us", t1, t2), + ( + "ms", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), ), - ), - ( - "s", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ( + "s", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), ), - ), - ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("d", t1c, -1 * t1c), - ]: - r1 = t1.round(freq) + ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("d", t1c, -1 * t1c), + ]: + r1 = t1.round(freq) + r2 = t2.round(freq) + tm.assert_index_equal(r1, s1) - r2 = t2.round(freq) tm.assert_index_equal(r2, s2) def test_components(self): diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index fce10d9176d74..ae88caf18fdae 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -42,7 +42,10 @@ def test_union_sort_false(self): tm.assert_index_equal(result, expected) def test_union_coverage(self): - idx = TimedeltaIndex(["3d", "1d", "2d"]) + # GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = TimedeltaIndex(["3d", "1d", "2d"]) ordered = TimedeltaIndex(idx.sort_values(), freq="infer") result = ordered.union(idx) tm.assert_index_equal(result, ordered) @@ -70,7 +73,7 @@ def test_union_bug_1745(self): tm.assert_index_equal(result, exp) def test_union_bug_4564(self): - left = timedelta_range("1 day", "30d") + left = timedelta_range("1 day", "30D") right = left + pd.offsets.Minute(15) result = left.union(right) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 3120066741ffa..2066be8976e7f 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -51,9 +51,9 @@ def test_fields(self): s = Series(rng) s[1] = np.nan - tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=[0, 1])) + tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=range(2))) tm.assert_series_equal( - s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1]) + s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=range(2)) ) # preserve name (GH15589) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index f22bdb7a90516..6f3d29fb4240a 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,7 +3,6 @@ from pandas import ( Timedelta, - TimedeltaIndex, timedelta_range, to_timedelta, ) @@ -43,32 +42,24 @@ def test_timedelta_range(self): result = timedelta_range("0 days", freq="30min", periods=50) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "depr_unit, unit", - [ - ("H", "hour"), - ("T", "minute"), - ("t", "minute"), - ("S", "second"), - ("L", "millisecond"), - ("l", "millisecond"), - ("U", "microsecond"), - ("u", "microsecond"), - ("N", "nanosecond"), - ("n", "nanosecond"), - ], - ) - def test_timedelta_units_H_T_S_L_U_N_deprecated(self, depr_unit, unit): + @pytest.mark.parametrize("depr_unit, unit", [("H", "hour"), ("S", "second")]) + def test_timedelta_units_H_S_deprecated(self, depr_unit, unit): # GH#52536 depr_msg = ( f"'{depr_unit}' is deprecated and will be removed in a future version." ) - expected = to_timedelta(np.arange(5), unit=unit) with tm.assert_produces_warning(FutureWarning, match=depr_msg): result = to_timedelta(np.arange(5), unit=depr_unit) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("unit", ["T", "t", "L", "l", "U", "u", "N", "n"]) + def test_timedelta_unit_T_L_U_N_raises(self, unit): + msg = f"invalid unit abbreviation: {unit}" + + with pytest.raises(ValueError, match=msg): + to_timedelta(np.arange(5), unit=unit) + @pytest.mark.parametrize( "periods, freq", [(3, "2D"), (5, "D"), (6, "19h12min"), (7, "16h"), (9, "12h")] ) @@ -78,15 +69,18 @@ def test_linspace_behavior(self, periods, freq): expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("msg_freq, freq", [("H", "19H12min"), ("T", "19h12T")]) - def test_timedelta_range_H_T_deprecated(self, freq, msg_freq): + def test_timedelta_range_H_raises(self): # GH#52536 - msg = f"'{msg_freq}' is deprecated and will be removed in a future version." + msg = "Invalid frequency: H" - result = timedelta_range(start="0 days", end="4 days", periods=6) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = timedelta_range(start="0 days", end="4 days", freq=freq) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days", end="4 days", freq="19H12min") + + def test_timedelta_range_T_raises(self): + msg = "Invalid frequency: T" + + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days", end="4 days", freq="19h12T") def test_errors(self): # not enough params @@ -134,40 +128,27 @@ def test_timedelta_range_infer_freq(self): assert result.freq is None @pytest.mark.parametrize( - "freq_depr, start, end, expected_values, expected_freq", + "freq_depr, start, end", [ ( - "3.5S", + "3.5l", "05:03:01", "05:03:10", - ["0 days 05:03:01", "0 days 05:03:04.500000", "0 days 05:03:08"], - "3500ms", ), ( "2.5T", "5 hours", "5 hours 8 minutes", - [ - "0 days 05:00:00", - "0 days 05:02:30", - "0 days 05:05:00", - "0 days 05:07:30", - ], - "150s", + ), + ( + "3.5S", + "05:03:01", + "05:03:10", ), ], ) - def test_timedelta_range_deprecated_freq( - self, freq_depr, start, end, expected_values, expected_freq - ): - # GH#52536 - msg = ( - f"'{freq_depr[-1]}' is deprecated and will be removed in a future version." - ) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = timedelta_range(start=start, end=end, freq=freq_depr) - expected = TimedeltaIndex( - expected_values, dtype="timedelta64[ns]", freq=expected_freq - ) - tm.assert_index_equal(result, expected) + def test_timedelta_range_removed_freq(self, freq_depr, start, end): + # GH#59143 + msg = f"Invalid frequency: {freq_depr}" + with pytest.raises(ValueError, match=msg): + timedelta_range(start=start, end=end, freq=freq_depr) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 2af76f69a4300..a33fb1e6979ec 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -1,4 +1,5 @@ -""" common utilities """ +"""common utilities""" + from __future__ import annotations from typing import ( diff --git a/pandas/tests/indexing/conftest.py b/pandas/tests/indexing/conftest.py deleted file mode 100644 index 4184c6a0047cc..0000000000000 --- a/pandas/tests/indexing/conftest.py +++ /dev/null @@ -1,127 +0,0 @@ -import numpy as np -import pytest - -from pandas import ( - DataFrame, - Index, - MultiIndex, - Series, - date_range, -) - - -@pytest.fixture -def series_ints(): - return Series(np.random.default_rng(2).random(4), index=np.arange(0, 8, 2)) - - -@pytest.fixture -def frame_ints(): - return DataFrame( - np.random.default_rng(2).standard_normal((4, 4)), - index=np.arange(0, 8, 2), - columns=np.arange(0, 12, 3), - ) - - -@pytest.fixture -def series_uints(): - return Series( - np.random.default_rng(2).random(4), - index=Index(np.arange(0, 8, 2, dtype=np.uint64)), - ) - - -@pytest.fixture -def frame_uints(): - return DataFrame( - np.random.default_rng(2).standard_normal((4, 4)), - index=Index(range(0, 8, 2), dtype=np.uint64), - columns=Index(range(0, 12, 3), dtype=np.uint64), - ) - - -@pytest.fixture -def series_labels(): - return Series(np.random.default_rng(2).standard_normal(4), index=list("abcd")) - - -@pytest.fixture -def frame_labels(): - return DataFrame( - np.random.default_rng(2).standard_normal((4, 4)), - index=list("abcd"), - columns=list("ABCD"), - ) - - -@pytest.fixture -def series_ts(): - return Series( - np.random.default_rng(2).standard_normal(4), - index=date_range("20130101", periods=4), - ) - - -@pytest.fixture -def frame_ts(): - return DataFrame( - np.random.default_rng(2).standard_normal((4, 4)), - index=date_range("20130101", periods=4), - ) - - -@pytest.fixture -def series_floats(): - return Series( - np.random.default_rng(2).random(4), - index=Index(range(0, 8, 2), dtype=np.float64), - ) - - -@pytest.fixture -def frame_floats(): - return DataFrame( - np.random.default_rng(2).standard_normal((4, 4)), - index=Index(range(0, 8, 2), dtype=np.float64), - columns=Index(range(0, 12, 3), dtype=np.float64), - ) - - -@pytest.fixture -def series_mixed(): - return Series(np.random.default_rng(2).standard_normal(4), index=[2, 4, "null", 8]) - - -@pytest.fixture -def frame_mixed(): - return DataFrame( - np.random.default_rng(2).standard_normal((4, 4)), index=[2, 4, "null", 8] - ) - - -@pytest.fixture -def frame_empty(): - return DataFrame() - - -@pytest.fixture -def series_empty(): - return Series(dtype=object) - - -@pytest.fixture -def frame_multi(): - return DataFrame( - np.random.default_rng(2).standard_normal((4, 4)), - index=MultiIndex.from_product([[1, 2], [3, 4]]), - columns=MultiIndex.from_product([[5, 6], [7, 8]]), - ) - - -@pytest.fixture -def series_multi(): - return Series( - np.random.default_rng(2).random(4), - index=MultiIndex.from_product([[1, 2], [3, 4]]), - ) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index cabfee9aa040a..6bcebefa6c696 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -2,7 +2,7 @@ import pytest from pandas._libs import index as libindex -from pandas.compat import IS64 +from pandas.compat import WASM import pandas as pd from pandas import ( @@ -210,11 +210,8 @@ def test_mi_intervalindex_slicing_with_scalar(self): expected = Series([1, 6, 2, 8, 7], index=expected_index, name="value") tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not IS64, reason="GH 23440") - @pytest.mark.parametrize( - "base", - [101, 1010], - ) + @pytest.mark.xfail(WASM, reason="GH 23440") + @pytest.mark.parametrize("base", [101, 1010]) def test_reindex_behavior_with_interval_index(self, base): # GH 51826 diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 283921a23e368..051dc7b98f2aa 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas.compat import WASM from pandas import ( Index, @@ -17,6 +17,9 @@ class TestIntervalIndex: @pytest.fixture def series_with_interval_index(self): + """ + Fixture providing a Series with an IntervalIndex. + """ return Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) def test_loc_with_interval(self, series_with_interval_index, indexer_sl): @@ -211,7 +214,7 @@ def test_loc_getitem_missing_key_error_message( obj.loc[[4, 5, 6]] -@pytest.mark.xfail(not IS64, reason="GH 23440") +@pytest.mark.xfail(WASM, reason="GH 23440") @pytest.mark.parametrize( "intervals", [ diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 0dd1a56890fee..c7ed21a2cc001 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -1,19 +1,17 @@ import numpy as np -import pytest from pandas._libs import index as libindex -from pandas.errors import SettingWithCopyError -import pandas.util._test_decorators as td from pandas import ( DataFrame, MultiIndex, + RangeIndex, Series, ) import pandas._testing as tm -def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): +def test_detect_chained_assignment(): # Inplace ops, originally from: # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] @@ -30,21 +28,11 @@ def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) zed = DataFrame(events, index=["a", "b"], columns=multiind) - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - zed["eyes"]["right"].fillna(value=555, inplace=True) - elif warn_copy_on_write: - with tm.assert_produces_warning(None): - zed["eyes"]["right"].fillna(value=555, inplace=True) - else: - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): - with tm.assert_produces_warning(None): - zed["eyes"]["right"].fillna(value=555, inplace=True) - - -@td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view -def test_cache_updating(using_copy_on_write, warn_copy_on_write): + with tm.raises_chained_assignment_error(): + zed["eyes"]["right"].fillna(value=555, inplace=True) + + +def test_cache_updating(): # 5216 # make sure that we don't try to set a dead cache a = np.random.default_rng(2).random((10, 3)) @@ -60,11 +48,7 @@ def test_cache_updating(using_copy_on_write, warn_copy_on_write): with tm.raises_chained_assignment_error(): df.loc[0]["z"].iloc[0] = 1.0 - if using_copy_on_write: - assert df.loc[(0, 0), "z"] == df_original.loc[0, "z"] - else: - result = df.loc[(0, 0), "z"] - assert result == 1 + assert df.loc[(0, 0), "z"] == df_original.loc[0, "z"] # correct setting df.loc[(0, 0), "z"] = 2 @@ -85,3 +69,19 @@ def test_indexer_caching(monkeypatch): s[s == 0] = 1 expected = Series(np.ones(size_cutoff), index=index) tm.assert_series_equal(s, expected) + + +def test_set_names_only_clears_level_cache(): + mi = MultiIndex.from_arrays([range(4), range(4)], names=["a", "b"]) + mi.dtypes + mi.is_monotonic_increasing + mi._engine + mi.levels + old_cache_keys = sorted(mi._cache.keys()) + assert old_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing", "levels"] + mi.names = ["A", "B"] + new_cache_keys = sorted(mi._cache.keys()) + assert new_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing"] + new_levels = mi.levels + tm.assert_index_equal(new_levels[0], RangeIndex(4, name="A")) + tm.assert_index_equal(new_levels[1], RangeIndex(4, name="B")) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 5508153322adb..1d3258ab18a61 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -1,10 +1,7 @@ import numpy as np import pytest -from pandas.errors import ( - IndexingError, - PerformanceWarning, -) +from pandas.errors import IndexingError import pandas as pd from pandas import ( @@ -16,14 +13,6 @@ import pandas._testing as tm -@pytest.fixture -def single_level_multiindex(): - """single level MultiIndex""" - return MultiIndex( - levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] - ) - - @pytest.fixture def frame_random_data_integer_multi_index(): levels = [[0, 1], [0, 1, 2]] @@ -44,7 +33,7 @@ def test_loc_setitem_frame_with_multiindex(self, multiindex_dataframe_random_dat df.loc[("bar", "two"), 1] = 7 assert df.loc[("bar", "two"), 1] == 7 - def test_loc_getitem_general(self, any_real_numpy_dtype): + def test_loc_getitem_general(self, performance_warning, any_real_numpy_dtype): # GH#2817 dtype = any_real_numpy_dtype data = { @@ -57,8 +46,7 @@ def test_loc_getitem_general(self, any_real_numpy_dtype): df = df.set_index(keys=["col", "num"]) key = 4.0, 12 - # emits a PerformanceWarning, ok - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): tm.assert_frame_equal(df.loc[key], df.iloc[2:]) # this is ok @@ -117,7 +105,7 @@ def test_loc_getitem_series(self): empty = Series(data=[], dtype=np.float64) expected = Series( [], - index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + index=MultiIndex(levels=index.levels, codes=[[], []]), dtype=np.float64, ) result = x.loc[empty] @@ -141,7 +129,7 @@ def test_loc_getitem_array(self): empty = np.array([]) expected = Series( [], - index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + index=MultiIndex(levels=index.levels, codes=[[], []]), dtype="float64", ) result = x.loc[empty] @@ -277,8 +265,10 @@ def test_loc_multiindex_incomplete(self): result = s.loc[2:4:2, "a":"c"] tm.assert_series_equal(result, expected) - def test_get_loc_single_level(self, single_level_multiindex): - single_level = single_level_multiindex + def test_get_loc_single_level(self): + single_level = MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) s = Series( np.random.default_rng(2).standard_normal(len(single_level)), index=single_level, @@ -391,6 +381,29 @@ def test_multiindex_setitem_columns_enlarging(self, indexer, exp_value): ) tm.assert_frame_equal(df, expected) + def test_multiindex_setitem_axis_set(self): + # GH#58116 + dates = pd.date_range("2001-01-01", freq="D", periods=2) + ids = ["i1", "i2", "i3"] + index = MultiIndex.from_product([dates, ids], names=["date", "identifier"]) + df = DataFrame(0.0, index=index, columns=["A", "B"]) + df.loc(axis=0)["2001-01-01", ["i1", "i3"]] = None + + expected = DataFrame( + [ + [None, None], + [0.0, 0.0], + [None, None], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + ], + index=index, + columns=["A", "B"], + ) + + tm.assert_frame_equal(df, expected) + def test_sorted_multiindex_after_union(self): # GH#44752 midx = MultiIndex.from_product( @@ -588,7 +601,7 @@ def test_loc_nan_multiindex(using_infer_string): np.ones((1, 4)), index=Index( [np.nan], - dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + dtype="object" if not using_infer_string else "str", name="u3", ), columns=Index(["d1", "d2", "d3", "d4"]), diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 36cc8316ea5ff..7140ad7d1e9f5 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -2,7 +2,6 @@ import pytest import pandas._libs.index as libindex -from pandas.errors import PerformanceWarning import pandas as pd from pandas import ( @@ -17,7 +16,7 @@ class TestMultiIndexBasic: - def test_multiindex_perf_warn(self): + def test_multiindex_perf_warn(self, performance_warning): df = DataFrame( { "jim": [0, 0, 1, 1], @@ -26,11 +25,11 @@ def test_multiindex_perf_warn(self): } ).set_index(["jim", "joe"]) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): df.loc[(1, "z")] df = df.iloc[[2, 1, 3, 0]] - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): df.loc[(0,)] @pytest.mark.parametrize("offset", [-5, 5]) @@ -233,3 +232,20 @@ def test_multiindex_from_tuples_with_nan(self): [("a", "b", "c"), (np.nan, np.nan, np.nan), ("d", "", "")] ) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("operation", ["div", "mul", "add", "sub"]) + def test_groupyby_rename_categories_operation_with_multiindex(self, operation): + # GH#51500 + data = DataFrame( + [["C", "B", "B"], ["B", "A", "A"], ["B", "A", "B"]], columns=["0", "1", "2"] + ) + data["0"] = data["0"].astype("category") + data["0"] = data["0"].cat.rename_categories({"C": "B", "B": "C"}) + + a = data.groupby(by=["0", "1"])["2"].value_counts() + b = data.groupby(by=["0", "1"]).size() + + result = getattr(a, operation)(b) + expected = getattr(a, operation)(b.sort_index(ascending=False)) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index fdf88b2a97e46..dbfabf7666d25 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, DatetimeIndex, @@ -95,7 +93,7 @@ def test_fancy_slice_partial( tm.assert_frame_equal(result, expected) ymd = multiindex_year_month_day_dataframe_random_data - result = ymd.loc[(2000, 2):(2000, 4)] + result = ymd.loc[(2000, 2) : (2000, 4)] lev = ymd.index.codes[1] expected = ymd[(lev >= 1) & (lev <= 3)] tm.assert_frame_equal(result, expected) @@ -118,14 +116,9 @@ def test_getitem_partial_column_select(self): with pytest.raises(KeyError, match=r"\('a', 'foo'\)"): df.loc[("a", "foo"), :] - # TODO(ArrayManager) rewrite test to not use .values - # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view - @td.skip_array_manager_invalid_test def test_partial_set( self, multiindex_year_month_day_dataframe_random_data, - using_copy_on_write, - warn_copy_on_write, ): # GH #397 ymd = multiindex_year_month_day_dataframe_random_data @@ -135,13 +128,9 @@ def test_partial_set( exp.iloc[65:85] = 0 tm.assert_frame_equal(df, exp) - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["A"].loc[2000, 4] = 1 - df.loc[(2000, 4), "A"] = 1 - else: - with tm.raises_chained_assignment_error(): - df["A"].loc[2000, 4] = 1 + with tm.raises_chained_assignment_error(): + df["A"].loc[2000, 4] = 1 + df.loc[(2000, 4), "A"] = 1 exp.iloc[65:85, 0] = 1 tm.assert_frame_equal(df, exp) @@ -152,10 +141,7 @@ def test_partial_set( # this works...for now with tm.raises_chained_assignment_error(): df["A"].iloc[14] = 5 - if using_copy_on_write: - assert df["A"].iloc[14] == exp["A"].iloc[14] - else: - assert df["A"].iloc[14] == 5 + assert df["A"].iloc[14] == exp["A"].iloc[14] @pytest.mark.parametrize("dtype", [int, float]) def test_getitem_intkey_leading_level( diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 53ad4d6b41687..d732cb4d7fbbc 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -1,9 +1,6 @@ import numpy as np import pytest -from pandas.errors import SettingWithCopyError -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -126,9 +123,6 @@ def test_setitem_multiindex3(self): expected=copy, ) - # TODO(ArrayManager) df.loc["bar"] *= 2 doesn't raise an error but results in - # all NaNs -> doesn't work in the "split" path (also for BlockManager actually) - @td.skip_array_manager_not_yet_implemented def test_multiindex_setitem(self): # GH 3738 # setting with a multi-index right hand side @@ -200,9 +194,7 @@ def test_multiindex_assignment(self): df.loc[4, "d"] = arr tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) - def test_multiindex_assignment_single_dtype( - self, using_copy_on_write, warn_copy_on_write - ): + def test_multiindex_assignment_single_dtype(self): # GH3777 part 2b # single dtype arr = np.array([0.0, 1.0]) @@ -213,32 +205,22 @@ def test_multiindex_assignment_single_dtype( index=[[4, 4, 8], [8, 10, 12]], dtype=np.int64, ) - view = df["c"].iloc[:2].values # arr can be losslessly cast to int, so this setitem is inplace - # INFO(CoW-warn) this does not warn because we directly took .values - # above, so no reference to a pandas object is alive for `view` df.loc[4, "c"] = arr exp = Series(arr, index=[8, 10], name="c", dtype="int64") result = df.loc[4, "c"] tm.assert_series_equal(result, exp) - # extra check for inplace-ness - if not using_copy_on_write: - tm.assert_numpy_array_equal(view, exp.values) - # arr + 0.5 cannot be cast losslessly to int, so we upcast - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[4, "c"] = arr + 0.5 - result = df.loc[4, "c"] - exp = exp + 0.5 - tm.assert_series_equal(result, exp) + # Upcast so that we can add .5 + df = df.astype({"c": "float64"}) + df.loc[4, "c"] = arr + 0.5 # scalar ok - with tm.assert_cow_warning(warn_copy_on_write): - df.loc[4, "c"] = 10 + df.loc[4, "c"] = 10 exp = Series(10, index=[8, 10], name="c", dtype="float64") tm.assert_series_equal(df.loc[4, "c"], exp) @@ -252,8 +234,7 @@ def test_multiindex_assignment_single_dtype( # But with a length-1 listlike column indexer this behaves like # `df.loc[4, "c"] = 0 - with tm.assert_cow_warning(warn_copy_on_write): - df.loc[4, ["c"]] = [0] + df.loc[4, ["c"]] = [0] assert (df.loc[4, "c"] == 0).all() def test_groupby_example(self): @@ -278,20 +259,16 @@ def test_groupby_example(self): new_vals = np.arange(df2.shape[0]) df.loc[name, "new_col"] = new_vals - def test_series_setitem( - self, multiindex_year_month_day_dataframe_random_data, warn_copy_on_write - ): + def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data s = ymd["A"] - with tm.assert_cow_warning(warn_copy_on_write): - s[2000, 3] = np.nan + s[2000, 3] = np.nan assert isna(s.values[42:65]).all() assert notna(s.values[:42]).all() assert notna(s.values[65:]).all() - with tm.assert_cow_warning(warn_copy_on_write): - s[2000, 3, 10] = np.nan + s[2000, 3, 10] = np.nan assert isna(s.iloc[49]) with pytest.raises(KeyError, match="49"): @@ -426,9 +403,7 @@ def test_setitem_change_dtype(self, multiindex_dataframe_random_data): reindexed = dft.reindex(columns=[("foo", "two")]) tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) - def test_set_column_scalar_with_loc( - self, multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write - ): + def test_set_column_scalar_with_loc(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data subset = frame.index[[1, 4, 5]] @@ -437,13 +412,9 @@ def test_set_column_scalar_with_loc( frame_original = frame.copy() col = frame["B"] - with tm.assert_cow_warning(warn_copy_on_write): - col[subset] = 97 - if using_copy_on_write: - # chained setitem doesn't work with CoW - tm.assert_frame_equal(frame, frame_original) - else: - assert (frame.loc[subset, "B"] == 97).all() + col[subset] = 97 + # chained setitem doesn't work with CoW + tm.assert_frame_equal(frame, frame_original) def test_nonunique_assignment_1750(self): df = DataFrame( @@ -520,55 +491,30 @@ def test_setitem_enlargement_keep_index_names(self): tm.assert_frame_equal(df, expected) -@td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values -# is not a view -def test_frame_setitem_view_direct( - multiindex_dataframe_random_data, using_copy_on_write -): +def test_frame_setitem_view_direct(multiindex_dataframe_random_data): # this works because we are modifying the underlying array # really a no-no df = multiindex_dataframe_random_data.T - if using_copy_on_write: - with pytest.raises(ValueError, match="read-only"): - df["foo"].values[:] = 0 - assert (df["foo"].values != 0).all() - else: + with pytest.raises(ValueError, match="read-only"): df["foo"].values[:] = 0 - assert (df["foo"].values == 0).all() + assert (df["foo"].values != 0).all() -def test_frame_setitem_copy_raises( - multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write -): +def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): # will raise/warn as its chained assignment df = multiindex_dataframe_random_data.T - if using_copy_on_write or warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df["foo"]["one"] = 2 - else: - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): - with tm.raises_chained_assignment_error(): - df["foo"]["one"] = 2 - - -def test_frame_setitem_copy_no_write( - multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write -): + with tm.raises_chained_assignment_error(): + df["foo"]["one"] = 2 + + +def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data.T expected = frame df = frame.copy() - if using_copy_on_write or warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df["foo"]["one"] = 2 - else: - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): - with tm.raises_chained_assignment_error(): - df["foo"]["one"] = 2 - - result = df - tm.assert_frame_equal(result, expected) + with tm.raises_chained_assignment_error(): + df["foo"]["one"] = 2 + + tm.assert_frame_equal(df, expected) def test_frame_setitem_partial_multiindex(): diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index cef3dca054758..7f298e9bdd375 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -700,21 +700,23 @@ def test_multiindex_label_slicing_with_negative_step(self): tm.assert_indexing_slices_equivalent(ser, SLC[::-1], SLC[::-1]) tm.assert_indexing_slices_equivalent(ser, SLC["d"::-1], SLC[15::-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[("d",)::-1], SLC[15::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",) :: -1], SLC[15::-1]) tm.assert_indexing_slices_equivalent(ser, SLC[:"d":-1], SLC[:11:-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[:("d",):-1], SLC[:11:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[: ("d",) : -1], SLC[:11:-1]) tm.assert_indexing_slices_equivalent(ser, SLC["d":"b":-1], SLC[15:3:-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[("d",):"b":-1], SLC[15:3:-1]) - tm.assert_indexing_slices_equivalent(ser, SLC["d":("b",):-1], SLC[15:3:-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[("d",):("b",):-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",) : "b" : -1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC["d" : ("b",) : -1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent( + ser, SLC[("d",) : ("b",) : -1], SLC[15:3:-1] + ) tm.assert_indexing_slices_equivalent(ser, SLC["b":"d":-1], SLC[:0]) - tm.assert_indexing_slices_equivalent(ser, SLC[("c", 2)::-1], SLC[10::-1]) - tm.assert_indexing_slices_equivalent(ser, SLC[:("c", 2):-1], SLC[:9:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("c", 2) :: -1], SLC[10::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[: ("c", 2) : -1], SLC[:9:-1]) tm.assert_indexing_slices_equivalent( - ser, SLC[("e", 0):("c", 2):-1], SLC[16:9:-1] + ser, SLC[("e", 0) : ("c", 2) : -1], SLC[16:9:-1] ) def test_multiindex_slice_first_level(self): diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index d78694018749c..e80acc230a320 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -13,7 +13,6 @@ CategoricalIndex, DataFrame, DatetimeIndex, - Index, MultiIndex, Series, Timestamp, @@ -24,12 +23,8 @@ def test_at_timezone(): # https://github.com/pandas-dev/pandas/issues/33544 result = DataFrame({"foo": [datetime(2000, 1, 1)]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result.at[0, "foo"] = datetime(2000, 1, 2, tzinfo=timezone.utc) - expected = DataFrame( - {"foo": [datetime(2000, 1, 2, tzinfo=timezone.utc)]}, dtype=object - ) - tm.assert_frame_equal(result, expected) def test_selection_methods_of_assigned_col(): @@ -71,11 +66,7 @@ def test_at_setitem_item_cache_cleared(self): df.at[0, "x"] = 4 df.at[0, "cost"] = 789 - expected = DataFrame( - {"x": [4], "cost": 789}, - index=[0], - columns=Index(["x", "cost"], dtype=object), - ) + expected = DataFrame({"x": [4], "cost": 789}, index=[0]) tm.assert_frame_equal(df, expected) # And in particular, check that the _item_cache has updated correctly. @@ -136,7 +127,11 @@ def test_at_datetime_index(self, row): class TestAtSetItemWithExpansion: def test_at_setitem_expansion_series_dt64tz_value(self, tz_naive_fixture): # GH#25506 - ts = Timestamp("2017-08-05 00:00:00+0100", tz=tz_naive_fixture) + ts = ( + Timestamp("2017-08-05 00:00:00+0100", tz=tz_naive_fixture) + if tz_naive_fixture is not None + else Timestamp("2017-08-05 00:00:00+0100") + ) result = Series(ts) result.at[1] = ts expected = Series([ts, ts]) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1b58f8e8b9831..c9f29b2cb55fe 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -511,13 +511,13 @@ def test_loc_and_at_with_categorical_index(self): # pandas scalars [Interval(1, 4), Interval(4, 6), Interval(6, 9)], [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], - [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], + [Timedelta(1, "D"), Timedelta(2, "D"), Timedelta(3, "D")], # pandas Integer arrays *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES), # other pandas arrays pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, pd.date_range("2019-01-01", periods=3).array, - pd.timedelta_range(start="1d", periods=3).array, + pd.timedelta_range(start="1D", periods=3).array, ], ) def test_loc_getitem_with_non_string_categories(self, idx_values, ordered): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index b97df376ac47f..64d8068fa9291 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -3,12 +3,6 @@ import numpy as np import pytest -from pandas.errors import ( - SettingWithCopyError, - SettingWithCopyWarning, -) -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -23,17 +17,8 @@ msg = "A value is trying to be set on a copy of a slice from a DataFrame" -def random_text(nobs=100): - # Construct a DataFrame where each row is a random slice from 'letters' - idxs = np.random.default_rng(2).integers(len(ascii_letters), size=(nobs, 2)) - idxs.sort(axis=1) - strings = [ascii_letters[x[0] : x[1]] for x in idxs] - - return DataFrame(strings, columns=["letters"]) - - class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self, using_copy_on_write): + def test_slice_consolidate_invalidate_item_cache(self): # this is chained assignment, but will 'work' with option_context("chained_assignment", None): # #3970 @@ -48,12 +33,7 @@ def test_slice_consolidate_invalidate_item_cache(self, using_copy_on_write): # Assignment to wrong series with tm.raises_chained_assignment_error(): df["bb"].iloc[0] = 0.17 - df._clear_item_cache() - if not using_copy_on_write: - tm.assert_almost_equal(df["bb"][0], 0.17) - else: - # with ArrayManager, parent is not mutated with chained assignment - tm.assert_almost_equal(df["bb"][0], 2.2) + tm.assert_almost_equal(df["bb"][0], 2.2) @pytest.mark.parametrize("do_ref", [True, False]) def test_setitem_cache_updating(self, do_ref): @@ -72,9 +52,7 @@ def test_setitem_cache_updating(self, do_ref): assert df.loc[0, "c"] == 0.0 assert df.loc[7, "c"] == 1.0 - def test_setitem_cache_updating_slices( - self, using_copy_on_write, warn_copy_on_write - ): + def test_setitem_cache_updating_slices(self): # GH 7084 # not updating cache on series setting with slices expected = DataFrame( @@ -98,17 +76,11 @@ def test_setitem_cache_updating_slices( out_original = out.copy() for ix, row in df.iterrows(): v = out[row["C"]][six:eix] + row["D"] - with tm.raises_chained_assignment_error( - (ix == 0) or warn_copy_on_write or using_copy_on_write - ): + with tm.raises_chained_assignment_error(): out[row["C"]][six:eix] = v - if not using_copy_on_write: - tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out["A"], expected["A"]) - else: - tm.assert_frame_equal(out, out_original) - tm.assert_series_equal(out["A"], out_original["A"]) + tm.assert_frame_equal(out, out_original) + tm.assert_series_equal(out["A"], out_original["A"]) out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): @@ -117,71 +89,50 @@ def test_setitem_cache_updating_slices( tm.assert_frame_equal(out, expected) tm.assert_series_equal(out["A"], expected["A"]) - def test_altering_series_clears_parent_cache( - self, using_copy_on_write, warn_copy_on_write - ): + def test_altering_series_clears_parent_cache(self): # GH #33675 df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) ser = df["A"] - if using_copy_on_write or warn_copy_on_write: - assert "A" not in df._item_cache - else: - assert "A" in df._item_cache - # Adding a new entry to ser swaps in a new array, so "A" needs to # be removed from df._item_cache ser["c"] = 5 assert len(ser) == 3 - assert "A" not in df._item_cache assert df["A"] is not ser assert len(df["A"]) == 2 class TestChaining: - def test_setitem_chained_setfault(self, using_copy_on_write): + def test_setitem_chained_setfault(self): # GH6026 data = ["right", "left", "left", "left", "right", "left", "timeout"] - mdata = ["right", "left", "left", "left", "right", "left", "none"] df = DataFrame({"response": np.array(data)}) mask = df.response == "timeout" with tm.raises_chained_assignment_error(): df.response[mask] = "none" - if using_copy_on_write: - tm.assert_frame_equal(df, DataFrame({"response": data})) - else: - tm.assert_frame_equal(df, DataFrame({"response": mdata})) + tm.assert_frame_equal(df, DataFrame({"response": data})) recarray = np.rec.fromarrays([data], names=["response"]) df = DataFrame(recarray) mask = df.response == "timeout" with tm.raises_chained_assignment_error(): df.response[mask] = "none" - if using_copy_on_write: - tm.assert_frame_equal(df, DataFrame({"response": data})) - else: - tm.assert_frame_equal(df, DataFrame({"response": mdata})) + tm.assert_frame_equal(df, DataFrame({"response": data})) df = DataFrame({"response": data, "response1": data}) df_original = df.copy() mask = df.response == "timeout" with tm.raises_chained_assignment_error(): df.response[mask] = "none" - if using_copy_on_write: - tm.assert_frame_equal(df, df_original) - else: - tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) + tm.assert_frame_equal(df, df_original) # GH 6056 expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) with tm.raises_chained_assignment_error(): df["A"].iloc[0] = np.nan - if using_copy_on_write: - expected = DataFrame({"A": ["foo", "bar", "bah", "foo", "bar"]}) - else: - expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) + expected = DataFrame({"A": ["foo", "bar", "bah", "foo", "bar"]}) result = df.head() tm.assert_frame_equal(result, expected) @@ -192,29 +143,22 @@ def test_setitem_chained_setfault(self, using_copy_on_write): tm.assert_frame_equal(result, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment(self, using_copy_on_write): + def test_detect_chained_assignment(self): with option_context("chained_assignment", "raise"): # work with the chain - expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB")) df = DataFrame( np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64" ) df_original = df.copy() - assert df._is_copy is None with tm.raises_chained_assignment_error(): df["A"][0] = -5 with tm.raises_chained_assignment_error(): df["A"][1] = -6 - if using_copy_on_write: - tm.assert_frame_equal(df, df_original) - else: - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, df_original) @pytest.mark.arm_slow - def test_detect_chained_assignment_raises( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_detect_chained_assignment_raises(self): # test with the chaining df = DataFrame( { @@ -223,42 +167,14 @@ def test_detect_chained_assignment_raises( } ) df_original = df.copy() - assert df._is_copy is None - - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["A"][0] = -5 - with tm.raises_chained_assignment_error(): - df["A"][1] = -6 - tm.assert_frame_equal(df, df_original) - elif warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df["A"][0] = -5 - with tm.raises_chained_assignment_error(): - df["A"][1] = np.nan - elif not using_array_manager: - with pytest.raises(SettingWithCopyError, match=msg): - with tm.raises_chained_assignment_error(): - df["A"][0] = -5 - - with pytest.raises(SettingWithCopyError, match=msg): - with tm.raises_chained_assignment_error(): - df["A"][1] = np.nan - - assert df["A"]._is_copy is None - else: - # INFO(ArrayManager) for ArrayManager it doesn't matter that it's - # a mixed dataframe + with tm.raises_chained_assignment_error(): df["A"][0] = -5 + with tm.raises_chained_assignment_error(): df["A"][1] = -6 - expected = DataFrame([[-5, 2], [-6, 3]], columns=list("AB")) - expected["B"] = expected["B"].astype("float64") - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, df_original) @pytest.mark.arm_slow - def test_detect_chained_assignment_fails( - self, using_copy_on_write, warn_copy_on_write - ): + def test_detect_chained_assignment_fails(self): # Using a copy (the chain), fails df = DataFrame( { @@ -267,17 +183,11 @@ def test_detect_chained_assignment_fails( } ) - if using_copy_on_write or warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df.loc[0]["A"] = -5 - else: - with pytest.raises(SettingWithCopyError, match=msg): - df.loc[0]["A"] = -5 + with tm.raises_chained_assignment_error(): + df.loc[0]["A"] = -5 @pytest.mark.arm_slow - def test_detect_chained_assignment_doc_example( - self, using_copy_on_write, warn_copy_on_write - ): + def test_detect_chained_assignment_doc_example(self): # Doc example df = DataFrame( { @@ -285,127 +195,43 @@ def test_detect_chained_assignment_doc_example( "c": Series(range(7), dtype="int64"), } ) - assert df._is_copy is None indexer = df.a.str.startswith("o") - if using_copy_on_write or warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df[indexer]["c"] = 42 - else: - with pytest.raises(SettingWithCopyError, match=msg): - df[indexer]["c"] = 42 + with tm.raises_chained_assignment_error(): + df[indexer]["c"] = 42 @pytest.mark.arm_slow - def test_detect_chained_assignment_object_dtype( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): - expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) + def test_detect_chained_assignment_object_dtype(self): df = DataFrame( {"A": Series(["aaa", "bbb", "ccc"], dtype=object), "B": [1, 2, 3]} ) df_original = df.copy() - if not using_copy_on_write and not warn_copy_on_write: - with pytest.raises(SettingWithCopyError, match=msg): - df.loc[0]["A"] = 111 - - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["A"][0] = 111 - tm.assert_frame_equal(df, df_original) - elif warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df["A"][0] = 111 - tm.assert_frame_equal(df, expected) - elif not using_array_manager: - with pytest.raises(SettingWithCopyError, match=msg): - with tm.raises_chained_assignment_error(): - df["A"][0] = 111 - - df.loc[0, "A"] = 111 - tm.assert_frame_equal(df, expected) - else: - # INFO(ArrayManager) for ArrayManager it doesn't matter that it's - # a mixed dataframe + with tm.raises_chained_assignment_error(): df["A"][0] = 111 - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, df_original) @pytest.mark.arm_slow - def test_detect_chained_assignment_is_copy_pickle(self): + def test_detect_chained_assignment_is_copy_pickle(self, temp_file): # gh-5475: Make sure that is_copy is picked up reconstruction df = DataFrame({"A": [1, 2]}) - assert df._is_copy is None - - with tm.ensure_clean("__tmp__pickle") as path: - df.to_pickle(path) - df2 = pd.read_pickle(path) - df2["B"] = df2["A"] - df2["B"] = df2["A"] - - @pytest.mark.arm_slow - def test_detect_chained_assignment_setting_entire_column(self): - # gh-5597: a spurious raise as we are setting the entire column here - - df = random_text(100000) - - # Always a copy - x = df.iloc[[0, 1, 2]] - assert x._is_copy is not None - - x = df.iloc[[0, 1, 2, 4]] - assert x._is_copy is not None - - # Explicitly copy - indexer = df.letters.apply(lambda x: len(x) > 10) - df = df.loc[indexer].copy() - - assert df._is_copy is None - df["letters"] = df["letters"].apply(str.lower) - @pytest.mark.arm_slow - def test_detect_chained_assignment_implicit_take(self): - # Implicitly take - df = random_text(100000) - indexer = df.letters.apply(lambda x: len(x) > 10) - df = df.loc[indexer] - - assert df._is_copy is not None - df["letters"] = df["letters"].apply(str.lower) - - @pytest.mark.arm_slow - def test_detect_chained_assignment_implicit_take2( - self, using_copy_on_write, warn_copy_on_write - ): - if using_copy_on_write or warn_copy_on_write: - pytest.skip("_is_copy is not always set for CoW") - # Implicitly take 2 - df = random_text(100000) - indexer = df.letters.apply(lambda x: len(x) > 10) - - df = df.loc[indexer] - assert df._is_copy is not None - df.loc[:, "letters"] = df["letters"].apply(str.lower) - - # with the enforcement of #45333 in 2.0, the .loc[:, letters] setting - # is inplace, so df._is_copy remains non-None. - assert df._is_copy is not None - - df["letters"] = df["letters"].apply(str.lower) - assert df._is_copy is None + path = str(temp_file) + df.to_pickle(path) + df2 = pd.read_pickle(path) + df2["B"] = df2["A"] + df2["B"] = df2["A"] @pytest.mark.arm_slow def test_detect_chained_assignment_str(self): - df = random_text(100000) + idxs = np.random.default_rng(2).integers(len(ascii_letters), size=(100, 2)) + idxs.sort(axis=1) + strings = [ascii_letters[x[0] : x[1]] for x in idxs] + + df = DataFrame(strings, columns=["letters"]) indexer = df.letters.apply(lambda x: len(x) > 10) df.loc[indexer, "letters"] = df.loc[indexer, "letters"].apply(str.lower) - @pytest.mark.arm_slow - def test_detect_chained_assignment_is_copy(self): - # an identical take, so no copy - df = DataFrame({"a": [1]}).dropna() - assert df._is_copy is None - df["a"] += 1 - @pytest.mark.arm_slow def test_detect_chained_assignment_sorting(self): df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) @@ -430,31 +256,18 @@ def test_detect_chained_assignment_false_positives(self): str(df) @pytest.mark.arm_slow - def test_detect_chained_assignment_undefined_column( - self, using_copy_on_write, warn_copy_on_write - ): + def test_detect_chained_assignment_undefined_column(self): # from SO: # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" df_original = df.copy() - - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df.iloc[0:5]["group"] = "a" - tm.assert_frame_equal(df, df_original) - elif warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df.iloc[0:5]["group"] = "a" - else: - with pytest.raises(SettingWithCopyError, match=msg): - with tm.raises_chained_assignment_error(): - df.iloc[0:5]["group"] = "a" + with tm.raises_chained_assignment_error(): + df.iloc[0:5]["group"] = "a" + tm.assert_frame_equal(df, df_original) @pytest.mark.arm_slow - def test_detect_chained_assignment_changing_dtype( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_detect_chained_assignment_changing_dtype(self): # Mixed type setting but same dtype & changing dtype df = DataFrame( { @@ -466,53 +279,26 @@ def test_detect_chained_assignment_changing_dtype( ) df_original = df.copy() - if using_copy_on_write or warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df.loc[2]["D"] = "foo" + with tm.raises_chained_assignment_error(): + df.loc[2]["D"] = "foo" + with tm.raises_chained_assignment_error(): + df.loc[2]["C"] = "foo" + tm.assert_frame_equal(df, df_original) + # TODO: Use tm.raises_chained_assignment_error() when PDEP-6 is enforced + with pytest.raises(TypeError, match="Invalid value"): with tm.raises_chained_assignment_error(): - df.loc[2]["C"] = "foo" - tm.assert_frame_equal(df, df_original) - with tm.raises_chained_assignment_error(extra_warnings=(FutureWarning,)): - df["C"][2] = "foo" - if using_copy_on_write: - tm.assert_frame_equal(df, df_original) - else: - assert df.loc[2, "C"] == "foo" - else: - with pytest.raises(SettingWithCopyError, match=msg): - df.loc[2]["D"] = "foo" - - with pytest.raises(SettingWithCopyError, match=msg): - df.loc[2]["C"] = "foo" - - if not using_array_manager: - with pytest.raises(SettingWithCopyError, match=msg): - with tm.raises_chained_assignment_error(): - df["C"][2] = "foo" - else: - # INFO(ArrayManager) for ArrayManager it doesn't matter if it's - # changing the dtype or not df["C"][2] = "foo" - assert df.loc[2, "C"] == "foo" - def test_setting_with_copy_bug(self, using_copy_on_write, warn_copy_on_write): + def test_setting_with_copy_bug(self): # operating on a copy df = DataFrame( {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} ) df_original = df.copy() mask = pd.isna(df.c) - - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df[["c"]][mask] = df[["b"]][mask] - tm.assert_frame_equal(df, df_original) - elif warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df[["c"]][mask] = df[["b"]][mask] - else: - with pytest.raises(SettingWithCopyError, match=msg): - df[["c"]][mask] = df[["b"]][mask] + with tm.raises_chained_assignment_error(): + df[["c"]][mask] = df[["b"]][mask] + tm.assert_frame_equal(df, df_original) def test_setting_with_copy_bug_no_warning(self): # invalid warning as we are returning a new object @@ -523,43 +309,20 @@ def test_setting_with_copy_bug_no_warning(self): # this should not raise df2["y"] = ["g", "h", "i"] - def test_detect_chained_assignment_warnings_errors( - self, using_copy_on_write, warn_copy_on_write - ): + def test_detect_chained_assignment_warnings_errors(self): df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) - if using_copy_on_write or warn_copy_on_write: - with tm.raises_chained_assignment_error(): - df.loc[0]["A"] = 111 - return - - with option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(SettingWithCopyWarning): - df.loc[0]["A"] = 111 - - with option_context("chained_assignment", "raise"): - with pytest.raises(SettingWithCopyError, match=msg): - df.loc[0]["A"] = 111 + with tm.raises_chained_assignment_error(): + df.loc[0]["A"] = 111 @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) - def test_detect_chained_assignment_warning_stacklevel( - self, rhs, using_copy_on_write, warn_copy_on_write - ): + def test_detect_chained_assignment_warning_stacklevel(self, rhs): # GH#42570 df = DataFrame(np.arange(25).reshape(5, 5)) df_original = df.copy() chained = df.loc[:3] - with option_context("chained_assignment", "warn"): - if not using_copy_on_write and not warn_copy_on_write: - with tm.assert_produces_warning(SettingWithCopyWarning) as t: - chained[2] = rhs - assert t[0].filename == __file__ - else: - # INFO(CoW) no warning, and original dataframe not changed - chained[2] = rhs - tm.assert_frame_equal(df, df_original) - - # TODO(ArrayManager) fast_xs with array-like scalars is not yet working - @td.skip_array_manager_not_yet_implemented + chained[2] = rhs + tm.assert_frame_equal(df, df_original) + def test_chained_getitem_with_lists(self): # GH6394 # Regression in chained getitem indexing with embedded list-like from @@ -588,7 +351,7 @@ def test_cache_updating(self): assert "Hello Friend" in df["A"].index assert "Hello Friend" in df["B"].index - def test_cache_updating2(self, using_copy_on_write): + def test_cache_updating2(self): # 10264 df = DataFrame( np.zeros((5, 5), dtype="int64"), @@ -597,26 +360,11 @@ def test_cache_updating2(self, using_copy_on_write): ) df["f"] = 0 df_orig = df.copy() - if using_copy_on_write: - with pytest.raises(ValueError, match="read-only"): - df.f.values[3] = 1 - tm.assert_frame_equal(df, df_orig) - return - - df.f.values[3] = 1 - - df.f.values[3] = 2 - expected = DataFrame( - np.zeros((5, 6), dtype="int64"), - columns=["a", "b", "c", "d", "e", "f"], - index=range(5), - ) - expected.at[3, "f"] = 2 - tm.assert_frame_equal(df, expected) - expected = Series([0, 0, 0, 2, 0], name="f") - tm.assert_series_equal(df.f, expected) + with pytest.raises(ValueError, match="read-only"): + df.f.values[3] = 1 + tm.assert_frame_equal(df, df_orig) - def test_iloc_setitem_chained_assignment(self, using_copy_on_write): + def test_iloc_setitem_chained_assignment(self): # GH#3970 with option_context("chained_assignment", None): df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) @@ -633,10 +381,7 @@ def test_iloc_setitem_chained_assignment(self, using_copy_on_write): with tm.raises_chained_assignment_error(): df["bb"].iloc[0] = 0.15 - if not using_copy_on_write: - assert df["bb"].iloc[0] == 0.15 - else: - assert df["bb"].iloc[0] == 2.2 + assert df["bb"].iloc[0] == 2.2 def test_getitem_loc_assignment_slice_state(self): # GH 13569 diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 0e32399b131c3..d5002a47c3447 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -117,16 +115,8 @@ def test_setitem_index_object(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], index=pd.Index(list("abcd"), dtype=object)) assert obj.index.dtype == object - if exp_dtype is IndexError: - temp = obj.copy() - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - msg = "index 5 is out of bounds for axis 0 with size 4" - with pytest.raises(exp_dtype, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - temp[5] = 5 - else: - exp_index = pd.Index(list("abcd") + [val], dtype=object) - self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) + exp_index = pd.Index(list("abcd") + [val], dtype=object) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) @pytest.mark.parametrize( "val,exp_dtype", [(5, np.int64), (1.1, np.float64), ("x", object)] @@ -606,7 +596,7 @@ def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01"), "datetime64[s]"), (pd.Timestamp("2012-01-01", tz="US/Eastern"), object), (1, object), ("x", object), @@ -623,7 +613,7 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04"), ] ) - assert obj.dtype == "datetime64[ns]" + assert obj.dtype == "datetime64[s]" exp = klass( [ @@ -638,10 +628,10 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[s, US/Eastern]"), (pd.Timestamp("2012-01-01"), object), # pre-2.0 with a mismatched tz we would get object result - (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[s, US/Eastern]"), (1, object), ("x", object), ], @@ -658,7 +648,7 @@ def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04", tz=tz), ] ) - assert obj.dtype == "datetime64[ns, US/Eastern]" + assert obj.dtype == "datetime64[s, US/Eastern]" if getattr(fill_val, "tz", None) is None: fv = fill_val @@ -833,11 +823,10 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -858,20 +847,9 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key - msg = "Downcasting behavior in `replace`" - warn = FutureWarning - if ( - exp.dtype == obj.dtype - or exp.dtype == object - or (exp.dtype.kind in "iufc" and obj.dtype.kind in "iufc") - ): - warn = None - with tm.assert_produces_warning(warn, match=msg): - result = obj.replace(replacer) - - tm.assert_series_equal(result, exp) + result = obj.replace(replacer) + tm.assert_series_equal(result, exp, check_dtype=False) @pytest.mark.parametrize( "to_key", @@ -885,7 +863,7 @@ def test_replace_series_datetime_tz( self, how, to_key, from_key, replacer, using_infer_string ): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -894,12 +872,8 @@ def test_replace_series_datetime_tz( else: assert exp.dtype == to_key - msg = "Downcasting behavior in `replace`" - warn = FutureWarning if exp.dtype != object else None - with tm.assert_produces_warning(warn, match=msg): - result = obj.replace(replacer) - - tm.assert_series_equal(result, exp) + result = obj.replace(replacer) + tm.assert_series_equal(result, exp, check_dtype=False) @pytest.mark.parametrize( "to_key", @@ -913,27 +887,20 @@ def test_replace_series_datetime_tz( ) def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") - warn = FutureWarning if isinstance(obj.dtype, pd.DatetimeTZDtype) and isinstance( exp.dtype, pd.DatetimeTZDtype ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) - warn = None - else: - assert exp.dtype == to_key - if to_key == from_key: - warn = None - - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(warn, match=msg): - result = obj.replace(replacer) + elif to_key == from_key: + exp = exp.dt.as_unit("ns") - tm.assert_series_equal(result, exp) + result = obj.replace(replacer) + tm.assert_series_equal(result, exp, check_dtype=False) @pytest.mark.xfail(reason="Test not implemented") def test_replace_series_period(self): diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 1fe431e12f2a1..f9b9e8a6c7c28 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -87,11 +87,11 @@ def test_scalar_non_numeric(self, index, frame_or_series, indexer_sl): ], ) def test_scalar_non_numeric_series_fallback(self, index): - # fallsback to position selection, series only + # starting in 3.0, integer keys are always treated as labels, no longer + # fall back to positional. s = Series(np.arange(len(index)), index=index) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(KeyError, match="3"): s[3] with pytest.raises(KeyError, match="^3.0$"): s[3.0] @@ -118,12 +118,9 @@ def test_scalar_with_mixed(self, indexer_sl): indexer_sl(s3)[1.0] if indexer_sl is not tm.loc: - # __getitem__ falls back to positional - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s3[1] - expected = 2 - assert result == expected + # as of 3.0, __getitem__ no longer falls back to positional + with pytest.raises(KeyError, match="^1$"): + s3[1] with pytest.raises(KeyError, match=r"^1\.0$"): indexer_sl(s3)[1.0] @@ -491,24 +488,12 @@ def test_floating_misc(self, indexer_sl): for fancy_idx in [[5, 0], np.array([5, 0])]: tm.assert_series_equal(indexer_sl(s)[fancy_idx], expected) - warn = FutureWarning if indexer_sl is tm.setitem else None - msg = r"The behavior of obj\[i:j\] with a float-dtype index" - # all should return the same as we are slicing 'the same' - with tm.assert_produces_warning(warn, match=msg): - result1 = indexer_sl(s)[2:5] result2 = indexer_sl(s)[2.0:5.0] result3 = indexer_sl(s)[2.0:5] result4 = indexer_sl(s)[2.1:5] - tm.assert_series_equal(result1, result2) - tm.assert_series_equal(result1, result3) - tm.assert_series_equal(result1, result4) - - expected = Series([1, 2], index=[2.5, 5.0]) - with tm.assert_produces_warning(warn, match=msg): - result = indexer_sl(s)[2:5] - - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, result3) + tm.assert_series_equal(result2, result4) # list selection result1 = indexer_sl(s)[[0.0, 5, 10]] diff --git a/pandas/tests/indexing/test_iat.py b/pandas/tests/indexing/test_iat.py index 5b8c4f2d4b9b9..bb9252a50eb2a 100644 --- a/pandas/tests/indexing/test_iat.py +++ b/pandas/tests/indexing/test_iat.py @@ -5,7 +5,6 @@ Series, period_range, ) -import pandas._testing as tm def test_iat(float_frame): @@ -29,25 +28,3 @@ def test_iat_getitem_series_with_period_index(): expected = ser[index[0]] result = ser.iat[0] assert expected == result - - -def test_iat_setitem_item_cache_cleared( - indexer_ial, using_copy_on_write, warn_copy_on_write -): - # GH#45684 - data = {"x": np.arange(8, dtype=np.int64), "y": np.int64(0)} - df = DataFrame(data).copy() - ser = df["y"] - - # previously this iat setting would split the block and fail to clear - # the item_cache. - with tm.assert_cow_warning(warn_copy_on_write): - indexer_ial(df)[7, 0] = 9999 - - with tm.assert_cow_warning(warn_copy_on_write): - indexer_ial(df)[7, 1] = 1234 - - assert df.iat[7, 1] == 1234 - if not using_copy_on_write: - assert ser.iloc[-1] == 1234 - assert df.iloc[-1, -1] == 1234 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 409eca42f404b..3be69617cad43 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1,4 +1,4 @@ -""" test positional based indexing with iloc """ +"""test positional based indexing with iloc""" from datetime import datetime import re @@ -7,10 +7,8 @@ import pytest from pandas.errors import IndexingError -import pandas.util._test_decorators as td from pandas import ( - NA, Categorical, CategoricalDtype, DataFrame, @@ -39,13 +37,18 @@ class TestiLoc: @pytest.mark.parametrize("key", [2, -1, [0, 1, 2]]) - @pytest.mark.parametrize("kind", ["series", "frame"]) @pytest.mark.parametrize( - "col", - ["labels", "mixed", "ts", "floats", "empty"], + "index", + [ + Index(list("abcd"), dtype=object), + Index([2, 4, "null", 8], dtype=object), + date_range("20130101", periods=4), + Index(range(0, 8, 2), dtype=np.float64), + Index([]), + ], ) - def test_iloc_getitem_int_and_list_int(self, key, kind, col, request): - obj = request.getfixturevalue(f"{kind}_{col}") + def test_iloc_getitem_int_and_list_int(self, key, frame_or_series, index, request): + obj = frame_or_series(range(len(index)), index=index) check_indexing_smoketest_or_raises( obj, "iloc", @@ -71,23 +74,20 @@ class TestiLocBaseIndependent: np.asarray([0, 1, 2]), ], ) - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manager): + def test_iloc_setitem_fullcol_categorical(self, indexer_li, key): frame = DataFrame({0: range(3)}, dtype=object) cat = Categorical(["alpha", "beta", "gamma"]) - if not using_array_manager: - assert frame._mgr.blocks[0]._can_hold_element(cat) + assert frame._mgr.blocks[0]._can_hold_element(cat) df = frame.copy() orig_vals = df.values - indexer(df)[key, 0] = cat + indexer_li(df)[key, 0] = cat expected = DataFrame({0: cat}).astype(object) - if not using_array_manager: - assert np.shares_memory(df[0].values, orig_vals) + assert np.shares_memory(df[0].values, orig_vals) tm.assert_frame_equal(df, expected) @@ -100,12 +100,11 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage # we retain the object dtype. frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)}) df = frame.copy() - indexer(df)[key, 0] = cat + indexer_li(df)[key, 0] = cat expected = DataFrame({0: Series(cat.astype(object), dtype=object), 1: range(3)}) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("box", [array, Series]) - def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_copy_on_write): + def test_iloc_setitem_ea_inplace(self, frame_or_series, index_or_series_or_array): # GH#38952 Case with not setting a full column # IntegerArray without NAs arr = array([1, 2, 3, 4]) @@ -114,23 +113,20 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_copy_on_write if frame_or_series is Series: values = obj.values else: - values = obj._mgr.arrays[0] + values = obj._mgr.blocks[0].values if frame_or_series is Series: - obj.iloc[:2] = box(arr[2:]) + obj.iloc[:2] = index_or_series_or_array(arr[2:]) else: - obj.iloc[:2, 0] = box(arr[2:]) + obj.iloc[:2, 0] = index_or_series_or_array(arr[2:]) expected = frame_or_series(np.array([3, 4, 3, 4], dtype="i8")) tm.assert_equal(obj, expected) # Check that we are actually in-place if frame_or_series is Series: - if using_copy_on_write: - assert obj.values is not values - assert np.shares_memory(obj.values, values) - else: - assert obj.values is values + assert obj.values is not values + assert np.shares_memory(obj.values, values) else: assert np.shares_memory(obj[0].values, values) @@ -426,7 +422,7 @@ def test_iloc_getitem_slice_dups(self): tm.assert_frame_equal(df.iloc[10:, :2], df2) tm.assert_frame_equal(df.iloc[10:, 2:], df1) - def test_iloc_setitem(self, warn_copy_on_write): + def test_iloc_setitem(self): df = DataFrame( np.random.default_rng(2).standard_normal((4, 4)), index=np.arange(0, 8, 2), @@ -520,9 +516,7 @@ def test_iloc_setitem_dups(self): df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) tm.assert_frame_equal(df, expected) - def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( - self, using_array_manager - ): + def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self): # Same as the "assign back to self" check in test_iloc_setitem_dups # but on a DataFrame with multiple blocks df = DataFrame([[0, 1], [2, 3]], columns=["B", "B"]) @@ -530,14 +524,12 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( # setting float values that can be held by existing integer arrays # is inplace df.iloc[:, 0] = df.iloc[:, 0].astype("f8") - if not using_array_manager: - assert len(df._mgr.blocks) == 1 + assert len(df._mgr.blocks) == 1 # if the assigned values cannot be held by existing integer arrays, - # we cast - df.iloc[:, 0] = df.iloc[:, 0] + 0.5 - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + # we raise + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[:, 0] = df.iloc[:, 0] + 0.5 expected = df.copy() @@ -632,7 +624,7 @@ def test_iloc_getitem_labelled_frame(self): with pytest.raises(ValueError, match=msg): df.iloc["j", "D"] - def test_iloc_getitem_doc_issue(self, using_array_manager): + def test_iloc_getitem_doc_issue(self): # multi axis slicing issue with single block # surfaced in GH 6059 @@ -662,8 +654,7 @@ def test_iloc_getitem_doc_issue(self, using_array_manager): columns = list(range(0, 8, 2)) df = DataFrame(arr, index=index, columns=columns) - if not using_array_manager: - df._mgr.blocks[0].mgr_locs + df._mgr.blocks[0].mgr_locs result = df.iloc[1:5, 2:4] expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) tm.assert_frame_equal(result, expected) @@ -735,15 +726,16 @@ def test_iloc_setitem_with_scalar_index(self, indexer, value): @pytest.mark.filterwarnings("ignore::UserWarning") def test_iloc_mask(self): - # GH 3631, iloc with a mask (of a series) should raise + # GH 60994, iloc with a mask (of a series) should return accordingly df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) mask = df.a % 2 == 0 msg = "iLocation based boolean indexing cannot use an indexable as a mask" with pytest.raises(ValueError, match=msg): df.iloc[mask] + mask.index = range(len(mask)) - msg = "iLocation based boolean indexing on an integer type is not available" - with pytest.raises(NotImplementedError, match=msg): + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): df.iloc[mask] # ndarray ok @@ -762,19 +754,13 @@ def test_iloc_mask(self): (None, ".iloc"): "0b1100", ("index", ""): "0b11", ("index", ".loc"): "0b11", - ("index", ".iloc"): ( - "iLocation based boolean indexing cannot use an indexable as a mask" - ), - ("locs", ""): "Unalignable boolean Series provided as indexer " - "(index of the boolean Series and of the indexed " - "object do not match).", - ("locs", ".loc"): "Unalignable boolean Series provided as indexer " - "(index of the boolean Series and of the " - "indexed object do not match).", - ("locs", ".iloc"): ( - "iLocation based boolean indexing on an " - "integer type is not available" - ), + ( + "index", + ".iloc", + ): "iLocation based boolean indexing cannot use an indexable as a mask", + ("locs", ""): "Unalignable boolean Series provided as indexer", + ("locs", ".loc"): "Unalignable boolean Series provided as indexer", + ("locs", ".iloc"): "Unalignable boolean Series provided as indexer", } # UserWarnings from reindex of a boolean mask @@ -790,18 +776,52 @@ def test_iloc_mask(self): else: accessor = df answer = str(bin(accessor[mask]["nums"].sum())) - except (ValueError, IndexingError, NotImplementedError) as err: + except (ValueError, IndexingError) as err: answer = str(err) key = ( idx, method, ) - r = expected.get(key) - if r != answer: - raise AssertionError( - f"[{key}] does not match [{answer}], received [{r}]" + expected_result = expected.get(key) + + # Fix the assertion to check for substring match + if ( + idx is None or (idx == "index" and method != ".iloc") + ) and "0b" in expected_result: + # For successful numeric results, exact match is needed + assert expected_result == answer, ( + f"[{key}] does not match [{answer}]" ) + else: + # For error messages, substring match is sufficient + assert expected_result in answer, f"[{key}] not found in [{answer}]" + + def test_iloc_with_numpy_bool_array(self): + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + result = df.iloc[np.array([True, False, True, False, True], dtype=bool)] + expected = DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"]) + tm.assert_frame_equal(result, expected) + + def test_iloc_series_mask_with_index_mismatch_raises(self): + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): + df.iloc[Series([True] * len(mask), dtype=bool)] + + def test_iloc_series_mask_all_true(self): + df = DataFrame(list(range(5)), columns=["a"]) + mask = Series([True] * len(df), dtype=bool) + result = df.iloc[mask] + tm.assert_frame_equal(result, df) + + def test_iloc_series_mask_alternate_true(self): + df = DataFrame(list(range(5)), columns=["a"]) + mask = Series([True, False, True, False, True], dtype=bool) + result = df.iloc[mask] + expected = DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4]) + tm.assert_frame_equal(result, expected) def test_iloc_non_unique_indexing(self): # GH 4017, non-unique indexing (on the axis) @@ -845,9 +865,7 @@ def test_iloc_empty_list_indexer_is_ok(self): df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object( - self, using_copy_on_write, warn_copy_on_write - ): + def test_identity_slice_returns_new_object(self): # GH13873 original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.iloc[:] @@ -858,25 +876,17 @@ def test_identity_slice_returns_new_object( # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig # depending on CoW - with tm.assert_cow_warning(warn_copy_on_write): - original_df.loc[:, "a"] = [4, 4, 4] - if using_copy_on_write: - assert (sliced_df["a"] == [1, 2, 3]).all() - else: - assert (sliced_df["a"] == 4).all() + original_df.loc[:, "a"] = [4, 4, 4] + assert (sliced_df["a"] == [1, 2, 3]).all() original_series = Series([1, 2, 3, 4, 5, 6]) sliced_series = original_series.iloc[:] assert sliced_series is not original_series # should also be a shallow copy - with tm.assert_cow_warning(warn_copy_on_write): - original_series[:3] = [7, 8, 9] - if using_copy_on_write: - # shallow copy not updated (CoW) - assert all(sliced_series[:3] == [1, 2, 3]) - else: - assert all(sliced_series[:3] == [7, 8, 9]) + original_series[:3] = [7, 8, 9] + # shallow copy not updated (CoW) + assert all(sliced_series[:3] == [1, 2, 3]) def test_indexing_zerodim_np_array(self): # GH24919 @@ -987,8 +997,7 @@ def test_iloc_setitem_empty_frame_raises_with_3d_ndarray(self): with pytest.raises(ValueError, match=msg): obj.iloc[nd3] = 0 - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_iloc_getitem_read_only_values(self, indexer): + def test_iloc_getitem_read_only_values(self, indexer_li): # GH#10043 this is fundamentally a test for iloc, but test loc while # we're here rw_array = np.eye(10) @@ -998,10 +1007,12 @@ def test_iloc_getitem_read_only_values(self, indexer): ro_array.setflags(write=False) ro_df = DataFrame(ro_array) - tm.assert_frame_equal(indexer(rw_df)[[1, 2, 3]], indexer(ro_df)[[1, 2, 3]]) - tm.assert_frame_equal(indexer(rw_df)[[1]], indexer(ro_df)[[1]]) - tm.assert_series_equal(indexer(rw_df)[1], indexer(ro_df)[1]) - tm.assert_frame_equal(indexer(rw_df)[1:3], indexer(ro_df)[1:3]) + tm.assert_frame_equal( + indexer_li(rw_df)[[1, 2, 3]], indexer_li(ro_df)[[1, 2, 3]] + ) + tm.assert_frame_equal(indexer_li(rw_df)[[1]], indexer_li(ro_df)[[1]]) + tm.assert_series_equal(indexer_li(rw_df)[1], indexer_li(ro_df)[1]) + tm.assert_frame_equal(indexer_li(rw_df)[1:3], indexer_li(ro_df)[1:3]) def test_iloc_getitem_readonly_key(self): # GH#17192 iloc with read-only array raising TypeError @@ -1200,7 +1211,6 @@ def test_iloc_setitem_2d_ndarray_into_ea_block(self): expected = DataFrame({"status": ["a", "a", "c"]}, dtype=df["status"].dtype) tm.assert_frame_equal(df, expected) - @td.skip_array_manager_not_yet_implemented def test_iloc_getitem_int_single_ea_block_view(self): # GH#45241 # TODO: make an extension interface test for this? @@ -1215,29 +1225,31 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] - def test_iloc_setitem_multicolumn_to_datetime(self): + def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) - df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) - expected = DataFrame( - { - "A": [ - Timestamp("2021-01-01 00:00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - "B": ["2021", "2022"], - } - ) - tm.assert_frame_equal(df, expected, check_dtype=False) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + else: + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + expected = DataFrame( + { + "A": [ + Timestamp("2021-01-01 00:00:00"), + Timestamp("2022-01-01 00:00:00"), + ], + "B": ["2021", "2022"], + } + ) + tm.assert_frame_equal(df, expected, check_dtype=False) class TestILocErrors: # NB: this test should work for _any_ Series we can pass as # series_with_simple_index - def test_iloc_float_raises( - self, series_with_simple_index, frame_or_series, warn_copy_on_write - ): + def test_iloc_float_raises(self, series_with_simple_index, frame_or_series): # GH#4892 # float_indexers should raise exceptions # on appropriate Index types & accessors @@ -1254,10 +1266,7 @@ def test_iloc_float_raises( obj.iloc[3.0] with pytest.raises(IndexError, match=_slice_iloc_msg): - with tm.assert_cow_warning( - warn_copy_on_write and frame_or_series is DataFrame - ): - obj.iloc[3.0] = 0 + obj.iloc[3.0] = 0 def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): with pytest.raises(IndexingError, match="Too many indexers"): @@ -1425,7 +1434,7 @@ def test_frame_iloc_setitem_callable(self): class TestILocSeries: - def test_iloc(self, using_copy_on_write, warn_copy_on_write): + def test_iloc(self): ser = Series( np.random.default_rng(2).standard_normal(10), index=list(range(0, 20, 2)) ) @@ -1444,12 +1453,8 @@ def test_iloc(self, using_copy_on_write, warn_copy_on_write): # test slice is a view with tm.assert_produces_warning(None): # GH#45324 make sure we aren't giving a spurious FutureWarning - with tm.assert_cow_warning(warn_copy_on_write): - result[:] = 0 - if using_copy_on_write: - tm.assert_series_equal(ser, ser_original) - else: - assert (ser.iloc[1:3] == 0).all() + result[:] = 0 + tm.assert_series_equal(ser, ser_original) # list of integers result = ser.iloc[[0, 2, 3, 4, 5]] @@ -1471,6 +1476,5 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) - result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") - expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 57f45f867254d..113eb6c2b2c31 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1,4 +1,4 @@ -""" test fancy indexing & misc """ +"""test fancy indexing & misc""" import array from datetime import datetime @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -77,9 +75,7 @@ def test_setitem_ndarray_1d_2(self): "ignore:Series.__getitem__ treating keys as positions is deprecated:" "FutureWarning" ) - def test_getitem_ndarray_3d( - self, index, frame_or_series, indexer_sli, using_array_manager - ): + def test_getitem_ndarray_3d(self, index, frame_or_series, indexer_sli): # GH 25567 obj = gen_obj(frame_or_series, index) idxr = indexer_sli(obj) @@ -88,12 +84,8 @@ def test_getitem_ndarray_3d( msgs = [] if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]: msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]") - if using_array_manager: - msgs.append("Passed array should be 1-dimensional") if frame_or_series is Series or indexer_sli is tm.iloc: msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)") - if using_array_manager: - msgs.append("indexer should be 1-dimensional") if indexer_sli is tm.loc or ( frame_or_series is Series and indexer_sli is tm.setitem ): @@ -186,14 +178,8 @@ def test_setitem_dtype_upcast(self): df["c"] = np.nan assert df["c"].dtype == np.float64 - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[0, "c"] = "foo" - expected = DataFrame( - {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)} - ) - tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("val", [3.14, "wxyz"]) def test_setitem_dtype_upcast2(self, val): @@ -205,19 +191,8 @@ def test_setitem_dtype_upcast2(self, val): ) left = df.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left.loc["a", "bar"] = val - right = DataFrame( - [[0, val, 2], [3, 4, 5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) - - tm.assert_frame_equal(left, right) - assert is_integer_dtype(left["foo"]) - assert is_integer_dtype(left["baz"]) def test_setitem_dtype_upcast3(self): left = DataFrame( @@ -225,21 +200,9 @@ def test_setitem_dtype_upcast3(self): index=list("ab"), columns=["foo", "bar", "baz"], ) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left.loc["a", "bar"] = "wxyz" - right = DataFrame( - [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) - - tm.assert_frame_equal(left, right) - assert is_float_dtype(left["foo"]) - assert is_float_dtype(left["baz"]) - def test_dups_fancy_indexing(self): # GH 3455 @@ -294,7 +257,7 @@ def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): with pytest.raises( KeyError, match=re.escape( - "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + "\"None of [Index(['E'], dtype='str')] are in the [index]\"" ), ): dfnu.loc[["E"]] @@ -461,9 +424,6 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't multiply arrow strings" - ) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -571,6 +531,7 @@ def test_astype_assignment(self, using_infer_string): df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object) df = df_orig.copy() @@ -580,9 +541,9 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + expected[list("CDG")] = expected[list("CDG")].astype(object) + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -591,18 +552,16 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): @@ -689,7 +648,6 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases @@ -714,7 +672,7 @@ def run_tests(df, rhs, right_loc, right_iloc): cols = ["jim", "joe", "jolie", "joline"] df = DataFrame(xs, columns=cols, index=list("abcde"), dtype="int64") - # right hand side; permute the indices and multiplpy by -2 + # right hand side; permute the indices and multiply by -2 rhs = -2 * df.iloc[3:0:-1, 2:0:-1] # expected `right` result; just multiply by -2 @@ -734,7 +692,7 @@ def run_tests(df, rhs, right_loc, right_iloc): frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}") right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): run_tests(df, rhs, right_loc, right_iloc) @pytest.mark.parametrize( @@ -786,10 +744,10 @@ def test_loc_range_in_series_indexing(self, size): # GH 11652 s = Series(index=range(size), dtype=np.float64) s.loc[range(1)] = 42 - tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) + tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=range(1))) s.loc[range(2)] = 43 - tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) + tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=range(2))) def test_partial_boolean_frame_indexing(self): # GH 17170 @@ -805,6 +763,27 @@ def test_partial_boolean_frame_indexing(self): ) tm.assert_frame_equal(result, expected) + def test_period_column_slicing(self): + # GH#60273 The transpose operation creates a single 5x1 block of PeriodDtype + # Make sure it is reindexed correctly + df = DataFrame( + pd.period_range("2021-01-01", periods=5, freq="D"), + columns=["A"], + ).T + result = df[[0, 1, 2]] + expected = DataFrame( + [ + [ + pd.Period("2021-01-01", freq="D"), + pd.Period("2021-01-02", freq="D"), + pd.Period("2021-01-03", freq="D"), + ] + ], + index=["A"], + columns=[0, 1, 2], + ) + tm.assert_frame_equal(result, expected) + def test_no_reference_cycle(self): df = DataFrame({"a": [0, 1], "b": [2, 3]}) for name in ("loc", "iloc", "at", "iat"): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index fb0adc56c401b..8838fc7eed2f7 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,5 +1,7 @@ -""" test label based indexing with loc """ +"""test label based indexing with loc""" + from collections import namedtuple +import contextlib from datetime import ( date, datetime, @@ -12,12 +14,8 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas._libs import index as libindex -from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -63,122 +61,122 @@ def test_not_change_nan_loc(series, new_series, expected_ser): class TestLoc: - def test_none_values_on_string_columns(self): + def test_none_values_on_string_columns(self, using_infer_string): # Issue #32218 - df = DataFrame(["1", "2", None], columns=["a"], dtype="str") - + df = DataFrame(["1", "2", None], columns=["a"], dtype=object) assert df.loc[2, "a"] is None - @pytest.mark.parametrize("kind", ["series", "frame"]) - def test_loc_getitem_int(self, kind, request): + df = DataFrame(["1", "2", None], columns=["a"], dtype="str") + if using_infer_string: + assert np.isnan(df.loc[2, "a"]) + else: + assert df.loc[2, "a"] is None + + def test_loc_getitem_int(self, frame_or_series): # int label - obj = request.getfixturevalue(f"{kind}_labels") + obj = frame_or_series(range(3), index=Index(list("abc"), dtype=object)) check_indexing_smoketest_or_raises(obj, "loc", 2, fails=KeyError) - @pytest.mark.parametrize("kind", ["series", "frame"]) - def test_loc_getitem_label(self, kind, request): + def test_loc_getitem_label(self, frame_or_series): # label - obj = request.getfixturevalue(f"{kind}_empty") + obj = frame_or_series() check_indexing_smoketest_or_raises(obj, "loc", "c", fails=KeyError) + @pytest.mark.parametrize("key", ["f", 20]) @pytest.mark.parametrize( - "key, typs, axes", + "index", [ - ["f", ["ints", "uints", "labels", "mixed", "ts"], None], - ["f", ["floats"], None], - [20, ["ints", "uints", "mixed"], None], - [20, ["labels"], None], - [20, ["ts"], 0], - [20, ["floats"], 0], + Index(list("abcd"), dtype=object), + Index([2, 4, "null", 8], dtype=object), + date_range("20130101", periods=4), + Index(range(0, 8, 2), dtype=np.float64), + Index([]), ], ) - @pytest.mark.parametrize("kind", ["series", "frame"]) - def test_loc_getitem_label_out_of_range(self, key, typs, axes, kind, request): - for typ in typs: - obj = request.getfixturevalue(f"{kind}_{typ}") - # out of range label - check_indexing_smoketest_or_raises( - obj, "loc", key, axes=axes, fails=KeyError - ) + def test_loc_getitem_label_out_of_range(self, key, index, frame_or_series): + obj = frame_or_series(range(len(index)), index=index) + # out of range label + check_indexing_smoketest_or_raises(obj, "loc", key, fails=KeyError) + + @pytest.mark.parametrize("key", [[0, 1, 2], [1, 3.0, "A"]]) + @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64]) + def test_loc_getitem_label_list(self, key, dtype, frame_or_series): + obj = frame_or_series(range(3), index=Index([0, 1, 2], dtype=dtype)) + # list of labels + check_indexing_smoketest_or_raises(obj, "loc", key, fails=KeyError) @pytest.mark.parametrize( - "key, typs", + "index", [ - [[0, 1, 2], ["ints", "uints", "floats"]], - [[1, 3.0, "A"], ["ints", "uints", "floats"]], + None, + Index([0, 1, 2], dtype=np.int64), + Index([0, 1, 2], dtype=np.uint64), + Index([0, 1, 2], dtype=np.float64), + MultiIndex.from_arrays([range(3), range(3)]), ], ) - @pytest.mark.parametrize("kind", ["series", "frame"]) - def test_loc_getitem_label_list(self, key, typs, kind, request): - for typ in typs: - obj = request.getfixturevalue(f"{kind}_{typ}") - # list of labels - check_indexing_smoketest_or_raises(obj, "loc", key, fails=KeyError) - @pytest.mark.parametrize( - "key, typs, axes", - [ - [[0, 1, 2], ["empty"], None], - [[0, 2, 10], ["ints", "uints", "floats"], 0], - [[3, 6, 7], ["ints", "uints", "floats"], 1], - # GH 17758 - MultiIndex and missing keys - [[(1, 3), (1, 4), (2, 5)], ["multi"], 0], - ], + "key", [[0, 1, 2], [0, 2, 10], [3, 6, 7], [(1, 3), (1, 4), (2, 5)]] ) - @pytest.mark.parametrize("kind", ["series", "frame"]) - def test_loc_getitem_label_list_with_missing(self, key, typs, axes, kind, request): - for typ in typs: - obj = request.getfixturevalue(f"{kind}_{typ}") - check_indexing_smoketest_or_raises( - obj, "loc", key, axes=axes, fails=KeyError - ) + def test_loc_getitem_label_list_with_missing(self, key, index, frame_or_series): + if index is None: + obj = frame_or_series() + else: + obj = frame_or_series(range(len(index)), index=index) + check_indexing_smoketest_or_raises(obj, "loc", key, fails=KeyError) - @pytest.mark.parametrize("typs", ["ints", "uints"]) - @pytest.mark.parametrize("kind", ["series", "frame"]) - def test_loc_getitem_label_list_fails(self, typs, kind, request): + @pytest.mark.parametrize("dtype", [np.int64, np.uint64]) + def test_loc_getitem_label_list_fails(self, dtype, frame_or_series): # fails - obj = request.getfixturevalue(f"{kind}_{typs}") + obj = frame_or_series(range(3), Index([0, 1, 2], dtype=dtype)) check_indexing_smoketest_or_raises( obj, "loc", [20, 30, 40], axes=1, fails=KeyError ) - def test_loc_getitem_label_array_like(self): - # TODO: test something? - # array like - pass - - @pytest.mark.parametrize("kind", ["series", "frame"]) - def test_loc_getitem_bool(self, kind, request): - obj = request.getfixturevalue(f"{kind}_empty") + def test_loc_getitem_bool(self, frame_or_series): + obj = frame_or_series() # boolean indexers b = [True, False, True, False] check_indexing_smoketest_or_raises(obj, "loc", b, fails=IndexError) @pytest.mark.parametrize( - "slc, typs, axes, fails", + "slc, indexes, axes, fails", [ [ slice(1, 3), - ["labels", "mixed", "empty", "ts", "floats"], + [ + Index(list("abcd"), dtype=object), + Index([2, 4, "null", 8], dtype=object), + None, + date_range("20130101", periods=4), + Index(range(0, 12, 3), dtype=np.float64), + ], None, TypeError, ], - [slice("20130102", "20130104"), ["ts"], 1, TypeError], - [slice(2, 8), ["mixed"], 0, TypeError], - [slice(2, 8), ["mixed"], 1, KeyError], - [slice(2, 4, 2), ["mixed"], 0, TypeError], + [ + slice("20130102", "20130104"), + [date_range("20130101", periods=4)], + 1, + TypeError, + ], + [slice(2, 8), [Index([2, 4, "null", 8], dtype=object)], 0, TypeError], + [slice(2, 8), [Index([2, 4, "null", 8], dtype=object)], 1, KeyError], + [slice(2, 4, 2), [Index([2, 4, "null", 8], dtype=object)], 0, TypeError], ], ) - @pytest.mark.parametrize("kind", ["series", "frame"]) - def test_loc_getitem_label_slice(self, slc, typs, axes, fails, kind, request): + def test_loc_getitem_label_slice(self, slc, indexes, axes, fails, frame_or_series): # label slices (with ints) # real label slices # GH 14316 - for typ in typs: - obj = request.getfixturevalue(f"{kind}_{typ}") + for index in indexes: + if index is None: + obj = frame_or_series() + else: + obj = frame_or_series(range(len(index)), index=index) check_indexing_smoketest_or_raises( obj, "loc", @@ -388,12 +386,8 @@ def test_loc_setitem_slice(self): df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") ix = df1["a"] == 1 newb2 = df2.loc[ix, "b"] - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df1.loc[ix, "b"] = newb2 - expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") - tm.assert_frame_equal(df2, expected) def test_loc_setitem_dtype(self): # GH31340 @@ -514,7 +508,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" + msg = "None of [RangeIndex(start=3, stop=4, step=1)] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] @@ -577,50 +571,31 @@ def frame_for_consistency(self): def test_loc_setitem_consistency(self, frame_for_consistency, val): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series(0, index=range(5), dtype=np.int64), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - df.loc[:, "date"] = val - tm.assert_frame_equal(df, expected) + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, "date"] = val def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series("foo", index=range(5)), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - df.loc[:, "date"] = "foo" - tm.assert_frame_equal(df, expected) + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, "date"] = "foo" def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series(1.0, index=range(5)), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - df.loc[:, "date"] = 1.0 - tm.assert_frame_equal(df, expected) + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, "date"] = 1.0 def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) - df.loc[:, "date"] = "string" - expected = DataFrame({"date": Series(["string"])}) - tm.assert_frame_equal(df, expected) + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, "date"] = "string" def test_loc_setitem_consistency_empty(self): # empty (essentially noops) @@ -638,7 +613,7 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) - def test_loc_setitem_consistency_slice_column_len(self): + def test_loc_setitem_consistency_slice_column_len(self, using_infer_string): # .loc[:,column] setting with slice == len of the column # GH10408 levels = [ @@ -662,13 +637,24 @@ def test_loc_setitem_consistency_slice_column_len(self): ] df = DataFrame(values, index=mi, columns=cols) - df.loc[:, ("Respondent", "StartDate")] = to_datetime( - df.loc[:, ("Respondent", "StartDate")] - ) - df.loc[:, ("Respondent", "EndDate")] = to_datetime( - df.loc[:, ("Respondent", "EndDate")] - ) - df = df.infer_objects(copy=False) + ctx = contextlib.nullcontext() + if using_infer_string: + ctx = pytest.raises(TypeError, match="Invalid value") + + with ctx: + df.loc[:, ("Respondent", "StartDate")] = to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + with ctx: + df.loc[:, ("Respondent", "EndDate")] = to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + + if using_infer_string: + # infer-objects won't infer stuff anymore + return + + df = df.infer_objects() # Adding a new key df.loc[:, ("Respondent", "Duration")] = ( @@ -678,14 +664,10 @@ def test_loc_setitem_consistency_slice_column_len(self): # timedelta64[m] -> float, so this cannot be done inplace, so # no warning - df.loc[:, ("Respondent", "Duration")] = df.loc[ - :, ("Respondent", "Duration") - ] / Timedelta(60_000_000_000) - - expected = Series( - [23.0, 12.0, 14.0, 36.0], index=df.index, name=("Respondent", "Duration") - ) - tm.assert_series_equal(df[("Respondent", "Duration")], expected) + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, ("Respondent", "Duration")] = df.loc[ + :, ("Respondent", "Duration") + ] / Timedelta(60_000_000_000) @pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s", "ms", "us"]) def test_loc_assign_non_ns_datetime(self, unit): @@ -711,7 +693,7 @@ def test_loc_modify_datetime(self): {"date": [1485264372711, 1485265925110, 1540215845888, 1540282121025]} ) - df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True) + df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True).dt.as_unit("ms") df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"] df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"] @@ -784,9 +766,9 @@ def test_loc_setitem_empty_frame(self): # is inplace, so that dtype is retained sera = Series(val1, index=keys1, dtype=np.float64) serb = Series(val2, index=keys2) - expected = DataFrame( - {"A": sera, "B": serb}, columns=Index(["A", "B"], dtype=object) - ).reindex(index=index) + expected = DataFrame({"A": sera, "B": serb}, columns=Index(["A", "B"])).reindex( + index=index + ) tm.assert_frame_equal(df, expected) def test_loc_setitem_frame(self): @@ -833,8 +815,7 @@ def test_loc_setitem_frame_mixed_labels(self): df.loc[0, [1, 2]] = [5, 6] tm.assert_frame_equal(df, expected) - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") - def test_loc_setitem_frame_multiples(self, warn_copy_on_write): + def test_loc_setitem_frame_multiples(self): # multiple setting df = DataFrame( {"A": ["foo", "bar", "baz"], "B": Series(range(3), dtype=np.int64)} @@ -866,6 +847,7 @@ def test_loc_setitem_frame_multiples(self, warn_copy_on_write): "val": Series([0, 1, 0, 1, 2], dtype=np.int64), } ) + expected["date"] = expected["date"].astype("M8[ns]") rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs @@ -984,7 +966,7 @@ def test_setitem_new_key_tz(self, indexer_sl): to_datetime(42).tz_localize("UTC"), to_datetime(666).tz_localize("UTC"), ] - expected = Series(vals, index=Index(["foo", "bar"], dtype=object)) + expected = Series(vals, index=Index(["foo", "bar"])) ser = Series(dtype=object) indexer_sl(ser)["foo"] = vals[0] @@ -1091,9 +1073,7 @@ def test_loc_empty_list_indexer_is_ok(self): df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object( - self, using_copy_on_write, warn_copy_on_write - ): + def test_identity_slice_returns_new_object(self): # GH13873 original_df = DataFrame({"a": [1, 2, 3]}) @@ -1107,19 +1087,12 @@ def test_identity_slice_returns_new_object( # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig # depending on CoW - with tm.assert_cow_warning(warn_copy_on_write): - original_df.loc[:, "a"] = [4, 4, 4] - if using_copy_on_write: - assert (sliced_df["a"] == [1, 2, 3]).all() - else: - assert (sliced_df["a"] == 4).all() + original_df.loc[:, "a"] = [4, 4, 4] + assert (sliced_df["a"] == [1, 2, 3]).all() # These should not return copies df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) - if using_copy_on_write or warn_copy_on_write: - assert df[0] is not df.loc[:, 0] - else: - assert df[0] is df.loc[:, 0] + assert df[0] is not df.loc[:, 0] # Same tests for Series original_series = Series([1, 2, 3, 4, 5, 6]) @@ -1127,19 +1100,11 @@ def test_identity_slice_returns_new_object( assert sliced_series is not original_series assert original_series[:] is not original_series - with tm.assert_cow_warning(warn_copy_on_write): - original_series[:3] = [7, 8, 9] - if using_copy_on_write: - assert all(sliced_series[:3] == [1, 2, 3]) - else: - assert all(sliced_series[:3] == [7, 8, 9]) + original_series[:3] = [7, 8, 9] + assert all(sliced_series[:3] == [1, 2, 3]) - def test_loc_copy_vs_view(self, request, using_copy_on_write): + def test_loc_copy_vs_view(self, request): # GH 15631 - - if not using_copy_on_write: - mark = pytest.mark.xfail(reason="accidental fix reverted - GH37497") - request.applymarker(mark) x = DataFrame(zip(range(3), range(3)), columns=["a", "b"]) y = x.copy() @@ -1223,20 +1188,11 @@ def test_loc_setitem_empty_append_raises(self): data = [1, 2] df = DataFrame(columns=["x", "y"]) df.index = df.index.astype(np.int64) - msg = ( - rf"None of \[Index\(\[0, 1\], dtype='{np.dtype(int)}'\)\] " - r"are in the \[index\]" - ) + msg = r"None of .*Index.* are in the \[index\]" with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "|".join( - [ - "cannot copy sequence with size 2 to array axis with dimension 0", - r"could not broadcast input array from shape \(2,\) into shape \(0,\)", - "Must have equal len keys and value when setting with an iterable", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data @@ -1263,20 +1219,23 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") - def test_loc_setitem_str_to_small_float_conversion_type(self): + def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string): # GH#20388 col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)] result = DataFrame(col_data, columns=["A"]) - expected = DataFrame(col_data, columns=["A"], dtype=object) + expected = DataFrame(col_data, columns=["A"]) tm.assert_frame_equal(result, expected) # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) - tm.assert_frame_equal(result, expected) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + result.loc[result.index, "A"] = [float(x) for x in col_data] + else: + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) # assigning the entire column using __setitem__ swaps in the new array # GH#??? @@ -1307,7 +1266,7 @@ def test_loc_getitem_time_object(self, frame_or_series): tm.assert_equal(result, expected) @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) - @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): sp_sparse = pytest.importorskip("scipy.sparse") @@ -1322,13 +1281,13 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): # regression test for GH#34526 itr_idx = range(2, rows) - result = df.loc[itr_idx].values + result = np.nan_to_num(df.loc[itr_idx].values) expected = spmatrix.toarray()[itr_idx] tm.assert_numpy_array_equal(result, expected) # regression test for GH#34540 result = df.loc[itr_idx].dtypes.values - expected = np.full(cols, SparseDtype(dtype, fill_value=0)) + expected = np.full(cols, SparseDtype(dtype)) tm.assert_numpy_array_equal(result, expected) def test_loc_getitem_listlike_all_retains_sparse(self): @@ -1340,18 +1299,16 @@ def test_loc_getitem_sparse_frame(self): # GH34687 sp_sparse = pytest.importorskip("scipy.sparse") - df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5)) + df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64)) result = df.loc[range(2)] expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]], - dtype=SparseDtype("float64", 0.0), + [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]], + dtype=SparseDtype(np.int64), ) tm.assert_frame_equal(result, expected) result = df.loc[range(2)].loc[range(1)] - expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0) - ) + expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64)) tm.assert_frame_equal(result, expected) def test_loc_getitem_sparse_series(self): @@ -1389,14 +1346,12 @@ def test_loc_getitem_timedelta_0seconds(self): result = df.loc["0s":, :] tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "val,expected", [(2**63 - 1, Series([1])), (2**63, Series([2]))] - ) + @pytest.mark.parametrize("val,expected", [(2**63 - 1, 1), (2**63, 2)]) def test_loc_getitem_uint64_scalar(self, val, expected): # see GH#19399 df = DataFrame([1, 2], index=[2**63 - 1, 2**63]) result = df.loc[val] - + expected = Series([expected]) expected.name = val tm.assert_series_equal(result, expected) @@ -1441,13 +1396,9 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 @@ -1460,7 +1411,7 @@ def test_loc_setitem_single_row_categorical(self, using_infer_string): result = df["Alpha"] expected = Series(categories, index=df.index, name="Alpha").astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) tm.assert_series_equal(result, expected) @@ -1474,9 +1425,8 @@ def test_loc_setitem_datetime_coercion(self): df.loc[0:1, "c"] = np.datetime64("2008-08-08") assert Timestamp("2008-08-08") == df.loc[0, "c"] assert Timestamp("2008-08-08") == df.loc[1, "c"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[2, "c"] = date(2005, 5, 5) - assert Timestamp("2005-05-05").date() == df.loc[2, "c"] @pytest.mark.parametrize("idxer", ["var", ["var"]]) def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): @@ -1487,10 +1437,15 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype result = DataFrame(index=idx, columns=["var"], dtype=np.float64) - result.loc[:, idxer] = expected - tm.assert_frame_equal(result, expected) + if idxer == "var": + with pytest.raises(TypeError, match="Invalid value"): + result.loc[:, idxer] = expected + else: + # See https://github.com/pandas-dev/pandas/issues/56223 + result.loc[:, idxer] = expected + tm.assert_frame_equal(result, expected) - def test_loc_setitem_time_key(self, using_array_manager): + def test_loc_setitem_time_key(self): index = date_range("2012-01-01", "2012-01-05", freq="30min") df = DataFrame( np.random.default_rng(2).standard_normal((len(index), 5)), index=index @@ -1505,9 +1460,6 @@ def test_loc_setitem_time_key(self, using_array_manager): result = result.loc[akey] expected = df.loc[akey].copy() expected.loc[:] = 0 - if using_array_manager: - # TODO(ArrayManager) we are still overwriting columns - expected = expected.astype(float) tm.assert_frame_equal(result, expected) result = df.copy() @@ -1520,9 +1472,6 @@ def test_loc_setitem_time_key(self, using_array_manager): result = result.loc[bkey] expected = df.loc[bkey].copy() expected.loc[:] = 0 - if using_array_manager: - # TODO(ArrayManager) we are still overwriting columns - expected = expected.astype(float) tm.assert_frame_equal(result, expected) result = df.copy() @@ -1566,16 +1515,10 @@ def test_loc_setitem_2d_to_1d_raises(self): # float64 dtype to avoid upcast when trying to set float data ser = Series(range(2), dtype="float64") - msg = "|".join( - [ - r"shape mismatch: value array of shape \(2,2\)", - r"cannot reshape array of size 4 into shape \(2,\)", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): ser.loc[range(2)] = data - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" with pytest.raises(ValueError, match=msg): ser.loc[:] = data @@ -1637,7 +1580,7 @@ def test_loc_setitem_single_column_mixed(self, using_infer_string): df.loc[df.index[::2], "str"] = np.nan expected = Series( [np.nan, "qux", np.nan, "qux", np.nan], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ).values tm.assert_almost_equal(df["str"].values, expected) @@ -1646,16 +1589,8 @@ def test_loc_setitem_cast2(self): # dtype conversion on setting df = DataFrame(np.random.default_rng(2).random((30, 3)), columns=tuple("ABC")) df["event"] = np.nan - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[10, "event"] = "foo" - result = df.dtypes - expected = Series( - [np.dtype("float64")] * 3 + [np.dtype("object")], - index=["A", "B", "C", "event"], - ) - tm.assert_series_equal(result, expected) def test_loc_setitem_cast3(self): # Test that data type is preserved . GH#5782 @@ -1688,10 +1623,10 @@ def test_loc_setitem_numpy_frame_categorical_value(self): class TestLocWithEllipsis: - @pytest.fixture(params=[tm.loc, tm.iloc]) - def indexer(self, request): + @pytest.fixture + def indexer(self, indexer_li): # Test iloc while we're here - return request.param + return indexer_li @pytest.fixture def obj(self, series_with_simple_index, frame_or_series): @@ -1830,7 +1765,7 @@ def test_loc_setitem_multiindex_slice(self): ) result = Series([1, 1, 1, 1, 1, 1, 1, 1], index=index) - result.loc[("baz", "one"):("foo", "two")] = 100 + result.loc[("baz", "one") : ("foo", "two")] = 100 expected = Series([1, 1, 100, 100, 100, 100, 1, 1], index=index) @@ -1851,7 +1786,7 @@ def test_loc_getitem_datetime_string_with_datetimeindex(self): result = df.loc[["2010-01-01", "2010-01-05"], ["a", "b"]] expected = DataFrame( {"a": [0, 4], "b": [0, 4]}, - index=DatetimeIndex(["2010-01-01", "2010-01-05"]), + index=DatetimeIndex(["2010-01-01", "2010-01-05"]).as_unit("ns"), ) tm.assert_frame_equal(result, expected) @@ -2011,7 +1946,7 @@ def test_loc_setitem_empty_series(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc[1] = 1 - tm.assert_series_equal(ser, Series([1], index=[1])) + tm.assert_series_equal(ser, Series([1], index=range(1, 2))) ser.loc[3] = 3 tm.assert_series_equal(ser, Series([1, 3], index=[1, 3])) @@ -2021,7 +1956,7 @@ def test_loc_setitem_empty_series_float(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc[1] = 1.0 - tm.assert_series_equal(ser, Series([1.0], index=[1])) + tm.assert_series_equal(ser, Series([1.0], index=range(1, 2))) ser.loc[3] = 3.0 tm.assert_series_equal(ser, Series([1.0, 3.0], index=[1, 3])) @@ -2031,15 +1966,11 @@ def test_loc_setitem_empty_series_str_idx(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc["foo"] = 1 - tm.assert_series_equal(ser, Series([1], index=Index(["foo"], dtype=object))) + tm.assert_series_equal(ser, Series([1], index=Index(["foo"]))) ser.loc["bar"] = 3 - tm.assert_series_equal( - ser, Series([1, 3], index=Index(["foo", "bar"], dtype=object)) - ) + tm.assert_series_equal(ser, Series([1, 3], index=Index(["foo", "bar"]))) ser.loc[3] = 4 - tm.assert_series_equal( - ser, Series([1, 3, 4], index=Index(["foo", "bar", 3], dtype=object)) - ) + tm.assert_series_equal(ser, Series([1, 3, 4], index=Index(["foo", "bar", 3]))) def test_loc_setitem_incremental_with_dst(self): # GH#20724 @@ -2061,8 +1992,8 @@ def test_loc_setitem_incremental_with_dst(self): ], ids=["self", "to_datetime64", "to_pydatetime", "np.datetime64"], ) - def test_loc_setitem_datetime_keys_cast(self, conv): - # GH#9516 + def test_loc_setitem_datetime_keys_cast(self, conv, using_infer_string): + # GH#9516, GH#51363 changed in 3.0 to not cast on Index.insert dt1 = Timestamp("20130101 09:00:00") dt2 = Timestamp("20130101 10:00:00") df = DataFrame() @@ -2071,8 +2002,10 @@ def test_loc_setitem_datetime_keys_cast(self, conv): expected = DataFrame( {"one": [100.0, 200.0]}, - index=[dt1, dt2], - columns=Index(["one"], dtype=object), + index=Index( + [conv(dt1), conv(dt2)], dtype=None if using_infer_string else object + ), + columns=Index(["one"]), ) tm.assert_frame_equal(df, expected) @@ -2119,7 +2052,7 @@ def test_setitem_with_expansion(self): expected = Series([v[0].tz_convert("UTC"), df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) - v = df.loc[df.new_col == "new", "time"] + Timedelta("1s") + v = df.loc[df.new_col == "new", "time"] + Timedelta("1s").as_unit("s") df.loc[df.new_col == "new", "time"] = v tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) @@ -2144,7 +2077,7 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): N = len(index) arr = np.arange(N).astype(np.int64) - orig = DataFrame(arr, index=index, columns=[0]) + orig = DataFrame(arr, index=index) # key that will requiring object-dtype casting in the index key = "kapow" @@ -2157,7 +2090,7 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): else: assert exp_index[-1] == key exp_data = np.arange(N + 1).astype(np.float64) - expected = DataFrame(exp_data, index=exp_index, columns=[0]) + expected = DataFrame(exp_data, index=exp_index) # Add new row, but no new columns df = orig.copy() @@ -2180,12 +2113,11 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): ) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - "dtype", ["Int32", "Int64", "UInt32", "UInt64", "Float32", "Float64"] - ) - def test_loc_setitem_with_expansion_preserves_nullable_int(self, dtype): + def test_loc_setitem_with_expansion_preserves_nullable_int( + self, any_numeric_ea_dtype + ): # GH#42099 - ser = Series([0, 1, 2, 3], dtype=dtype) + ser = Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype) df = DataFrame({"data": ser}) result = DataFrame(index=df.index) @@ -2646,7 +2578,6 @@ def test_loc_setitem_mask_td64_series_value(self): assert expected == result tm.assert_frame_equal(df, df_copy) - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_loc_setitem_boolean_and_column(self, float_frame): expected = float_frame.copy() mask = float_frame["A"] > 0 @@ -2658,9 +2589,7 @@ def test_loc_setitem_boolean_and_column(self, float_frame): expected = DataFrame(values, index=expected.index, columns=expected.columns) tm.assert_frame_equal(float_frame, expected) - def test_loc_setitem_ndframe_values_alignment( - self, using_copy_on_write, warn_copy_on_write - ): + def test_loc_setitem_ndframe_values_alignment(self): # GH#45501 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df.loc[[False, False, True], ["a"]] = DataFrame( @@ -2683,12 +2612,8 @@ def test_loc_setitem_ndframe_values_alignment( df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() ser = df["a"] - with tm.assert_cow_warning(warn_copy_on_write): - ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) - if using_copy_on_write: - tm.assert_frame_equal(df, df_orig) - else: - tm.assert_frame_equal(df, expected) + ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) + tm.assert_frame_equal(df, df_orig) def test_loc_indexer_empty_broadcast(self): # GH#51450 @@ -2855,7 +2780,7 @@ def test_loc_axis_1_slice(): index=tuple("ABCDEFGHIJ"), columns=MultiIndex.from_tuples(cols), ) - result = df.loc(axis=1)[(2014, 9):(2015, 8)] + result = df.loc(axis=1)[(2014, 9) : (2015, 8)] expected = DataFrame( np.ones((10, 4)), index=tuple("ABCDEFGHIJ"), @@ -3018,20 +2943,9 @@ def test_loc_setitem_uint8_upcast(value): # GH#26049 df = DataFrame([1, 2, 3, 4], columns=["col1"], dtype="uint8") - with tm.assert_produces_warning(FutureWarning, match="item of incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[2, "col1"] = value # value that can't be held in uint8 - if np_version_gt2 and isinstance(value, np.int16): - # Note, result type of uint8 + int16 is int16 - # in numpy < 2, though, numpy would inspect the - # value and see that it could fit in an uint16, resulting in a uint16 - dtype = "int16" - else: - dtype = "uint16" - - expected = DataFrame([1, 2, 300, 4], columns=["col1"], dtype=dtype) - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "fill_val,exp_dtype", @@ -3321,7 +3235,6 @@ def test_loc_assign_dict_to_row(self, dtype): tm.assert_frame_equal(df, expected) - @td.skip_array_manager_invalid_test def test_loc_setitem_dict_timedelta_multiple_set(self): # GH 16309 result = DataFrame(columns=["time", "value"]) @@ -3355,3 +3268,110 @@ def test_getitem_loc_str_periodindex(self): index = pd.period_range(start="2000", periods=20, freq="B") series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 + + def test_loc_nonunique_masked_index(self): + # GH 57027 + ids = list(range(11)) + index = Index(ids * 1000, dtype="Int64") + df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index) + result = df.loc[ids] + expected = DataFrame( + {"val": index.argsort(kind="stable").astype(np.intp)}, + index=Index(np.array(ids).repeat(1000), dtype="Int64"), + ) + tm.assert_frame_equal(result, expected) + + def test_loc_index_alignment_for_series(self): + # GH #56024 + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + other = Series([200, 999], index=[1, 0]) + df.loc[:, "a"] = other + expected = DataFrame({"a": [999, 200], "b": [3, 4]}) + tm.assert_frame_equal(expected, df) + + def test_loc_reindexing_of_empty_index(self): + # GH 57735 + df = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) + df.loc[Series([False] * 4, index=df.index, name=0), 0] = df[0] + expected = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "df, row_index, col_index, expected_df", + [ + [ + DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC")), + slice(0, 3), + ["A", "B", "C"], + DataFrame([[10, 10, 10], [20, 20, 20]], columns=list("ABC")), + ], + [ + DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC")), + slice(None), + ["A", "B", "C"], + DataFrame([[10, 10, 10], [20, 20, 20]], columns=list("ABC")), + ], + [ + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("ABC")), + [True, True, True], + ["A", "B", "C"], + DataFrame( + [[10, 10, 10], [20, 20, 20], [30, 30, 30]], columns=list("ABC") + ), + ], + [ + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("ABC")), + slice(0, 4), + ["A", "B", "C"], + DataFrame( + [[10, 10, 10], [20, 20, 20], [30, 30, 30]], columns=list("ABC") + ), + ], + [ + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("ABC")), + slice(None), + slice("A", "C"), + DataFrame( + [[10, 10, 10], [20, 20, 20], [30, 30, 30]], columns=list("ABC") + ), + ], + [ + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("ABC")), + slice(None), + Series( + { + "A": True, + "C": False, + "B": True, + } + ), + DataFrame([[10, 10, 3], [20, 20, 6], [30, 30, 9]], columns=list("ABC")), + ], + ], + ) + def test_loc_set_series_to_multiple_columns( + self, df, row_index, col_index, expected_df + ): + # GH 59933 + df.loc[row_index, col_index] = Series([10, 20, 30]) + tm.assert_frame_equal(df, expected_df) + + def test_loc_setitem_matching_index(self): + # GH 25548 + s = Series(0.0, index=list("abcd")) + s1 = Series(1.0, index=list("ab")) + s2 = Series(2.0, index=list("xy")) + + # Test matching indices + s.loc[["a", "b"]] = s1 + + result = s[["a", "b"]] + expected = s1 + tm.assert_series_equal(result, expected) + + # Test unmatched indices + s.loc[["a", "b"]] = s2 + + result = s[["a", "b"]] + expected = Series([np.nan, np.nan], index=["a", "b"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py index 5364cfe852430..d4ad350a64e4d 100644 --- a/pandas/tests/indexing/test_na_indexing.py +++ b/pandas/tests/indexing/test_na_indexing.py @@ -54,7 +54,6 @@ def test_series_mask_boolean(values, dtype, mask, indexer_class, frame): msg = "iLocation based boolean indexing cannot use an indexable as a mask" with pytest.raises(ValueError, match=msg): result = obj.iloc[mask] - tm.assert_equal(result, expected) else: result = obj.iloc[mask] tm.assert_equal(result, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index ca551024b4c1f..6f20d0e4e7cbf 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -30,7 +30,7 @@ def test_empty_frame_setitem_index_name_retained(self): expected = DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index"), - columns=Index(["series"], dtype=object), + columns=Index(["series"]), ) tm.assert_frame_equal(df, expected) @@ -43,7 +43,7 @@ def test_empty_frame_setitem_index_name_inherited(self): expected = DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index"), - columns=Index(["series"], dtype=object), + columns=Index(["series"]), ) tm.assert_frame_equal(df, expected) @@ -96,9 +96,7 @@ def test_partial_set_empty_frame2(self): # these work as they don't really change # anything but the index # GH#5632 - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="object") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="object")) df = DataFrame(index=Index([], dtype="object")) df["foo"] = Series([], dtype="object") @@ -116,9 +114,7 @@ def test_partial_set_empty_frame2(self): tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame3(self): - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="int64")) expected["foo"] = expected["foo"].astype("float64") df = DataFrame(index=Index([], dtype="int64")) @@ -135,9 +131,7 @@ def test_partial_set_empty_frame4(self): df = DataFrame(index=Index([], dtype="int64")) df["foo"] = range(len(df)) - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="int64")) # range is int-dtype-like, so we get int64 dtype expected["foo"] = expected["foo"].astype("int64") tm.assert_frame_equal(df, expected) @@ -210,7 +204,7 @@ def test_partial_set_empty_frame_empty_copy_assignment(self): df = DataFrame(index=[0]) df = df.copy() df["a"] = 0 - expected = DataFrame(0, index=[0], columns=Index(["a"], dtype=object)) + expected = DataFrame(0, index=[0], columns=Index(["a"])) tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): @@ -227,7 +221,7 @@ def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): { "x": Series( ["1", "2"], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ), "y": Series([np.nan, np.nan], dtype=object), } @@ -279,7 +273,7 @@ def test_partial_setting(self): s.iat[3] = 5.0 @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") - def test_partial_setting_frame(self, using_array_manager): + def test_partial_setting_frame(self): df_orig = DataFrame( np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" ) @@ -292,8 +286,6 @@ def test_partial_setting_frame(self, using_array_manager): df.iloc[4, 2] = 5.0 msg = "index 2 is out of bounds for axis 0 with size 2" - if using_array_manager: - msg = "list index out of range" with pytest.raises(IndexError, match=msg): df.iat[4, 2] = 5.0 @@ -582,7 +574,7 @@ def test_partial_set_invalid(self): ], ), ( - date_range(start="2000", periods=20, freq="D"), + date_range(start="2000", periods=20, freq="D", unit="s"), ["2000-01-04", "2000-01-08", "2000-01-12"], [ Timestamp("2000-01-04"), diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 29e3dc0aebe95..730fe584d7f07 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -1,4 +1,5 @@ -""" test scalar indexing, including at and iat """ +"""test scalar indexing, including at and iat""" + from datetime import ( datetime, timedelta, @@ -10,6 +11,7 @@ from pandas import ( DataFrame, + Index, Series, Timedelta, Timestamp, @@ -32,29 +34,42 @@ def generate_indices(f, values=False): class TestScalar: - @pytest.mark.parametrize("kind", ["series", "frame"]) - @pytest.mark.parametrize("col", ["ints", "uints"]) - def test_iat_set_ints(self, kind, col, request): - f = request.getfixturevalue(f"{kind}_{col}") + @pytest.mark.parametrize("dtype", [np.int64, np.uint64]) + def test_iat_set_ints(self, dtype, frame_or_series): + f = frame_or_series(range(3), index=Index([0, 1, 2], dtype=dtype)) indices = generate_indices(f, True) for i in indices: f.iat[i] = 1 expected = f.values[i] tm.assert_almost_equal(expected, 1) - @pytest.mark.parametrize("kind", ["series", "frame"]) - @pytest.mark.parametrize("col", ["labels", "ts", "floats"]) - def test_iat_set_other(self, kind, col, request): - f = request.getfixturevalue(f"{kind}_{col}") + @pytest.mark.parametrize( + "index", + [ + Index(list("abcd"), dtype=object), + date_range("20130101", periods=4), + Index(range(0, 8, 2), dtype=np.float64), + ], + ) + def test_iat_set_other(self, index, frame_or_series): + f = frame_or_series(range(len(index)), index=index) msg = "iAt based indexing can only have integer indexers" + idx = next(generate_indices(f, False)) with pytest.raises(ValueError, match=msg): - idx = next(generate_indices(f, False)) f.iat[idx] = 1 - @pytest.mark.parametrize("kind", ["series", "frame"]) - @pytest.mark.parametrize("col", ["ints", "uints", "labels", "ts", "floats"]) - def test_at_set_ints_other(self, kind, col, request): - f = request.getfixturevalue(f"{kind}_{col}") + @pytest.mark.parametrize( + "index", + [ + Index(list("abcd"), dtype=object), + date_range("20130101", periods=4), + Index(range(0, 8, 2), dtype=np.float64), + Index(range(0, 8, 2), dtype=np.uint64), + Index(range(0, 8, 2), dtype=np.int64), + ], + ) + def test_at_set_ints_other(self, index, frame_or_series): + f = frame_or_series(range(len(index)), index=index) indices = generate_indices(f, False) for i in indices: f.at[i] = 1 diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 15c2b8d000b37..a41d7dec8b496 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -1,4 +1,7 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest @@ -8,7 +11,6 @@ is_ci_environment, is_platform_windows, ) -from pandas.compat.numpy import np_version_lt1p23 import pandas as pd import pandas._testing as tm @@ -21,29 +23,12 @@ from pandas.core.interchange.utils import ArrowCTypes -@pytest.fixture -def data_categorical(): - return { +@pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)]) +def test_categorical_dtype(data): + data_categorical = { "ordered": pd.Categorical(list("testdata") * 30, ordered=True), "unordered": pd.Categorical(list("testdata") * 30, ordered=False), } - - -@pytest.fixture -def string_data(): - return { - "separator data": [ - "abC|DeF,Hik", - "234,3245.67", - "gSaf,qWer|Gre", - "asd3,4sad|", - np.nan, - ] - } - - -@pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)]) -def test_categorical_dtype(data, data_categorical): df = pd.DataFrame({"A": (data_categorical[data[0]])}) col = df.__dataframe__().get_column_by_name("A") @@ -179,8 +164,6 @@ def test_missing_from_masked(): } ) - df2 = df.__dataframe__() - rng = np.random.default_rng(2) dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): @@ -231,7 +214,16 @@ def test_mixed_missing(): assert df2.get_column_by_name(col_name).null_count == 2 -def test_string(string_data): +def test_string(): + string_data = { + "separator data": [ + "abC|DeF,Hik", + "234,3245.67", + "gSaf,qWer|Gre", + "asd3,4sad|", + np.nan, + ] + } test_str_data = string_data["separator data"] + [""] df = pd.DataFrame({"A": test_str_data}) col = df.__dataframe__().get_column_by_name("A") @@ -268,7 +260,6 @@ def test_datetime(): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -@pytest.mark.skipif(np_version_lt1p23, reason="Numpy > 1.23 required") def test_categorical_to_numpy_dlpack(): # https://github.com/pandas-dev/pandas/issues/48393 df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])}) @@ -287,7 +278,7 @@ def test_empty_pyarrow(data): expected = pd.DataFrame(data) arrow_df = pa_from_dataframe(expected) result = from_dataframe(arrow_df) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) def test_multi_chunk_pyarrow() -> None: @@ -297,14 +288,57 @@ def test_multi_chunk_pyarrow() -> None: table = pa.table([n_legs], names=names) with pytest.raises( RuntimeError, - match="To join chunks a copy is required which is " - "forbidden by allow_copy=False", + match="Cannot do zero copy conversion into multi-column DataFrame block", ): pd.api.interchange.from_dataframe(table, allow_copy=False) +def test_multi_chunk_column() -> None: + pytest.importorskip("pyarrow", "11.0.0") + ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]") + df = pd.concat([ser, ser], ignore_index=True).to_frame("a") + df_orig = df.copy() + with pytest.raises( + RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False" + ): + pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False)) + result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True)) + # Interchange protocol defaults to creating numpy-backed columns, so currently this + # is 'float64'. + expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64") + tm.assert_frame_equal(result, expected) + + # Check that the rechunking we did didn't modify the original DataFrame. + tm.assert_frame_equal(df, df_orig) + assert len(df["a"].array._pa_array.chunks) == 2 + assert len(df_orig["a"].array._pa_array.chunks) == 2 + + +def test_timestamp_ns_pyarrow(): + # GH 56712 + pytest.importorskip("pyarrow", "11.0.0") + timestamp_args = { + "year": 2000, + "month": 1, + "day": 1, + "hour": 1, + "minute": 1, + "second": 1, + } + df = pd.Series( + [datetime(**timestamp_args)], + dtype="timestamp[ns][pyarrow]", + name="col0", + ).to_frame() + + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi)["col0"].item() + + expected = pd.Timestamp(**timestamp_args) + assert result == expected + + @pytest.mark.parametrize("tz", ["UTC", "US/Pacific"]) -@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_datetimetzdtype(tz, unit): # GH 54239 tz_data = ( @@ -362,3 +396,256 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: interchange.get_column_by_name = lambda _: column monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) pd.api.interchange.from_dataframe(df) + + +def test_empty_string_column(): + # https://github.com/pandas-dev/pandas/issues/56703 + df = pd.DataFrame({"a": []}, dtype=str) + df2 = df.__dataframe__() + result = pd.api.interchange.from_dataframe(df2) + tm.assert_frame_equal(df, result) + + +def test_large_string(): + # GH#56702 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + expected = pd.DataFrame({"a": ["x"]}, dtype="str") + tm.assert_frame_equal(result, expected) + + +def test_non_str_names(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.Series([1, 2, 3], name=0).to_frame() + names = df.__dataframe__().column_names() + assert names == ["0"] + + +def test_non_str_names_w_duplicates(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) + dfi = df.__dataframe__() + with pytest.raises( + TypeError, + match=( + "Expected a Series, got a DataFrame. This likely happened because you " + "called __dataframe__ on a DataFrame which, after converting column " + r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " + r"dtype='(str|object)'\). Please rename these columns before using the " + "interchange protocol." + ), + ): + pd.api.interchange.from_dataframe(dfi, allow_copy=False) + + +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, None], "Int64", "int64"), + ([1, 2, None], "Int64[pyarrow]", "int64"), + ([1, 2, None], "Int8", "int8"), + ([1, 2, None], "Int8[pyarrow]", "int8"), + ( + [1, 2, None], + "UInt64", + "uint64", + ), + ( + [1, 2, None], + "UInt64[pyarrow]", + "uint64", + ), + ([1.0, 2.25, None], "Float32", "float32"), + ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), + ([True, False, None], "boolean", "bool"), + ([True, False, None], "boolean[pyarrow]", "bool"), + (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), + (["much ado", "about", None], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + None, + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), + ], +) +def test_pandas_nullable_with_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + # https://github.com/pandas-dev/pandas/issues/57664 + pa = pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() is None + + +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, 3], "Int64", "int64"), + ([1, 2, 3], "Int64[pyarrow]", "int64"), + ([1, 2, 3], "Int8", "int8"), + ([1, 2, 3], "Int8[pyarrow]", "int8"), + ( + [1, 2, 3], + "UInt64", + "uint64", + ), + ( + [1, 2, 3], + "UInt64[pyarrow]", + "uint64", + ), + ([1.0, 2.25, 5.0], "Float32", "float32"), + ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), + ([True, False, False], "boolean", "bool"), + ([True, False, False], "boolean[pyarrow]", "bool"), + ( + ["much ado", "about", "nothing"], + pd.StringDtype(na_value=np.nan), + "large_string", + ), + (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + datetime(2020, 1, 3, tzinfo=timezone.utc), + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), + ], +) +def test_pandas_nullable_without_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pa = pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() == data[2] + + +def test_string_validity_buffer() -> None: + # https://github.com/pandas-dev/pandas/issues/57761 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert result is None + + +def test_string_validity_buffer_no_missing() -> None: + # https://github.com/pandas-dev/pandas/issues/57762 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]") + validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert validity is not None + result = validity[1] + expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=") + assert result == expected + + +def test_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/56700 + df = pd.DataFrame({"a": []}, dtype="int8") + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) + expected = pd.DataFrame({"a": []}, dtype="int8") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("data", "expected_dtype", "expected_buffer_dtype"), + [ + ( + pd.Series(["a", "b", "a"], dtype="category"), + (DtypeKind.CATEGORICAL, 8, "c", "="), + (DtypeKind.INT, 8, "c", "|"), + ), + ( + pd.Series( + [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], + dtype="M8[ns]", + ), + (DtypeKind.DATETIME, 64, "tsn:", "="), + (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), + ), + ( + pd.Series(["a", "bc", None]), + (DtypeKind.STRING, 8, ArrowCTypes.STRING, "="), + (DtypeKind.UINT, 8, ArrowCTypes.UINT8, "="), + ), + ( + pd.Series([1, 2, 3]), + (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), + (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), + ), + ( + pd.Series([1.5, 2, 3]), + (DtypeKind.FLOAT, 64, ArrowCTypes.FLOAT64, "="), + (DtypeKind.FLOAT, 64, ArrowCTypes.FLOAT64, "="), + ), + ], +) +def test_buffer_dtype_categorical( + data: pd.Series, + expected_dtype: tuple[DtypeKind, int, str, str], + expected_buffer_dtype: tuple[DtypeKind, int, str, str], +) -> None: + # https://github.com/pandas-dev/pandas/issues/54781 + df = pd.DataFrame({"data": data}) + dfi = df.__dataframe__() + col = dfi.get_column_by_name("data") + assert col.dtype == expected_dtype + assert col.get_buffers()["data"][1] == expected_buffer_dtype + + +def test_from_dataframe_list_dtype(): + pa = pytest.importorskip("pyarrow", "14.0.0") + data = {"a": [[1, 2], [4, 5, 6]]} + tbl = pa.table(data) + result = from_dataframe(tbl) + expected = pd.DataFrame(data) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/interchange/test_spec_conformance.py b/pandas/tests/interchange/test_spec_conformance.py index 7c02379c11853..55e42ed2023cd 100644 --- a/pandas/tests/interchange/test_spec_conformance.py +++ b/pandas/tests/interchange/test_spec_conformance.py @@ -2,6 +2,7 @@ A verbatim copy (vendored) of the spec tests. Taken from https://github.com/data-apis/dataframe-api """ + import ctypes import math diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index f816cef38b9ab..fc222f6987466 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -3,10 +3,14 @@ in core.internals """ +import datetime + +import numpy as np import pytest import pandas as pd import pandas._testing as tm +from pandas.api.internals import create_dataframe_from_blocks from pandas.core import internals from pandas.core.internals import api @@ -23,19 +27,13 @@ def test_namespace(): "concat", "managers", "construction", - "array_manager", - "base", "api", "ops", ] expected = [ "make_block", - "DataManager", - "ArrayManager", "BlockManager", - "SingleDataManager", "SingleBlockManager", - "SingleArrayManager", "concatenate_managers", ] @@ -46,28 +44,21 @@ def test_namespace(): @pytest.mark.parametrize( "name", [ - "NumericBlock", - "ObjectBlock", "Block", "ExtensionBlock", - "DatetimeTZBlock", ], ) def test_deprecations(name): # GH#55139 msg = f"{name} is deprecated.* Use public APIs instead" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): getattr(internals, name) - if name not in ["NumericBlock", "ObjectBlock"]: - # NumericBlock and ObjectBlock are not in the internals.api namespace - with tm.assert_produces_warning(DeprecationWarning, match=msg): - getattr(api, name) - def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") + msg = "make_block is deprecated" with tm.assert_produces_warning(DeprecationWarning, match=msg): blk = api.make_block(dti, placement=[0]) @@ -84,5 +75,93 @@ def test_create_block_manager_from_blocks_deprecated(): "create_block_manager_from_blocks is deprecated and will be " "removed in a future version. Use public APIs instead" ) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): internals.create_block_manager_from_blocks + + +def test_create_dataframe_from_blocks(float_frame): + block = float_frame._mgr.blocks[0] + index = float_frame.index.copy() + columns = float_frame.columns.copy() + + result = create_dataframe_from_blocks( + [(block.values, block.mgr_locs.as_array)], index=index, columns=columns + ) + tm.assert_frame_equal(result, float_frame) + + +def test_create_dataframe_from_blocks_types(): + df = pd.DataFrame( + { + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("uint8"), + "float": [2.0, np.nan, 3.0], + "bool": np.array([True, False, True]), + "boolean": pd.array([True, False, None], dtype="boolean"), + "string": list("abc"), + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3).tz_localize( + "Europe/Brussels" + ), + "timedelta": pd.timedelta_range("1 day", periods=3), + "period": pd.period_range("2012-01-01", periods=3, freq="D"), + "categorical": pd.Categorical(["a", "b", "a"]), + "interval": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]), + } + ) + + result = create_dataframe_from_blocks( + [(block.values, block.mgr_locs.as_array) for block in df._mgr.blocks], + index=df.index, + columns=df.columns, + ) + tm.assert_frame_equal(result, df) + + +def test_create_dataframe_from_blocks_datetimelike(): + # extension dtypes that have an exact matching numpy dtype can also be + # be passed as a numpy array + index, columns = pd.RangeIndex(3), pd.Index(["a", "b", "c", "d"]) + + block_array1 = np.arange( + datetime.datetime(2020, 1, 1), + datetime.datetime(2020, 1, 7), + step=datetime.timedelta(1), + ).reshape((2, 3)) + block_array2 = np.arange( + datetime.timedelta(1), datetime.timedelta(7), step=datetime.timedelta(1) + ).reshape((2, 3)) + result = create_dataframe_from_blocks( + [(block_array1, np.array([0, 2])), (block_array2, np.array([1, 3]))], + index=index, + columns=columns, + ) + expected = pd.DataFrame( + { + "a": pd.date_range("2020-01-01", periods=3, unit="us"), + "b": pd.timedelta_range("1 days", periods=3, unit="us"), + "c": pd.date_range("2020-01-04", periods=3, unit="us"), + "d": pd.timedelta_range("4 days", periods=3, unit="us"), + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2020-01-01", periods=3), + pd.date_range("2020-01-01", periods=3, tz="UTC"), + pd.period_range("2012-01-01", periods=3, freq="D"), + pd.timedelta_range("1 day", periods=3), + ], +) +def test_create_dataframe_from_blocks_1dEA(array): + # ExtensionArrays can be passed as 1D even if stored under the hood as 2D + df = pd.DataFrame({"a": array}) + + block = df._mgr.blocks[0] + result = create_dataframe_from_blocks( + [(block.values[0], block.mgr_locs.as_array)], index=df.index, columns=df.columns + ) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 2265522bc7ecb..ac8ac0766f04d 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -10,7 +10,6 @@ from pandas._libs.internals import BlockPlacement from pandas.compat import IS64 -import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_scalar @@ -44,10 +43,6 @@ new_block, ) -# this file contains BlockManager specific tests -# TODO(ArrayManager) factor out interleave_dtype tests -pytestmark = td.skip_array_manager_invalid_test - @pytest.fixture(params=[new_block, make_block]) def block_maker(request): @@ -201,7 +196,7 @@ def create_mgr(descr, item_shape=None): * components with same DTYPE_ID are combined into single block * to force multiple blocks with same dtype, use '-SUFFIX':: - 'a:f8-1; b:f8-2; c:f8-foobar' + "a:f8-1; b:f8-2; c:f8-foobar" """ if item_shape is None: @@ -352,7 +347,7 @@ def test_split(self): # GH#37799 values = np.random.default_rng(2).standard_normal((3, 4)) blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2) - result = blk._split() + result = list(blk._split()) # check that we get views, not copies values[:] = -9999 @@ -386,8 +381,8 @@ def test_duplicate_ref_loc_failure(self): msg = "Gaps in blk ref_locs" + mgr = BlockManager(blocks, axes) with pytest.raises(AssertionError, match=msg): - mgr = BlockManager(blocks, axes) mgr._rebuild_blknos_and_blklocs() blocks[0].mgr_locs = BlockPlacement(np.array([0])) @@ -617,7 +612,7 @@ def _compare(old_mgr, new_mgr): # noops mgr = create_mgr("f: i8; g: f8") - new_mgr = mgr.convert(copy=True) + new_mgr = mgr.convert() _compare(mgr, new_mgr) # convert @@ -625,8 +620,8 @@ def _compare(old_mgr, new_mgr): mgr.iset(0, np.array(["1"] * N, dtype=np.object_)) mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) - new_mgr = mgr.convert(copy=True) - dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ + new_mgr = mgr.convert() + dtype = "str" if using_infer_string else np.object_ assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype assert new_mgr.iget(2).dtype == dtype @@ -639,7 +634,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(0, np.array(["1"] * N, dtype=np.object_)) mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) - new_mgr = mgr.convert(copy=True) + new_mgr = mgr.convert() assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype assert new_mgr.iget(2).dtype == dtype @@ -758,7 +753,7 @@ def test_reindex_items(self): mgr.iget(3).internal_values(), reindexed.iget(3).internal_values() ) - def test_get_numeric_data(self, using_copy_on_write): + def test_get_numeric_data(self): mgr = create_mgr( "int: int; float: float; complex: complex;" "str: object; bool: bool; obj: object; dt: datetime", @@ -779,18 +774,12 @@ def test_get_numeric_data(self, using_copy_on_write): np.array([100.0, 200.0, 300.0]), inplace=True, ) - if using_copy_on_write: - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([1.0, 1.0, 1.0]), - ) - else: - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), - ) + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([1.0, 1.0, 1.0]), + ) - def test_get_bool_data(self, using_copy_on_write): + def test_get_bool_data(self): mgr = create_mgr( "int: int; float: float; complex: complex;" "str: object; bool: bool; obj: object; dt: datetime", @@ -806,16 +795,10 @@ def test_get_bool_data(self, using_copy_on_write): ) bools.iset(0, np.array([True, False, True]), inplace=True) - if using_copy_on_write: - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, True, True]), - ) - else: - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), - ) + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, True, True]), + ) def test_unicode_repr_doesnt_raise(self): repr(create_mgr("b,\u05d0: object")) @@ -1163,7 +1146,6 @@ def test_array_to_slice_conversion(self, arr, slc): [-1], [-1, -2, -3], [-10], - [-1], [-1, 0, 1, 2], [-2, 0, 2, 4], [1, 0, -1], @@ -1298,19 +1280,19 @@ def test_interval_can_hold_element(self, dtype, element): # `elem` to not have the same length as `arr` ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither") elem = element(ii2) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)]) elem = element(ii3) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)]) elem = element(ii4) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) @@ -1330,14 +1312,24 @@ def test_period_can_hold_element(self, element): # `elem` to not have the same length as `arr` pi2 = pi.asfreq("D")[:-1] elem = element(pi2) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, pi, False) dti = pi.to_timestamp("s")[:-1] elem = element(dti) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, pi, False) + def test_period_reindex_axis(self): + # GH#60273 Test reindexing of block with PeriodDtype + pi = period_range("2020", periods=5, freq="Y") + blk = new_block(pi._data.reshape(5, 1), BlockPlacement(slice(5)), ndim=2) + mgr = BlockManager(blocks=(blk,), axes=[Index(np.arange(5)), Index(["a"])]) + reindexed = mgr.reindex_axis(Index([0, 2, 4]), axis=0) + result = DataFrame._from_mgr(reindexed, axes=reindexed.axes) + expected = DataFrame([[pi[0], pi[2], pi[4]]], columns=[0, 2, 4], index=["a"]) + tm.assert_frame_equal(result, expected) + def check_can_hold_element(self, obj, elem, inplace: bool): blk = obj._mgr.blocks[0] if inplace: @@ -1383,8 +1375,8 @@ def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" - depr_msg = "make_block is deprecated" + depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): make_block(values, placement, ndim=2) @@ -1402,11 +1394,11 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) - warn = None if block_maker is not make_block else DeprecationWarning - msg = "make_block is deprecated and will be removed in a future version" + depr_msg = "make_block is deprecated" + warn = DeprecationWarning if block_maker is make_block else None # NumpyExtensionArray, no dtype - with tm.assert_produces_warning(warn, match=msg): + with tm.assert_produces_warning(warn, match=depr_msg): result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] @@ -1415,14 +1407,14 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - with tm.assert_produces_warning(warn, match=msg): + with tm.assert_produces_warning(warn, match=depr_msg): result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False - # new_block no longer taked dtype keyword + # new_block no longer accepts dtype keyword # ndarray, NumpyEADtype - with tm.assert_produces_warning(warn, match=msg): + with tm.assert_produces_warning(warn, match=depr_msg): result = block_maker( arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim ) diff --git a/pandas/tests/internals/test_managers.py b/pandas/tests/internals/test_managers.py deleted file mode 100644 index f40362c299717..0000000000000 --- a/pandas/tests/internals/test_managers.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -Testing interaction between the different managers (BlockManager, ArrayManager) -""" -import os -import subprocess -import sys - -import pytest - -from pandas.core.dtypes.missing import array_equivalent - -import pandas as pd -import pandas._testing as tm -from pandas.core.internals import ( - ArrayManager, - BlockManager, - SingleArrayManager, - SingleBlockManager, -) - - -def test_dataframe_creation(): - msg = "data_manager option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.data_manager", "block"): - df_block = pd.DataFrame( - {"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]} - ) - assert isinstance(df_block._mgr, BlockManager) - - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.data_manager", "array"): - df_array = pd.DataFrame( - {"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]} - ) - assert isinstance(df_array._mgr, ArrayManager) - - # also ensure both are seen as equal - tm.assert_frame_equal(df_block, df_array) - - # conversion from one manager to the other - result = df_block._as_manager("block") - assert isinstance(result._mgr, BlockManager) - result = df_block._as_manager("array") - assert isinstance(result._mgr, ArrayManager) - tm.assert_frame_equal(result, df_block) - assert all( - array_equivalent(left, right) - for left, right in zip(result._mgr.arrays, df_array._mgr.arrays) - ) - - result = df_array._as_manager("array") - assert isinstance(result._mgr, ArrayManager) - result = df_array._as_manager("block") - assert isinstance(result._mgr, BlockManager) - tm.assert_frame_equal(result, df_array) - assert len(result._mgr.blocks) == 2 - - -def test_series_creation(): - msg = "data_manager option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.data_manager", "block"): - s_block = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) - assert isinstance(s_block._mgr, SingleBlockManager) - - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.data_manager", "array"): - s_array = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) - assert isinstance(s_array._mgr, SingleArrayManager) - - # also ensure both are seen as equal - tm.assert_series_equal(s_block, s_array) - - # conversion from one manager to the other - result = s_block._as_manager("block") - assert isinstance(result._mgr, SingleBlockManager) - result = s_block._as_manager("array") - assert isinstance(result._mgr, SingleArrayManager) - tm.assert_series_equal(result, s_block) - - result = s_array._as_manager("array") - assert isinstance(result._mgr, SingleArrayManager) - result = s_array._as_manager("block") - assert isinstance(result._mgr, SingleBlockManager) - tm.assert_series_equal(result, s_array) - - -@pytest.mark.single_cpu -@pytest.mark.parametrize("manager", ["block", "array"]) -def test_array_manager_depr_env_var(manager): - # GH#55043 - test_env = os.environ.copy() - test_env["PANDAS_DATA_MANAGER"] = manager - response = subprocess.run( - [sys.executable, "-c", "import pandas"], - capture_output=True, - env=test_env, - check=True, - ) - msg = "FutureWarning: The env variable PANDAS_DATA_MANAGER is set" - stderr_msg = response.stderr.decode("utf-8") - assert msg in stderr_msg, stderr_msg diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index ab6cacc4cc860..a5ddda9d66e7a 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -67,14 +67,13 @@ def s3_base(worker_id, monkeypatch): monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): - # NOT RUN on Windows/macOS/ARM, only Ubuntu + # NOT RUN on Windows/macOS, only Ubuntu # - subprocess in CI can cause timeouts # - GitHub Actions do not support # container services for the above OSs - # - CircleCI will probably hit the Docker rate pull limit pytest.skip( - "S3 tests do not have a corresponding service in " - "Windows, macOS or ARM platforms" + "S3 tests do not have a corresponding service on " + "Windows or macOS platforms" ) else: # set in .github/workflows/unit-tests.yml @@ -224,19 +223,3 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] - - -@pytest.fixture( - params=[ - "python", - pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - ] -) -def string_storage(request): - """ - Parametrized fixture for pd.options.mode.string_storage. - - * 'python' - * 'pyarrow' - """ - return request.param diff --git a/pandas/tests/io/data/excel/test_none_type.xlsx b/pandas/tests/io/data/excel/test_none_type.xlsx new file mode 100644 index 0000000000000..38aaf72ddfc8f Binary files /dev/null and b/pandas/tests/io/data/excel/test_none_type.xlsx differ diff --git a/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 b/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 deleted file mode 100644 index 8cb4eda470398..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/datetimetz_object.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/gh26443.h5 b/pandas/tests/io/data/legacy_hdf/gh26443.h5 deleted file mode 100644 index 45aa64324530f..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/gh26443.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 deleted file mode 100644 index 18cfae15a3a78..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_datetime_py2.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_py2.h5 deleted file mode 100644 index 540251d9fae86..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/legacy_table_fixed_py2.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5 b/pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5 deleted file mode 100644 index 3863d714a315b..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/legacy_table_py2.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5 b/pandas/tests/io/data/legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5 deleted file mode 100644 index 6fb92d3c564bd..0000000000000 Binary files a/pandas/tests/io/data/legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5 and /dev/null differ diff --git a/pandas/tests/io/data/legacy_msgpack/0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack b/pandas/tests/io/data/legacy_msgpack/0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack deleted file mode 100644 index 7a546a82ae766..0000000000000 Binary files a/pandas/tests/io/data/legacy_msgpack/0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack and /dev/null differ diff --git a/pandas/tests/io/data/pickle/test_mi_py27.pkl b/pandas/tests/io/data/pickle/test_mi_py27.pkl deleted file mode 100644 index 89021dd828108..0000000000000 Binary files a/pandas/tests/io/data/pickle/test_mi_py27.pkl and /dev/null differ diff --git a/pandas/tests/io/data/pickle/test_py27.pkl b/pandas/tests/io/data/pickle/test_py27.pkl deleted file mode 100644 index 5308b864bc0c7..0000000000000 Binary files a/pandas/tests/io/data/pickle/test_py27.pkl and /dev/null differ diff --git a/pandas/tests/io/data/stata/stata-compat-102.dta b/pandas/tests/io/data/stata/stata-compat-102.dta new file mode 100644 index 0000000000000..424b767b0011c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-102.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta new file mode 100644 index 0000000000000..adfeb6c672333 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-103.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-104.dta b/pandas/tests/io/data/stata/stata-compat-104.dta new file mode 100644 index 0000000000000..9bc3659afd31c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-104.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-110.dta b/pandas/tests/io/data/stata/stata-compat-110.dta new file mode 100644 index 0000000000000..68e591aba829a Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-110.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta new file mode 100644 index 0000000000000..0e2ef231f91c0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-103.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-104.dta b/pandas/tests/io/data/stata/stata-compat-be-104.dta new file mode 100644 index 0000000000000..98185d8ce27dc Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-104.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-105.dta b/pandas/tests/io/data/stata/stata-compat-be-105.dta new file mode 100644 index 0000000000000..af75548c840d4 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-105.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-108.dta b/pandas/tests/io/data/stata/stata-compat-be-108.dta new file mode 100644 index 0000000000000..e3e5d85fca4ad Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-108.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-110.dta b/pandas/tests/io/data/stata/stata-compat-be-110.dta new file mode 100644 index 0000000000000..0936be478028c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-110.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-111.dta b/pandas/tests/io/data/stata/stata-compat-be-111.dta new file mode 100644 index 0000000000000..197decdcf0c2d Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-111.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-113.dta b/pandas/tests/io/data/stata/stata-compat-be-113.dta new file mode 100644 index 0000000000000..c69c32106114f Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-113.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-114.dta b/pandas/tests/io/data/stata/stata-compat-be-114.dta new file mode 100644 index 0000000000000..222bdb2b62784 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-114.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-118.dta b/pandas/tests/io/data/stata/stata-compat-be-118.dta new file mode 100644 index 0000000000000..0a5df1b321c2d Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-118.dta differ diff --git a/pandas/tests/io/data/stata/stata10_115.dta b/pandas/tests/io/data/stata/stata10_115.dta index b917dde5ad47d..bdca3b9b340c1 100644 Binary files a/pandas/tests/io/data/stata/stata10_115.dta and b/pandas/tests/io/data/stata/stata10_115.dta differ diff --git a/pandas/tests/io/data/stata/stata12_118.dta b/pandas/tests/io/data/stata/stata12_118.dta new file mode 100644 index 0000000000000..87c6d1f063150 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_118.dta differ diff --git a/pandas/tests/io/data/stata/stata12_119.dta b/pandas/tests/io/data/stata/stata12_119.dta new file mode 100644 index 0000000000000..fa63f0135738e Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_119.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_117.dta b/pandas/tests/io/data/stata/stata12_be_117.dta new file mode 100644 index 0000000000000..7f84d15fb76d0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_117.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_118.dta b/pandas/tests/io/data/stata/stata12_be_118.dta new file mode 100644 index 0000000000000..9ed6f39b0f9b5 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_119.dta b/pandas/tests/io/data/stata/stata12_be_119.dta new file mode 100644 index 0000000000000..3c9736d0f3af3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata14_119.dta b/pandas/tests/io/data/stata/stata14_119.dta new file mode 100644 index 0000000000000..e64353213b1c9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_119.dta differ diff --git a/pandas/tests/io/data/stata/stata14_be_118.dta b/pandas/tests/io/data/stata/stata14_be_118.dta new file mode 100644 index 0000000000000..584ec0984c49e Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata14_be_119.dta b/pandas/tests/io/data/stata/stata14_be_119.dta new file mode 100644 index 0000000000000..09d08f7e992ea Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata16_119.dta b/pandas/tests/io/data/stata/stata16_119.dta new file mode 100644 index 0000000000000..d03c489d4342d Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_119.dta differ diff --git a/pandas/tests/io/data/stata/stata16_be_118.dta b/pandas/tests/io/data/stata/stata16_be_118.dta new file mode 100644 index 0000000000000..bae769c038820 Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata16_be_119.dta b/pandas/tests/io/data/stata/stata16_be_119.dta new file mode 100644 index 0000000000000..e928a9713715d Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata1_102.dta b/pandas/tests/io/data/stata/stata1_102.dta new file mode 100644 index 0000000000000..d0ca1b2a8c02d Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_102.dta differ diff --git a/pandas/tests/io/data/stata/stata1_103.dta b/pandas/tests/io/data/stata/stata1_103.dta new file mode 100644 index 0000000000000..98072ba6bd4fc Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_103.dta differ diff --git a/pandas/tests/io/data/stata/stata1_104.dta b/pandas/tests/io/data/stata/stata1_104.dta new file mode 100644 index 0000000000000..a46aeb9128ecf Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_104.dta differ diff --git a/pandas/tests/io/data/stata/stata1_105.dta b/pandas/tests/io/data/stata/stata1_105.dta new file mode 100644 index 0000000000000..ba2c463486dbf Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_105.dta differ diff --git a/pandas/tests/io/data/stata/stata1_108.dta b/pandas/tests/io/data/stata/stata1_108.dta new file mode 100644 index 0000000000000..6c948b4490589 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_108.dta differ diff --git a/pandas/tests/io/data/stata/stata1_110.dta b/pandas/tests/io/data/stata/stata1_110.dta new file mode 100644 index 0000000000000..c9e2ca72dbd4e Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_110.dta differ diff --git a/pandas/tests/io/data/stata/stata1_111.dta b/pandas/tests/io/data/stata/stata1_111.dta new file mode 100644 index 0000000000000..21370d3027458 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_111.dta differ diff --git a/pandas/tests/io/data/stata/stata1_113.dta b/pandas/tests/io/data/stata/stata1_113.dta new file mode 100644 index 0000000000000..6fcf55f0406e9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_113.dta differ diff --git a/pandas/tests/io/data/stata/stata1_115.dta b/pandas/tests/io/data/stata/stata1_115.dta new file mode 100644 index 0000000000000..2e5258da49c3c Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_115.dta differ diff --git a/pandas/tests/io/data/stata/stata1_118.dta b/pandas/tests/io/data/stata/stata1_118.dta new file mode 100644 index 0000000000000..26d7beccb745c Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_118.dta differ diff --git a/pandas/tests/io/data/stata/stata1_119.dta b/pandas/tests/io/data/stata/stata1_119.dta new file mode 100644 index 0000000000000..284daa78bf6db Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_119.dta differ diff --git a/pandas/tests/io/data/stata/stata4_102.dta b/pandas/tests/io/data/stata/stata4_102.dta new file mode 100644 index 0000000000000..669fe06c2b492 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_102.dta differ diff --git a/pandas/tests/io/data/stata/stata4_103.dta b/pandas/tests/io/data/stata/stata4_103.dta new file mode 100644 index 0000000000000..3c63935e63df9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_103.dta differ diff --git a/pandas/tests/io/data/stata/stata4_104.dta b/pandas/tests/io/data/stata/stata4_104.dta new file mode 100644 index 0000000000000..c2517355ebff1 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_104.dta differ diff --git a/pandas/tests/io/data/stata/stata4_105.dta b/pandas/tests/io/data/stata/stata4_105.dta new file mode 100644 index 0000000000000..f804c315b344b Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_105.dta differ diff --git a/pandas/tests/io/data/stata/stata4_108.dta b/pandas/tests/io/data/stata/stata4_108.dta new file mode 100644 index 0000000000000..e78c24b319e47 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_108.dta differ diff --git a/pandas/tests/io/data/stata/stata4_110.dta b/pandas/tests/io/data/stata/stata4_110.dta new file mode 100644 index 0000000000000..3ea01040448b0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_110.dta differ diff --git a/pandas/tests/io/data/stata/stata4_111.dta b/pandas/tests/io/data/stata/stata4_111.dta new file mode 100644 index 0000000000000..b69034174fcfe Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_111.dta differ diff --git a/pandas/tests/io/data/stata/stata4_114.dta b/pandas/tests/io/data/stata/stata4_114.dta index c5d7de8b42295..f58cdb215332e 100644 Binary files a/pandas/tests/io/data/stata/stata4_114.dta and b/pandas/tests/io/data/stata/stata4_114.dta differ diff --git a/pandas/tests/io/data/stata/stata8_102.dta b/pandas/tests/io/data/stata/stata8_102.dta new file mode 100644 index 0000000000000..5d3a4fb171e9c Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_102.dta differ diff --git a/pandas/tests/io/data/stata/stata8_103.dta b/pandas/tests/io/data/stata/stata8_103.dta new file mode 100644 index 0000000000000..623a21e37650f Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_103.dta differ diff --git a/pandas/tests/io/data/stata/stata8_104.dta b/pandas/tests/io/data/stata/stata8_104.dta new file mode 100644 index 0000000000000..df79d6a8af230 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_104.dta differ diff --git a/pandas/tests/io/data/stata/stata8_105.dta b/pandas/tests/io/data/stata/stata8_105.dta new file mode 100644 index 0000000000000..cf01463a83d81 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_105.dta differ diff --git a/pandas/tests/io/data/stata/stata8_108.dta b/pandas/tests/io/data/stata/stata8_108.dta new file mode 100644 index 0000000000000..962f7f4331fb3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_108.dta differ diff --git a/pandas/tests/io/data/stata/stata8_110.dta b/pandas/tests/io/data/stata/stata8_110.dta new file mode 100644 index 0000000000000..a7fe9a3b7e639 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_110.dta differ diff --git a/pandas/tests/io/data/stata/stata8_111.dta b/pandas/tests/io/data/stata/stata8_111.dta new file mode 100644 index 0000000000000..cb96ac0e0f5d3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_111.dta differ diff --git a/pandas/tests/io/data/stata/stata9_115.dta b/pandas/tests/io/data/stata/stata9_115.dta index 5ad6cd6a2c8ff..1b5c0042bebbe 100644 Binary files a/pandas/tests/io/data/stata/stata9_115.dta and b/pandas/tests/io/data/stata/stata9_115.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_102.dta b/pandas/tests/io/data/stata/stata_int_validranges_102.dta new file mode 100644 index 0000000000000..69de2e2f7f91d Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_102.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_103.dta b/pandas/tests/io/data/stata/stata_int_validranges_103.dta new file mode 100644 index 0000000000000..71f03873808e2 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_103.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_104.dta b/pandas/tests/io/data/stata/stata_int_validranges_104.dta new file mode 100644 index 0000000000000..f6dff2a6b42d9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_104.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_105.dta b/pandas/tests/io/data/stata/stata_int_validranges_105.dta new file mode 100644 index 0000000000000..d0a7ad0f01d16 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_105.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_108.dta b/pandas/tests/io/data/stata/stata_int_validranges_108.dta new file mode 100644 index 0000000000000..47b715bce21ef Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_108.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_110.dta b/pandas/tests/io/data/stata/stata_int_validranges_110.dta new file mode 100644 index 0000000000000..2fe5dee018f4e Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_110.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_111.dta b/pandas/tests/io/data/stata/stata_int_validranges_111.dta new file mode 100644 index 0000000000000..07052d824f132 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_111.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_113.dta b/pandas/tests/io/data/stata/stata_int_validranges_113.dta new file mode 100644 index 0000000000000..4060c1c88ea12 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_113.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_114.dta b/pandas/tests/io/data/stata/stata_int_validranges_114.dta new file mode 100644 index 0000000000000..71c22366e9b1a Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_114.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_115.dta b/pandas/tests/io/data/stata/stata_int_validranges_115.dta new file mode 100644 index 0000000000000..80e1dc8670b38 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_115.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_117.dta b/pandas/tests/io/data/stata/stata_int_validranges_117.dta new file mode 100644 index 0000000000000..c220037941f4f Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_117.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_118.dta b/pandas/tests/io/data/stata/stata_int_validranges_118.dta new file mode 100644 index 0000000000000..4bbd823bff63e Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_118.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_119.dta b/pandas/tests/io/data/stata/stata_int_validranges_119.dta new file mode 100644 index 0000000000000..6bd9bbde1d22d Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_119.dta differ diff --git a/pandas/tests/io/data/tar/test-csv.tar b/pandas/tests/io/data/tar/test-csv.tar new file mode 100644 index 0000000000000..c3b3091348426 Binary files /dev/null and b/pandas/tests/io/data/tar/test-csv.tar differ diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 271353a173d2a..7843bb59f97cf 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -3,11 +3,11 @@ datetime, ) import re +import uuid import pytest import pandas as pd -import pandas._testing as tm from pandas.io.excel import ExcelWriter @@ -19,42 +19,46 @@ def ext(): return ".ods" -def test_write_append_mode_raises(ext): +@pytest.fixture +def tmp_excel(ext, tmp_path): + tmp = tmp_path / f"{uuid.uuid4()}{ext}" + tmp.touch() + return str(tmp) + + +def test_write_append_mode_raises(tmp_excel): msg = "Append mode is not supported with odf!" - with tm.ensure_clean(ext) as f: - with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine="odf", mode="a") + with pytest.raises(ValueError, match=msg): + ExcelWriter(tmp_excel, engine="odf", mode="a") @pytest.mark.parametrize("engine_kwargs", [None, {"kwarg": 1}]) -def test_engine_kwargs(ext, engine_kwargs): +def test_engine_kwargs(tmp_excel, engine_kwargs): # GH 42286 # GH 43445 # test for error: OpenDocumentSpreadsheet does not accept any arguments - with tm.ensure_clean(ext) as f: - if engine_kwargs is not None: - error = re.escape( - "OpenDocumentSpreadsheet() got an unexpected keyword argument 'kwarg'" - ) - with pytest.raises( - TypeError, - match=error, - ): - ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) - else: - with ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) as _: - pass - - -def test_book_and_sheets_consistent(ext): + if engine_kwargs is not None: + error = re.escape( + "OpenDocumentSpreadsheet() got an unexpected keyword argument 'kwarg'" + ) + with pytest.raises( + TypeError, + match=error, + ): + ExcelWriter(tmp_excel, engine="odf", engine_kwargs=engine_kwargs) + else: + with ExcelWriter(tmp_excel, engine="odf", engine_kwargs=engine_kwargs) as _: + pass + + +def test_book_and_sheets_consistent(tmp_excel): # GH#45687 - Ensure sheets is updated if user modifies book - with tm.ensure_clean(ext) as f: - with ExcelWriter(f) as writer: - assert writer.sheets == {} - table = odf.table.Table(name="test_name") - writer.book.spreadsheet.addElement(table) - assert writer.sheets == {"test_name": table} + with ExcelWriter(tmp_excel) as writer: + assert writer.sheets == {} + table = odf.table.Table(name="test_name") + writer.book.spreadsheet.addElement(table) + assert writer.sheets == {"test_name": table} @pytest.mark.parametrize( @@ -73,7 +77,9 @@ def test_book_and_sheets_consistent(ext): (date(2010, 10, 10), "date", "date-value", "2010-10-10"), ], ) -def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell_value): +def test_cell_value_type( + tmp_excel, value, cell_value_type, cell_value_attribute, cell_value +): # GH#54994 ODS: cell attributes should follow specification # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13 from odf.namespaces import OFFICENS @@ -84,18 +90,17 @@ def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell table_cell_name = TableCell().qname - with tm.ensure_clean(ext) as f: - pd.DataFrame([[value]]).to_excel(f, header=False, index=False) - - with pd.ExcelFile(f) as wb: - sheet = wb._reader.get_sheet_by_index(0) - sheet_rows = sheet.getElementsByType(TableRow) - sheet_cells = [ - x - for x in sheet_rows[0].childNodes - if hasattr(x, "qname") and x.qname == table_cell_name - ] - - cell = sheet_cells[0] - assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type - assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value + pd.DataFrame([[value]]).to_excel(tmp_excel, header=False, index=False) + + with pd.ExcelFile(tmp_excel) as wb: + sheet = wb._reader.get_sheet_by_index(0) + sheet_rows = sheet.getElementsByType(TableRow) + sheet_cells = [ + x + for x in sheet_rows[0].childNodes + if hasattr(x, "qname") and x.qname == table_cell_name + ] + + cell = sheet_cells[0] + assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type + assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 2df9ec9e53516..5b4bbb9e686d3 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,6 +1,7 @@ import contextlib from pathlib import Path import re +import uuid import numpy as np import pytest @@ -23,6 +24,13 @@ def ext(): return ".xlsx" +@pytest.fixture +def tmp_excel(ext, tmp_path): + tmp = tmp_path / f"{uuid.uuid4()}{ext}" + tmp.touch() + return str(tmp) + + def test_to_excel_styleconverter(): from openpyxl import styles @@ -56,7 +64,7 @@ def test_to_excel_styleconverter(): assert kw["protection"] == protection -def test_write_cells_merge_styled(ext): +def test_write_cells_merge_styled(tmp_excel): from pandas.io.formats.excel import ExcelCell sheet_name = "merge_styled" @@ -78,72 +86,73 @@ def test_write_cells_merge_styled(ext): ) ] - with tm.ensure_clean(ext) as path: - with _OpenpyxlWriter(path) as writer: - writer._write_cells(initial_cells, sheet_name=sheet_name) - writer._write_cells(merge_cells, sheet_name=sheet_name) + with _OpenpyxlWriter(tmp_excel) as writer: + writer._write_cells(initial_cells, sheet_name=sheet_name) + writer._write_cells(merge_cells, sheet_name=sheet_name) - wks = writer.sheets[sheet_name] - xcell_b1 = wks["B1"] - xcell_a2 = wks["A2"] - assert xcell_b1.font == openpyxl_sty_merged - assert xcell_a2.font == openpyxl_sty_merged + wks = writer.sheets[sheet_name] + xcell_b1 = wks["B1"] + xcell_a2 = wks["A2"] + assert xcell_b1.font == openpyxl_sty_merged + assert xcell_a2.font == openpyxl_sty_merged @pytest.mark.parametrize("iso_dates", [True, False]) -def test_engine_kwargs_write(ext, iso_dates): +def test_engine_kwargs_write(tmp_excel, iso_dates): # GH 42286 GH 43445 engine_kwargs = {"iso_dates": iso_dates} - with tm.ensure_clean(ext) as f: - with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer: - assert writer.book.iso_dates == iso_dates - # ExcelWriter won't allow us to close without writing something - DataFrame().to_excel(writer) + with ExcelWriter( + tmp_excel, engine="openpyxl", engine_kwargs=engine_kwargs + ) as writer: + assert writer.book.iso_dates == iso_dates + # ExcelWriter won't allow us to close without writing something + DataFrame().to_excel(writer) -def test_engine_kwargs_append_invalid(ext): +def test_engine_kwargs_append_invalid(tmp_excel): # GH 43445 # test whether an invalid engine kwargs actually raises - with tm.ensure_clean(ext) as f: - DataFrame(["hello", "world"]).to_excel(f) - with pytest.raises( - TypeError, - match=re.escape( - "load_workbook() got an unexpected keyword argument 'apple_banana'" - ), - ): - with ExcelWriter( - f, engine="openpyxl", mode="a", engine_kwargs={"apple_banana": "fruit"} - ) as writer: - # ExcelWriter needs us to write something to close properly - DataFrame(["good"]).to_excel(writer, sheet_name="Sheet2") + DataFrame(["hello", "world"]).to_excel(tmp_excel) + with pytest.raises( + TypeError, + match=re.escape( + "load_workbook() got an unexpected keyword argument 'apple_banana'" + ), + ): + with ExcelWriter( + tmp_excel, + engine="openpyxl", + mode="a", + engine_kwargs={"apple_banana": "fruit"}, + ) as writer: + # ExcelWriter needs us to write something to close properly + DataFrame(["good"]).to_excel(writer, sheet_name="Sheet2") @pytest.mark.parametrize("data_only, expected", [(True, 0), (False, "=1+1")]) -def test_engine_kwargs_append_data_only(ext, data_only, expected): +def test_engine_kwargs_append_data_only(tmp_excel, data_only, expected): # GH 43445 # tests whether the data_only engine_kwarg actually works well for # openpyxl's load_workbook - with tm.ensure_clean(ext) as f: - DataFrame(["=1+1"]).to_excel(f) - with ExcelWriter( - f, engine="openpyxl", mode="a", engine_kwargs={"data_only": data_only} - ) as writer: - assert writer.sheets["Sheet1"]["B2"].value == expected - # ExcelWriter needs us to writer something to close properly? - DataFrame().to_excel(writer, sheet_name="Sheet2") - - # ensure that data_only also works for reading - # and that formulas/values roundtrip - assert ( - pd.read_excel( - f, - sheet_name="Sheet1", - engine="openpyxl", - engine_kwargs={"data_only": data_only}, - ).iloc[0, 1] - == expected - ) + DataFrame(["=1+1"]).to_excel(tmp_excel) + with ExcelWriter( + tmp_excel, engine="openpyxl", mode="a", engine_kwargs={"data_only": data_only} + ) as writer: + assert writer.sheets["Sheet1"]["B2"].value == expected + # ExcelWriter needs us to writer something to close properly? + DataFrame().to_excel(writer, sheet_name="Sheet2") + + # ensure that data_only also works for reading + # and that formulas/values roundtrip + assert ( + pd.read_excel( + tmp_excel, + sheet_name="Sheet1", + engine="openpyxl", + engine_kwargs={"data_only": data_only}, + ).iloc[0, 1] + == expected + ) @pytest.mark.parametrize("kwarg_name", ["read_only", "data_only"]) @@ -162,26 +171,25 @@ def test_engine_kwargs_append_reader(datapath, ext, kwarg_name, kwarg_value): @pytest.mark.parametrize( "mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])] ) -def test_write_append_mode(ext, mode, expected): +def test_write_append_mode(tmp_excel, mode, expected): df = DataFrame([1], columns=["baz"]) - with tm.ensure_clean(ext) as f: - wb = openpyxl.Workbook() - wb.worksheets[0].title = "foo" - wb.worksheets[0]["A1"].value = "foo" - wb.create_sheet("bar") - wb.worksheets[1]["A1"].value = "bar" - wb.save(f) + wb = openpyxl.Workbook() + wb.worksheets[0].title = "foo" + wb.worksheets[0]["A1"].value = "foo" + wb.create_sheet("bar") + wb.worksheets[1]["A1"].value = "bar" + wb.save(tmp_excel) - with ExcelWriter(f, engine="openpyxl", mode=mode) as writer: - df.to_excel(writer, sheet_name="baz", index=False) + with ExcelWriter(tmp_excel, engine="openpyxl", mode=mode) as writer: + df.to_excel(writer, sheet_name="baz", index=False) - with contextlib.closing(openpyxl.load_workbook(f)) as wb2: - result = [sheet.title for sheet in wb2.worksheets] - assert result == expected + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb2: + result = [sheet.title for sheet in wb2.worksheets] + assert result == expected - for index, cell_value in enumerate(expected): - assert wb2.worksheets[index]["A1"].value == cell_value + for index, cell_value in enumerate(expected): + assert wb2.worksheets[index]["A1"].value == cell_value @pytest.mark.parametrize( @@ -192,26 +200,25 @@ def test_write_append_mode(ext, mode, expected): ("overlay", 1, ["pear", "banana"]), ], ) -def test_if_sheet_exists_append_modes(ext, if_sheet_exists, num_sheets, expected): +def test_if_sheet_exists_append_modes(tmp_excel, if_sheet_exists, num_sheets, expected): # GH 40230 df1 = DataFrame({"fruit": ["apple", "banana"]}) df2 = DataFrame({"fruit": ["pear"]}) - with tm.ensure_clean(ext) as f: - df1.to_excel(f, engine="openpyxl", sheet_name="foo", index=False) - with ExcelWriter( - f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists - ) as writer: - df2.to_excel(writer, sheet_name="foo", index=False) + df1.to_excel(tmp_excel, engine="openpyxl", sheet_name="foo", index=False) + with ExcelWriter( + tmp_excel, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists + ) as writer: + df2.to_excel(writer, sheet_name="foo", index=False) - with contextlib.closing(openpyxl.load_workbook(f)) as wb: - assert len(wb.sheetnames) == num_sheets - assert wb.sheetnames[0] == "foo" - result = pd.read_excel(wb, "foo", engine="openpyxl") - assert list(result["fruit"]) == expected - if len(wb.sheetnames) == 2: - result = pd.read_excel(wb, wb.sheetnames[1], engine="openpyxl") - tm.assert_frame_equal(result, df2) + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb: + assert len(wb.sheetnames) == num_sheets + assert wb.sheetnames[0] == "foo" + result = pd.read_excel(wb, "foo", engine="openpyxl") + assert list(result["fruit"]) == expected + if len(wb.sheetnames) == 2: + result = pd.read_excel(wb, wb.sheetnames[1], engine="openpyxl") + tm.assert_frame_equal(result, df2) @pytest.mark.parametrize( @@ -223,28 +230,29 @@ def test_if_sheet_exists_append_modes(ext, if_sheet_exists, num_sheets, expected (1, 1, ["hello", "world"], ["goodbye", "poop"]), ], ) -def test_append_overlay_startrow_startcol(ext, startrow, startcol, greeting, goodbye): +def test_append_overlay_startrow_startcol( + tmp_excel, startrow, startcol, greeting, goodbye +): df1 = DataFrame({"greeting": ["hello", "world"], "goodbye": ["goodbye", "people"]}) df2 = DataFrame(["poop"]) - with tm.ensure_clean(ext) as f: - df1.to_excel(f, engine="openpyxl", sheet_name="poo", index=False) - with ExcelWriter( - f, engine="openpyxl", mode="a", if_sheet_exists="overlay" - ) as writer: - # use startrow+1 because we don't have a header - df2.to_excel( - writer, - index=False, - header=False, - startrow=startrow + 1, - startcol=startcol, - sheet_name="poo", - ) + df1.to_excel(tmp_excel, engine="openpyxl", sheet_name="poo", index=False) + with ExcelWriter( + tmp_excel, engine="openpyxl", mode="a", if_sheet_exists="overlay" + ) as writer: + # use startrow+1 because we don't have a header + df2.to_excel( + writer, + index=False, + header=False, + startrow=startrow + 1, + startcol=startcol, + sheet_name="poo", + ) - result = pd.read_excel(f, sheet_name="poo", engine="openpyxl") - expected = DataFrame({"greeting": greeting, "goodbye": goodbye}) - tm.assert_frame_equal(result, expected) + result = pd.read_excel(tmp_excel, sheet_name="poo", engine="openpyxl") + expected = DataFrame({"greeting": greeting, "goodbye": goodbye}) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -265,29 +273,27 @@ def test_append_overlay_startrow_startcol(ext, startrow, startcol, greeting, goo ), ], ) -def test_if_sheet_exists_raises(ext, if_sheet_exists, msg): +def test_if_sheet_exists_raises(tmp_excel, if_sheet_exists, msg): # GH 40230 df = DataFrame({"fruit": ["pear"]}) - with tm.ensure_clean(ext) as f: - with pytest.raises(ValueError, match=re.escape(msg)): - df.to_excel(f, sheet_name="foo", engine="openpyxl") - with ExcelWriter( - f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists - ) as writer: - df.to_excel(writer, sheet_name="foo") + df.to_excel(tmp_excel, sheet_name="foo", engine="openpyxl") + with pytest.raises(ValueError, match=re.escape(msg)): + with ExcelWriter( + tmp_excel, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists + ) as writer: + df.to_excel(writer, sheet_name="foo") -def test_to_excel_with_openpyxl_engine(ext): +def test_to_excel_with_openpyxl_engine(tmp_excel): # GH 29854 - with tm.ensure_clean(ext) as filename: - df1 = DataFrame({"A": np.linspace(1, 10, 10)}) - df2 = DataFrame({"B": np.linspace(1, 20, 10)}) - df = pd.concat([df1, df2], axis=1) - styled = df.style.map( - lambda val: f"color: {'red' if val < 0 else 'black'}" - ).highlight_max() + df1 = DataFrame({"A": np.linspace(1, 10, 10)}) + df2 = DataFrame({"B": np.linspace(1, 20, 10)}) + df = pd.concat([df1, df2], axis=1) + styled = df.style.map( + lambda val: f"color: {'red' if val < 0 else 'black'}" + ).highlight_max() - styled.to_excel(filename, engine="openpyxl") + styled.to_excel(tmp_excel, engine="openpyxl") @pytest.mark.parametrize("read_only", [True, False]) @@ -337,25 +343,24 @@ def test_read_with_bad_dimension( tm.assert_frame_equal(result, expected) -def test_append_mode_file(ext): +def test_append_mode_file(tmp_excel): # GH 39576 df = DataFrame() - with tm.ensure_clean(ext) as f: - df.to_excel(f, engine="openpyxl") + df.to_excel(tmp_excel, engine="openpyxl") - with ExcelWriter( - f, mode="a", engine="openpyxl", if_sheet_exists="new" - ) as writer: - df.to_excel(writer) + with ExcelWriter( + tmp_excel, mode="a", engine="openpyxl", if_sheet_exists="new" + ) as writer: + df.to_excel(writer) - # make sure that zip files are not concatenated by making sure that - # "docProps/app.xml" only occurs twice in the file - data = Path(f).read_bytes() - first = data.find(b"docProps/app.xml") - second = data.find(b"docProps/app.xml", first + 1) - third = data.find(b"docProps/app.xml", second + 1) - assert second != -1 and third == -1 + # make sure that zip files are not concatenated by making sure that + # "docProps/app.xml" only occurs twice in the file + data = Path(tmp_excel).read_bytes() + first = data.find(b"docProps/app.xml") + second = data.find(b"docProps/app.xml", first + 1) + third = data.find(b"docProps/app.xml", second + 1) + assert second != -1 and third == -1 # When read_only is None, use read_excel instead of a workbook @@ -396,13 +401,12 @@ def test_read_empty_with_blank_row(datapath, ext, read_only): tm.assert_frame_equal(result, expected) -def test_book_and_sheets_consistent(ext): +def test_book_and_sheets_consistent(tmp_excel): # GH#45687 - Ensure sheets is updated if user modifies book - with tm.ensure_clean(ext) as f: - with ExcelWriter(f, engine="openpyxl") as writer: - assert writer.sheets == {} - sheet = writer.book.create_sheet("test_name", 0) - assert writer.sheets == {"test_name": sheet} + with ExcelWriter(tmp_excel, engine="openpyxl") as writer: + assert writer.sheets == {} + sheet = writer.book.create_sheet("test_name", 0) + assert writer.sheets == {"test_name": sheet} def test_ints_spelled_with_decimals(datapath, ext): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 15712f36da4ca..140cf39b26556 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -11,13 +11,12 @@ import platform import re from urllib.error import URLError +import uuid from zipfile import BadZipFile import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -29,10 +28,6 @@ read_csv, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ @@ -122,6 +117,13 @@ def read_ext(engine_and_read_ext): return read_ext +@pytest.fixture +def tmp_excel(read_ext, tmp_path): + tmp = tmp_path / f"{uuid.uuid4()}{read_ext}" + tmp.touch() + return str(tmp) + + @pytest.fixture def df_ref(datapath): """ @@ -133,10 +135,13 @@ def df_ref(datapath): def get_exp_unit(read_ext: str, engine: str | None) -> str: - return "ns" + unit = "us" + if (read_ext == ".ods") ^ (engine == "calamine"): + unit = "s" + return unit -def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None: +def adjust_expected(expected: DataFrame, read_ext: str, engine: str | None) -> None: expected.index.name = None unit = get_exp_unit(read_ext, engine) # error: "Index" has no attribute "as_unit" @@ -153,6 +158,36 @@ def xfail_datetimes_with_pyxlsb(engine, request): class TestReaders: + @pytest.mark.parametrize("col", [[True, None, False], [True], [True, False]]) + def test_read_excel_type_check(self, col, tmp_excel, read_ext): + # GH 58159 + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") + df = DataFrame({"bool_column": col}, dtype="boolean") + df.to_excel(tmp_excel, index=False) + df2 = pd.read_excel(tmp_excel, dtype={"bool_column": "boolean"}) + tm.assert_frame_equal(df, df2) + + def test_pass_none_type(self, datapath): + # GH 58159 + f_path = datapath("io", "data", "excel", "test_none_type.xlsx") + + with pd.ExcelFile(f_path) as excel: + parsed = pd.read_excel( + excel, + sheet_name="Sheet1", + keep_default_na=True, + na_values=["nan", "None", "abcd"], + dtype="boolean", + engine="openpyxl", + ) + expected = DataFrame( + {"Test": [True, None, False, None, False, None, True]}, + dtype="boolean", + ) + + tm.assert_frame_equal(parsed, expected) + @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch): """ @@ -550,7 +585,7 @@ def test_reader_dtype(self, read_ext): expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + expected["c"] = Series(["001", "002", "003", "004"], dtype="str") tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -562,25 +597,21 @@ def test_reader_dtype(self, read_ext): [ ( None, - DataFrame( - { - "a": [1, 2, 3, 4], - "b": [2.5, 3.5, 4.5, 5.5], - "c": [1, 2, 3, 4], - "d": [1.0, 2.0, np.nan, 4.0], - } - ), + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + }, ), ( {"a": "float64", "b": "float32", "c": str, "d": str}, - DataFrame( - { - "a": Series([1, 2, 3, 4], dtype="float64"), - "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": Series(["001", "002", "003", "004"], dtype=object), - "d": Series(["1", "2", np.nan, "4"], dtype=object), - } - ), + { + "a": Series([1, 2, 3, 4], dtype="float64"), + "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), + "c": Series(["001", "002", "003", "004"], dtype="str"), + "d": Series(["1", "2", np.nan, "4"], dtype="str"), + }, ), ], ) @@ -589,9 +620,10 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): basename = "testdtype" actual = pd.read_excel(basename + read_ext, dtype=dtype) + expected = DataFrame(expected) tm.assert_frame_equal(actual, expected) - def test_dtype_backend(self, read_ext, dtype_backend, engine): + def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") @@ -610,11 +642,10 @@ def test_dtype_backend(self, read_ext, dtype_backend, engine): "j": Series([pd.NA, pd.NA], dtype="Int64"), } ) - with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, sheet_name="test", index=False) - result = pd.read_excel( - file_path, sheet_name="test", dtype_backend=dtype_backend - ) + df.to_excel(tmp_excel, sheet_name="test", index=False) + result = pd.read_excel( + tmp_excel, sheet_name="test", dtype_backend=dtype_backend + ) if dtype_backend == "pyarrow": import pyarrow as pa @@ -639,60 +670,48 @@ def test_dtype_backend(self, read_ext, dtype_backend, engine): tm.assert_frame_equal(result, expected) - def test_dtype_backend_and_dtype(self, read_ext): + def test_dtype_backend_and_dtype(self, read_ext, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) - with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, sheet_name="test", index=False) - result = pd.read_excel( - file_path, - sheet_name="test", - dtype_backend="numpy_nullable", - dtype="float64", - ) + df.to_excel(tmp_excel, sheet_name="test", index=False) + result = pd.read_excel( + tmp_excel, + sheet_name="test", + dtype_backend="numpy_nullable", + dtype="float64", + ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="infer_string takes precedence" - ) - def test_dtype_backend_string(self, read_ext, string_storage): + def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - pa = pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } + ) + df.to_excel(tmp_excel, sheet_name="test", index=False) with pd.option_context("mode.string_storage", string_storage): - df = DataFrame( - { - "a": np.array(["a", "b"], dtype=np.object_), - "b": np.array(["x", pd.NA], dtype=np.object_), - } + result = pd.read_excel( + tmp_excel, sheet_name="test", dtype_backend="numpy_nullable" ) - with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, sheet_name="test", index=False) - result = pd.read_excel( - file_path, sheet_name="test", dtype_backend="numpy_nullable" - ) - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), + } + ) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): @@ -891,8 +910,7 @@ def test_corrupt_bytes_raises(self, engine): error = XLRDError msg = ( - "Unsupported format, or corrupt file: Expected BOF " - "record; found b'foo'" + "Unsupported format, or corrupt file: Expected BOF record; found b'foo'" ) elif engine == "calamine": from python_calamine import CalamineError @@ -969,19 +987,6 @@ def test_read_from_pathlib_path(self, read_ext): tm.assert_frame_equal(expected, actual) - @td.skip_if_no("py.path") - def test_read_from_py_localpath(self, read_ext): - # GH12655 - from py.path import local as LocalPath - - str_path = os.path.join("test1" + read_ext) - expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0) - - path_obj = LocalPath().join("test1" + read_ext) - actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0) - - tm.assert_frame_equal(expected, actual) - def test_close_from_py_localpath(self, read_ext): # GH31467 str_path = os.path.join("test1" + read_ext) @@ -1076,7 +1081,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext): tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet - expected.index = list(range(4)) + expected.index = range(4) expected.columns = mi.set_names(["c1", "c2"]) actual = pd.read_excel( mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0 @@ -1128,7 +1133,6 @@ def test_read_excel_multiindex_blank_after_name( mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) unit = get_exp_unit(read_ext, engine) - expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -1464,17 +1468,10 @@ def test_euro_decimal_format(self, read_ext): class TestExcelFileRead: - def test_deprecate_bytes_input(self, engine, read_ext): + def test_raises_bytes_input(self, engine, read_ext): # GH 53830 - msg = ( - "Passing bytes to 'read_excel' is deprecated and " - "will be removed in a future version. To read from a " - "byte string, wrap it in a `BytesIO` object." - ) - - with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False - ): + msg = "Expected file path name or file-like object" + with pytest.raises(TypeError, match=msg): with open("test1" + read_ext, "rb") as f: pd.read_excel(f.read(), engine=engine) @@ -1693,6 +1690,7 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) unit = get_exp_unit(read_ext, engine) + dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]") expected_column_index = MultiIndex.from_arrays( [dti[:1], dti[1:]], @@ -1724,7 +1722,7 @@ def test_ignore_chartsheets(self, request, engine, read_ext): with pd.ExcelFile("chartsheet" + read_ext) as excel: assert excel.sheet_names == ["Sheet1"] - def test_corrupt_files_closed(self, engine, read_ext): + def test_corrupt_files_closed(self, engine, tmp_excel): # GH41778 errors = (BadZipFile,) if engine is None: @@ -1738,10 +1736,9 @@ def test_corrupt_files_closed(self, engine, read_ext): errors = (CalamineError,) - with tm.ensure_clean(f"corrupt{read_ext}") as file: - Path(file).write_text("corrupt", encoding="utf-8") - with tm.assert_produces_warning(False): - try: - pd.ExcelFile(file, engine=engine) - except errors: - pass + Path(tmp_excel).write_text("corrupt", encoding="utf-8") + with tm.assert_produces_warning(False): + try: + pd.ExcelFile(tmp_excel, engine=engine) + except errors: + pass diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 3ca8637885639..0e13b2f94ed58 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -1,5 +1,6 @@ import contextlib import time +import uuid import numpy as np import pytest @@ -8,6 +9,9 @@ from pandas import ( DataFrame, + MultiIndex, + Timestamp, + period_range, read_excel, ) import pandas._testing as tm @@ -21,6 +25,13 @@ # 'template' file, but this needs the import error to delayed until render time. +@pytest.fixture +def tmp_excel(tmp_path): + tmp = tmp_path / f"{uuid.uuid4()}.xlsx" + tmp.touch() + return str(tmp) + + def assert_equal_cell_styles(cell1, cell2): # TODO: should find a better way to check equality assert cell1.alignment.__dict__ == cell2.alignment.__dict__ @@ -31,26 +42,43 @@ def assert_equal_cell_styles(cell1, cell2): assert cell1.protection.__dict__ == cell2.protection.__dict__ -@pytest.mark.parametrize( - "engine", - ["xlsxwriter", "openpyxl"], -) -def test_styler_to_excel_unstyled(engine): +def test_styler_default_values(tmp_excel): + # GH 54154 + openpyxl = pytest.importorskip("openpyxl") + df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) + + with ExcelWriter(tmp_excel, engine="openpyxl") as writer: + df.to_excel(writer, sheet_name="custom") + + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb: + # Check font, spacing, indentation + assert wb["custom"].cell(1, 1).font.bold is False + assert wb["custom"].cell(1, 1).alignment.horizontal is None + assert wb["custom"].cell(1, 1).alignment.vertical is None + + # Check border + assert wb["custom"].cell(1, 1).border.bottom.color is None + assert wb["custom"].cell(1, 1).border.top.color is None + assert wb["custom"].cell(1, 1).border.left.color is None + assert wb["custom"].cell(1, 1).border.right.color is None + + +@pytest.mark.parametrize("engine", ["xlsxwriter", "openpyxl"]) +def test_styler_to_excel_unstyled(engine, tmp_excel): # compare DataFrame.to_excel and Styler.to_excel when no styles applied pytest.importorskip(engine) df = DataFrame(np.random.default_rng(2).standard_normal((2, 2))) - with tm.ensure_clean(".xlsx") as path: - with ExcelWriter(path, engine=engine) as writer: - df.to_excel(writer, sheet_name="dataframe") - df.style.to_excel(writer, sheet_name="unstyled") + with ExcelWriter(tmp_excel, engine=engine) as writer: + df.to_excel(writer, sheet_name="dataframe") + df.style.to_excel(writer, sheet_name="unstyled") - openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl - with contextlib.closing(openpyxl.load_workbook(path)) as wb: - for col1, col2 in zip(wb["dataframe"].columns, wb["unstyled"].columns): - assert len(col1) == len(col2) - for cell1, cell2 in zip(col1, col2): - assert cell1.value == cell2.value - assert_equal_cell_styles(cell1, cell2) + openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb: + for col1, col2 in zip(wb["dataframe"].columns, wb["unstyled"].columns): + assert len(col1) == len(col2) + for cell1, cell2 in zip(col1, col2): + assert cell1.value == cell2.value + assert_equal_cell_styles(cell1, cell2) shared_style_params = [ @@ -123,43 +151,65 @@ def test_styler_to_excel_unstyled(engine): ] -@pytest.mark.parametrize( - "engine", - ["xlsxwriter", "openpyxl"], -) +def test_styler_custom_style(tmp_excel): + # GH 54154 + css_style = "background-color: #111222" + openpyxl = pytest.importorskip("openpyxl") + df = DataFrame([{"A": 1, "B": 2}, {"A": 1, "B": 2}]) + + with ExcelWriter(tmp_excel, engine="openpyxl") as writer: + styler = df.style.map(lambda x: css_style) + styler.to_excel(writer, sheet_name="custom", index=False) + + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb: + # Check font, spacing, indentation + assert wb["custom"].cell(1, 1).font.bold is False + assert wb["custom"].cell(1, 1).alignment.horizontal is None + assert wb["custom"].cell(1, 1).alignment.vertical is None + + # Check border + assert wb["custom"].cell(1, 1).border.bottom.color is None + assert wb["custom"].cell(1, 1).border.top.color is None + assert wb["custom"].cell(1, 1).border.left.color is None + assert wb["custom"].cell(1, 1).border.right.color is None + + # Check background color + assert wb["custom"].cell(2, 1).fill.fgColor.index == "00111222" + assert wb["custom"].cell(3, 1).fill.fgColor.index == "00111222" + assert wb["custom"].cell(2, 2).fill.fgColor.index == "00111222" + assert wb["custom"].cell(3, 2).fill.fgColor.index == "00111222" + + +@pytest.mark.parametrize("engine", ["xlsxwriter", "openpyxl"]) @pytest.mark.parametrize("css, attrs, expected", shared_style_params) -def test_styler_to_excel_basic(engine, css, attrs, expected): +def test_styler_to_excel_basic(engine, css, attrs, expected, tmp_excel): pytest.importorskip(engine) df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) styler = df.style.map(lambda x: css) - with tm.ensure_clean(".xlsx") as path: - with ExcelWriter(path, engine=engine) as writer: - df.to_excel(writer, sheet_name="dataframe") - styler.to_excel(writer, sheet_name="styled") - - openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl - with contextlib.closing(openpyxl.load_workbook(path)) as wb: - # test unstyled data cell does not have expected styles - # test styled cell has expected styles - u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2) - for attr in attrs: - u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr) - - if isinstance(expected, dict): - assert u_cell is None or u_cell != expected[engine] - assert s_cell == expected[engine] - else: - assert u_cell is None or u_cell != expected - assert s_cell == expected - - -@pytest.mark.parametrize( - "engine", - ["xlsxwriter", "openpyxl"], -) + with ExcelWriter(tmp_excel, engine=engine) as writer: + df.to_excel(writer, sheet_name="dataframe") + styler.to_excel(writer, sheet_name="styled") + + openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb: + # test unstyled data cell does not have expected styles + # test styled cell has expected styles + u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2) + for attr in attrs: + u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr) + + if isinstance(expected, dict): + assert u_cell is None or u_cell != expected[engine] + assert s_cell == expected[engine] + else: + assert u_cell is None or u_cell != expected + assert s_cell == expected + + +@pytest.mark.parametrize("engine", ["xlsxwriter", "openpyxl"]) @pytest.mark.parametrize("css, attrs, expected", shared_style_params) -def test_styler_to_excel_basic_indexes(engine, css, attrs, expected): +def test_styler_to_excel_basic_indexes(engine, css, attrs, expected, tmp_excel): pytest.importorskip(engine) df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) @@ -172,31 +222,30 @@ def test_styler_to_excel_basic_indexes(engine, css, attrs, expected): null_styler.map_index(lambda x: "null: css;", axis=0) null_styler.map_index(lambda x: "null: css;", axis=1) - with tm.ensure_clean(".xlsx") as path: - with ExcelWriter(path, engine=engine) as writer: - null_styler.to_excel(writer, sheet_name="null_styled") - styler.to_excel(writer, sheet_name="styled") - - openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl - with contextlib.closing(openpyxl.load_workbook(path)) as wb: - # test null styled index cells does not have expected styles - # test styled cell has expected styles - ui_cell, si_cell = wb["null_styled"].cell(2, 1), wb["styled"].cell(2, 1) - uc_cell, sc_cell = wb["null_styled"].cell(1, 2), wb["styled"].cell(1, 2) - for attr in attrs: - ui_cell, si_cell = getattr(ui_cell, attr, None), getattr(si_cell, attr) - uc_cell, sc_cell = getattr(uc_cell, attr, None), getattr(sc_cell, attr) - - if isinstance(expected, dict): - assert ui_cell is None or ui_cell != expected[engine] - assert si_cell == expected[engine] - assert uc_cell is None or uc_cell != expected[engine] - assert sc_cell == expected[engine] - else: - assert ui_cell is None or ui_cell != expected - assert si_cell == expected - assert uc_cell is None or uc_cell != expected - assert sc_cell == expected + with ExcelWriter(tmp_excel, engine=engine) as writer: + null_styler.to_excel(writer, sheet_name="null_styled") + styler.to_excel(writer, sheet_name="styled") + + openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb: + # test null styled index cells does not have expected styles + # test styled cell has expected styles + ui_cell, si_cell = wb["null_styled"].cell(2, 1), wb["styled"].cell(2, 1) + uc_cell, sc_cell = wb["null_styled"].cell(1, 2), wb["styled"].cell(1, 2) + for attr in attrs: + ui_cell, si_cell = getattr(ui_cell, attr, None), getattr(si_cell, attr) + uc_cell, sc_cell = getattr(uc_cell, attr, None), getattr(sc_cell, attr) + + if isinstance(expected, dict): + assert ui_cell is None or ui_cell != expected[engine] + assert si_cell == expected[engine] + assert uc_cell is None or uc_cell != expected[engine] + assert sc_cell == expected[engine] + else: + assert ui_cell is None or ui_cell != expected + assert si_cell == expected + assert uc_cell is None or uc_cell != expected + assert sc_cell == expected # From https://openpyxl.readthedocs.io/en/stable/api/openpyxl.styles.borders.html @@ -219,12 +268,9 @@ def test_styler_to_excel_basic_indexes(engine, css, attrs, expected): ] -@pytest.mark.parametrize( - "engine", - ["xlsxwriter", "openpyxl"], -) +@pytest.mark.parametrize("engine", ["xlsxwriter", "openpyxl"]) @pytest.mark.parametrize("border_style", excel_border_styles) -def test_styler_to_excel_border_style(engine, border_style): +def test_styler_to_excel_border_style(engine, border_style, tmp_excel): css = f"border-left: {border_style} black thin" attrs = ["border", "left", "style"] expected = border_style @@ -233,28 +279,27 @@ def test_styler_to_excel_border_style(engine, border_style): df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) styler = df.style.map(lambda x: css) - with tm.ensure_clean(".xlsx") as path: - with ExcelWriter(path, engine=engine) as writer: - df.to_excel(writer, sheet_name="dataframe") - styler.to_excel(writer, sheet_name="styled") + with ExcelWriter(tmp_excel, engine=engine) as writer: + df.to_excel(writer, sheet_name="dataframe") + styler.to_excel(writer, sheet_name="styled") - openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl - with contextlib.closing(openpyxl.load_workbook(path)) as wb: - # test unstyled data cell does not have expected styles - # test styled cell has expected styles - u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2) - for attr in attrs: - u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr) + openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb: + # test unstyled data cell does not have expected styles + # test styled cell has expected styles + u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2) + for attr in attrs: + u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr) - if isinstance(expected, dict): - assert u_cell is None or u_cell != expected[engine] - assert s_cell == expected[engine] - else: - assert u_cell is None or u_cell != expected - assert s_cell == expected + if isinstance(expected, dict): + assert u_cell is None or u_cell != expected[engine] + assert s_cell == expected[engine] + else: + assert u_cell is None or u_cell != expected + assert s_cell == expected -def test_styler_custom_converter(): +def test_styler_custom_converter(tmp_excel): openpyxl = pytest.importorskip("openpyxl") def custom_converter(css): @@ -262,14 +307,13 @@ def custom_converter(css): df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) styler = df.style.map(lambda x: "color: #888999") - with tm.ensure_clean(".xlsx") as path: - with ExcelWriter(path, engine="openpyxl") as writer: - ExcelFormatter(styler, style_converter=custom_converter).write( - writer, sheet_name="custom" - ) + with ExcelWriter(tmp_excel, engine="openpyxl") as writer: + ExcelFormatter(styler, style_converter=custom_converter).write( + writer, sheet_name="custom" + ) - with contextlib.closing(openpyxl.load_workbook(path)) as wb: - assert wb["custom"].cell(2, 2).font.color.value == "00111222" + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as wb: + assert wb["custom"].cell(2, 2).font.color.value == "00111222" @pytest.mark.single_cpu @@ -292,3 +336,26 @@ def test_styler_to_s3(s3_public_bucket, s3so): f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so ) tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("merge_cells", [True, False, "columns"]) +def test_format_hierarchical_rows_periodindex(merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + formatter = ExcelFormatter(df, merge_cells=merge_cells) + formatted_cells = formatter._format_hierarchical_rows() + + for cell in formatted_cells: + if cell.row != 0 and cell.col == 0: + assert isinstance(cell.val, Timestamp), ( + "Period should be converted to Timestamp" + ) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8c003723c1c71..ced4feb9e7eb9 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -3,15 +3,16 @@ datetime, timedelta, ) +from decimal import Decimal from functools import partial from io import BytesIO import os import re +import uuid import numpy as np import pytest -from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -22,6 +23,7 @@ MultiIndex, date_range, option_context, + period_range, ) import pandas._testing as tm @@ -36,7 +38,9 @@ def get_exp_unit(path: str) -> str: - return "ns" + if path.endswith(".ods"): + return "s" + return "us" @pytest.fixture @@ -47,18 +51,19 @@ def frame(float_frame): return float_frame[:10] -@pytest.fixture(params=[True, False]) +@pytest.fixture(params=[True, False, "columns"]) def merge_cells(request): return request.param @pytest.fixture -def path(ext): +def tmp_excel(ext, tmp_path): """ Fixture to open file for use in each test case. """ - with tm.ensure_clean(ext) as file_path: - yield file_path + tmp = tmp_path / f"{uuid.uuid4()}{ext}" + tmp.touch() + return str(tmp) @pytest.fixture @@ -90,66 +95,61 @@ def set_engine(engine, ext): class TestRoundTrip: @pytest.mark.parametrize( "header,expected", - [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], + [(None, [np.nan] * 4), (0, {"Unnamed: 0": [np.nan] * 3})], ) - def test_read_one_empty_col_no_header(self, ext, header, expected): + def test_read_one_empty_col_no_header(self, tmp_excel, header, expected): # xref gh-12292 filename = "no_header" df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) - with tm.ensure_clean(ext) as path: - df.to_excel(path, sheet_name=filename, index=False, header=False) - result = pd.read_excel( - path, sheet_name=filename, usecols=[0], header=header - ) - + df.to_excel(tmp_excel, sheet_name=filename, index=False, header=False) + result = pd.read_excel( + tmp_excel, sheet_name=filename, usecols=[0], header=header + ) + expected = DataFrame(expected) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "header,expected", - [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], + "header,expected_extra", + [(None, [0]), (0, [])], ) - def test_read_one_empty_col_with_header(self, ext, header, expected): + def test_read_one_empty_col_with_header(self, tmp_excel, header, expected_extra): filename = "with_header" df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) - with tm.ensure_clean(ext) as path: - df.to_excel(path, sheet_name="with_header", index=False, header=True) - result = pd.read_excel( - path, sheet_name=filename, usecols=[0], header=header - ) - + df.to_excel(tmp_excel, sheet_name="with_header", index=False, header=True) + result = pd.read_excel( + tmp_excel, sheet_name=filename, usecols=[0], header=header + ) + expected = DataFrame(expected_extra + [np.nan] * 4) tm.assert_frame_equal(result, expected) - def test_set_column_names_in_parameter(self, ext): + def test_set_column_names_in_parameter(self, tmp_excel): # GH 12870 : pass down column names associated with # keyword argument names refdf = DataFrame([[1, "foo"], [2, "bar"], [3, "baz"]], columns=["a", "b"]) - with tm.ensure_clean(ext) as pth: - with ExcelWriter(pth) as writer: - refdf.to_excel( - writer, sheet_name="Data_no_head", header=False, index=False - ) - refdf.to_excel(writer, sheet_name="Data_with_head", index=False) + with ExcelWriter(tmp_excel) as writer: + refdf.to_excel(writer, sheet_name="Data_no_head", header=False, index=False) + refdf.to_excel(writer, sheet_name="Data_with_head", index=False) - refdf.columns = ["A", "B"] + refdf.columns = ["A", "B"] - with ExcelFile(pth) as reader: - xlsdf_no_head = pd.read_excel( - reader, sheet_name="Data_no_head", header=None, names=["A", "B"] - ) - xlsdf_with_head = pd.read_excel( - reader, - sheet_name="Data_with_head", - index_col=None, - names=["A", "B"], - ) + with ExcelFile(tmp_excel) as reader: + xlsdf_no_head = pd.read_excel( + reader, sheet_name="Data_no_head", header=None, names=["A", "B"] + ) + xlsdf_with_head = pd.read_excel( + reader, + sheet_name="Data_with_head", + index_col=None, + names=["A", "B"], + ) - tm.assert_frame_equal(xlsdf_no_head, refdf) - tm.assert_frame_equal(xlsdf_with_head, refdf) + tm.assert_frame_equal(xlsdf_no_head, refdf) + tm.assert_frame_equal(xlsdf_with_head, refdf) - def test_creating_and_reading_multiple_sheets(self, ext): + def test_creating_and_reading_multiple_sheets(self, tmp_excel): # see gh-9450 # # Test reading multiple sheets, from a runtime @@ -163,124 +163,126 @@ def tdf(col_sheet_name): dfs = [tdf(s) for s in sheets] dfs = dict(zip(sheets, dfs)) - with tm.ensure_clean(ext) as pth: - with ExcelWriter(pth) as ew: - for sheetname, df in dfs.items(): - df.to_excel(ew, sheet_name=sheetname) + with ExcelWriter(tmp_excel) as ew: + for sheetname, df in dfs.items(): + df.to_excel(ew, sheet_name=sheetname) - dfs_returned = pd.read_excel(pth, sheet_name=sheets, index_col=0) + dfs_returned = pd.read_excel(tmp_excel, sheet_name=sheets, index_col=0) - for s in sheets: - tm.assert_frame_equal(dfs[s], dfs_returned[s]) + for s in sheets: + tm.assert_frame_equal(dfs[s], dfs_returned[s]) - def test_read_excel_multiindex_empty_level(self, ext): + def test_read_excel_multiindex_empty_level(self, tmp_excel): # see gh-12453 - with tm.ensure_clean(ext) as path: - df = DataFrame( - { - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", ""): {0: 0}, - } - ) + df = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0}, + } + ) - expected = DataFrame( - { - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", "Unnamed: 4_level_1"): {0: 0}, - } - ) + expected = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0}, + } + ) - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) + df.to_excel(tmp_excel) + actual = pd.read_excel(tmp_excel, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) - df = DataFrame( - { - ("Beg", ""): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7}, - } - ) + df = DataFrame( + { + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) - expected = DataFrame( - { - ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7}, - } - ) + expected = DataFrame( + { + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) + df.to_excel(tmp_excel) + actual = pd.read_excel(tmp_excel, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) @pytest.mark.parametrize("c_idx_names", ["a", None]) @pytest.mark.parametrize("r_idx_names", ["b", None]) @pytest.mark.parametrize("c_idx_levels", [1, 3]) @pytest.mark.parametrize("r_idx_levels", [1, 3]) def test_excel_multindex_roundtrip( - self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels, request + self, + tmp_excel, + c_idx_names, + r_idx_names, + c_idx_levels, + r_idx_levels, ): # see gh-4679 - with tm.ensure_clean(ext) as pth: - # Empty name case current read in as - # unnamed levels, not Nones. - check_names = bool(r_idx_names) or r_idx_levels <= 1 + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = bool(r_idx_names) or r_idx_levels <= 1 - if c_idx_levels == 1: - columns = Index(list("abcde")) - else: - columns = MultiIndex.from_arrays( - [range(5) for _ in range(c_idx_levels)], - names=[f"{c_idx_names}-{i}" for i in range(c_idx_levels)], - ) - if r_idx_levels == 1: - index = Index(list("ghijk")) - else: - index = MultiIndex.from_arrays( - [range(5) for _ in range(r_idx_levels)], - names=[f"{r_idx_names}-{i}" for i in range(r_idx_levels)], - ) - df = DataFrame( - 1.1 * np.ones((5, 5)), - columns=columns, - index=index, + if c_idx_levels == 1: + columns = Index(list("abcde")) + else: + columns = MultiIndex.from_arrays( + [range(5) for _ in range(c_idx_levels)], + names=[f"{c_idx_names}-{i}" for i in range(c_idx_levels)], ) - df.to_excel(pth) - - act = pd.read_excel( - pth, - index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels)), + if r_idx_levels == 1: + index = Index(list("ghijk")) + else: + index = MultiIndex.from_arrays( + [range(5) for _ in range(r_idx_levels)], + names=[f"{r_idx_names}-{i}" for i in range(r_idx_levels)], ) - tm.assert_frame_equal(df, act, check_names=check_names) + df = DataFrame( + 1.1 * np.ones((5, 5)), + columns=columns, + index=index, + ) + df.to_excel(tmp_excel) - df.iloc[0, :] = np.nan - df.to_excel(pth) + act = pd.read_excel( + tmp_excel, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) - act = pd.read_excel( - pth, - index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels)), - ) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[-1, :] = np.nan - df.to_excel(pth) - act = pd.read_excel( - pth, - index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels)), - ) - tm.assert_frame_equal(df, act, check_names=check_names) + df.iloc[0, :] = np.nan + df.to_excel(tmp_excel) + + act = pd.read_excel( + tmp_excel, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(tmp_excel) + act = pd.read_excel( + tmp_excel, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) - def test_read_excel_parse_dates(self, ext): + def test_read_excel_parse_dates(self, tmp_excel): # see gh-11544, gh-12051 df = DataFrame( {"col": [1, 2, 3], "date_strings": date_range("2012-01-01", periods=3)} @@ -288,34 +290,23 @@ def test_read_excel_parse_dates(self, ext): df2 = df.copy() df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") - with tm.ensure_clean(ext) as pth: - df2.to_excel(pth) + df2.to_excel(tmp_excel) - res = pd.read_excel(pth, index_col=0) - tm.assert_frame_equal(df2, res) + res = pd.read_excel(tmp_excel, index_col=0) + tm.assert_frame_equal(df2, res) - res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) + res = pd.read_excel(tmp_excel, parse_dates=["date_strings"], index_col=0) + expected = df[:] + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(res, expected) - date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") - with tm.assert_produces_warning( - FutureWarning, - match="use 'date_format' instead", - raise_on_extra_warnings=False, - ): - res = pd.read_excel( - pth, - parse_dates=["date_strings"], - date_parser=date_parser, - index_col=0, - ) - tm.assert_frame_equal(df, res) - res = pd.read_excel( - pth, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 - ) - tm.assert_frame_equal(df, res) + res = pd.read_excel( + tmp_excel, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 + ) + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(expected, res) - def test_multiindex_interval_datetimes(self, ext): + def test_multiindex_interval_datetimes(self, tmp_excel): # GH 30986 midx = MultiIndex.from_arrays( [ @@ -326,9 +317,8 @@ def test_multiindex_interval_datetimes(self, ext): ] ) df = DataFrame(range(4), index=midx) - with tm.ensure_clean(ext) as pth: - df.to_excel(pth) - result = pd.read_excel(pth, index_col=[0, 1]) + df.to_excel(tmp_excel) + result = pd.read_excel(tmp_excel, index_col=[0, 1]) expected = DataFrame( range(4), MultiIndex.from_arrays( @@ -342,9 +332,47 @@ def test_multiindex_interval_datetimes(self, ext): ], ] ), + columns=Index([0]), ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("merge_cells", [True, False, "columns"]) + def test_excel_round_trip_with_periodindex(self, tmp_excel, merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + df.to_excel(tmp_excel, merge_cells=merge_cells) + result = pd.read_excel(tmp_excel, index_col=[0, 1]) + expected = DataFrame( + {"A": [1, 2]}, + MultiIndex.from_arrays( + [ + [ + pd.to_datetime("2006-10-06 00:00:00"), + pd.to_datetime("2006-10-07 00:00:00"), + ], + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + time_format = ( + "datetime64[s]" if tmp_excel.endswith(".ods") else "datetime64[us]" + ) + expected.index = expected.index.set_levels( + expected.index.levels[0].astype(time_format), level=0 + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "engine,ext", @@ -369,7 +397,7 @@ def test_multiindex_interval_datetimes(self, ext): ) @pytest.mark.usefixtures("set_engine") class TestExcelWriter: - def test_excel_sheet_size(self, path): + def test_excel_sheet_size(self, tmp_excel): # GH 26080 breaking_row_count = 2**20 + 1 breaking_col_count = 2**14 + 1 @@ -381,16 +409,19 @@ def test_excel_sheet_size(self, path): msg = "sheet is too large" with pytest.raises(ValueError, match=msg): - row_df.to_excel(path) + row_df.to_excel(tmp_excel) with pytest.raises(ValueError, match=msg): - col_df.to_excel(path) + col_df.to_excel(tmp_excel) - def test_excel_sheet_by_name_raise(self, path): - gt = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) - gt.to_excel(path) + def test_excel_sheet_by_name_raise(self, tmp_excel): + gt = DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), + index=Index(list(range(10))), + ) + gt.to_excel(tmp_excel) - with ExcelFile(path) as xl: + with ExcelFile(tmp_excel) as xl: df = pd.read_excel(xl, sheet_name=0, index_col=0) tm.assert_frame_equal(gt, df) @@ -399,80 +430,84 @@ def test_excel_sheet_by_name_raise(self, path): with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - def test_excel_writer_context_manager(self, frame, path): - with ExcelWriter(path) as writer: + def test_excel_writer_context_manager(self, frame, tmp_excel): + with ExcelWriter(tmp_excel) as writer: frame.to_excel(writer, sheet_name="Data1") frame2 = frame.copy() frame2.columns = frame.columns[::-1] frame2.to_excel(writer, sheet_name="Data2") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: found_df = pd.read_excel(reader, sheet_name="Data1", index_col=0) found_df2 = pd.read_excel(reader, sheet_name="Data2", index_col=0) tm.assert_frame_equal(found_df, frame) tm.assert_frame_equal(found_df2, frame2) - def test_roundtrip(self, frame, path): + def test_roundtrip(self, frame, tmp_excel): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, sheet_name="test1") - frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) - frame.to_excel(path, sheet_name="test1", header=False) - frame.to_excel(path, sheet_name="test1", index=False) + frame.to_excel(tmp_excel, sheet_name="test1") + frame.to_excel(tmp_excel, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(tmp_excel, sheet_name="test1", header=False) + frame.to_excel(tmp_excel, sheet_name="test1", index=False) # test roundtrip - frame.to_excel(path, sheet_name="test1") - recons = pd.read_excel(path, sheet_name="test1", index_col=0) + frame.to_excel(tmp_excel, sheet_name="test1") + recons = pd.read_excel(tmp_excel, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, sheet_name="test1", index=False) - recons = pd.read_excel(path, sheet_name="test1", index_col=None) + frame.to_excel(tmp_excel, sheet_name="test1", index=False) + recons = pd.read_excel(tmp_excel, sheet_name="test1", index_col=None) recons.index = frame.index tm.assert_frame_equal(frame, recons) - frame.to_excel(path, sheet_name="test1", na_rep="NA") - recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["NA"]) + frame.to_excel(tmp_excel, sheet_name="test1", na_rep="NA") + recons = pd.read_excel( + tmp_excel, sheet_name="test1", index_col=0, na_values=["NA"] + ) tm.assert_frame_equal(frame, recons) # GH 3611 - frame.to_excel(path, sheet_name="test1", na_rep="88") - recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["88"]) + frame.to_excel(tmp_excel, sheet_name="test1", na_rep="88") + recons = pd.read_excel( + tmp_excel, sheet_name="test1", index_col=0, na_values=["88"] + ) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, sheet_name="test1", na_rep="88") + frame.to_excel(tmp_excel, sheet_name="test1", na_rep="88") recons = pd.read_excel( - path, sheet_name="test1", index_col=0, na_values=[88, 88.0] + tmp_excel, sheet_name="test1", index_col=0, na_values=[88, 88.0] ) tm.assert_frame_equal(frame, recons) # GH 6573 - frame.to_excel(path, sheet_name="Sheet1") - recons = pd.read_excel(path, index_col=0) + frame.to_excel(tmp_excel, sheet_name="Sheet1") + recons = pd.read_excel(tmp_excel, index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, sheet_name="0") - recons = pd.read_excel(path, index_col=0) + frame.to_excel(tmp_excel, sheet_name="0") + recons = pd.read_excel(tmp_excel, index_col=0) tm.assert_frame_equal(frame, recons) # GH 8825 Pandas Series should provide to_excel method s = frame["A"] - s.to_excel(path) - recons = pd.read_excel(path, index_col=0) + s.to_excel(tmp_excel) + recons = pd.read_excel(tmp_excel, index_col=0) tm.assert_frame_equal(s.to_frame(), recons) - def test_mixed(self, frame, path): + def test_mixed(self, frame, tmp_excel): mixed_frame = frame.copy() mixed_frame["foo"] = "bar" - mixed_frame.to_excel(path, sheet_name="test1") - with ExcelFile(path) as reader: + mixed_frame.to_excel(tmp_excel, sheet_name="test1") + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) - def test_ts_frame(self, path): - unit = get_exp_unit(path) + def test_ts_frame(self, tmp_excel): + unit = get_exp_unit(tmp_excel) df = DataFrame( np.random.default_rng(2).standard_normal((5, 4)), columns=Index(list("ABCD")), @@ -486,79 +521,86 @@ def test_ts_frame(self, path): expected = df[:] expected.index = expected.index.as_unit(unit) - df.to_excel(path, sheet_name="test1") - with ExcelFile(path) as reader: + df.to_excel(tmp_excel, sheet_name="test1") + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_basics_with_nan(self, frame, path): + def test_basics_with_nan(self, frame, tmp_excel): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, sheet_name="test1") - frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) - frame.to_excel(path, sheet_name="test1", header=False) - frame.to_excel(path, sheet_name="test1", index=False) + frame.to_excel(tmp_excel, sheet_name="test1") + frame.to_excel(tmp_excel, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(tmp_excel, sheet_name="test1", header=False) + frame.to_excel(tmp_excel, sheet_name="test1", index=False) @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) - def test_int_types(self, np_type, path): + def test_int_types(self, np_type, tmp_excel): # Test np.int values read come back as int # (rather than float which is Excel's format). df = DataFrame( - np.random.default_rng(2).integers(-10, 10, size=(10, 2)), dtype=np_type + np.random.default_rng(2).integers(-10, 10, size=(10, 2)), + dtype=np_type, + index=Index(list(range(10))), ) - df.to_excel(path, sheet_name="test1") + df.to_excel(tmp_excel, sheet_name="test1") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) int_frame = df.astype(np.int64) tm.assert_frame_equal(int_frame, recons) - recons2 = pd.read_excel(path, sheet_name="test1", index_col=0) + recons2 = pd.read_excel(tmp_excel, sheet_name="test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) - def test_float_types(self, np_type, path): + def test_float_types(self, np_type, tmp_excel): # Test np.float values read come back as float. - df = DataFrame(np.random.default_rng(2).random(10), dtype=np_type) - df.to_excel(path, sheet_name="test1") + df = DataFrame( + np.random.default_rng(2).random(10), + dtype=np_type, + index=Index(list(range(10))), + ) + df.to_excel(tmp_excel, sheet_name="test1") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np_type ) tm.assert_frame_equal(df, recons) - def test_bool_types(self, path): + def test_bool_types(self, tmp_excel): # Test np.bool_ values read come back as float. - df = DataFrame([1, 0, True, False], dtype=np.bool_) - df.to_excel(path, sheet_name="test1") + df = DataFrame([1, 0, True, False], dtype=np.bool_, index=Index(list(range(4)))) + df.to_excel(tmp_excel, sheet_name="test1") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.bool_ ) tm.assert_frame_equal(df, recons) - def test_inf_roundtrip(self, path): - df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - df.to_excel(path, sheet_name="test1") + def test_inf_roundtrip(self, tmp_excel): + df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)], index=Index(list(range(3)))) + df.to_excel(tmp_excel, sheet_name="test1") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(df, recons) - def test_sheets(self, frame, path): + def test_sheets(self, frame, tmp_excel): # freq doesn't round-trip - unit = get_exp_unit(path) + unit = get_exp_unit(tmp_excel) tsframe = DataFrame( np.random.default_rng(2).standard_normal((5, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=5, freq="B"), ) + index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -568,16 +610,16 @@ def test_sheets(self, frame, path): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, sheet_name="test1") - frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) - frame.to_excel(path, sheet_name="test1", header=False) - frame.to_excel(path, sheet_name="test1", index=False) + frame.to_excel(tmp_excel, sheet_name="test1") + frame.to_excel(tmp_excel, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(tmp_excel, sheet_name="test1", header=False) + frame.to_excel(tmp_excel, sheet_name="test1", index=False) # Test writing to separate sheets - with ExcelWriter(path) as writer: + with ExcelWriter(tmp_excel) as writer: frame.to_excel(writer, sheet_name="test1") tsframe.to_excel(writer, sheet_name="test2") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) recons = pd.read_excel(reader, sheet_name="test2", index_col=0) @@ -586,39 +628,39 @@ def test_sheets(self, frame, path): assert "test1" == reader.sheet_names[0] assert "test2" == reader.sheet_names[1] - def test_colaliases(self, frame, path): + def test_colaliases(self, frame, tmp_excel): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, sheet_name="test1") - frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) - frame.to_excel(path, sheet_name="test1", header=False) - frame.to_excel(path, sheet_name="test1", index=False) + frame.to_excel(tmp_excel, sheet_name="test1") + frame.to_excel(tmp_excel, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(tmp_excel, sheet_name="test1", header=False) + frame.to_excel(tmp_excel, sheet_name="test1", index=False) # column aliases col_aliases = Index(["AA", "X", "Y", "Z"]) - frame.to_excel(path, sheet_name="test1", header=col_aliases) - with ExcelFile(path) as reader: + frame.to_excel(tmp_excel, sheet_name="test1", header=col_aliases) + with ExcelFile(tmp_excel) as reader: rs = pd.read_excel(reader, sheet_name="test1", index_col=0) xp = frame.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) - def test_roundtrip_indexlabels(self, merge_cells, frame, path): + def test_roundtrip_indexlabels(self, merge_cells, frame, tmp_excel): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, sheet_name="test1") - frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) - frame.to_excel(path, sheet_name="test1", header=False) - frame.to_excel(path, sheet_name="test1", index=False) + frame.to_excel(tmp_excel, sheet_name="test1") + frame.to_excel(tmp_excel, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(tmp_excel, sheet_name="test1", header=False) + frame.to_excel(tmp_excel, sheet_name="test1", index=False) # test index_label df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 df.to_excel( - path, sheet_name="test1", index_label=["test"], merge_cells=merge_cells + tmp_excel, sheet_name="test1", index_label=["test"], merge_cells=merge_cells ) - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 ) @@ -627,23 +669,29 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 df.to_excel( - path, + tmp_excel, sheet_name="test1", index_label=["test", "dummy", "dummy2"], merge_cells=merge_cells, ) - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 ) df.index.names = ["test"] assert df.index.names == recons.index.names - df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 + df = ( + DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), + index=Index(list(range(10))), + ) + >= 0 + ) df.to_excel( - path, sheet_name="test1", index_label="test", merge_cells=merge_cells + tmp_excel, sheet_name="test1", index_label="test", merge_cells=merge_cells ) - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 ) @@ -651,7 +699,7 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): tm.assert_frame_equal(df, recons.astype(bool)) frame.to_excel( - path, + tmp_excel, sheet_name="test1", columns=["A", "B", "C", "D"], index=False, @@ -661,25 +709,25 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): df = frame.copy() df = df.set_index(["A", "B"]) - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons) - def test_excel_roundtrip_indexname(self, merge_cells, path): + def test_excel_roundtrip_indexname(self, merge_cells, tmp_excel): df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) df.index.name = "foo" - df.to_excel(path, merge_cells=merge_cells) + df.to_excel(tmp_excel, merge_cells=merge_cells) - with ExcelFile(path) as xf: + with ExcelFile(tmp_excel) as xf: result = pd.read_excel(xf, sheet_name=xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) assert result.index.name == "foo" - def test_excel_roundtrip_datetime(self, merge_cells, path): + def test_excel_roundtrip_datetime(self, merge_cells, tmp_excel): # datetime.date, not sure what to test here exactly - unit = get_exp_unit(path) + unit = get_exp_unit(tmp_excel) # freq does not round-trip tsframe = DataFrame( @@ -693,21 +741,20 @@ def test_excel_roundtrip_datetime(self, merge_cells, path): tsf = tsframe.copy() tsf.index = [x.date() for x in tsframe.index] - tsf.to_excel(path, sheet_name="test1", merge_cells=merge_cells) + tsf.to_excel(tmp_excel, sheet_name="test1", merge_cells=merge_cells) - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) expected = tsframe[:] expected.index = expected.index.as_unit(unit) tm.assert_frame_equal(expected, recons) - def test_excel_date_datetime_format(self, ext, path): + def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # see gh-4133 # # Excel output format strings - unit = get_exp_unit(path) - + unit = get_exp_unit(tmp_excel) df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], @@ -726,22 +773,26 @@ def test_excel_date_datetime_format(self, ext, path): ) df_expected = df_expected.astype(f"M8[{unit}]") - with tm.ensure_clean(ext) as filename2: - with ExcelWriter(path) as writer1: - df.to_excel(writer1, sheet_name="test1") + filename2 = tmp_path / f"tmp2{ext}" + filename2.touch() + with ExcelWriter(tmp_excel) as writer1: + df.to_excel(writer1, sheet_name="test1") + + with ExcelWriter( + filename2, + date_format="DD.MM.YYYY", + datetime_format="DD.MM.YYYY HH-MM-SS", + ) as writer2: + df.to_excel(writer2, sheet_name="test1") - with ExcelWriter( - filename2, - date_format="DD.MM.YYYY", - datetime_format="DD.MM.YYYY HH-MM-SS", - ) as writer2: - df.to_excel(writer2, sheet_name="test1") + with ExcelFile(tmp_excel) as reader1: + rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) - with ExcelFile(path) as reader1: - rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) + with ExcelFile(filename2) as reader2: + rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) - with ExcelFile(filename2) as reader2: - rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + # TODO: why do we get different units? + rs2 = rs2.astype(f"M8[{unit}]") tm.assert_frame_equal(rs1, rs2) @@ -749,7 +800,10 @@ def test_excel_date_datetime_format(self, ext, path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) - def test_to_excel_interval_no_labels(self, path, using_infer_string): + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_to_excel_interval_no_labels(self, tmp_excel, using_infer_string): # see gh-19242 # # Test writing Interval without labels. @@ -760,15 +814,15 @@ def test_to_excel_interval_no_labels(self, path, using_infer_string): df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) - df.to_excel(path, sheet_name="test1") - with ExcelFile(path) as reader: + df.to_excel(tmp_excel, sheet_name="test1") + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_interval_labels(self, path): + def test_to_excel_interval_labels(self, tmp_excel): # see gh-19242 # # Test writing Interval with labels. @@ -782,12 +836,12 @@ def test_to_excel_interval_labels(self, path): df["new"] = intervals expected["new"] = pd.Series(list(intervals)) - df.to_excel(path, sheet_name="test1") - with ExcelFile(path) as reader: + df.to_excel(tmp_excel, sheet_name="test1") + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_timedelta(self, path): + def test_to_excel_timedelta(self, tmp_excel): # see gh-19242, gh-9155 # # Test writing timedelta to xls. @@ -803,12 +857,12 @@ def test_to_excel_timedelta(self, path): lambda x: timedelta(seconds=x).total_seconds() / 86400 ) - df.to_excel(path, sheet_name="test1") - with ExcelFile(path) as reader: + df.to_excel(tmp_excel, sheet_name="test1") + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_periodindex(self, path): + def test_to_excel_periodindex(self, tmp_excel): # xp has a PeriodIndex df = DataFrame( np.random.default_rng(2).standard_normal((5, 4)), @@ -817,28 +871,28 @@ def test_to_excel_periodindex(self, path): ) xp = df.resample("ME").mean().to_period("M") - xp.to_excel(path, sheet_name="sht1") + xp.to_excel(tmp_excel, sheet_name="sht1") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: rs = pd.read_excel(reader, sheet_name="sht1", index_col=0) tm.assert_frame_equal(xp, rs.to_period("M")) - def test_to_excel_multiindex(self, merge_cells, frame, path): + def test_to_excel_multiindex(self, merge_cells, frame, tmp_excel): arrays = np.arange(len(frame.index) * 2, dtype=np.int64).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - frame.to_excel(path, sheet_name="test1", header=False) - frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(tmp_excel, sheet_name="test1", header=False) + frame.to_excel(tmp_excel, sheet_name="test1", columns=["A", "B"]) # round trip - frame.to_excel(path, sheet_name="test1", merge_cells=merge_cells) - with ExcelFile(path) as reader: + frame.to_excel(tmp_excel, sheet_name="test1", merge_cells=merge_cells) + with ExcelFile(tmp_excel) as reader: df = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 - def test_to_excel_multiindex_nan_label(self, merge_cells, path): + def test_to_excel_multiindex_nan_label(self, merge_cells, tmp_excel): df = DataFrame( { "A": [None, 2, 3], @@ -848,38 +902,60 @@ def test_to_excel_multiindex_nan_label(self, merge_cells, path): ) df = df.set_index(["A", "B"]) - df.to_excel(path, merge_cells=merge_cells) - df1 = pd.read_excel(path, index_col=[0, 1]) + df.to_excel(tmp_excel, merge_cells=merge_cells) + df1 = pd.read_excel(tmp_excel, index_col=[0, 1]) tm.assert_frame_equal(df, df1) # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells - def test_to_excel_multiindex_cols(self, merge_cells, frame, path): + def test_to_excel_multiindex_cols(self, merge_cells, tmp_excel): + # GH#11328 + frame = DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + "C": [7, 8, 9], + } + ) arrays = np.arange(len(frame.index) * 2, dtype=np.int64).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1)]) frame.columns = new_cols_index - header = [0, 1] - if not merge_cells: - header = 0 + frame.to_excel(tmp_excel, sheet_name="test1", merge_cells=merge_cells) - # round trip - frame.to_excel(path, sheet_name="test1", merge_cells=merge_cells) - with ExcelFile(path) as reader: - df = pd.read_excel( - reader, sheet_name="test1", header=header, index_col=[0, 1] + # Check round trip + with ExcelFile(tmp_excel) as reader: + result = pd.read_excel( + reader, sheet_name="test1", header=[0, 1], index_col=[0, 1] + ) + tm.assert_frame_equal(result, frame) + + # GH#60274 + # Check with header/index_col None to determine which cells were merged + with ExcelFile(tmp_excel) as reader: + result = pd.read_excel( + reader, sheet_name="test1", header=None, index_col=None ) + expected = DataFrame( + { + 0: [np.nan, np.nan, "first", 0, 1, 2], + 1: [np.nan, np.nan, "second", 3, 4, 5], + 2: [40.0, 1.0, np.nan, 1.0, 2.0, 3.0], + 3: [np.nan, 2.0, np.nan, 4.0, 5.0, 6.0], + 4: [50.0, 1.0, np.nan, 7.0, 8.0, 9.0], + } + ) if not merge_cells: - fm = frame.columns._format_multi(sparsify=False, include_names=False) - frame.columns = [".".join(map(str, q)) for q in zip(*fm)] - tm.assert_frame_equal(frame, df) + # MultiIndex column value is repeated + expected.loc[0, 3] = 40.0 + tm.assert_frame_equal(result, expected) - def test_to_excel_multiindex_dates(self, merge_cells, path): + def test_to_excel_multiindex_dates(self, merge_cells, tmp_excel): # try multiindex with dates - unit = get_exp_unit(path) + unit = get_exp_unit(tmp_excel) tsframe = DataFrame( np.random.default_rng(2).standard_normal((5, 4)), columns=Index(list("ABCD")), @@ -893,14 +969,14 @@ def test_to_excel_multiindex_dates(self, merge_cells, path): names=["time", "foo"], ) - tsframe.to_excel(path, sheet_name="test1", merge_cells=merge_cells) - with ExcelFile(path) as reader: + tsframe.to_excel(tmp_excel, sheet_name="test1", merge_cells=merge_cells) + with ExcelFile(tmp_excel) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) assert recons.index.names == ("time", "foo") - def test_to_excel_multiindex_no_write_index(self, path): + def test_to_excel_multiindex_no_write_index(self, tmp_excel): # Test writing and re-reading a MI without the index. GH 5616. # Initial non-MI frame. @@ -912,37 +988,48 @@ def test_to_excel_multiindex_no_write_index(self, path): frame2.index = multi_index # Write out to Excel without the index. - frame2.to_excel(path, sheet_name="test1", index=False) + frame2.to_excel(tmp_excel, sheet_name="test1", index=False) # Read it back in. - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: frame3 = pd.read_excel(reader, sheet_name="test1") # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) - def test_to_excel_empty_multiindex(self, path): + def test_to_excel_empty_multiindex(self, tmp_excel): # GH 19543. expected = DataFrame([], columns=[0, 1, 2]) df = DataFrame([], index=MultiIndex.from_tuples([], names=[0, 1]), columns=[2]) - df.to_excel(path, sheet_name="test1") + df.to_excel(tmp_excel, sheet_name="test1") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: result = pd.read_excel(reader, sheet_name="test1") tm.assert_frame_equal( result, expected, check_index_type=False, check_dtype=False ) - def test_to_excel_float_format(self, path): + def test_to_excel_empty_multiindex_both_axes(self, tmp_excel): + # GH 57696 + df = DataFrame( + [], + index=MultiIndex.from_tuples([], names=[0, 1]), + columns=MultiIndex.from_tuples([("A", "B")]), + ) + df.to_excel(tmp_excel) + result = pd.read_excel(tmp_excel, header=[0, 1], index_col=[0, 1]) + tm.assert_frame_equal(result, df) + + def test_to_excel_float_format(self, tmp_excel): df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) - df.to_excel(path, sheet_name="test1", float_format="%.2f") + df.to_excel(tmp_excel, sheet_name="test1", float_format="%.2f") - with ExcelFile(path) as reader: + with ExcelFile(tmp_excel) as reader: result = pd.read_excel(reader, sheet_name="test1", index_col=0) expected = DataFrame( @@ -952,7 +1039,37 @@ def test_to_excel_float_format(self, path): ) tm.assert_frame_equal(result, expected) - def test_to_excel_output_encoding(self, ext): + def test_to_excel_datatypes_preserved(self, tmp_excel): + # Test that when writing and reading Excel with dtype=object, + # datatypes are preserved, except Decimals which should be + # stored as floats + + # see gh-49598 + df = DataFrame( + [ + [1.23, "1.23", Decimal("1.23")], + [4.56, "4.56", Decimal("4.56")], + ], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.to_excel(tmp_excel) + + with ExcelFile(tmp_excel) as reader: + result = pd.read_excel(reader, index_col=0, dtype=object) + + expected = DataFrame( + [ + [1.23, "1.23", 1.23], + [4.56, "4.56", 4.56], + ], + index=["A", "B"], + columns=["X", "Y", "Z"], + dtype=object, + ) + tm.assert_frame_equal(result, expected) + + def test_to_excel_output_encoding(self, tmp_excel): # Avoid mixed inferred_type. df = DataFrame( [["\u0192", "\u0193", "\u0194"], ["\u0195", "\u0196", "\u0197"]], @@ -960,28 +1077,22 @@ def test_to_excel_output_encoding(self, ext): columns=["X\u0193", "Y", "Z"], ) - with tm.ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: - df.to_excel(filename, sheet_name="TestSheet") - result = pd.read_excel(filename, sheet_name="TestSheet", index_col=0) - tm.assert_frame_equal(result, df) - - def test_to_excel_unicode_filename(self, ext): - with tm.ensure_clean("\u0192u." + ext) as filename: - try: - with open(filename, "wb"): - pass - except UnicodeEncodeError: - pytest.skip("No unicode file names on this system") + df.to_excel(tmp_excel, sheet_name="TestSheet") + result = pd.read_excel(tmp_excel, sheet_name="TestSheet", index_col=0) + tm.assert_frame_equal(result, df) - df = DataFrame( - [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], - index=["A", "B"], - columns=["X", "Y", "Z"], - ) - df.to_excel(filename, sheet_name="test1", float_format="%.2f") + def test_to_excel_unicode_filename(self, ext, tmp_path): + filename = tmp_path / f"\u0192u.{ext}" + filename.touch() + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.to_excel(filename, sheet_name="test1", float_format="%.2f") - with ExcelFile(filename) as reader: - result = pd.read_excel(reader, sheet_name="test1", index_col=0) + with ExcelFile(filename) as reader: + result = pd.read_excel(reader, sheet_name="test1", index_col=0) expected = DataFrame( [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], @@ -994,12 +1105,14 @@ def test_to_excel_unicode_filename(self, ext): @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) def test_excel_010_hemstring( - self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, path + self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, tmp_excel ): def roundtrip(data, header=True, parser_hdr=0, index=True): - data.to_excel(path, header=header, merge_cells=merge_cells, index=index) + data.to_excel( + tmp_excel, header=header, merge_cells=merge_cells, index=index + ) - with ExcelFile(path) as xf: + with ExcelFile(tmp_excel) as xf: return pd.read_excel( xf, sheet_name=xf.sheet_names[0], header=parser_hdr ) @@ -1062,56 +1175,56 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): for c in range(len(res.columns)): assert res.iloc[r, c] is not np.nan - def test_duplicated_columns(self, path): + def test_duplicated_columns(self, tmp_excel): # see gh-5235 df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) - df.to_excel(path, sheet_name="test1") + df.to_excel(tmp_excel, sheet_name="test1") expected = DataFrame( [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] ) # By default, we mangle. - result = pd.read_excel(path, sheet_name="test1", index_col=0) + result = pd.read_excel(tmp_excel, sheet_name="test1", index_col=0) tm.assert_frame_equal(result, expected) # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) - df.to_excel(path, sheet_name="test1") + df.to_excel(tmp_excel, sheet_name="test1") - result = pd.read_excel(path, sheet_name="test1", index_col=0) + result = pd.read_excel(tmp_excel, sheet_name="test1", index_col=0) expected = DataFrame( [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"] ) tm.assert_frame_equal(result, expected) # see gh-10982 - df.to_excel(path, sheet_name="test1", index=False, header=False) - result = pd.read_excel(path, sheet_name="test1", header=None) + df.to_excel(tmp_excel, sheet_name="test1", index=False, header=False) + result = pd.read_excel(tmp_excel, sheet_name="test1", header=None) expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) tm.assert_frame_equal(result, expected) - def test_swapped_columns(self, path): + def test_swapped_columns(self, tmp_excel): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) - write_frame.to_excel(path, sheet_name="test1", columns=["B", "A"]) + write_frame.to_excel(tmp_excel, sheet_name="test1", columns=["B", "A"]) - read_frame = pd.read_excel(path, sheet_name="test1", header=0) + read_frame = pd.read_excel(tmp_excel, sheet_name="test1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) tm.assert_series_equal(write_frame["B"], read_frame["B"]) - def test_invalid_columns(self, path): + def test_invalid_columns(self, tmp_excel): # see gh-10982 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) with pytest.raises(KeyError, match="Not all names specified"): - write_frame.to_excel(path, sheet_name="test1", columns=["B", "C"]) + write_frame.to_excel(tmp_excel, sheet_name="test1", columns=["B", "C"]) with pytest.raises( KeyError, match="'passes columns are not ALL present dataframe'" ): - write_frame.to_excel(path, sheet_name="test1", columns=["C", "D"]) + write_frame.to_excel(tmp_excel, sheet_name="test1", columns=["C", "D"]) @pytest.mark.parametrize( "to_excel_index,read_excel_index_col", @@ -1120,81 +1233,88 @@ def test_invalid_columns(self, path): (False, None), # Dont include index in write to file ], ) - def test_write_subset_columns(self, path, to_excel_index, read_excel_index_col): + def test_write_subset_columns( + self, tmp_excel, to_excel_index, read_excel_index_col + ): # GH 31677 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2], "C": [3, 3, 3]}) write_frame.to_excel( - path, sheet_name="col_subset_bug", columns=["A", "B"], index=to_excel_index + tmp_excel, + sheet_name="col_subset_bug", + columns=["A", "B"], + index=to_excel_index, ) expected = write_frame[["A", "B"]] read_frame = pd.read_excel( - path, sheet_name="col_subset_bug", index_col=read_excel_index_col + tmp_excel, sheet_name="col_subset_bug", index_col=read_excel_index_col ) tm.assert_frame_equal(expected, read_frame) - def test_comment_arg(self, path): + def test_comment_arg(self, tmp_excel): # see gh-18735 # # Test the comment argument functionality to pd.read_excel. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, sheet_name="test_c") + df.to_excel(tmp_excel, sheet_name="test_c") # Read file without comment arg. - result1 = pd.read_excel(path, sheet_name="test_c", index_col=0) + result1 = pd.read_excel(tmp_excel, sheet_name="test_c", index_col=0) result1.iloc[1, 0] = None result1.iloc[1, 1] = None result1.iloc[2, 1] = None - result2 = pd.read_excel(path, sheet_name="test_c", comment="#", index_col=0) + result2 = pd.read_excel( + tmp_excel, sheet_name="test_c", comment="#", index_col=0 + ) tm.assert_frame_equal(result1, result2) - def test_comment_default(self, path): + def test_comment_default(self, tmp_excel): # Re issue #18735 # Test the comment argument default to pd.read_excel # Create file to read in df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, sheet_name="test_c") + df.to_excel(tmp_excel, sheet_name="test_c") # Read file with default and explicit comment=None - result1 = pd.read_excel(path, sheet_name="test_c") - result2 = pd.read_excel(path, sheet_name="test_c", comment=None) + result1 = pd.read_excel(tmp_excel, sheet_name="test_c") + result2 = pd.read_excel(tmp_excel, sheet_name="test_c", comment=None) tm.assert_frame_equal(result1, result2) - def test_comment_used(self, path): + def test_comment_used(self, tmp_excel): # see gh-18735 # # Test the comment argument is working as expected when used. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, sheet_name="test_c") + df.to_excel(tmp_excel, sheet_name="test_c") # Test read_frame_comment against manually produced expected output. expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) - result = pd.read_excel(path, sheet_name="test_c", comment="#", index_col=0) + result = pd.read_excel(tmp_excel, sheet_name="test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) - def test_comment_empty_line(self, path): + def test_comment_empty_line(self, tmp_excel): # Re issue #18735 # Test that pd.read_excel ignores commented lines at the end of file df = DataFrame({"a": ["1", "#2"], "b": ["2", "3"]}) - df.to_excel(path, index=False) + df.to_excel(tmp_excel, index=False) # Test that all-comment lines at EoF are ignored expected = DataFrame({"a": [1], "b": [2]}) - result = pd.read_excel(path, comment="#") + result = pd.read_excel(tmp_excel, comment="#") tm.assert_frame_equal(result, expected) - def test_datetimes(self, path): + def test_datetimes(self, tmp_excel): # Test writing and reading datetimes. For issue #9139. (xref #9185) - unit = get_exp_unit(path) + unit = get_exp_unit(tmp_excel) datetimes = [ datetime(2013, 1, 13, 1, 2, 3), datetime(2013, 1, 13, 2, 45, 56), @@ -1210,8 +1330,8 @@ def test_datetimes(self, path): ] write_frame = DataFrame({"A": datetimes}) - write_frame.to_excel(path, sheet_name="Sheet1") - read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) + write_frame.to_excel(tmp_excel, sheet_name="Sheet1") + read_frame = pd.read_excel(tmp_excel, sheet_name="Sheet1", header=0) expected = write_frame.astype(f"M8[{unit}]") tm.assert_series_equal(expected["A"], read_frame["A"]) @@ -1229,7 +1349,7 @@ def test_bytes_io(self, engine): reread_df = pd.read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) - def test_engine_kwargs(self, engine, path): + def test_engine_kwargs(self, engine, tmp_excel): # GH#52368 df = DataFrame([{"A": 1, "B": 2}, {"A": 3, "B": 4}]) @@ -1240,28 +1360,27 @@ def test_engine_kwargs(self, engine, path): "xlsxwriter": r"__init__() got an unexpected keyword argument 'foo'", } - if PY310: - msgs[ - "openpyxl" - ] = "Workbook.__init__() got an unexpected keyword argument 'foo'" - msgs[ - "xlsxwriter" - ] = "Workbook.__init__() got an unexpected keyword argument 'foo'" + msgs["openpyxl"] = ( + "Workbook.__init__() got an unexpected keyword argument 'foo'" + ) + msgs["xlsxwriter"] = ( + "Workbook.__init__() got an unexpected keyword argument 'foo'" + ) # Handle change in error message for openpyxl (write and append mode) - if engine == "openpyxl" and not os.path.exists(path): - msgs[ - "openpyxl" - ] = r"load_workbook() got an unexpected keyword argument 'foo'" + if engine == "openpyxl" and not os.path.exists(tmp_excel): + msgs["openpyxl"] = ( + r"load_workbook() got an unexpected keyword argument 'foo'" + ) with pytest.raises(TypeError, match=re.escape(msgs[engine])): df.to_excel( - path, + tmp_excel, engine=engine, engine_kwargs={"foo": "bar"}, ) - def test_write_lists_dict(self, path): + def test_write_lists_dict(self, tmp_excel): # see gh-8188. df = DataFrame( { @@ -1270,8 +1389,8 @@ def test_write_lists_dict(self, path): "str": ["apple", "banana", "cherry"], } ) - df.to_excel(path, sheet_name="Sheet1") - read = pd.read_excel(path, sheet_name="Sheet1", header=0, index_col=0) + df.to_excel(tmp_excel, sheet_name="Sheet1") + read = pd.read_excel(tmp_excel, sheet_name="Sheet1", header=0, index_col=0) expected = df.copy() expected.mixed = expected.mixed.apply(str) @@ -1279,47 +1398,35 @@ def test_write_lists_dict(self, path): tm.assert_frame_equal(read, expected) - def test_render_as_column_name(self, path): + def test_render_as_column_name(self, tmp_excel): # see gh-34331 df = DataFrame({"render": [1, 2], "data": [3, 4]}) - df.to_excel(path, sheet_name="Sheet1") - read = pd.read_excel(path, "Sheet1", index_col=0) + df.to_excel(tmp_excel, sheet_name="Sheet1") + read = pd.read_excel(tmp_excel, "Sheet1", index_col=0) expected = df tm.assert_frame_equal(read, expected) - def test_true_and_false_value_options(self, path): + def test_true_and_false_value_options(self, tmp_excel): # see gh-13347 df = DataFrame([["foo", "bar"]], columns=["col1", "col2"], dtype=object) with option_context("future.no_silent_downcasting", True): expected = df.replace({"foo": True, "bar": False}).astype("bool") - df.to_excel(path) + df.to_excel(tmp_excel) read_frame = pd.read_excel( - path, true_values=["foo"], false_values=["bar"], index_col=0 + tmp_excel, true_values=["foo"], false_values=["bar"], index_col=0 ) tm.assert_frame_equal(read_frame, expected) - def test_freeze_panes(self, path): + def test_freeze_panes(self, tmp_excel): # see gh-15160 expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) - expected.to_excel(path, sheet_name="Sheet1", freeze_panes=(1, 1)) + expected.to_excel(tmp_excel, sheet_name="Sheet1", freeze_panes=(1, 1)) - result = pd.read_excel(path, index_col=0) + result = pd.read_excel(tmp_excel, index_col=0) tm.assert_frame_equal(result, expected) def test_path_path_lib(self, engine, ext): - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD")), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) - writer = partial(df.to_excel, engine=engine) - - reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, path=f"foo{ext}") - tm.assert_frame_equal(result, df) - - def test_path_local_path(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), @@ -1328,10 +1435,10 @@ def test_path_local_path(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_localpath(writer, reader, path=f"foo{ext}") + result = tm.round_trip_pathlib(writer, reader, path=f"foo{ext}") tm.assert_frame_equal(result, df) - def test_merged_cell_custom_objects(self, path): + def test_merged_cell_custom_objects(self, tmp_excel): # see GH-27006 mi = MultiIndex.from_tuples( [ @@ -1340,8 +1447,8 @@ def test_merged_cell_custom_objects(self, path): ] ) expected = DataFrame(np.ones((2, 2), dtype="int64"), columns=mi) - expected.to_excel(path) - result = pd.read_excel(path, header=[0, 1], index_col=0) + expected.to_excel(tmp_excel) + result = pd.read_excel(tmp_excel, header=[0, 1], index_col=0) # need to convert PeriodIndexes to standard Indexes for assert equal expected.columns = expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], @@ -1350,52 +1457,59 @@ def test_merged_cell_custom_objects(self, path): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) - def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path): + def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, tmp_excel): # GH 27008, GH 7056 tz = tz_aware_fixture data = pd.Timestamp("2019", tz=tz) df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): - df.to_excel(path) + df.to_excel(tmp_excel) data = data.to_pydatetime() df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): - df.to_excel(path) + df.to_excel(tmp_excel) - def test_excel_duplicate_columns_with_names(self, path): + def test_excel_duplicate_columns_with_names(self, tmp_excel): # GH#39695 df = DataFrame({"A": [0, 1], "B": [10, 11]}) - df.to_excel(path, columns=["A", "B", "A"], index=False) + df.to_excel(tmp_excel, columns=["A", "B", "A"], index=False) - result = pd.read_excel(path) + result = pd.read_excel(tmp_excel) expected = DataFrame([[0, 10, 0], [1, 11, 1]], columns=["A", "B", "A.1"]) tm.assert_frame_equal(result, expected) - def test_if_sheet_exists_raises(self, ext): + def test_if_sheet_exists_raises(self, tmp_excel): # GH 40230 msg = "if_sheet_exists is only valid in append mode (mode='a')" - with tm.ensure_clean(ext) as f: - with pytest.raises(ValueError, match=re.escape(msg)): - ExcelWriter(f, if_sheet_exists="replace") + with pytest.raises(ValueError, match=re.escape(msg)): + ExcelWriter(tmp_excel, if_sheet_exists="replace") - def test_excel_writer_empty_frame(self, engine, ext): + def test_excel_writer_empty_frame(self, engine, tmp_excel): # GH#45793 - with tm.ensure_clean(ext) as path: - with ExcelWriter(path, engine=engine) as writer: - DataFrame().to_excel(writer) - result = pd.read_excel(path) - expected = DataFrame() - tm.assert_frame_equal(result, expected) - - def test_to_excel_empty_frame(self, engine, ext): + with ExcelWriter(tmp_excel, engine=engine) as writer: + DataFrame().to_excel(writer) + result = pd.read_excel(tmp_excel) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + + def test_to_excel_empty_frame(self, engine, tmp_excel): # GH#45793 - with tm.ensure_clean(ext) as path: - DataFrame().to_excel(path, engine=engine) - result = pd.read_excel(path) - expected = DataFrame() - tm.assert_frame_equal(result, expected) + DataFrame().to_excel(tmp_excel, engine=engine) + result = pd.read_excel(tmp_excel) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + + def test_to_excel_raising_warning_when_cell_character_exceed_limit(self): + # GH#56954 + df = DataFrame({"A": ["a" * 32768]}) + msg = r"Cell contents too long \(32768\), truncated to 32767 characters" + with tm.assert_produces_warning( + UserWarning, match=msg, raise_on_extra_warnings=False + ): + buf = BytesIO() + df.to_excel(buf) class TestExcelWriterEngineTests: @@ -1406,22 +1520,21 @@ class TestExcelWriterEngineTests: pytest.param(_OpenpyxlWriter, ".xlsx", marks=td.skip_if_no("openpyxl")), ], ) - def test_ExcelWriter_dispatch(self, klass, ext): - with tm.ensure_clean(ext) as path: - with ExcelWriter(path) as writer: - if ext == ".xlsx" and bool( - import_optional_dependency("xlsxwriter", errors="ignore") - ): - # xlsxwriter has preference over openpyxl if both installed - assert isinstance(writer, _XlsxWriter) - else: - assert isinstance(writer, klass) + def test_ExcelWriter_dispatch(self, klass, ext, tmp_excel): + with ExcelWriter(tmp_excel) as writer: + if ext == ".xlsx" and bool( + import_optional_dependency("xlsxwriter", errors="ignore") + ): + # xlsxwriter has preference over openpyxl if both installed + assert isinstance(writer, _XlsxWriter) + else: + assert isinstance(writer, klass) def test_ExcelWriter_dispatch_raises(self): with pytest.raises(ValueError, match="No engine"): ExcelWriter("nothing") - def test_register_writer(self): + def test_register_writer(self, tmp_path): class DummyClass(ExcelWriter): called_save = False called_write_cells = False @@ -1453,50 +1566,41 @@ def assert_called_and_reset(cls): register_writer(DummyClass) with option_context("io.excel.xlsx.writer", "dummy"): - path = "something.xlsx" - with tm.ensure_clean(path) as filepath: - with ExcelWriter(filepath) as writer: - assert isinstance(writer, DummyClass) - df = DataFrame( - ["a"], - columns=Index(["b"], name="foo"), - index=Index(["c"], name="bar"), - ) - df.to_excel(filepath) + filepath = tmp_path / "something.xlsx" + filepath.touch() + with ExcelWriter(filepath) as writer: + assert isinstance(writer, DummyClass) + df = DataFrame( + ["a"], + columns=Index(["b"], name="foo"), + index=Index(["c"], name="bar"), + ) + df.to_excel(filepath) DummyClass.assert_called_and_reset() - with tm.ensure_clean("something.xls") as filepath: - df.to_excel(filepath, engine="dummy") + filepath2 = tmp_path / "something2.xlsx" + filepath2.touch() + df.to_excel(filepath2, engine="dummy") DummyClass.assert_called_and_reset() @td.skip_if_no("xlrd") @td.skip_if_no("openpyxl") class TestFSPath: - def test_excelfile_fspath(self): - with tm.ensure_clean("foo.xlsx") as path: - df = DataFrame({"A": [1, 2]}) - df.to_excel(path) - with ExcelFile(path) as xl: - result = os.fspath(xl) - assert result == path - - def test_excelwriter_fspath(self): - with tm.ensure_clean("foo.xlsx") as path: - with ExcelWriter(path) as writer: - assert os.fspath(writer) == str(path) - - def test_to_excel_pos_args_deprecation(self): - # GH-54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_excel except " - r"for the argument 'excel_writer' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - buf = BytesIO() - writer = ExcelWriter(buf) - df.to_excel(writer, "Sheet_name_1") + def test_excelfile_fspath(self, tmp_path): + path = tmp_path / "foo.xlsx" + path.touch() + df = DataFrame({"A": [1, 2]}) + df.to_excel(path) + with ExcelFile(path) as xl: + result = os.fspath(xl) + assert result == str(path) + + def test_excelwriter_fspath(self, tmp_path): + path = tmp_path / "foo.xlsx" + path.touch() + with ExcelWriter(path) as writer: + assert os.fspath(writer) == str(path) @pytest.mark.parametrize("klass", _writers.values()) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 6d5008ca9ee68..c49cbaf7fb26c 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -12,14 +12,14 @@ xlrd = pytest.importorskip("xlrd") -@pytest.fixture(params=[".xls"]) -def read_ext_xlrd(request): +@pytest.fixture +def read_ext_xlrd(): """ Valid extensions for reading Excel files with xlrd. Similar to read_ext, but excludes .ods, .xlsb, and for xlrd>2 .xlsx, .xlsm """ - return request.param + return ".xls" def test_read_xlrd_book(read_ext_xlrd, datapath): diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 94f6bdfaf069c..b2e6c845e5019 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -1,9 +1,9 @@ import contextlib +import uuid import pytest from pandas import DataFrame -import pandas._testing as tm from pandas.io.excel import ExcelWriter @@ -15,67 +15,72 @@ def ext(): return ".xlsx" -def test_column_format(ext): +@pytest.fixture +def tmp_excel(ext, tmp_path): + tmp = tmp_path / f"{uuid.uuid4()}{ext}" + tmp.touch() + return str(tmp) + + +def test_column_format(tmp_excel): # Test that column formats are applied to cells. Test for issue #9167. # Applicable to xlsxwriter only. openpyxl = pytest.importorskip("openpyxl") - with tm.ensure_clean(ext) as path: - frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]}) + frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]}) - with ExcelWriter(path) as writer: - frame.to_excel(writer) + with ExcelWriter(tmp_excel) as writer: + frame.to_excel(writer) - # Add a number format to col B and ensure it is applied to cells. - num_format = "#,##0" - write_workbook = writer.book - write_worksheet = write_workbook.worksheets()[0] - col_format = write_workbook.add_format({"num_format": num_format}) - write_worksheet.set_column("B:B", None, col_format) + # Add a number format to col B and ensure it is applied to cells. + num_format = "#,##0" + write_workbook = writer.book + write_worksheet = write_workbook.worksheets()[0] + col_format = write_workbook.add_format({"num_format": num_format}) + write_worksheet.set_column("B:B", None, col_format) - with contextlib.closing(openpyxl.load_workbook(path)) as read_workbook: - try: - read_worksheet = read_workbook["Sheet1"] - except TypeError: - # compat - read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1") - - # Get the number format from the cell. + with contextlib.closing(openpyxl.load_workbook(tmp_excel)) as read_workbook: try: - cell = read_worksheet["B2"] + read_worksheet = read_workbook["Sheet1"] except TypeError: # compat - cell = read_worksheet.cell("B2") + read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1") - try: - read_num_format = cell.number_format - except AttributeError: - read_num_format = cell.style.number_format._format_code + # Get the number format from the cell. + try: + cell = read_worksheet["B2"] + except TypeError: + # compat + cell = read_worksheet.cell("B2") + + try: + read_num_format = cell.number_format + except AttributeError: + read_num_format = cell.style.number_format._format_code - assert read_num_format == num_format + assert read_num_format == num_format -def test_write_append_mode_raises(ext): +def test_write_append_mode_raises(tmp_excel): msg = "Append mode is not supported with xlsxwriter!" - with tm.ensure_clean(ext) as f: - with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine="xlsxwriter", mode="a") + with pytest.raises(ValueError, match=msg): + ExcelWriter(tmp_excel, engine="xlsxwriter", mode="a") @pytest.mark.parametrize("nan_inf_to_errors", [True, False]) -def test_engine_kwargs(ext, nan_inf_to_errors): +def test_engine_kwargs(tmp_excel, nan_inf_to_errors): # GH 42286 engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}} - with tm.ensure_clean(ext) as f: - with ExcelWriter(f, engine="xlsxwriter", engine_kwargs=engine_kwargs) as writer: - assert writer.book.nan_inf_to_errors == nan_inf_to_errors + with ExcelWriter( + tmp_excel, engine="xlsxwriter", engine_kwargs=engine_kwargs + ) as writer: + assert writer.book.nan_inf_to_errors == nan_inf_to_errors -def test_book_and_sheets_consistent(ext): +def test_book_and_sheets_consistent(tmp_excel): # GH#45687 - Ensure sheets is updated if user modifies book - with tm.ensure_clean(ext) as f: - with ExcelWriter(f, engine="xlsxwriter") as writer: - assert writer.sheets == {} - sheet = writer.book.add_worksheet("test_name") - assert writer.sheets == {"test_name": sheet} + with ExcelWriter(tmp_excel, engine="xlsxwriter") as writer: + assert writer.sheets == {} + sheet = writer.book.add_worksheet("test_name") + assert writer.sheets == {"test_name": sheet} diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py index b0e4712e8bb3d..d28c7c566d851 100644 --- a/pandas/tests/io/formats/style/test_bar.py +++ b/pandas/tests/io/formats/style/test_bar.py @@ -347,6 +347,7 @@ def test_styler_bar_with_NA_values(): def test_style_bar_with_pyarrow_NA_values(): + pytest.importorskip("pyarrow") data = """name,age,test1,test2,teacher Adam,15,95.0,80,Ashby Bob,16,81.0,82,Ashby diff --git a/pandas/tests/io/formats/style/test_format.py b/pandas/tests/io/formats/style/test_format.py index 1c84816ead140..ae68fcf9ef1fc 100644 --- a/pandas/tests/io/formats/style/test_format.py +++ b/pandas/tests/io/formats/style/test_format.py @@ -32,10 +32,14 @@ def styler(df): @pytest.fixture def df_multi(): - return DataFrame( - data=np.arange(16).reshape(4, 4), - columns=MultiIndex.from_product([["A", "B"], ["a", "b"]]), - index=MultiIndex.from_product([["X", "Y"], ["x", "y"]]), + return ( + DataFrame( + data=np.arange(16).reshape(4, 4), + columns=MultiIndex.from_product([["A", "B"], ["a", "b"]]), + index=MultiIndex.from_product([["X", "Y"], ["x", "y"]]), + ) + .rename_axis(["0_0", "0_1"], axis=0) + .rename_axis(["1_0", "1_1"], axis=1) ) @@ -560,3 +564,98 @@ def test_relabel_roundtrip(styler): ctx = styler._translate(True, True) assert {"value": "x", "display_value": "x"}.items() <= ctx["body"][0][0].items() assert {"value": "y", "display_value": "y"}.items() <= ctx["body"][1][0].items() + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize( + "level, expected", + [ + (0, ["X", "one"]), # level int + ("zero", ["X", "one"]), # level name + (1, ["zero", "X"]), # other level int + ("one", ["zero", "X"]), # other level name + ([0, 1], ["X", "X"]), # both levels + ([0, "zero"], ["X", "one"]), # level int and name simultaneous + ([0, "one"], ["X", "X"]), # both levels as int and name + (["one", "zero"], ["X", "X"]), # both level names, reversed + ], +) +def test_format_index_names_level(axis, level, expected): + midx = MultiIndex.from_arrays([["_", "_"], ["_", "_"]], names=["zero", "one"]) + df = DataFrame([[1, 2], [3, 4]]) + if axis == 0: + df.index = midx + else: + df.columns = midx + + styler = df.style.format_index_names(lambda v: "X", level=level, axis=axis) + ctx = styler._translate(True, True) + + if axis == 0: # compare index + result = [ctx["head"][1][s]["display_value"] for s in range(2)] + else: # compare columns + result = [ctx["head"][s][0]["display_value"] for s in range(2)] + assert expected == result + + +@pytest.mark.parametrize( + "attr, kwargs", + [ + ("_display_funcs_index_names", {"axis": 0}), + ("_display_funcs_column_names", {"axis": 1}), + ], +) +def test_format_index_names_clear(styler, attr, kwargs): + assert 0 not in getattr(styler, attr) # using default + styler.format_index_names("{:.2f}", **kwargs) + assert 0 in getattr(styler, attr) # formatter is specified + styler.format_index_names(**kwargs) + assert 0 not in getattr(styler, attr) # formatter cleared to default + + +@pytest.mark.parametrize("axis", [0, 1]) +def test_format_index_names_callable(styler_multi, axis): + ctx = styler_multi.format_index_names( + lambda v: v.replace("_", "A"), axis=axis + )._translate(True, True) + result = [ + ctx["head"][2][0]["display_value"], + ctx["head"][2][1]["display_value"], + ctx["head"][0][1]["display_value"], + ctx["head"][1][1]["display_value"], + ] + if axis == 0: + expected = ["0A0", "0A1", "1_0", "1_1"] + else: + expected = ["0_0", "0_1", "1A0", "1A1"] + assert result == expected + + +def test_format_index_names_dict(styler_multi): + ctx = ( + styler_multi.format_index_names({"0_0": "{:<<5}"}) + .format_index_names({"1_1": "{:>>4}"}, axis=1) + ._translate(True, True) + ) + assert ctx["head"][2][0]["display_value"] == "0_0<<" + assert ctx["head"][1][1]["display_value"] == ">1_1" + + +def test_format_index_names_with_hidden_levels(styler_multi): + ctx = styler_multi._translate(True, True) + full_head_height = len(ctx["head"]) + full_head_width = len(ctx["head"][0]) + assert full_head_height == 3 + assert full_head_width == 6 + + ctx = ( + styler_multi.hide(axis=0, level=1) + .hide(axis=1, level=1) + .format_index_names("{:>>4}", axis=1) + .format_index_names("{:!<5}") + ._translate(True, True) + ) + assert len(ctx["head"]) == full_head_height - 1 + assert len(ctx["head"][0]) == full_head_width - 1 + assert ctx["head"][0][0]["display_value"] == ">1_0" + assert ctx["head"][1][0]["display_value"] == "0_0!!" diff --git a/pandas/tests/io/formats/style/test_highlight.py b/pandas/tests/io/formats/style/test_highlight.py index 3d59719010ee0..5d19e9c14d534 100644 --- a/pandas/tests/io/formats/style/test_highlight.py +++ b/pandas/tests/io/formats/style/test_highlight.py @@ -198,16 +198,17 @@ def test_highlight_quantile(styler, kwargs): ], ) @pytest.mark.parametrize( - "df", + "dtype", [ - DataFrame([[0, 10], [20, 30]], dtype=int), - DataFrame([[0, 10], [20, 30]], dtype=float), - DataFrame([[0, 10], [20, 30]], dtype="datetime64[ns]"), - DataFrame([[0, 10], [20, 30]], dtype=str), - DataFrame([[0, 10], [20, 30]], dtype="timedelta64[ns]"), + int, + float, + "datetime64[ns]", + str, + "timedelta64[ns]", ], ) -def test_all_highlight_dtypes(f, kwargs, df): +def test_all_highlight_dtypes(f, kwargs, dtype): + df = DataFrame([[0, 10], [20, 30]], dtype=dtype) if f == "highlight_quantile" and isinstance(df.iloc[0, 0], (str)): return None # quantile incompatible with str if f == "highlight_between": diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index 1e345eb82ed3c..752b9b391f9cb 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -34,6 +34,16 @@ def styler_mi(): return Styler(DataFrame(np.arange(16).reshape(4, 4), index=midx, columns=midx)) +@pytest.fixture +def styler_multi(): + df = DataFrame( + data=np.arange(16).reshape(4, 4), + columns=MultiIndex.from_product([["A", "B"], ["a", "b"]], names=["A&", "b&"]), + index=MultiIndex.from_product([["X", "Y"], ["x", "y"]], names=["X>", "y_"]), + ) + return Styler(df) + + @pytest.fixture def tpl_style(env): return env.get_template("html_style.tpl") @@ -93,11 +103,7 @@ def test_w3_html_format(styler): lambda x: "att1:v1;" ).set_table_attributes('class="my-cls1" style="attr3:v3;"').set_td_classes( DataFrame(["my-cls2"], index=["a"], columns=["A"]) - ).format( - "{:.1f}" - ).set_caption( - "A comprehensive test" - ) + ).format("{:.1f}").set_caption("A comprehensive test") expected = dedent( """\ ' in result assert '' not in result + assert 'title="' not in result def test_tooltip_css_class(styler): @@ -83,3 +85,95 @@ def test_tooltip_css_class(styler): props="color:green;color:red;", ).to_html() assert "#T_ .another-class {\n color: green;\n color: red;\n}" in result + + +@pytest.mark.parametrize( + "data, columns, index", + [ + # Test basic reindex and ignoring blank + ([["Min", "Max"], [np.nan, ""]], ["A", "C"], ["x", "y"]), + # Test non-referenced columns, reversed col names, short index + ([["Max", "Min", "Bad-Col"]], ["C", "A", "D"], ["x"]), + ], +) +def test_tooltip_render_as_title(data, columns, index, styler): + ttips = DataFrame(data=data, columns=columns, index=index) + # GH 56605 + result = styler.set_tooltips(ttips, as_title_attribute=True).to_html() + + # test css not added + assert "#T_ .pd-t {\n visibility: hidden;\n" not in result + + # test 'Min' tooltip added as title attribute and css does not exist + assert "#T_ #T__row0_col0:hover .pd-t {\n visibility: visible;\n}" not in result + assert '#T_ #T__row0_col0 .pd-t::after {\n content: "Min";\n}' not in result + assert 'class="data row0 col0" title="Min">0
    " ) assert result == expected - - -def test_to_html_pos_args_deprecation(): - # GH-54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_html except for the " - r"argument 'buf' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_html(None, None) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 1fd96dff27d06..ebc6ff5be108f 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -187,22 +187,6 @@ def test_to_latex_midrule_location(self): ) assert result == expected - def test_to_latex_pos_args_deprecation(self): - # GH-54229 - df = DataFrame( - { - "name": ["Raphael", "Donatello"], - "age": [26, 45], - "height": [181.23, 177.65], - } - ) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_latex except for " - r"the argument 'buf' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_latex(None, None) - class TestToLatexLongtable: def test_to_latex_empty_longtable(self): @@ -283,14 +267,15 @@ def test_to_latex_longtable_without_index(self): assert result == expected @pytest.mark.parametrize( - "df, expected_number", + "df_data, expected_number", [ - (DataFrame({"a": [1, 2]}), 1), - (DataFrame({"a": [1, 2], "b": [3, 4]}), 2), - (DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}), 3), + ({"a": [1, 2]}, 1), + ({"a": [1, 2], "b": [3, 4]}, 2), + ({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, 3), ], ) - def test_to_latex_longtable_continued_on_next_page(self, df, expected_number): + def test_to_latex_longtable_continued_on_next_page(self, df_data, expected_number): + df = DataFrame(df_data) result = df.to_latex(index=False, longtable=True) assert rf"\multicolumn{{{expected_number}}}" in result @@ -771,7 +756,7 @@ def df_with_symbols(self): """Dataframe with special characters for testing chars escaping.""" a = "a" b = "b" - yield DataFrame({"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}}) + return DataFrame({"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}}) def test_to_latex_escape_false(self, df_with_symbols): result = df_with_symbols.to_latex(escape=False) @@ -839,6 +824,46 @@ def test_to_latex_escape_special_chars(self): ) assert result == expected + def test_to_latex_escape_special_chars_in_index_names(self): + # https://github.com/pandas-dev/pandas/issues/61309 + # https://github.com/pandas-dev/pandas/issues/57362 + index = "&%$#_{}}~^\\" + df = DataFrame({index: [1, 2, 3]}).set_index(index) + result = df.to_latex(escape=True) + expected = _dedent( + r""" + \begin{tabular}{l} + \toprule + \&\%\$\#\_\{\}\}\textasciitilde \textasciicircum \textbackslash \\ + \midrule + 1 \\ + 2 \\ + 3 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + + def test_to_latex_escape_special_chars_in_column_name(self): + df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) + df.columns.name = "_^~" + result = df.to_latex(escape=True) + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + \_\textasciicircum \textasciitilde & A & B \\ + \midrule + 0 & 1 & a \\ + 1 & 2 & b \\ + 2 & 3 & c \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + def test_to_latex_specified_header_special_chars_without_escape(self): # GH 7124 df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) @@ -1010,7 +1035,7 @@ class TestToLatexMultiindex: @pytest.fixture def multiindex_frame(self): """Multiindex dataframe for testing multirow LaTeX macros.""" - yield DataFrame.from_dict( + return DataFrame.from_dict( { ("c1", 0): Series({x: x for x in range(4)}), ("c1", 1): Series({x: x + 4 for x in range(4)}), @@ -1023,7 +1048,7 @@ def multiindex_frame(self): @pytest.fixture def multicolumn_frame(self): """Multicolumn dataframe for testing multicolumn LaTeX macros.""" - yield DataFrame( + return DataFrame( { ("c1", 0): {x: x for x in range(5)}, ("c1", 1): {x: x + 5 for x in range(5)}, @@ -1326,7 +1351,6 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): ) col_names = [n if (bool(n) and 1 in axes) else "" for n in names] observed = df.to_latex(multirow=False) - # pylint: disable-next=consider-using-f-string expected = r"""\begin{tabular}{llrrrr} \toprule & %s & \multicolumn{2}{r}{1} & \multicolumn{2}{r}{2} \\ @@ -1338,9 +1362,7 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): & 4 & -1 & -1 & -1 & -1 \\ \bottomrule \end{tabular} -""" % tuple( - list(col_names) + [idx_names_row] - ) +""" % tuple(list(col_names) + [idx_names_row]) assert observed == expected @pytest.mark.parametrize("one_row", [True, False]) @@ -1423,3 +1445,88 @@ def test_to_latex_multiindex_multirow(self): """ ) assert result == expected + + def test_to_latex_multiindex_format_single_index_hidden(self): + # GH 52218 + df = DataFrame( + { + "A": [1, 2], + "B": [4, 5], + } + ) + result = ( + df.style.hide(axis="index") + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{rr} + \textbf{A} & \textbf{B} \\ + 1 & 4 \\ + 2 & 5 \\ + \end{tabular} + """) + assert result == expected + + def test_to_latex_multiindex_format_triple_index_two_hidden(self): + # GH 52218 + arrays = [ + ["A", "A", "B", "B"], + ["one", "two", "one", "two"], + ["x", "x", "y", "y"], + ] + index = pd.MultiIndex.from_arrays( + arrays, names=["Level 0", "Level 1", "Level 2"] + ) + df = DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=index, + columns=["C1", "C2", "C3"], + ) + result = ( + df.style.hide(axis="index", level=[0, 1]) + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{lrrr} + & \textbf{C1} & \textbf{C2} & \textbf{C3} \\ + Level 2 & & & \\ + x & 0 & 0 & 0 \\ + x & 0 & 0 & 0 \\ + y & 0 & 0 & 0 \\ + y & 0 & 0 & 0 \\ + \end{tabular} + """) + assert result == expected + + def test_to_latex_multiindex_format_triple_index_all_hidden(self): + # GH 52218 + arrays = [ + ["A", "A", "B", "B"], + ["one", "two", "one", "two"], + ["x", "x", "y", "y"], + ] + index = pd.MultiIndex.from_arrays( + arrays, names=["Level 0", "Level 1", "Level 2"] + ) + df = DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=index, + columns=["C1", "C2", "C3"], + ) + result = ( + df.style.hide(axis="index", level=[0, 1, 2]) + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{rrr} + \textbf{C1} & \textbf{C2} & \textbf{C3} \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + \end{tabular} + """) + assert result == expected diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py index 85eca834ff0d4..f3d9b88cc91e2 100644 --- a/pandas/tests/io/formats/test_to_markdown.py +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -1,7 +1,4 @@ -from io import ( - BytesIO, - StringIO, -) +from io import StringIO import pytest @@ -11,6 +8,17 @@ pytest.importorskip("tabulate") +def test_keyword_deprecation(): + # GH 57280 + msg = ( + "Starting with pandas version 4.0 all arguments of to_markdown " + "except for the argument 'buf' will be keyword-only." + ) + s = pd.Series() + with tm.assert_produces_warning(FutureWarning, match=msg): + s.to_markdown(None, "wt") + + def test_simple(): buf = StringIO() df = pd.DataFrame([1, 2, 3]) @@ -27,8 +35,7 @@ def test_empty_frame(): df.to_markdown(buf=buf) result = buf.getvalue() assert result == ( - "| id | first_name | last_name |\n" - "|------|--------------|-------------|" + "| id | first_name | last_name |\n|------|--------------|-------------|" ) @@ -57,8 +64,7 @@ def test_series(): s.to_markdown(buf=buf) result = buf.getvalue() assert result == ( - "| | foo |\n|---:|------:|\n| 0 | 1 " - "|\n| 1 | 2 |\n| 2 | 3 |" + "| | foo |\n|---:|------:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" ) @@ -92,15 +98,3 @@ def test_showindex_disallowed_in_kwargs(): df = pd.DataFrame([1, 2, 3]) with pytest.raises(ValueError, match="Pass 'index' instead of 'showindex"): df.to_markdown(index=True, showindex=True) - - -def test_markdown_pos_args_deprecatation(): - # GH-54229 - df = pd.DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_markdown except for the " - r"argument 'buf' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - buffer = BytesIO() - df.to_markdown(buffer, "grid") diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 2e5a5005cb076..0866581535c2f 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( CategoricalIndex, @@ -35,6 +35,16 @@ def _three_digit_exp(): class TestDataFrameToStringFormatters: + def test_keyword_deprecation(self): + # GH 57280 + msg = ( + "Starting with pandas version 4.0 all arguments of to_string " + "except for the argument 'buf' will be keyword-only." + ) + s = Series(["a", "b"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + s.to_string(None, "NaN") + def test_to_string_masked_ea_with_formatter(self): # GH#39336 df = DataFrame( @@ -122,20 +132,17 @@ def test_to_string_with_formatters_unicode(self): ) assert result == expected - def test_to_string_index_formatter(self): - df = DataFrame([range(5), range(5, 10), range(10, 15)]) - - rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) - - xp = dedent( - """\ - 0 1 2 3 4 - a 0 1 2 3 4 - b 5 6 7 8 9 - c 10 11 12 13 14\ - """ - ) - assert rs == xp + def test_to_string_index_formatter(self): + df = DataFrame([range(5), range(5, 10), range(10, 15)]) + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) + xp = dedent( + """\ + 0 1 2 3 4 + a 0 1 2 3 4 + b 5 6 7 8 9 + c 10 11 12 13 14""" + ) + assert rs == xp def test_no_extra_space(self): # GH#52690: Check that no extra space is given @@ -370,17 +377,11 @@ def test_to_string_small_float_values(self): # sadness per above if _three_digit_exp(): expected = ( - " a\n" - "0 1.500000e+000\n" - "1 1.000000e-017\n" - "2 -5.500000e-007" + " a\n0 1.500000e+000\n1 1.000000e-017\n2 -5.500000e-007" ) else: expected = ( - " a\n" - "0 1.500000e+00\n" - "1 1.000000e-17\n" - "2 -5.500000e-07" + " a\n0 1.500000e+00\n1 1.000000e-17\n2 -5.500000e-07" ) assert result == expected @@ -412,6 +413,24 @@ def test_to_string_complex_float_formatting(self): ) assert result == expected + def test_to_string_complex_float_formatting_with_exponents(self): + # GH #60393 + with option_context("display.precision", 6): + df = DataFrame( + { + "x": [ + (1.8816e-09 + 0j), + (1.8816e-09 + 3.39676e-09j), + ] + } + ) + result = df.to_string() + expected = ( + " x\n0 1.881600e-09+0.000000e+00j\n" + "1 1.881600e-09+3.396760e-09j" + ) + assert result == expected + def test_to_string_format_inf(self): # GH#24861 df = DataFrame( @@ -758,24 +777,12 @@ def test_to_string_string_dtype(self): result = df.dtypes.to_string() expected = dedent( """\ - x string[pyarrow] - y string[python] - z int64[pyarrow]""" + x string + y string + z int64[pyarrow]""" ) assert result == expected - def test_to_string_pos_args_deprecation(self): - # GH#54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - "Starting with pandas version 3.0 all arguments of to_string " - "except for the " - "argument 'buf' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - buf = StringIO() - df.to_string(buf, None, None, True, True) - def test_to_string_utf8_columns(self): n = "\u05d0".encode() df = DataFrame([1, 2], columns=[n]) @@ -851,7 +858,7 @@ def test_to_string(self): frame.to_string() # TODO: split or simplify this test? - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="fix when arrow is default") + @pytest.mark.xfail(using_string_dtype(), reason="fix when arrow is default") def test_to_string_index_with_nan(self): # GH#2850 df = DataFrame( @@ -957,6 +964,13 @@ def test_to_string_repr_unicode(self): finally: sys.stdin = _stdin + def test_nested_dataframe(self): + df1 = DataFrame({"level1": [["row1"], ["row2"]]}) + df2 = DataFrame({"level3": [{"level2": df1}]}) + result = df2.to_string() + expected = " level3\n0 {'level2': ['level1']}" + assert result == expected + class TestSeriesToString: def test_to_string_without_index(self): @@ -1190,13 +1204,7 @@ def test_to_string_float_na_spacing(self): ser[::2] = np.nan result = ser.to_string() - expected = ( - "0 NaN\n" - "1 1.5678\n" - "2 NaN\n" - "3 -3.0000\n" - "4 NaN" - ) + expected = "0 NaN\n1 1.5678\n2 NaN\n3 -3.0000\n4 NaN" assert result == expected def test_to_string_with_datetimeindex(self): diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index ff7d34c85c015..953a9246da1cd 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -41,6 +41,7 @@ def test_read_zipped_json(datapath): @td.skip_if_not_us_locale @pytest.mark.single_cpu +@pytest.mark.network def test_with_s3_url(compression, s3_public_bucket, s3so): # Bucket created in tests/io/conftest.py df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) diff --git a/pandas/tests/io/json/test_deprecated_kwargs.py b/pandas/tests/io/json/test_deprecated_kwargs.py index cc88fc3ba1826..9da682c90a285 100644 --- a/pandas/tests/io/json/test_deprecated_kwargs.py +++ b/pandas/tests/io/json/test_deprecated_kwargs.py @@ -1,6 +1,7 @@ """ Tests for the deprecated keyword arguments for `read_json`. """ + from io import StringIO import pandas as pd diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index cc101bb9c8b6d..7936982e4a055 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -1,4 +1,5 @@ """Tests for Table Schema integration.""" + from collections import OrderedDict from io import StringIO import json @@ -31,7 +32,7 @@ def df_schema(): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), @@ -44,12 +45,12 @@ def df_table(): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], - "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + "H": pd.date_range("2016-01-01", freq="D", periods=4, tz="US/Central"), }, index=pd.Index(range(4), name="idx"), ) @@ -69,7 +70,7 @@ def test_build_table_schema(self, df_schema, using_infer_string): "primaryKey": ["idx"], } if using_infer_string: - expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -120,9 +121,9 @@ def test_multiindex(self, df_schema, using_infer_string): expected["fields"][0] = { "name": "level_0", "type": "any", - "extDtype": "string", + "extDtype": "str", } - expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected df.index.names = ["idx0", None] @@ -165,11 +166,9 @@ def test_as_json_table_type_bool_data(self, bool_type): def test_as_json_table_type_date_data(self, date_data): assert as_json_table_type(date_data.dtype) == "datetime" - @pytest.mark.parametrize( - "str_data", - [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)], - ) - def test_as_json_table_type_string_data(self, str_data): + @pytest.mark.parametrize("klass", [pd.Series, pd.Index]) + def test_as_json_table_type_string_data(self, klass): + str_data = klass(["a", "b"], dtype=object) assert as_json_table_type(str_data.dtype) == "string" @pytest.mark.parametrize( @@ -179,7 +178,6 @@ def test_as_json_table_type_string_data(self, str_data): pd.Categorical([1]), pd.Series(pd.Categorical([1])), pd.CategoricalIndex([1]), - pd.Categorical([1]), ], ) def test_as_json_table_type_categorical_data(self, cat_data): @@ -305,7 +303,7 @@ def test_to_json(self, df_table, using_infer_string): ] if using_infer_string: - fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + fields[2] = {"name": "B", "type": "any", "extDtype": "str"} schema = {"fields": fields, "primaryKey": ["idx"]} data = [ @@ -453,12 +451,17 @@ def test_to_json_categorical_index(self): assert result == expected def test_date_format_raises(self, df_table): - msg = ( + error_msg = ( "Trying to write with `orient='table'` and `date_format='epoch'`. Table " "Schema requires dates to be formatted with `date_format='iso'`" ) - with pytest.raises(ValueError, match=msg): - df_table.to_json(orient="table", date_format="epoch") + warning_msg = ( + "'epoch' date format is deprecated and will be removed in a future " + "version, please use 'iso' date format instead." + ) + with pytest.raises(ValueError, match=error_msg): + with tm.assert_produces_warning(FutureWarning, match=warning_msg): + df_table.to_json(orient="table", date_format="epoch") # others work df_table.to_json(orient="table", date_format="iso") @@ -636,7 +639,7 @@ def test_warns_non_roundtrippable_names(self, idx): # GH 19130 df = DataFrame(index=idx) df.index.name = "index" - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match="not round-trippable"): set_default_names(df) def test_timestamp_in_columns(self): @@ -684,7 +687,7 @@ class TestTableOrientReader: {"ints": [1, 2, 3, 4]}, {"objects": ["a", "b", "c", "d"]}, {"objects": ["1", "2", "3", "4"]}, - {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, + {"date_ranges": pd.date_range("2016-01-01", freq="D", periods=4)}, {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, { "ordered_cats": pd.Series( @@ -696,25 +699,32 @@ class TestTableOrientReader: {"bools": [True, False, False, True]}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2016-01-01", freq="D", periods=4, tz="US/Central" ) # added in # GH 35973 }, ], ) - def test_read_json_table_orient(self, index_nm, vals, recwarn): + def test_read_json_table_orient(self, index_nm, vals): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) - out = df.to_json(orient="table") + out = StringIO(df.to_json(orient="table")) result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) - @pytest.mark.parametrize("index_nm", [None, "idx", "index"]) @pytest.mark.parametrize( - "vals", - [{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}], + "index_nm", + [ + None, + "idx", + pytest.param( + "index", + marks=pytest.mark.filterwarnings("ignore:Index name:UserWarning"), + ), + ], ) - def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): + def test_read_json_table_orient_raises(self, index_nm): + vals = {"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")} df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) - out = df.to_json(orient="table") + out = StringIO(df.to_json(orient="table")) with pytest.raises(NotImplementedError, match="can not yet read "): pd.read_json(out, orient="table") @@ -728,7 +738,7 @@ def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): {"ints": [1, 2, 3, 4]}, {"objects": ["a", "b", "c", "d"]}, {"objects": ["1", "2", "3", "4"]}, - {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, + {"date_ranges": pd.date_range("2016-01-01", freq="D", periods=4)}, {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, { "ordered_cats": pd.Series( @@ -740,19 +750,19 @@ def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): {"bools": [True, False, False, True]}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2016-01-01", freq="D", periods=4, tz="US/Central" ) # added in # GH 35973 }, ], ) - def test_read_json_table_period_orient(self, index_nm, vals, recwarn): + def test_read_json_table_period_orient(self, index_nm, vals): df = DataFrame( vals, index=pd.Index( (pd.Period(f"2022Q{q}") for q in range(1, 5)), name=index_nm ), ) - out = df.to_json(orient="table") + out = StringIO(df.to_json(orient="table")) result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) @@ -762,15 +772,15 @@ def test_read_json_table_period_orient(self, index_nm, vals, recwarn): pd.Index(range(4)), pd.date_range( "2020-08-30", - freq="d", + freq="D", periods=4, )._with_freq(None), pd.date_range( - "2020-08-30", freq="d", periods=4, tz="US/Central" + "2020-08-30", freq="D", periods=4, tz="US/Central" )._with_freq(None), pd.MultiIndex.from_product( [ - pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"), + pd.date_range("2020-08-30", freq="D", periods=2, tz="US/Central"), ["x", "y"], ], ), @@ -780,18 +790,18 @@ def test_read_json_table_period_orient(self, index_nm, vals, recwarn): "vals", [ {"floats": [1.1, 2.2, 3.3, 4.4]}, - {"dates": pd.date_range("2020-08-30", freq="d", periods=4)}, + {"dates": pd.date_range("2020-08-30", freq="D", periods=4)}, { "timezones": pd.date_range( - "2020-08-30", freq="d", periods=4, tz="Europe/London" + "2020-08-30", freq="D", periods=4, tz="Europe/London" ) }, ], ) - def test_read_json_table_timezones_orient(self, idx, vals, recwarn): + def test_read_json_table_timezones_orient(self, idx, vals): # GH 35973 df = DataFrame(vals, index=idx) - out = df.to_json(orient="table") + out = StringIO(df.to_json(orient="table")) result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) @@ -800,12 +810,12 @@ def test_comprehensive(self): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), # 'D': pd.timedelta_range('1h', periods=4, freq='min'), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.1, 2.2, 3.3, 4.4], - "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + "H": pd.date_range("2016-01-01", freq="D", periods=4, tz="US/Central"), "I": [True, False, False, True], }, index=pd.Index(range(4), name="idx"), @@ -862,12 +872,12 @@ def test_read_json_orient_table_old_schema_version(self): tm.assert_frame_equal(expected, result) @pytest.mark.parametrize("freq", ["M", "2M", "Q", "2Q", "Y", "2Y"]) - def test_read_json_table_orient_period_depr_freq(self, freq, recwarn): + def test_read_json_table_orient_period_depr_freq(self, freq): # GH#9586 df = DataFrame( {"ints": [1, 2]}, index=pd.PeriodIndex(["2020-01", "2021-06"], freq=freq), ) - out = df.to_json(orient="table") + out = StringIO(df.to_json(orient="table")) result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index b7bb057bc538e..12ae24b064c9d 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -61,54 +61,33 @@ def test_build_table_schema(self): class TestTableSchemaType: - @pytest.mark.parametrize( - "date_data", - [ - DateArray([dt.date(2021, 10, 10)]), - DateArray(dt.date(2021, 10, 10)), - Series(DateArray(dt.date(2021, 10, 10))), - ], - ) - def test_as_json_table_type_ext_date_array_dtype(self, date_data): + @pytest.mark.parametrize("box", [lambda x: x, Series]) + def test_as_json_table_type_ext_date_array_dtype(self, box): + date_data = box(DateArray([dt.date(2021, 10, 10)])) assert as_json_table_type(date_data.dtype) == "any" def test_as_json_table_type_ext_date_dtype(self): assert as_json_table_type(DateDtype()) == "any" - @pytest.mark.parametrize( - "decimal_data", - [ - DecimalArray([decimal.Decimal(10)]), - Series(DecimalArray([decimal.Decimal(10)])), - ], - ) - def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data): + @pytest.mark.parametrize("box", [lambda x: x, Series]) + def test_as_json_table_type_ext_decimal_array_dtype(self, box): + decimal_data = box(DecimalArray([decimal.Decimal(10)])) assert as_json_table_type(decimal_data.dtype) == "number" def test_as_json_table_type_ext_decimal_dtype(self): assert as_json_table_type(DecimalDtype()) == "number" - @pytest.mark.parametrize( - "string_data", - [ - array(["pandas"], dtype="string"), - Series(array(["pandas"], dtype="string")), - ], - ) - def test_as_json_table_type_ext_string_array_dtype(self, string_data): + @pytest.mark.parametrize("box", [lambda x: x, Series]) + def test_as_json_table_type_ext_string_array_dtype(self, box): + string_data = box(array(["pandas"], dtype="string")) assert as_json_table_type(string_data.dtype) == "any" def test_as_json_table_type_ext_string_dtype(self): assert as_json_table_type(StringDtype()) == "any" - @pytest.mark.parametrize( - "integer_data", - [ - array([10], dtype="Int64"), - Series(array([10], dtype="Int64")), - ], - ) - def test_as_json_table_type_ext_integer_array_dtype(self, integer_data): + @pytest.mark.parametrize("box", [lambda x: x, Series]) + def test_as_json_table_type_ext_integer_array_dtype(self, box): + integer_data = box(array([10], dtype="Int64")) assert as_json_table_type(integer_data.dtype) == "integer" def test_as_json_table_type_ext_integer_dtype(self): @@ -118,31 +97,24 @@ def test_as_json_table_type_ext_integer_dtype(self): class TestTableOrient: @pytest.fixture def da(self): + """Fixture for creating a DateArray.""" return DateArray([dt.date(2021, 10, 10)]) @pytest.fixture def dc(self): + """Fixture for creating a DecimalArray.""" return DecimalArray([decimal.Decimal(10)]) @pytest.fixture def sa(self): + """Fixture for creating a StringDtype array.""" return array(["pandas"], dtype="string") @pytest.fixture def ia(self): + """Fixture for creating an Int64Dtype array.""" return array([10], dtype="Int64") - @pytest.fixture - def df(self, da, dc, sa, ia): - return DataFrame( - { - "A": da, - "B": dc, - "C": sa, - "D": ia, - } - ) - def test_build_date_series(self, da): s = Series(da, name="a") s.index.name = "id" @@ -187,7 +159,7 @@ def test_build_decimal_series(self, dc): expected = OrderedDict( [ ("schema", schema), - ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ("data", [OrderedDict([("id", 0), ("a", "10")])]), ] ) @@ -243,8 +215,15 @@ def test_build_int64_series(self, ia): assert result == expected - def test_to_json(self, df): - df = df.copy() + def test_to_json(self, da, dc, sa, ia): + df = DataFrame( + { + "A": da, + "B": dc, + "C": sa, + "D": ia, + } + ) df.index.name = "idx" result = df.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) @@ -266,7 +245,7 @@ def test_to_json(self, df): [ ("idx", 0), ("A", "2021-10-10T00:00:00.000"), - ("B", 10.0), + ("B", "10"), ("C", "pandas"), ("D", 10), ] diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3c6517a872b76..fdbfbd004617e 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -78,17 +78,6 @@ def state_data(): ] -@pytest.fixture -def author_missing_data(): - return [ - {"info": None}, - { - "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"}, - "author_name": {"first": "Jane", "last_name": "Doe"}, - }, - ] - - @pytest.fixture def missing_metadata(): return [ @@ -120,23 +109,6 @@ def missing_metadata(): ] -@pytest.fixture -def max_level_test_input_data(): - """ - input data to test json_normalize with max_level param - """ - return [ - { - "CreatedBy": {"Name": "User001"}, - "Lookup": { - "TextField": "Some text", - "UserField": {"Id": "ID001", "Name": "Name001"}, - }, - "Image": {"a": "b"}, - } - ] - - class TestJSONNormalize: def test_simple_records(self): recs = [ @@ -424,8 +396,15 @@ def test_non_ascii_key(self): result = json_normalize(json.loads(testjson)) tm.assert_frame_equal(result, expected) - def test_missing_field(self, author_missing_data): + def test_missing_field(self): # GH20030: + author_missing_data = [ + {"info": None}, + { + "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"}, + "author_name": {"first": "Jane", "last_name": "Doe"}, + }, + ] result = json_normalize(author_missing_data) ex_data = [ { @@ -537,7 +516,7 @@ def test_nonetype_record_path(self, nulls_fixture): ], record_path=["info"], ) - expected = DataFrame({"i": 2}, index=[0]) + expected = DataFrame({"i": 2}, index=range(1)) tm.assert_equal(result, expected) @pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"']) @@ -547,8 +526,8 @@ def test_non_list_record_path_errors(self, value): test_input = {"state": "Texas", "info": parsed_value} test_path = "info" msg = ( - f"{test_input} has non list value {parsed_value} for path {test_path}. " - "Must be list or null." + f"Path must contain list or null, " + f"but got {type(parsed_value).__name__} at 'info'" ) with pytest.raises(TypeError, match=msg): json_normalize([test_input], record_path=[test_path]) @@ -582,6 +561,14 @@ def test_top_column_with_leading_underscore(self): tm.assert_frame_equal(result, expected) + def test_series_index(self, state_data): + idx = Index([7, 8]) + series = Series(state_data, index=idx) + result = json_normalize(series) + tm.assert_index_equal(result.index, idx) + result = json_normalize(series, "counties") + tm.assert_index_equal(result.index, idx.repeat([3, 2])) + class TestNestedToRecord: def test_flat_stays_flat(self): @@ -843,8 +830,18 @@ def test_nonetype_multiple_levels(self): ), ], ) - def test_with_max_level(self, max_level, expected, max_level_test_input_data): + def test_with_max_level(self, max_level, expected): # GH23843: Enhanced JSON normalize + max_level_test_input_data = [ + { + "CreatedBy": {"Name": "User001"}, + "Lookup": { + "TextField": "Some text", + "UserField": {"Id": "ID001", "Name": "Name001"}, + }, + "Image": {"a": "b"}, + } + ] output = nested_to_record(max_level_test_input_data, max_level=max_level) assert output == expected @@ -902,6 +899,7 @@ def test_series_non_zero_index(self): "elements.a": [1.0, np.nan, np.nan], "elements.b": [np.nan, 2.0, np.nan], "elements.c": [np.nan, np.nan, 3.0], - } + }, + index=[1, 2, 3], ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0eefb0b52c483..32eeb30de4b69 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,5 @@ import datetime from datetime import timedelta -from decimal import Decimal from io import ( BytesIO, StringIO, @@ -13,7 +12,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -24,64 +23,40 @@ DataFrame, DatetimeIndex, Index, + RangeIndex, Series, Timestamp, date_range, read_json, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.json import ujson_dumps -def test_literal_json_deprecation(): +def test_literal_json_raises(): # PR 53409 - expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) - jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - msg = ( - "Passing literal json to 'read_json' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) + msg = r".* does not exist" - with tm.assert_produces_warning(FutureWarning, match=msg): - try: - read_json(jsonl, lines=False) - except ValueError: - pass + with pytest.raises(FileNotFoundError, match=msg): + read_json(jsonl, lines=False) - with tm.assert_produces_warning(FutureWarning, match=msg): - read_json(expected.to_json(), lines=False) + with pytest.raises(FileNotFoundError, match=msg): + read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) - tm.assert_frame_equal(result, expected) + with pytest.raises(FileNotFoundError, match=msg): + read_json( + '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n', + lines=False, + ) - with tm.assert_produces_warning(FutureWarning, match=msg): - try: - result = read_json( - '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n', - lines=False, - ) - except ValueError: - pass - - with tm.assert_produces_warning(FutureWarning, match=msg): - try: - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False) - except ValueError: - pass - tm.assert_frame_equal(result, expected) + with pytest.raises(FileNotFoundError, match=msg): + read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False) def assert_json_roundtrip_equal(result, expected, orient): @@ -93,17 +68,6 @@ def assert_json_roundtrip_equal(result, expected, orient): class TestPandasContainer: - @pytest.fixture - def categorical_frame(self): - data = { - c: np.random.default_rng(i).standard_normal(30) - for i, c in enumerate(list("ABCD")) - } - cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * 15 - data["E"] = list(reversed(cat)) - data["sort"] = np.arange(30, dtype="int64") - return DataFrame(data, index=pd.CategoricalIndex(cat, name="E")) - @pytest.fixture def datetime_series(self): # Same as usual datetime_series, but with index freq set to None, @@ -122,7 +86,7 @@ def datetime_frame(self): # since that doesn't round-trip, see GH#33711 df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=30, freq="B"), ) df.index = df.index._with_freq(None) @@ -166,20 +130,35 @@ def test_frame_non_unique_index_raises(self, orient): [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ], ) - def test_frame_non_unique_columns(self, orient, data): + def test_frame_non_unique_columns(self, orient, data, request): + if isinstance(data[0][0], Timestamp) and orient == "split": + mark = pytest.mark.xfail( + reason="GH#55827 non-nanosecond dt64 fails to round-trip" + ) + request.applymarker(mark) + df = DataFrame(data, index=[1, 2], columns=["x", "x"]) - result = read_json( - StringIO(df.to_json(orient=orient)), orient=orient, convert_dates=["x"] + expected_warning = None + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." ) + if df.iloc[:, 0].dtype == "datetime64[s]": + expected_warning = FutureWarning + + with tm.assert_produces_warning(expected_warning, match=msg): + result = read_json( + StringIO(df.to_json(orient=orient)), orient=orient, convert_dates=["x"] + ) if orient == "values": expected = DataFrame(data) - if expected.iloc[:, 0].dtype == "datetime64[ns]": + if expected.iloc[:, 0].dtype == "datetime64[s]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 - expected.iloc[:, 0] = expected.iloc[:, 0].astype(np.int64) // 1000000 + expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000) elif orient == "split": expected = df expected.columns = ["x", "x.1"] @@ -249,7 +228,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype): @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_categorical( - self, request, orient, categorical_frame, convert_axes, using_infer_string + self, request, orient, convert_axes, using_infer_string ): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): @@ -259,12 +238,20 @@ def test_roundtrip_categorical( ) ) + data = { + c: np.random.default_rng(i).standard_normal(30) + for i, c in enumerate(list("ABCD")) + } + cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * 15 + data["E"] = list(reversed(cat)) + data["sort"] = np.arange(30, dtype="int64") + categorical_frame = DataFrame(data, index=pd.CategoricalIndex(cat, name="E")) data = StringIO(categorical_frame.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes) expected = categorical_frame.copy() expected.index = expected.index.astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -407,7 +394,7 @@ def test_frame_read_json_dtype_missing_value(self, dtype): # GH28501 Parse missing values using read_json with dtype=False # to NaN instead of None result = read_json(StringIO("[null]"), dtype=dtype) - expected = DataFrame([np.nan]) + expected = DataFrame([np.nan], dtype=object if not dtype else None) tm.assert_frame_equal(result, expected) @@ -493,12 +480,12 @@ def test_frame_mixedtype_orient(self): # GH10289 left = read_json(inp, orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) - right.index = pd.RangeIndex(len(df)) + right.index = RangeIndex(len(df)) inp = StringIO(df.to_json(orient="records")) left = read_json(inp, orient="records", convert_axes=False) tm.assert_frame_equal(left, right) - right.columns = pd.RangeIndex(df.shape[1]) + right.columns = RangeIndex(df.shape[1]) inp = StringIO(df.to_json(orient="values")) left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) @@ -618,7 +605,7 @@ def test_blocks_compat_GH9037(self, using_infer_string): # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype( - np.str_ if not using_infer_string else "string[pyarrow_numpy]" + np.str_ if not using_infer_string else "str" ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") @@ -703,7 +690,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string expected = string_series if using_infer_string and orient in ("split", "index", "columns"): # These schemas don't contain dtypes, so we infer string - expected.index = expected.index.astype("string[pyarrow_numpy]") + expected.index = expected.index.astype("str") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -722,6 +709,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series): if orient != "split": expected.name = None + if using_string_dtype(): + expected = expected.astype("str") + tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): @@ -786,7 +776,12 @@ def test_series_with_dtype(self): ) def test_series_with_dtype_datetime(self, dtype, expected): s = Series(["2000-01-01"], dtype="datetime64[ns]") - data = StringIO(s.to_json()) + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + data = StringIO(s.to_json()) result = read_json(data, typ="series", dtype=dtype) tm.assert_series_equal(result, expected) @@ -797,7 +792,7 @@ def test_frame_from_json_precise_float(self): def test_typ(self): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64") - result = read_json(StringIO(s.to_json()), typ=None) + result = read_json(StringIO(s.to_json()), typ="series") tm.assert_series_equal(result, s) def test_reconstruction_index(self): @@ -832,12 +827,18 @@ def test_convert_dates(self, datetime_series, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101").as_unit("ns") - json = StringIO(df.to_json()) + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + json = StringIO(df.to_json()) result = read_json(json) tm.assert_frame_equal(result, df) df["foo"] = 1.0 - json = StringIO(df.to_json(date_unit="ns")) + with tm.assert_produces_warning(FutureWarning, match=msg): + json = StringIO(df.to_json(date_unit="ns")) result = read_json(json, convert_dates=False) expected = df.copy() @@ -847,7 +848,8 @@ def test_convert_dates(self, datetime_series, datetime_frame): # series ts = Series(Timestamp("20130101").as_unit("ns"), index=datetime_series.index) - json = StringIO(ts.to_json()) + with tm.assert_produces_warning(FutureWarning, match=msg): + json = StringIO(ts.to_json()) result = read_json(json, typ="series") tm.assert_series_equal(result, ts) @@ -860,15 +862,27 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): data.append("a") ser = Series(data, index=data) - result = ser.to_json(date_format=date_format) + if not as_object: + ser = ser.astype("M8[ns]") + if isinstance(ser.index, DatetimeIndex): + ser.index = ser.index.as_unit("ns") + expected_warning = None if date_format == "epoch": expected = '{"1577836800000":1577836800000,"null":null}' + expected_warning = FutureWarning else: expected = ( '{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}' ) + msg = ( + "'epoch' date format is deprecated and will be removed in a future " + "version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(expected_warning, match=msg): + result = ser.to_json(date_format=date_format) + if as_object: expected = expected.replace("}", ',"a":"a"}') @@ -893,6 +907,7 @@ def test_convert_dates_infer(self, infer_word): expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) + expected[infer_word] = expected[infer_word].astype("M8[ns]") result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) @@ -957,7 +972,6 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_date_unit(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42").as_unit("ns") @@ -966,7 +980,12 @@ def test_date_unit(self, unit, datetime_frame): df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT - json = df.to_json(date_format="epoch", date_unit=unit) + msg = ( + "'epoch' date format is deprecated and will be removed in a future " + "version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + json = df.to_json(date_format="epoch", date_unit=unit) # force date unit result = read_json(StringIO(json), date_unit=unit) @@ -976,6 +995,34 @@ def test_date_unit(self, unit, datetime_frame): result = read_json(StringIO(json), date_unit=None) tm.assert_frame_equal(result, df) + @pytest.mark.parametrize( + "df, warn", + [ + (DataFrame({"A": ["a", "b", "c"], "B": np.arange(3)}), None), + (DataFrame({"A": [True, False, False]}), None), + ( + DataFrame( + {"A": ["a", "b", "c"], "B": pd.to_timedelta(np.arange(3), unit="D")} + ), + FutureWarning, + ), + ( + DataFrame( + {"A": pd.to_datetime(["2020-01-01", "2020-02-01", "2020-03-01"])} + ), + FutureWarning, + ), + ], + ) + def test_default_epoch_date_format_deprecated(self, df, warn): + # GH 57063 + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(warn, match=msg): + df.to_json() + @pytest.mark.parametrize("unit", ["s", "ms", "us"]) def test_iso_non_nano_datetimes(self, unit): # Test that numpy datetimes @@ -1045,7 +1092,12 @@ def test_doc_example(self): dfj2["bools"] = True dfj2.index = date_range("20130101", periods=5) - json = StringIO(dfj2.to_json()) + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + json = StringIO(dfj2.to_json()) result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_}) tm.assert_frame_equal(result, result) @@ -1057,9 +1109,7 @@ def test_round_trip_exception(self, datapath): result = read_json(StringIO(s)) res = result.reindex(index=df.index, columns=df.columns) - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = res.fillna(np.nan, downcast=False) + res = res.fillna(np.nan) tm.assert_frame_equal(res, df) @pytest.mark.network @@ -1084,19 +1134,26 @@ def test_timedelta(self): ser = Series([timedelta(23), timedelta(seconds=5)]) assert ser.dtype == "timedelta64[ns]" - result = read_json(StringIO(ser.to_json()), typ="series").apply(converter) + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = read_json(StringIO(ser.to_json()), typ="series").apply(converter) tm.assert_series_equal(result, ser) ser = Series([timedelta(23), timedelta(seconds=5)], index=Index([0, 1])) assert ser.dtype == "timedelta64[ns]" - result = read_json(StringIO(ser.to_json()), typ="series").apply(converter) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = read_json(StringIO(ser.to_json()), typ="series").apply(converter) tm.assert_series_equal(result, ser) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) assert frame[0].dtype == "timedelta64[ns]" - tm.assert_frame_equal( - frame, read_json(StringIO(frame.to_json())).apply(converter) - ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + json = frame.to_json() + tm.assert_frame_equal(frame, read_json(StringIO(json)).apply(converter)) def test_timedelta2(self): frame = DataFrame( @@ -1106,7 +1163,12 @@ def test_timedelta2(self): "c": date_range(start="20130101", periods=2), } ) - data = StringIO(frame.to_json(date_unit="ns")) + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + data = StringIO(frame.to_json(date_unit="ns")) result = read_json(data) result["a"] = pd.to_timedelta(result.a, unit="ns") result["c"] = pd.to_datetime(result.c) @@ -1134,17 +1196,24 @@ def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): data.append("a") ser = Series(data, index=data) + expected_warning = None if date_format == "iso": expected = ( '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' ) else: + expected_warning = FutureWarning expected = '{"86400000":86400000,"172800000":172800000,"null":null}' if as_object: expected = expected.replace("}", ',"a":"a"}') - result = ser.to_json(date_format=date_format) + msg = ( + "'epoch' date format is deprecated and will be removed in a future " + "version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(expected_warning, match=msg): + result = ser.to_json(date_format=date_format) assert result == expected @pytest.mark.parametrize("as_object", [True, False]) @@ -1152,10 +1221,17 @@ def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): def test_timedelta_to_json_fractional_precision(self, as_object, timedelta_typ): data = [timedelta_typ(milliseconds=42)] ser = Series(data, index=data) + warn = FutureWarning if as_object: ser = ser.astype(object) + warn = None - result = ser.to_json() + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(warn, match=msg): + result = ser.to_json() expected = '{"42":42}' assert result == expected @@ -1194,9 +1270,7 @@ def test_default_handler_numpy_unsupported_dtype(self): columns=["a", "b"], ) expected = ( - '[["(1+0j)","(nan+0j)"],' - '["(2.3+0j)","(nan+0j)"],' - '["(4-5j)","(1.2+0j)"]]' + '[["(1+0j)","(nan+0j)"],["(2.3+0j)","(nan+0j)"],["(4-5j)","(1.2+0j)"]]' ) assert df.to_json(default_handler=str, orient="values") == expected @@ -1237,12 +1311,18 @@ def test_datetime_tz(self): df_naive = df.copy() df_naive["A"] = tz_naive - expected = df_naive.to_json() - assert expected == df.to_json() + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df_naive.to_json() + assert expected == df.to_json() stz = Series(tz_range) s_naive = Series(tz_naive) - assert stz.to_json() == s_naive.to_json() + with tm.assert_produces_warning(FutureWarning, match=msg): + assert stz.to_json() == s_naive.to_json() def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks @@ -1293,11 +1373,7 @@ def test_tz_is_naive(self): ) def test_tz_range_is_utc(self, tz_range): exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' - dfexp = ( - '{"DT":{' - '"0":"2013-01-01T05:00:00.000Z",' - '"1":"2013-01-02T05:00:00.000Z"}}' - ) + dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000Z","1":"2013-01-02T05:00:00.000Z"}}' assert ujson_dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) @@ -1333,6 +1409,7 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu + @pytest.mark.network @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so): # GH17200 @@ -1491,7 +1568,7 @@ def test_from_json_to_json_table_dtypes(self): # TODO: We are casting to string which coerces None to NaN before casting back # to object, ending up with incorrect na values - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") + @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1507,7 +1584,12 @@ def test_to_json_from_json_columns_dtypes(self, orient): ), } ) - dfjson = expected.to_json(orient=orient) + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + dfjson = expected.to_json(orient=orient) result = read_json( StringIO(dfjson), @@ -1523,6 +1605,13 @@ def test_to_json_from_json_columns_dtypes(self, orient): ) tm.assert_frame_equal(result, expected) + def test_to_json_with_index_as_a_column_name(self): + df = DataFrame(data={"index": [1, 2], "a": [2, 3]}) + with pytest.raises( + ValueError, match="Overlapping names between the index and columns" + ): + df.to_json(orient="table") + @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 @@ -1667,6 +1756,7 @@ def test_read_timezone_information(self): [ "s3://example-fsspec/", "gcs://another-fsspec/file.json", + "filecache::s3://yet-another-fsspec/file.json", "https://example-site.com/data", "some-protocol://data.txt", ], @@ -1683,7 +1773,7 @@ def test_read_json_with_url_value(self, url): ) def test_read_json_with_very_long_file_path(self, compression): # GH 46718 - long_json_path = f'{"a" * 1000}.json{compression}' + long_json_path = f"{'a' * 1000}.json{compression}" with pytest.raises( FileNotFoundError, match=f"File {long_json_path} does not exist" ): @@ -1697,7 +1787,17 @@ def test_read_json_with_very_long_file_path(self, compression): def test_timedelta_as_label(self, date_format, key): df = DataFrame([[1]], columns=[pd.Timedelta("1D")]) expected = f'{{"{key}":{{"0":1}}}}' - result = df.to_json(date_format=date_format) + + expected_warning = None + if date_format == "epoch": + expected_warning = FutureWarning + + msg = ( + "'epoch' date format is deprecated and will be removed in a future " + "version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(expected_warning, match=msg): + result = df.to_json(date_format=date_format) assert result == expected @@ -1750,7 +1850,7 @@ def test_to_json_indent(self, indent): assert result == expected @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Adjust expected when infer_string is default, no bug here, " "just a complicated parametrization", ) @@ -1899,22 +1999,18 @@ def test_frame_int_overflow(self): result = read_json(StringIO(encoded_json)) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dataframe,expected", - [ - ( - DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}), - '{"(0, \'x\')":1,"(0, \'y\')":"a","(1, \'x\')":2,' - '"(1, \'y\')":"b","(2, \'x\')":3,"(2, \'y\')":"c"}', - ) - ], - ) - def test_json_multiindex(self, dataframe, expected): - series = dataframe.stack(future_stack=True) + def test_json_multiindex(self): + dataframe = DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}) + expected = ( + '{"(0, \'x\')":1,"(0, \'y\')":"a","(1, \'x\')":2,' + '"(1, \'y\')":"b","(2, \'x\')":3,"(2, \'y\')":"c"}' + ) + series = dataframe.stack() result = series.to_json(orient="index") assert result == expected @pytest.mark.single_cpu + @pytest.mark.network def test_to_s3(self, s3_public_bucket, s3so): # GH 28375 mock_bucket_name, target_file = s3_public_bucket.name, "test.json" @@ -1928,13 +2024,18 @@ def test_to_s3(self, s3_public_bucket, s3so): timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" - def test_json_pandas_nulls(self, nulls_fixture, request): + def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 - if isinstance(nulls_fixture, Decimal): - mark = pytest.mark.xfail(reason="not implemented") - request.applymarker(mark) + expected_warning = None + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + if nulls_fixture is pd.NaT: + expected_warning = FutureWarning - result = DataFrame([[nulls_fixture]]).to_json() + with tm.assert_produces_warning(expected_warning, match=msg): + result = DataFrame([[nulls_fixture]]).to_json() assert result == '{"0":{"0":null}}' def test_readjson_bool_series(self): @@ -1949,7 +2050,7 @@ def test_to_json_multiindex_escape(self): True, index=date_range("2017-01-20", "2017-01-23"), columns=["foo", "bar"], - ).stack(future_stack=True) + ).stack() result = df.to_json() expected = ( "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true," @@ -2026,14 +2127,10 @@ def test_json_uint64(self): result = df.to_json(orient="split") assert result == expected - @pytest.mark.parametrize( - "orient", ["split", "records", "values", "index", "columns"] - ) def test_read_json_dtype_backend( self, string_storage, dtype_backend, orient, using_infer_string ): # GH#50750 - pa = pytest.importorskip("pyarrow") df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2047,30 +2144,18 @@ def test_read_json_dtype_backend( } ) - if using_infer_string: - string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) - elif string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( StringIO(out), dtype_backend=dtype_backend, orient=orient ) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2079,12 +2164,13 @@ def test_read_json_dtype_backend( "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray expected = DataFrame( @@ -2097,7 +2183,33 @@ def test_read_json_dtype_backend( if orient == "values": expected.columns = list(range(8)) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) + + @td.skip_if_no("pyarrow") + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_read_json_pyarrow_with_dtype(self): + dtype = {"a": "int32[pyarrow]", "b": "int64[pyarrow]"} + json = b'{"a": 1, "b": 2}\n' + + df = read_json( + BytesIO(json), + dtype=dtype, + lines=True, + engine="pyarrow", + dtype_backend="pyarrow", + ) + + result = df.dtypes + expected = Series( + data=[ + pd.ArrowDtype.construct_from_string("int32[pyarrow]"), + pd.ArrowDtype.construct_from_string("int64[pyarrow]"), + ], + index=["a", "b"], + ) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): @@ -2146,29 +2258,56 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype="string[pyarrow_numpy]", - index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=Index(["row 1", "row 2"], dtype=dtype), + columns=Index(["col 1", "col 2"], dtype=dtype), ) tm.assert_frame_equal(result, expected) -def test_json_pos_args_deprecation(): - # GH-54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_json except for the " - r"argument 'path_or_buf' will be keyword-only." +@td.skip_if_no("pyarrow") +def test_to_json_ea_null(): + # GH#57224 + df = DataFrame( + { + "a": Series([1, NA], dtype="int64[pyarrow]"), + "b": Series([2, NA], dtype="Int64"), + } + ) + result = df.to_json(orient="records", lines=True) + expected = """{"a":1,"b":2} +{"a":null,"b":null} +""" + assert result == expected + + +def test_read_json_lines_rangeindex(): + # GH 57429 + data = """ +{"a": 1, "b": 2} +{"a": 3, "b": 4} +""" + result = read_json(StringIO(data), lines=True).index + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) + + +def test_large_number(): + # GH#20608 + result = read_json( + StringIO('["9999999999999999"]'), + orient="values", + typ="series", + convert_dates=False, ) - with tm.assert_produces_warning(FutureWarning, match=msg): - buf = BytesIO() - df.to_json(buf, "split") + expected = Series([9999999999999999]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index d96ccb4b94cc2..d482eb5fa1a06 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -165,11 +165,11 @@ def test_readjson_chunks_series(request, engine): s = pd.Series({"A": 1, "B": 2}) strio = StringIO(s.to_json(lines=True, orient="records")) - unchunked = read_json(strio, lines=True, typ="Series", engine=engine) + unchunked = read_json(strio, lines=True, typ="series", engine=engine) strio = StringIO(s.to_json(lines=True, orient="records")) with read_json( - strio, lines=True, typ="Series", chunksize=1, engine=engine + strio, lines=True, typ="series", chunksize=1, engine=engine ) as reader: chunked = pd.concat(reader) @@ -236,9 +236,9 @@ def test_readjson_chunks_closes(chunksize): ) with reader: reader.read() - assert ( - reader.handles.handle.closed - ), f"didn't close stream with chunksize = {chunksize}" + assert reader.handles.handle.closed, ( + f"didn't close stream with chunksize = {chunksize}" + ) @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) @@ -435,8 +435,7 @@ def test_to_json_append_mode(mode_): # Test ValueError when mode is not supported option df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - f"mode={mode_} is not a valid option." - "Only 'w' and 'a' are currently supported." + f"mode={mode_} is not a valid option.Only 'w' and 'a' are currently supported." ) with pytest.raises(ValueError, match=msg): df.to_json(mode=mode_, lines=False, orient="records") diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 56ea9ea625dff..d2bf9bdb139bd 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -10,7 +10,6 @@ import dateutil import numpy as np import pytest -import pytz import pandas._libs.json as ujson from pandas.compat import IS64 @@ -54,60 +53,24 @@ def orient(request): class TestUltraJSONTests: @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") - def test_encode_decimal(self): - sut = decimal.Decimal("1337.1337") - encoded = ujson.ujson_dumps(sut, double_precision=15) - decoded = ujson.ujson_loads(encoded) - assert decoded == 1337.1337 - - sut = decimal.Decimal("0.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "1.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 - - sut = decimal.Decimal("0.94") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "0.9" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 0.9 - - sut = decimal.Decimal("1.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "2.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 2.0 - - sut = decimal.Decimal("-1.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "-2.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == -2.0 - - sut = decimal.Decimal("0.995") - encoded = ujson.ujson_dumps(sut, double_precision=2) - assert encoded == "1.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 - - sut = decimal.Decimal("0.9995") - encoded = ujson.ujson_dumps(sut, double_precision=3) - assert encoded == "1.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 - - sut = decimal.Decimal("0.99999999999999944") - encoded = ujson.ujson_dumps(sut, double_precision=15) - assert encoded == "1.0" - + @pytest.mark.parametrize( + "value, double_precision", + [ + ("1337.1337", 15), + ("0.95", 1), + ("0.94", 1), + ("1.95", 1), + ("-1.95", 1), + ("0.995", 2), + ("0.9995", 3), + ("0.99999999999999944", 15), + ], + ) + def test_encode_decimal(self, value, double_precision): + sut = decimal.Decimal(value) + encoded = ujson.ujson_dumps(sut, double_precision=double_precision) decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == value @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): @@ -177,9 +140,7 @@ def test_encode_dict_with_unicode_keys(self, unicode_key): unicode_dict = {unicode_key: "value1"} assert unicode_dict == ujson.ujson_loads(ujson.ujson_dumps(unicode_dict)) - @pytest.mark.parametrize( - "double_input", [math.pi, -math.pi] # Should work with negatives too. - ) + @pytest.mark.parametrize("double_input", [math.pi, -math.pi]) def test_encode_double_conversion(self, double_input): output = ujson.ujson_dumps(double_input) assert round(double_input, 5) == round(json.loads(output), 5) @@ -372,6 +333,7 @@ def test_encode_time_conversion_basic(self, test): def test_encode_time_conversion_pytz(self): # see gh-11473: to_json segfaults with timezone-aware datetimes + pytz = pytest.importorskip("pytz") test = datetime.time(10, 12, 15, 343243, pytz.utc) output = ujson.ujson_dumps(test) expected = f'"{test.isoformat()}"' @@ -519,9 +481,7 @@ def test_decode_invalid_dict(self, invalid_dict): with pytest.raises(ValueError, match=msg): ujson.ujson_loads(invalid_dict) - @pytest.mark.parametrize( - "numeric_int_as_str", ["31337", "-31337"] # Should work with negatives. - ) + @pytest.mark.parametrize("numeric_int_as_str", ["31337", "-31337"]) def test_decode_numeric_int(self, numeric_int_as_str): assert int(numeric_int_as_str) == ujson.ujson_loads(numeric_int_as_str) @@ -995,7 +955,7 @@ def test_decode_array(self, arr): def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.ujson_loads(str(extreme_num)) - @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"]) + @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-(2**63) - 1}"]) def test_decode_too_extreme_numbers(self, too_extreme_num): with pytest.raises( ValueError, @@ -1010,7 +970,7 @@ def test_decode_with_trailing_non_whitespaces(self): with pytest.raises(ValueError, match="Trailing data"): ujson.ujson_loads("{}\n\t a") - @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"]) + @pytest.mark.parametrize("value", [f"{2**64}", f"{-(2**63) - 1}"]) def test_decode_array_with_big_int(self, value): with pytest.raises( ValueError, @@ -1040,11 +1000,7 @@ def test_decode_floating_point(self, sign, float_number): ) def test_encode_big_set(self): - s = set() - - for x in range(100000): - s.add(x) - + s = set(range(100000)) # Make sure no Exception is raised. ujson.ujson_dumps(s) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 5e47bcc1c5b0e..65ad7273666e5 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from io import StringIO import numpy as np @@ -220,21 +221,15 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) # Coercions should work without warnings. - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - depr_msg = "Passing a BlockManager to DataFrame" - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - with monkeypatch.context() as m: - m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) - result = parser.read_csv(StringIO(data)) + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + result = parser.read_csv(StringIO(data)) assert type(result.a[0]) is np.float64 assert result.a.dtype == float -@pytest.mark.filterwarnings("ignore:make_block is deprecated:FutureWarning") -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -252,21 +247,22 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): buf = StringIO(data) if parser.engine == "pyarrow": - df = parser.read_csv_check_warnings( - DeprecationWarning, - "Passing a BlockManager to DataFrame is deprecated", + df = parser.read_csv( buf, - check_stacklevel=False, ) else: df = parser.read_csv_check_warnings( warning_type, - r"Columns \(0\) have mixed types. " + r"Columns \(0: a\) have mixed types. " "Specify dtype option on import or set low_memory=False.", buf, ) - - assert df.a.dtype == object + if parser.engine == "c" and parser.low_memory: + assert df.a.dtype == object + elif using_infer_string: + assert df.a.dtype == "str" + else: + assert df.a.dtype == object @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 7ffc49e941c14..3680273f5e98a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from datetime import datetime from inspect import signature from io import StringIO @@ -12,6 +13,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -21,14 +25,10 @@ from pandas import ( DataFrame, Index, - Timestamp, compat, ) import pandas._testing as tm -from pandas.io.parsers import TextFileReader -from pandas.io.parsers.c_parser_wrapper import CParserWrapper - pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) @@ -37,62 +37,13 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -def test_override_set_noconvert_columns(): - # see gh-17351 - # - # Usecols needs to be sorted in _set_noconvert_columns based - # on the test_usecols_with_parse_dates test from test_usecols.py - class MyTextFileReader(TextFileReader): - def __init__(self) -> None: - self._currow = 0 - self.squeeze = False - - class MyCParserWrapper(CParserWrapper): - def _set_noconvert_columns(self): - if self.usecols_dtype == "integer": - # self.usecols is a set, which is documented as unordered - # but in practice, a CPython set of integers is sorted. - # In other implementations this assumption does not hold. - # The following code simulates a different order, which - # before GH 17351 would cause the wrong columns to be - # converted via the parse_dates parameter - self.usecols = list(self.usecols) - self.usecols.reverse() - return CParserWrapper._set_noconvert_columns(self) - - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - - parse_dates = [[1, 2]] - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - parser = MyTextFileReader() - parser.options = { - "usecols": [0, 2, 3], - "parse_dates": parse_dates, - "delimiter": ",", - } - parser.engine = "c" - parser._engine = MyCParserWrapper(StringIO(data), **parser.options) - - result = parser.read() - tm.assert_frame_equal(result, expected) - - def test_read_csv_local(all_parsers, csv1): prefix = "file:///" if compat.is_platform_windows() else "file://" parser = all_parsers fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") + expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -114,6 +65,7 @@ def test_read_csv_local(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -194,9 +146,6 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -218,6 +167,7 @@ def test_read_csv_dataframe(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -472,65 +422,43 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): @pytest.mark.parametrize( - "kwargs,expected", + "kwargs,expected_data", [ # gh-8661, gh-8679: this should ignore six lines, including # lines with trailing whitespace and blank lines. ( { "header": None, - "delim_whitespace": True, + "sep": r"\s+", "skiprows": [0, 1, 2, 3, 5, 6], "skip_blank_lines": True, }, - DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), + [[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]], ), # gh-8983: test skipping set of rows after a row with trailing spaces. ( { - "delim_whitespace": True, + "sep": r"\s+", "skiprows": [1, 2, 3, 5, 6], "skip_blank_lines": True, }, - DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), + {"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}, ), ], ) -def test_trailing_spaces(all_parsers, kwargs, expected): +def test_trailing_spaces(all_parsers, kwargs, expected_data): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 parser = all_parsers - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + with pytest.raises(ValueError, match="the 'pyarrow' engine does not support"): + parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + expected = DataFrame(expected_data) + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) tm.assert_frame_equal(result, expected) -def test_raise_on_sep_with_delim_whitespace(all_parsers): - # see gh-6607 - data = "a b c\n1 2 3" - parser = all_parsers - - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with pytest.raises(ValueError, match="you can only specify one"): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) - - def test_read_filepath_or_buffer(all_parsers): # see gh-43366 parser = all_parsers @@ -539,8 +467,7 @@ def test_read_filepath_or_buffer(all_parsers): parser.read_csv(filepath_or_buffer=b"input") -@pytest.mark.parametrize("delim_whitespace", [True, False]) -def test_single_char_leading_whitespace(all_parsers, delim_whitespace): +def test_single_char_leading_whitespace(all_parsers): # see gh-9710 parser = all_parsers data = """\ @@ -550,28 +477,16 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): a b\n""" - expected = DataFrame({"MyColumn": list("abab")}) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if parser.engine == "pyarrow": msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), - skipinitialspace=True, - delim_whitespace=delim_whitespace, - ) + parser.read_csv( + StringIO(data), + skipinitialspace=True, + ) return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace - ) + expected = DataFrame({"MyColumn": list("abab")}) + result = parser.read_csv(StringIO(data), skipinitialspace=True, sep=r"\s+") tm.assert_frame_equal(result, expected) @@ -814,49 +729,6 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser.read_csv(StringIO(data), names=set("QAZ")) -def test_read_table_delim_whitespace_default_sep(all_parsers): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - - depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" - - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_table(f, delim_whitespace=True) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_table(f, delim_whitespace=True) - expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) - - def test_read_csv_delimiter_and_sep_no_default(all_parsers): # GH#39823 f = StringIO("a,b\n1,2") @@ -882,26 +754,6 @@ def test_read_csv_line_break_as_separator(kwargs, all_parsers): parser.read_csv(StringIO(data), **kwargs) -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=delimiter) - - @skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 @@ -915,6 +767,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_data_list.py b/pandas/tests/io/parser/common/test_data_list.py index 3b0ff9e08d349..bf9293ddd841d 100644 --- a/pandas/tests/io/parser/common/test_data_list.py +++ b/pandas/tests/io/parser/common/test_data_list.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + import csv from io import StringIO diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py index 4ceca037f589a..eb6c97097e5fb 100644 --- a/pandas/tests/io/parser/common/test_decimal.py +++ b/pandas/tests/io/parser/common/test_decimal.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from io import StringIO import pytest diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index a7a8d031da215..cef57318195ec 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from io import ( BytesIO, StringIO, @@ -14,6 +15,7 @@ import numpy as np import pytest +from pandas.compat import WASM from pandas.errors import ( EmptyDataError, ParserError, @@ -72,27 +74,14 @@ def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) -@xfail_pyarrow # AssertionError: DataFrame.index are different -def test_path_local_path(all_parsers): - parser = all_parsers - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) - result = tm.round_trip_localpath( - df.to_csv, lambda p: parser.read_csv(p, index_col=0) - ) - tm.assert_frame_equal(df, result) - - +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError @@ -106,6 +95,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") @td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 @@ -247,12 +237,12 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request): tm.assert_frame_equal(result, expected) -def test_temporary_file(all_parsers): +def test_temporary_file(all_parsers, temp_file): # see gh-13398 parser = all_parsers data = "0 0" - with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: + with temp_file.open(mode="w+", encoding="utf-8") as new_file: new_file.write(data) new_file.flush() new_file.seek(0) @@ -425,7 +415,7 @@ def test_context_manager(all_parsers, datapath): try: with reader: next(reader) - assert False + raise AssertionError except AssertionError: assert reader.handles.handle.closed @@ -446,13 +436,13 @@ def test_context_manageri_user_provided(all_parsers, datapath): try: with reader: next(reader) - assert False + raise AssertionError except AssertionError: assert not reader.handles.handle.closed @skip_pyarrow # ParserError: Empty CSV file -def test_file_descriptor_leak(all_parsers, using_copy_on_write): +def test_file_descriptor_leak(all_parsers): # GH 31488 parser = all_parsers with tm.ensure_clean() as path: diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 6069c23936297..4e0b61577f9e7 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from io import StringIO import numpy as np diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 038c684c90c9e..cfa8785b24bde 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from datetime import datetime from io import StringIO import os @@ -86,7 +87,9 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): +def test_multi_index_no_level_names( + request, all_parsers, index_col, using_infer_string +): data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 @@ -145,20 +148,21 @@ def test_multi_index_no_level_names_implicit(all_parsers): @xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( - "data,expected,header", + "data,columns,header", [ - ("a,b", DataFrame(columns=["a", "b"]), [0]), + ("a,b", ["a", "b"], [0]), ( "a,b\nc,d", - DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), + MultiIndex.from_tuples([("a", "c"), ("b", "d")]), [0, 1], ), ], ) @pytest.mark.parametrize("round_trip", [True, False]) -def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): +def test_multi_index_blank_df(all_parsers, data, columns, header, round_trip): # see gh-14545 parser = all_parsers + expected = DataFrame(columns=columns) data = expected.to_csv(index=False) if round_trip else data result = parser.read_csv(StringIO(data), header=header) @@ -258,7 +262,8 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): datetime(2000, 1, 5), datetime(2000, 1, 6), datetime(2000, 1, 7), - ] + ], + dtype="M8[s]", ), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_inf.py b/pandas/tests/io/parser/common/test_inf.py index 74596b178d35d..657aa3278a442 100644 --- a/pandas/tests/io/parser/common/test_inf.py +++ b/pandas/tests/io/parser/common/test_inf.py @@ -2,15 +2,12 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from io import StringIO -import numpy as np import pytest -from pandas import ( - DataFrame, - option_context, -) +from pandas import DataFrame import pandas._testing as tm pytestmark = pytest.mark.filterwarnings( @@ -60,19 +57,3 @@ def test_infinity_parsing(all_parsers, na_filter): ) result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) tm.assert_frame_equal(result, expected) - - -def test_read_csv_with_use_inf_as_na(all_parsers): - # https://github.com/pandas-dev/pandas/issues/35493 - parser = all_parsers - data = "1.0\nNaN\n3.0" - msg = "use_inf_as_na option is deprecated" - warn = FutureWarning - if parser.engine == "pyarrow": - warn = (FutureWarning, DeprecationWarning) - - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - with option_context("use_inf_as_na", True): - result = parser.read_csv(StringIO(data), header=None) - expected = DataFrame([1.0, np.nan, 3.0]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index a3167346c64ef..9322e8d54f5b8 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from io import StringIO import numpy as np @@ -40,31 +41,29 @@ def test_int_conversion(all_parsers): ( "A,B\nTrue,1\nFalse,2\nTrue,3", {}, - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + [[True, 1], [False, 2], [True, 3]], ), ( "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]}, - DataFrame( - [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], - columns=["A", "B"], - ), + [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], ), ( "A,B\nTRUE,1\nFALSE,2\nTRUE,3", {}, - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + [[True, 1], [False, 2], [True, 3]], ), ( "A,B\nfoo,bar\nbar,foo", {"true_values": ["foo"], "false_values": ["bar"]}, - DataFrame([[True, False], [False, True]], columns=["A", "B"]), + [[True, False], [False, True]], ), ], ) def test_parse_bool(all_parsers, data, kwargs, expected): parser = all_parsers result = parser.read_csv(StringIO(data), **kwargs) + expected = DataFrame(expected, columns=["A", "B"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index a521c84aa007d..668aab05b9fa4 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -2,6 +2,7 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ + from io import StringIO import pytest @@ -97,6 +98,31 @@ def test_iterator_stop_on_chunksize(all_parsers): tm.assert_frame_equal(concat(result), expected) +def test_nrows_iterator_without_chunksize(all_parsers): + # GH 59079 + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True, nrows=2) + return + + with parser.read_csv(StringIO(data), iterator=True, nrows=2) as reader: + result = reader.get_chunk() + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["foo", "bar"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] ) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 4a4ae2b259289..a73327beea8bb 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -2,6 +2,7 @@ Tests that work on the Python, C and PyArrow engines but do not have a specific classification into the other test modules. """ + import codecs import csv from io import StringIO @@ -57,9 +58,12 @@ def test_bad_stream_exception(all_parsers, csv_dir_path): msg = "'utf-8' codec can't decode byte" # Stream must be binary UTF8. - with open(path, "rb") as handle, codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter - ) as stream: + with ( + open(path, "rb") as handle, + codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter + ) as stream, + ): with pytest.raises(UnicodeDecodeError, match=msg): parser.read_csv(stream) @@ -127,17 +131,11 @@ def test_catch_too_many_names(all_parsers): msg = ( "Too many columns specified: expected 4 and found 3" if parser.engine == "c" - else "Number of passed names did not match " - "number of header fields in the file" + else "Number of passed names did not match number of header fields in the file" ) - depr_msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) @skip_pyarrow # CSV parse error: Empty CSV file or block @@ -168,13 +166,7 @@ def test_suppress_error_output(all_parsers): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - msg = "Passing a BlockManager to DataFrame" - - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), on_bad_lines="skip") + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) @@ -203,7 +195,6 @@ def test_warn_bad_lines(all_parsers): expected_warning = ParserWarning if parser.engine == "pyarrow": match_msg = "Expected 1 columns, but found 3: 1,2,3" - expected_warning = (ParserWarning, DeprecationWarning) with tm.assert_produces_warning( expected_warning, match=match_msg, check_stacklevel=False @@ -258,19 +249,17 @@ def test_null_byte_char(request, all_parsers): @pytest.mark.filterwarnings("always::ResourceWarning") -def test_open_file(request, all_parsers): +def test_open_file(all_parsers): # GH 39024 parser = all_parsers msg = "Could not determine delimiter" err = csv.Error if parser.engine == "c": - msg = "the 'c' engine does not support sep=None with delim_whitespace=False" - err = ValueError + msg = "object of type 'NoneType' has no len" + err = TypeError elif parser.engine == "pyarrow": - msg = ( - "the 'pyarrow' engine does not support sep=None with delim_whitespace=False" - ) + msg = "'utf-8' codec can't decode byte 0xe4" err = ValueError with tm.ensure_clean() as path: @@ -322,7 +311,6 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers): expected_warning = ParserWarning if parser.engine == "pyarrow": match_msg = "Expected 2 columns, but found 3: a,b,c" - expected_warning = (ParserWarning, DeprecationWarning) with tm.assert_produces_warning( expected_warning, match=match_msg, check_stacklevel=False diff --git a/pandas/tests/io/parser/common/test_verbose.py b/pandas/tests/io/parser/common/test_verbose.py deleted file mode 100644 index fede54643d2dd..0000000000000 --- a/pandas/tests/io/parser/common/test_verbose.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Tests that work on both the Python and C engines but do not have a -specific classification into the other test modules. -""" -from io import StringIO - -import pytest - -import pandas._testing as tm - -depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" - - -def test_verbose_read(all_parsers, capsys): - parser = all_parsers - data = """a,b,c,d -one,1,2,3 -one,1,2,3 -,1,2,3 -one,1,2,3 -,1,2,3 -,1,2,3 -one,1,2,3 -two,1,2,3""" - - if parser.engine == "pyarrow": - msg = "The 'verbose' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), verbose=True) - return - - # Engines are verbose in different ways. - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), verbose=True) - captured = capsys.readouterr() - - if parser.engine == "c": - assert "Tokenization took:" in captured.out - assert "Parser memory cleanup took:" in captured.out - else: # Python engine - assert captured.out == "Filled 3 NA values in column a\n" - - -def test_verbose_read2(all_parsers, capsys): - parser = all_parsers - data = """a,b,c,d -one,1,2,3 -two,1,2,3 -three,1,2,3 -four,1,2,3 -five,1,2,3 -,1,2,3 -seven,1,2,3 -eight,1,2,3""" - - if parser.engine == "pyarrow": - msg = "The 'verbose' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), verbose=True, index_col=0) - return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), verbose=True, index_col=0) - captured = capsys.readouterr() - - # Engines are verbose in different ways. - if parser.engine == "c": - assert "Tokenization took:" in captured.out - assert "Parser memory cleanup took:" in captured.out - else: # Python engine - assert captured.out == "Filled 1 NA values in column a\n" diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 6d5f870f07206..90f77a7024235 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -4,6 +4,7 @@ import pytest +from pandas.compat import HAS_PYARROW from pandas.compat._optional import VERSIONS from pandas import ( @@ -117,7 +118,15 @@ def csv1(datapath): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] +_pyarrow_parsers_only = [ + pytest.param( + _pyarrowParser, + marks=[ + pytest.mark.single_cpu, + pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"), + ], + ) +] _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] @@ -181,7 +190,16 @@ def _get_all_parser_float_precision_combinations(): parser = parser.values[0] for precision in parser.float_precision_choices: # Re-wrap in pytest.param for pyarrow - mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () + mark = ( + [ + pytest.mark.single_cpu, + pytest.mark.skipif( + not HAS_PYARROW, reason="pyarrow is not installed" + ), + ] + if parser.engine == "pyarrow" + else () + ) param = pytest.param((parser(), precision), marks=mark) params.append(param) ids.append(f"{parser_id}-{precision}") diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index f4aff14a5ce32..15cbac54ff8d9 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -2,6 +2,7 @@ Tests dtype specification during parsing for all of the parsers defined in parsers.py """ + from io import StringIO import os diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ce02e752fb90b..75b7cf0d42cb8 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -2,6 +2,7 @@ Tests dtype specification during parsing for all of the parsers defined in parsers.py """ + from collections import defaultdict from io import StringIO @@ -16,21 +17,19 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - IntegerArray, - StringArray, -) +from pandas.core.arrays import IntegerArray pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): +def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): # see gh-3795, gh-6607 parser = all_parsers @@ -48,8 +47,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): if check_orig: expected = df.copy() result = result.astype(float) - else: + elif using_infer_string and dtype is str: expected = df.astype(str) + else: + expected = df.astype(str).astype(object) tm.assert_frame_equal(result, expected) @@ -67,7 +68,6 @@ def test_dtype_per_column(all_parsers): [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] ) expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) @@ -132,16 +132,13 @@ def test_dtype_with_converters(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) -) -def test_numeric_dtype(all_parsers, dtype): +def test_numeric_dtype(all_parsers, any_real_numpy_dtype): data = "0\n1" parser = all_parsers - expected = DataFrame([0, 1], dtype=dtype) + expected = DataFrame([0, 1], dtype=any_real_numpy_dtype) - result = parser.read_csv(StringIO(data), header=None, dtype=dtype) - tm.assert_frame_equal(expected, result) + result = parser.read_csv(StringIO(data), header=None, dtype=any_real_numpy_dtype) + tm.assert_frame_equal(expected, result, check_column_type=False) @pytest.mark.usefixtures("pyarrow_xfail") @@ -460,8 +457,6 @@ def test_dtype_backend_and_dtype(all_parsers): def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 - pa = pytest.importorskip("pyarrow") - with pd.option_context("mode.string_storage", string_storage): parser = all_parsers @@ -471,21 +466,13 @@ def test_dtype_backend_string(all_parsers, string_storage): """ result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), + }, + ) + tm.assert_frame_equal(result, expected) def test_dtype_backend_ea_dtype_specified(all_parsers): @@ -556,8 +543,7 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) data = """a,b x,1 @@ -575,10 +561,8 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) -def test_string_inference_object_dtype(all_parsers, dtype): +def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): # GH#56047 - pytest.importorskip("pyarrow") - data = """a,b x,a y,a @@ -587,12 +571,13 @@ def test_string_inference_object_dtype(all_parsers, dtype): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) + expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=expected_dtype), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -601,14 +586,15 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -619,7 +605,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -641,3 +627,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py index f34385b190c5f..ebc61e7f0ca2b 100644 --- a/pandas/tests/io/parser/dtypes/test_empty.py +++ b/pandas/tests/io/parser/dtypes/test_empty.py @@ -2,6 +2,7 @@ Tests dtype specification during parsing for all of the parsers defined in parsers.py """ + from io import StringIO import numpy as np @@ -125,8 +126,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): expected.index = expected.index.astype(object) with pytest.raises(ValueError, match="Duplicate names"): - data = "" - parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) + parser.read_csv(StringIO(""), names=["one", "one"], dtype={0: "u1", 1: "f"}) @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 27d7bc0bb6c07..11a30a26f91ef 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -4,6 +4,7 @@ these tests out of this module as soon as the Python parser can accept further arguments when parsing. """ + from decimal import Decimal from io import ( BytesIO, @@ -17,6 +18,7 @@ import numpy as np import pytest +from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -51,11 +53,7 @@ def test_delim_whitespace_custom_terminator(c_parser_only): data = "a b c~1 2 3~4 5 6~7 8 9" parser = c_parser_only - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + df = parser.read_csv(StringIO(data), lineterminator="~", sep=r"\s+") expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) @@ -93,15 +91,16 @@ def test_dtype_and_names_error(c_parser_only): """ # fallback casting, but not castable warning = RuntimeWarning if np_version_gte1p24 else None - with pytest.raises(ValueError, match="cannot safely convert"): - with tm.assert_produces_warning(warning, check_stacklevel=False): - parser.read_csv( - StringIO(data), - sep=r"\s+", - header=None, - names=["a", "b"], - dtype={"a": np.int32}, - ) + if not WASM: # no fp exception support in wasm + with pytest.raises(ValueError, match="cannot safely convert"): + with tm.assert_produces_warning(warning, check_stacklevel=False): + parser.read_csv( + StringIO(data), + sep=r"\s+", + header=None, + names=["a", "b"], + dtype={"a": np.int32}, + ) @pytest.mark.parametrize( @@ -183,7 +182,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -208,8 +207,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): @@ -509,8 +512,8 @@ def __next__(self): def test_buffer_rd_bytes_bad_unicode(c_parser_only): # see gh-22748 - t = BytesIO(b"\xB0") - t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") + t = BytesIO(b"\xb0") + t = TextIOWrapper(t, encoding="UTF-8", errors="surrogateescape") msg = "'utf-8' codec can't encode character" with pytest.raises(UnicodeError, match=msg): c_parser_only.read_csv(t, encoding="UTF-8") @@ -549,6 +552,7 @@ def test_chunk_whitespace_on_boundary(c_parser_only): tm.assert_frame_equal(result, expected) +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_file_handles_mmap(c_parser_only, csv1): # gh-14418 # diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index abaeeb86476da..ba27b170aecdc 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -2,6 +2,7 @@ Tests that comments are properly handled during parsing for all of the parsers defined in parsers.py """ + from io import StringIO import numpy as np @@ -30,10 +31,8 @@ def test_comment(all_parsers, na_values): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}] -) -def test_line_comment(all_parsers, read_kwargs, request): +@pytest.mark.parametrize("read_kwargs", [{}, {"lineterminator": "*"}, {"sep": r"\s+"}]) +def test_line_comment(all_parsers, read_kwargs): parser = all_parsers data = """# empty A,B,C @@ -41,12 +40,8 @@ def test_line_comment(all_parsers, read_kwargs, request): #ignore this line 5.,NaN,10.0 """ - warn = None - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - - if read_kwargs.get("delim_whitespace"): + if read_kwargs.get("sep"): data = data.replace(",", " ") - warn = FutureWarning elif read_kwargs.get("lineterminator"): data = data.replace("\n", read_kwargs.get("lineterminator")) @@ -59,23 +54,15 @@ def test_line_comment(all_parsers, read_kwargs, request): else: msg = "The 'comment' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - warn, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), **read_kwargs) + parser.read_csv(StringIO(data), **read_kwargs) return elif parser.engine == "python" and read_kwargs.get("lineterminator"): msg = r"Custom line terminators not supported in python parser \(yet\)" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - warn, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), **read_kwargs) + parser.read_csv(StringIO(data), **read_kwargs) return - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), **read_kwargs) - + result = parser.read_csv(StringIO(data), **read_kwargs) expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) diff --git a/pandas/tests/io/parser/test_concatenate_chunks.py b/pandas/tests/io/parser/test_concatenate_chunks.py index 1bae2317a2fc6..3e418faeff8e0 100644 --- a/pandas/tests/io/parser/test_concatenate_chunks.py +++ b/pandas/tests/io/parser/test_concatenate_chunks.py @@ -16,7 +16,7 @@ def test_concatenate_chunks_pyarrow(): {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, {0: ArrowExtensionArray(pa.array([1, 2]))}, ] - result = _concatenate_chunks(chunks) + result = _concatenate_chunks(chunks, ["column_0", "column_1"]) expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0])) tm.assert_extension_array_equal(result[0], expected) @@ -28,8 +28,10 @@ def test_concatenate_chunks_pyarrow_strings(): {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, {0: ArrowExtensionArray(pa.array(["a", "b"]))}, ] - with tm.assert_produces_warning(DtypeWarning, match="have mixed types"): - result = _concatenate_chunks(chunks) + with tm.assert_produces_warning( + DtypeWarning, match="Columns \\(0: column_0\\) have mixed types" + ): + result = _concatenate_chunks(chunks, ["column_0", "column_1"]) expected = np.concatenate( [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])] ) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 7f3e45324dbd2..c6ba2213033ea 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -2,6 +2,7 @@ Tests column conversion functionality during parsing for all of the parsers defined in parsers.py """ + from io import StringIO from dateutil.parser import parse @@ -32,7 +33,8 @@ def test_converters_type_must_be_dict(all_parsers): @pytest.mark.parametrize("column", [3, "D"]) @pytest.mark.parametrize( - "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer. + "converter", + [parse, lambda x: int(x.split("/")[2])], # Produce integer. ) def test_converters(all_parsers, column, converter): parser = all_parsers @@ -84,9 +86,9 @@ def test_converters_euro_decimal_format(all_parsers): 1;1521,1541;187101,9543;ABC;poi;4,7387 2;121,12;14897,76;DEF;uyt;0,3773 3;878,158;108013,434;GHI;rez;2,7356""" - converters["Number1"] = converters["Number2"] = converters[ - "Number3" - ] = lambda x: float(x.replace(",", ".")) + converters["Number1"] = converters["Number2"] = converters["Number3"] = ( + lambda x: float(x.replace(",", ".")) + ) if parser.engine == "pyarrow": msg = "The 'converters' option is not supported with the 'pyarrow' engine" @@ -202,7 +204,7 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 7a72e66996d43..803114723bc74 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -26,7 +26,7 @@ def custom_dialect(): "escapechar": "~", "delimiter": ":", "skipinitialspace": False, - "quotechar": "~", + "quotechar": "`", "quoting": 3, } return dialect_name, dialect_kwargs diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index cbd3917ba9c04..5df8c3d27bf84 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -2,6 +2,7 @@ Tests encoding functionality during parsing for all of the parsers defined in parsers.py """ + from io import ( BytesIO, TextIOWrapper, @@ -57,9 +58,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): skip this too A,B,C 1,2,3 -4,5,6""".replace( - ",", sep - ) +4,5,6""".replace(",", sep) path = f"__{uuid.uuid4()}__.csv" kwargs = {"sep": sep, "skiprows": 2} utf8 = "utf-8" @@ -99,22 +98,22 @@ def test_unicode_encoding(all_parsers, csv_dir_path): "data,kwargs,expected", [ # Basic test - ("a\n1", {}, DataFrame({"a": [1]})), + ("a\n1", {}, [1]), # "Regular" quoting - ('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})), + ('"a"\n1', {"quotechar": '"'}, [1]), # Test in a data row instead of header - ("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})), + ("b\n1", {"names": ["a"]}, ["b", "1"]), # Test in empty data row with skipping - ("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})), + ("\n1", {"names": ["a"], "skip_blank_lines": True}, [1]), # Test in empty data row without skipping ( "\n1", {"names": ["a"], "skip_blank_lines": False}, - DataFrame({"a": [np.nan, 1]}), + [np.nan, 1], ), ], ) -def test_utf8_bom(all_parsers, data, kwargs, expected, request): +def test_utf8_bom(all_parsers, data, kwargs, expected): # see gh-4793 parser = all_parsers bom = "\ufeff" @@ -133,6 +132,7 @@ def _encode_data_with_bom(_data): pytest.skip(reason="https://github.com/apache/arrow/issues/38676") result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) + expected = DataFrame({"a": expected}) tm.assert_frame_equal(result, expected) @@ -181,7 +181,9 @@ def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath): @pytest.mark.parametrize("pass_encoding", [True, False]) -def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): +def test_encoding_temp_file( + all_parsers, utf_value, encoding_fmt, pass_encoding, temp_file +): # see gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) @@ -192,7 +194,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) expected = DataFrame({"foo": ["bar"]}) - with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f: + with temp_file.open(mode="w+", encoding=encoding) as f: f.write("foo\nbar") f.seek(0) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 0dbd4e3569ad6..d333aef723de2 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -32,8 +32,7 @@ def test_read_with_bad_header(all_parsers): msg = r"but only \d+ lines in file" with pytest.raises(ValueError, match=msg): - s = StringIO(",,") - parser.read_csv(s, header=[10]) + parser.read_csv(StringIO(",,"), header=[10]) def test_negative_header(all_parsers): @@ -163,7 +162,7 @@ def test_header_multi_index(all_parsers): {"index_col": ["foo", "bar"]}, ( "index_col must only contain " - "row numbers when specifying " + "integers of column positions when specifying " "a multi-index header" ), ), @@ -369,7 +368,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), - index=Index([1, 7]), + index=range(1, 13, 6), columns=MultiIndex( levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], @@ -539,7 +538,7 @@ def test_mangles_multi_index(all_parsers, data, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: an integer is requireds +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -671,7 +670,7 @@ def test_header_none_and_on_bad_lines_skip(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: an integer is requireds +@xfail_pyarrow # TypeError: an integer is required def test_header_missing_rows(all_parsers): # GH#47400 parser = all_parsers @@ -683,7 +682,7 @@ def test_header_missing_rows(all_parsers): parser.read_csv(StringIO(data), header=[0, 1, 2]) -# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +# ValueError: the 'pyarrow' engine does not support regex separators @xfail_pyarrow def test_header_multiple_whitespaces(all_parsers): # GH#54931 @@ -696,7 +695,7 @@ def test_header_multiple_whitespaces(all_parsers): tm.assert_frame_equal(result, expected) -# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +# ValueError: the 'pyarrow' engine does not support regex separators @xfail_pyarrow def test_header_delim_whitespace(all_parsers): # GH#54918 @@ -705,12 +704,7 @@ def test_header_delim_whitespace(all_parsers): 1,2 3,4 """ - - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), delim_whitespace=True) + result = parser.read_csv(StringIO(data), sep=r"\s+") expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index ba15d061b2deb..9977e2b8e1a1d 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -3,6 +3,7 @@ is properly handled or inferred during parsing for all of the parsers defined in parsers.py """ + from io import StringIO import numpy as np @@ -352,7 +353,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 1d245f81f027c..55c8bbc4bb9e1 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -3,11 +3,15 @@ CSV engine. In general, the expected result is that they are either thoroughly de-duplicated (if mangling requested) or ignored otherwise. """ + from io import StringIO import pytest -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -129,10 +133,10 @@ def test_mangled_unnamed_placeholders(all_parsers): # This test recursively updates `df`. for i in range(3): - expected = DataFrame() + expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): - col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) + col_name = "Unnamed: 0" + f".{1 * j}" * min(j, 1) expected.insert(loc=0, column=col_name, value=[0, 1, 2]) expected[orig_key] = orig_value diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index da9b9bddd30cd..348c19ac0f0c6 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -2,6 +2,7 @@ Tests multithreading behaviour for reading and parsing files for each parser defined in parsers.py """ + from contextlib import ExitStack from io import BytesIO from multiprocessing.pool import ThreadPool @@ -12,6 +13,7 @@ import pandas as pd from pandas import DataFrame import pandas._testing as tm +from pandas.util.version import Version xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -23,10 +25,16 @@ ] -@xfail_pyarrow # ValueError: Found non-unique column index -def test_multi_thread_string_io_read_csv(all_parsers): +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_multi_thread_string_io_read_csv(all_parsers, request): # see gh-11786 parser = all_parsers + if parser.engine == "pyarrow": + pa = pytest.importorskip("pyarrow") + if Version(pa.__version__) < Version("16.0"): + request.applymarker( + pytest.mark.xfail(reason="# ValueError: Found non-unique column index") + ) max_row_range = 100 num_files = 10 @@ -144,7 +152,8 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): with tm.ensure_clean(file_name) as path: df.to_csv(path) - final_dataframe = _generate_multi_thread_dataframe( - parser, path, num_rows, num_tasks - ) - tm.assert_frame_equal(df, final_dataframe) + result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) + + expected = df[:] + expected["date"] = expected["date"].astype("M8[s]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index ca106fa772e82..213fa2c01cef4 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -2,6 +2,7 @@ Tests that NA values are properly handled during parsing for all of the parsers defined in parsers.py """ + from io import StringIO import numpy as np @@ -263,47 +264,41 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): [ ( {}, - DataFrame( - { - "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], - } - ), + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + }, ), ( {"na_values": {"A": [], "C": []}, "keep_default_na": False}, - DataFrame( - { - "A": ["a", "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", "nan", "five", "", "seven"], - } - ), + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + }, ), ( {"na_values": ["a"], "keep_default_na": False}, - DataFrame( - { - "A": [np.nan, "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", "nan", "five", "", "seven"], - } - ), + { + "A": [np.nan, "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + }, ), ( {"na_values": {"A": [], "C": []}}, - DataFrame( - { - "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], - } - ), + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + }, ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -321,10 +316,12 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) + expected = DataFrame(expected) tm.assert_frame_equal(result, expected) @@ -432,7 +429,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -440,14 +436,21 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) data = """\ A,B 1,A nan,B 3,C """ - parser = all_parsers result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) @@ -538,6 +541,46 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +def test_na_values_dict_null_column_name(all_parsers): + # see gh-57547 + parser = all_parsers + data = ",x,y\n\nMA,1,2\nNA,2,1\nOA,,3" + names = [None, "x", "y"] + na_values = dict.fromkeys(names, STR_NA_VALUES) + dtype = {None: "object", "x": "float64", "y": "float64"} + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + index_col=0, + header=0, + dtype=dtype, + names=names, + na_values=na_values, + keep_default_na=False, + ) + return + + expected = DataFrame( + {"x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]}, + index=Index(["MA", "NA", "OA"], dtype=object), + ) + + result = parser.read_csv( + StringIO(data), + index_col=0, + header=0, + dtype=dtype, + names=names, + na_values=na_values, + keep_default_na=False, + ) + + tm.assert_frame_equal(result, expected) + + def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" @@ -561,10 +604,10 @@ def test_na_values_dict_col_index(all_parsers): ( str(2**63) + "\n" + str(2**63 + 1), {"na_values": [2**63]}, - DataFrame([str(2**63), str(2**63 + 1)]), + [str(2**63), str(2**63 + 1)], ), - (str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])), - (str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])), + (str(2**63) + ",1" + "\n,2", {}, [[str(2**63), 1], ["", 2]]), + (str(2**63) + "\n1", {"na_values": [2**63]}, [np.nan, 1]), ], ) def test_na_values_uint64(all_parsers, data, kwargs, expected, request): @@ -581,6 +624,7 @@ def test_na_values_uint64(all_parsers, data, kwargs, expected, request): request.applymarker(mark) result = parser.read_csv(StringIO(data), header=None, **kwargs) + expected = DataFrame(expected) tm.assert_frame_equal(result, expected) @@ -769,3 +813,21 @@ def test_bool_and_nan_to_float(all_parsers): result = parser.read_csv(StringIO(data), dtype="float") expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +@pytest.mark.parametrize( + "na_values", + [[-99.0, -99], [-99, -99.0]], +) +def test_na_values_dict_without_dtype(all_parsers, na_values): + parser = all_parsers + data = """A +-99 +-99 +-99.0 +-99.0""" + + result = parser.read_csv(StringIO(data), na_values=na_values) + expected = DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 9351387dfc337..836ac71d8e865 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -2,6 +2,7 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ + from io import BytesIO import logging import re @@ -38,7 +39,7 @@ def test_compressed_urls( # test reading compressed urls with various engines and # extension inference if compression_only == "tar": - pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data") + pytest.skip("TODO: Add tar salaries.csv to pandas/io/parsers/data") extension = compression_to_extension[compression_only] with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f: @@ -74,6 +75,7 @@ def tips_df(datapath): @pytest.mark.single_cpu +@pytest.mark.network @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index d8f362039ba13..9a15d9bc84a2e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -4,19 +4,14 @@ """ from datetime import ( - date, datetime, timedelta, timezone, ) from io import StringIO -from dateutil.parser import parse as du_parse import numpy as np import pytest -import pytz - -from pandas._libs.tslibs import parsing import pandas as pd from pandas import ( @@ -33,425 +28,14 @@ from pandas.io.parsers import read_csv -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning"), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow -def test_read_csv_with_custom_date_parser(all_parsers): - # GH36111 - def __custom_date_parser(time): - time = time.astype(np.float64) - time = time.astype(int) # convert float seconds to int type - return pd.to_timedelta(time, unit="s") - - testdata = StringIO( - """time e n h - 41047.00 -98573.7297 871458.0640 389.0089 - 41048.00 -98573.7299 871458.0640 389.0089 - 41049.00 -98573.7300 871458.0642 389.0088 - 41050.00 -98573.7299 871458.0643 389.0088 - 41051.00 -98573.7302 871458.0640 389.0086 - """ - ) - result = all_parsers.read_csv_check_warnings( - FutureWarning, - "Please use 'date_format' instead", - testdata, - delim_whitespace=True, - parse_dates=True, - date_parser=__custom_date_parser, - index_col="time", - ) - time = [41047, 41048, 41049, 41050, 41051] - time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time") - expected = DataFrame( - { - "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302], - "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640], - "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086], - }, - index=time, - ) - - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): - # GH44366 - def __custom_date_parser(time): - time = time.astype(np.float64) - time = time.astype(int) # convert float seconds to int type - return pd.to_timedelta(time, unit="s") - - testdata = StringIO( - """time e - 41047.00 -93.77 - 41048.00 -95.79 - 41049.00 -98.73 - 41050.00 -93.99 - 41051.00 -97.72 - """ - ) - result = all_parsers.read_csv_check_warnings( - FutureWarning, - "Please use 'date_format' instead", - testdata, - delim_whitespace=True, - parse_dates=False, - date_parser=__custom_date_parser, - index_col="time", - ) - time = Series([41047.00, 41048.00, 41049.00, 41050.00, 41051.00], name="time") - expected = DataFrame( - {"e": [-93.77, -95.79, -98.73, -93.99, -97.72]}, - index=time, - ) - - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_separator_date_conflict(all_parsers): - # Regression test for gh-4678 - # - # Make sure thousands separator and - # date parsing do not conflict. - parser = all_parsers - data = "06-02-2013;13:00;1-000.215" - expected = DataFrame( - [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] - ) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - df = parser.read_csv( - StringIO(data), - sep=";", - thousands="-", - parse_dates={"Date": [0, 1]}, - header=None, - ) - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col_custom(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - - def date_parser(*date_cols): - """ - Test date parser. - - Parameters - ---------- - date_cols : args - The list of data columns to parse. - - Returns - ------- - parsed : Series - """ - return parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), parser=du_parse - ) - - kwds = { - "header": None, - "date_parser": date_parser, - "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "actual", - "nominal", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("container", [list, tuple, Index, Series]) -@pytest.mark.parametrize("dim", [1, 2]) -def test_concat_date_col_fail(container, dim): - msg = "not all elements from date_cols are numpy arrays" - value = "19990127" - - date_cols = tuple(container([value]) for _ in range(dim)) - - with pytest.raises(ValueError, match=msg): - parsing.concat_date_cols(date_cols) - - -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - - depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" - - kwds = { - "header": None, - "parse_dates": [[1, 2], [1, 3]], - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - with tm.assert_produces_warning( - (DeprecationWarning, FutureWarning), match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), **kwds) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "X1_X2", - "X1_X3", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - tm.assert_frame_equal(result, expected) - - def test_date_col_as_index_col(all_parsers): data = """\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -477,6 +61,7 @@ def test_date_col_as_index_col(all_parsers): datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 22, 0), ], + dtype="M8[s]", name="X1", ) expected = DataFrame( @@ -498,299 +83,6 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -def test_multiple_date_cols_int_cast(all_parsers): - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - parse_dates = {"actual": [1, 2], "nominal": [1, 3]} - parser = all_parsers - - kwds = { - "header": None, - "parse_dates": parse_dates, - "date_parser": pd.to_datetime, - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_multiple_date_col_timestamp_parse(all_parsers): - parser = all_parsers - data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 -05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=[[0, 1]], - header=None, - date_parser=Timestamp, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 1, - "E", - 0, - np.nan, - 1306.25, - ], - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 8, - "E", - 0, - np.nan, - 1306.25, - ], - ], - columns=["0_1", 2, 3, 4, 5, 6, 7], - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_with_header(all_parsers): - parser = all_parsers - data = """\ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,parse_dates,msg", - [ - ( - """\ -date_NominalTime,date,NominalTime -KORD1,19990127, 19:00:00 -KORD2,19990127, 20:00:00""", - [[1, 2]], - ("New date column already in dict date_NominalTime"), - ), - ( - """\ -ID,date,nominalTime -KORD,19990127, 19:00:00 -KORD,19990127, 20:00:00""", - {"ID": [1, 2]}, - "Date column ID already in dict", - ), - ], -) -def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): - parser = all_parsers - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), parse_dates=parse_dates) - - -def test_date_parser_int_bug(all_parsers): - # see gh-3071 - parser = all_parsers - data = ( - "posix_timestamp,elapsed,sys,user,queries,query_time,rows," - "accountid,userid,contactid,level,silo,method\n" - "1343103150,0.062353,0,4,6,0.01690,3," - "12345,1,-1,3,invoice_InvoiceResource,search\n" - ) - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - index_col=0, - parse_dates=[0], - # Note: we must pass tz and then drop the tz attribute - # (if we don't CI will flake out depending on the runner's local time) - date_parser=lambda x: datetime.fromtimestamp(int(x), tz=timezone.utc).replace( - tzinfo=None - ), - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [ - 0.062353, - 0, - 4, - 6, - 0.01690, - 3, - 12345, - 1, - -1, - 3, - "invoice_InvoiceResource", - "search", - ] - ], - columns=[ - "elapsed", - "sys", - "user", - "queries", - "query_time", - "rows", - "accountid", - "userid", - "contactid", - "level", - "silo", - "method", - ], - index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_nat_parse(all_parsers): # see gh-3062 @@ -798,7 +90,7 @@ def test_nat_parse(all_parsers): df = DataFrame( { "A": np.arange(10, dtype="float64"), - "B": Timestamp("20010101").as_unit("ns"), + "B": Timestamp("20010101"), } ) df.iloc[3:6, :] = np.nan @@ -810,26 +102,6 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) -@skip_pyarrow -def test_csv_custom_parser(all_parsers): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=lambda x: datetime.strptime(x, "%Y%m%d"), - ) - expected = parser.read_csv(StringIO(data), parse_dates=True) - tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), date_format="%Y%m%d") - tm.assert_frame_equal(result, expected) - - @skip_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C @@ -854,7 +126,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) + index = date_range("1/1/2009", periods=3, name="date", unit="s")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -862,37 +134,6 @@ def test_parse_dates_string(all_parsers): tm.assert_frame_equal(result, expected) -# Bug in https://github.com/dateutil/dateutil/issues/217 -# has been addressed, but we just don't pass in the `yearfirst` -@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") -@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) -def test_yy_format_with_year_first(all_parsers, parse_dates): - data = """date,time,B,C -090131,0010,1,2 -090228,1020,3,4 -090331,0830,5,6 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - UserWarning, - "Could not infer format", - StringIO(data), - index_col=0, - parse_dates=parse_dates, - ) - index = DatetimeIndex( - [ - datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0), - ], - dtype=object, - name="date_time", - ) - expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): @@ -902,6 +143,8 @@ def test_parse_dates_column_list(all_parsers, parse_dates): expected = DataFrame( {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} ) + expected["a"] = expected["a"].astype("M8[s]") + expected["c"] = expected["c"].astype("M8[s]") expected = expected.set_index(["a", "b"]) result = parser.read_csv( @@ -925,9 +168,10 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers + dti = date_range("2009-01-01", periods=3, freq="D", unit="s") index = MultiIndex.from_product( [ - (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + dti, ("one", "two", "three"), ], names=["index1", "index2"], @@ -962,66 +206,17 @@ def test_multi_index_parse_dates(all_parsers, index_col): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -@pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}]) -def test_parse_dates_custom_euro_format(all_parsers, kwargs): - parser = all_parsers - data = """foo,bar,baz -31/01/2010,1,2 -01/02/2010,1,NA -02/02/2010,1,2 -""" - if "dayfirst" in kwargs: - df = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - header=0, - index_col=0, - parse_dates=True, - na_values=["NA"], - ) - exp_index = Index( - [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], - name="time", - ) - expected = DataFrame( - {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, - index=exp_index, - columns=["Q", "NTU"], - ) - tm.assert_frame_equal(df, expected) - else: - msg = "got an unexpected keyword argument 'day_first'" - with pytest.raises(TypeError, match=msg): - parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - skiprows=[0], - index_col=0, - parse_dates=True, - na_values=["NA"], - ) - - def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) if parser.engine == "pyarrow": + pytz = pytest.importorskip("pytz") expected_tz = pytz.utc else: expected_tz = timezone.utc @@ -1029,282 +224,11 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is expected_tz -@xfail_pyarrow -@pytest.mark.parametrize( - "parse_dates,index_col", - [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], -) -def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): - parser = all_parsers - data = """ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD1", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD2", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD3", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD4", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD5", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD6", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - expected = expected.set_index("nominal") - - if not isinstance(parse_dates, dict): - expected.index.name = "date_NominalTime" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates=parse_dates, index_col=index_col - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_chunked(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], - ) - expected = expected.set_index("nominal") - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with parser.read_csv( - StringIO(data), - parse_dates={"nominal": [1, 2]}, - index_col="nominal", - chunksize=2, - ) as reader: - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -def test_multiple_date_col_named_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False - ): - with_indices = parser.read_csv( - StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" - ) - - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False - ): - with_names = parser.read_csv( - StringIO(data), - index_col="nominal", - parse_dates={"nominal": ["date", "nominalTime"]}, - ) - tm.assert_frame_equal(with_indices, with_names) - - -def test_multiple_date_col_multiple_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} - ) - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False - ): - expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - - expected = expected.set_index(["nominal", "ID"]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) + msg = "Only booleans and lists are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" @@ -1315,20 +239,16 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) + msg = "Only booleans and lists are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), parse_dates=(1,)) + parser.read_csv(StringIO(data), parse_dates=parse_dates) -@pytest.mark.parametrize("cache_dates", [True, False]) @pytest.mark.parametrize("value", ["nan", ""]) -def test_bad_date_parse(all_parsers, cache_dates, value): +def test_bad_date_parse(all_parsers, cache, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers @@ -1339,13 +259,12 @@ def test_bad_date_parse(all_parsers, cache_dates, value): header=None, names=["foo", "bar"], parse_dates=["foo"], - cache_dates=cache_dates, + cache_dates=cache, ) -@pytest.mark.parametrize("cache_dates", [True, False]) @pytest.mark.parametrize("value", ["0"]) -def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): +def test_bad_date_parse_with_warning(all_parsers, cache, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly. parser = all_parsers @@ -1357,7 +276,7 @@ def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): # TODO: parse dates directly in pyarrow, see # https://github.com/pandas-dev/pandas/issues/48017 warn = None - elif cache_dates: + elif cache: # Note: warning is not raised if 'cache_dates', because here there is only a # single unique date and hence no risk of inconsistent parsing. warn = None @@ -1370,12 +289,11 @@ def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): header=None, names=["foo", "bar"], parse_dates=["foo"], - cache_dates=cache_dates, + cache_dates=cache, raise_on_extra_warnings=False, ) -@xfail_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers @@ -1385,48 +303,10 @@ def test_parse_dates_empty_string(all_parsers): expected = DataFrame( [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] ) + expected["Date"] = expected["Date"].astype("M8[s]") tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "reader", ["read_csv_check_warnings", "read_table_check_warnings"] -) -def test_parse_dates_infer_datetime_format_warning(all_parsers, reader): - # GH 49024, 51017 - parser = all_parsers - data = "Date,test\n2012-01-01,1\n,2" - - getattr(parser, reader)( - FutureWarning, - "The argument 'infer_datetime_format' is deprecated", - StringIO(data), - parse_dates=["Date"], - infer_datetime_format=True, - sep=",", - raise_on_extra_warnings=False, - ) - - -@pytest.mark.parametrize( - "reader", ["read_csv_check_warnings", "read_table_check_warnings"] -) -def test_parse_dates_date_parser_and_date_format(all_parsers, reader): - # GH 50601 - parser = all_parsers - data = "Date,test\n2012-01-01,1\n,2" - msg = "Cannot use both 'date_parser' and 'date_format'" - with pytest.raises(TypeError, match=msg): - getattr(parser, reader)( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=["Date"], - date_parser=pd.to_datetime, - date_format="ISO8601", - sep=",", - ) - - @xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", @@ -1434,18 +314,22 @@ def test_parse_dates_date_parser_and_date_format(all_parsers, reader): ( "a\n04.15.2016", {"parse_dates": ["a"]}, - DataFrame([datetime(2016, 4, 15)], columns=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"], dtype="M8[s]"), ), ( "a\n04.15.2016", {"parse_dates": True, "index_col": 0}, - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), + DataFrame( + index=DatetimeIndex(["2016-04-15"], dtype="M8[s]", name="a"), columns=[] + ), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": ["a", "b"]}, DataFrame( - [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], + dtype="M8[s]", + columns=["a", "b"], ), ), ( @@ -1453,7 +337,13 @@ def test_parse_dates_date_parser_and_date_format(all_parsers, reader): {"parse_dates": True, "index_col": [0, 1]}, DataFrame( index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + [ + ( + Timestamp(2016, 4, 15).as_unit("s"), + Timestamp(2013, 9, 16).as_unit("s"), + ) + ], + names=["a", "b"], ), columns=[], ), @@ -1468,279 +358,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -def test_parse_date_time_multi_level_column_name(all_parsers): - data = """\ -D,T,A,B -date, time,a,b -2001-01-05, 09:00:00, 0.0, 10. -2001-01-06, 00:00:00, 1.0, 11. -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=pd.to_datetime, - ) - - expected_data = [ - [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], - ] - expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """\ -date,time,a,b -2001-01-05, 10:00:00, 0.0, 10. -2001-01-05, 00:00:00, 1., 11. -""", - {"header": 0, "parse_dates": {"date_time": [0, 1]}}, - DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], - [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], - ], - columns=["date_time", "a", "b"], - ), - ), - ( - ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ), - {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}}, - DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - 0.81, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - 0.01, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ), - ), - ], -) -def test_parse_date_time(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=pd.to_datetime, - **kwargs, - raise_on_extra_warnings=False, - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_parse_date_fields(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=lambda x: x, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], - columns=["ymd", "a"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S", None), - ], -) -def test_parse_date_all_fields(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0,0.0,10. -2001,01,5,10,0,00,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S.%f", None), - ], -) -def test_datetime_fractional_seconds(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0.123456,0.0,10. -2001,01,5,10,0,0.500000,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -def test_generic(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - - def parse_function(yy, mm): - return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ym": [0, 1]}, - date_parser=parse_function, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], - columns=["ym", "day", "a"], - ) - expected["ym"] = expected["ym"].astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_date_parser_resolution_if_not_ns(all_parsers): - # see gh-10245 - parser = all_parsers - data = """\ -date,time,prn,rxstatus -2013-11-03,19:00:00,126,00E80000 -2013-11-03,19:00:00,23,00E80000 -2013-11-03,19:00:00,13,00E80000 -""" - - def date_parser(dt, time): - try: - arr = dt + "T" + time - except TypeError: - # dt & time are date/time objects - arr = [datetime.combine(d, t) for d, t in zip(dt, time)] - return np.array(arr, dtype="datetime64[s]") - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=date_parser, - parse_dates={"datetime": ["date", "time"]}, - index_col=["datetime", "prn"], - ) - - datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") - expected = DataFrame( - data={"rxstatus": ["00E80000"] * 3}, - index=MultiIndex.from_arrays( - [datetimes, [126, 23, 13]], - names=["datetime", "prn"], - ), - ) - tm.assert_frame_equal(result, expected) - - def test_parse_date_column_with_empty_string(all_parsers): # see gh-6428 parser = all_parsers @@ -1757,11 +374,11 @@ def test_parse_date_column_with_empty_string(all_parsers): [ ( "a\n135217135789158401\n1352171357E+5", - DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"), + [135217135789158401, 135217135700000], ), ( "a\n99999999999\n123456789012345\n1234E+0", - DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"), + [99999999999, 123456789012345, 1234], ), ], ) @@ -1774,6 +391,7 @@ def test_parse_date_float(all_parsers, data, expected, parse_dates): parser = all_parsers result = parser.read_csv(StringIO(data), parse_dates=parse_dates) + expected = DataFrame({"a": expected}, dtype="float64") tm.assert_frame_equal(result, expected) @@ -1793,6 +411,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=timezone(timedelta(minutes=540)), + unit="s", )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} @@ -1807,7 +426,7 @@ def test_parse_timezone(all_parsers): ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame({0: [date_string]}, dtype="str") result = parser.read_csv( StringIO(date_string), header=None, @@ -1831,7 +450,7 @@ def test_parse_delimited_date_swap_no_warning( all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") if parser.engine == "pyarrow": if not dayfirst: # "CSV parse error: Empty CSV file or block" @@ -1864,7 +483,7 @@ def test_parse_delimited_date_swap_with_warning( all_parsers, date_string, dayfirst, expected ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -1885,8 +504,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): with pytest.raises( ValueError, match=( - r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' - r"at position 1. You might want to try:" + r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y". ' + r"You might want to try:" ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) @@ -1899,11 +518,6 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): [ (None, ["val"], ["date", "time"], "date, time"), (None, ["val"], [0, "time"], "time"), - (None, ["val"], [["date", "time"]], "date, time"), - (None, ["val"], [[0, "time"]], "time"), - (None, ["val"], {"date": [0, "time"]}, "time"), - (None, ["val"], {"date": ["date", "time"]}, "date, time"), - (None, ["val"], [["date", "time"], "date"], "date, time"), (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), ( ["date1", "time1", "temperature"], @@ -1921,20 +535,10 @@ def test_missing_parse_dates_column_raises( content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - warn = FutureWarning - if isinstance(parse_dates, list) and all( - isinstance(x, (int, str)) for x in parse_dates - ): - warn = None - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates - ) + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates + ) @xfail_pyarrow # mismatched shape @@ -1964,40 +568,7 @@ def test_date_parser_multiindex_columns(all_parsers): 1,2 2019-12-31,6""" result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) - expected = DataFrame( - {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow # TypeError: an integer is required -@pytest.mark.parametrize( - "parse_spec, col_name", - [ - ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), - ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), - ], -) -def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): - parser = all_parsers - data = """a,b,c -1,2,3 -2019-12,-31,6""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - parse_dates=parse_spec, - header=[0, 1], - ) - expected = DataFrame( - {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} - ) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) tm.assert_frame_equal(result, expected) @@ -2031,26 +602,7 @@ def test_date_parser_usecols_thousands(all_parsers): thousands="-", ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow # mismatched shape -def test_parse_dates_and_keep_original_column(all_parsers): - # GH#13378 - parser = all_parsers - data = """A -20150908 -20150909 -""" - depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True - ) - expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] - expected = DataFrame({"date": expected_data, "A": expected_data}) + expected["C"] = expected["C"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -2060,7 +612,7 @@ def test_dayfirst_warnings(): # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -2086,7 +638,7 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") # A. use dayfirst=True res5 = read_csv( @@ -2121,7 +673,7 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): # GH47880 initial_value = f"date\n{date_string}" expected = DatetimeIndex( - ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-01-31"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -2151,14 +703,7 @@ def test_infer_first_column_as_index(all_parsers): @xfail_pyarrow # pyarrow engine doesn't support passing a dict for na_values -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning), - ("date_format", "%Y-%m-%d", None), - ], -) -def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): +def test_replace_nans_before_parsing_dates(all_parsers): # GH#26203 parser = all_parsers data = """Test @@ -2168,13 +713,11 @@ def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): # 2017-09-09 """ - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", + result = parser.read_csv( StringIO(data), na_values={"Test": ["#", "0"]}, parse_dates=["Test"], - **{key: value}, + date_format="%Y-%m-%d", ) expected = DataFrame( { @@ -2185,7 +728,8 @@ def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): pd.NaT, Timestamp("2017-09-09"), ] - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -2200,6 +744,7 @@ def test_parse_dates_and_string_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") + expected["b"] = expected["b"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -2212,14 +757,14 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="str", name="a", ) warn = None else: expected_index = DatetimeIndex( ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], - dtype="datetime64[ns]", + dtype="datetime64[ms]", name="a", ) warn = UserWarning @@ -2252,34 +797,8 @@ def test_parse_dates_dict_format(all_parsers): { "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] -) -def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): - # GH#51240 - parser = all_parsers - data = """a,b -31-,12-2019 -31-,12-2020""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates - ) - expected = DataFrame( - { - key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -2312,9 +831,6 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ @@ -2334,8 +850,8 @@ def test_from_csv_with_mixed_offsets(all_parsers): result = parser.read_csv(StringIO(data), parse_dates=["a"])["a"] expected = Series( [ - Timestamp("2020-01-01 00:00:00+01:00"), - Timestamp("2020-01-01 00:00:00+00:00"), + "2020-01-01T00:00:00+01:00", + "2020-01-01T00:00:00+00:00", ], name="a", index=[0, 1], diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index dbd474c6ae0b9..a5bb151e84f47 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -4,6 +4,7 @@ these tests out of this module as soon as the C parser can accept further arguments when parsing. """ + from __future__ import annotations import csv @@ -520,6 +521,8 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d "c": [0, 4000, 131], } ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) tm.assert_frame_equal(result, expected) @@ -528,21 +531,17 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d [ ( {"a": str, "b": np.float64, "c": np.int64}, - DataFrame( - { - "b": [16000.1, 0, 23000], - "c": [0, 4001, 131], - } - ), + { + "b": [16000.1, 0, 23000], + "c": [0, 4001, 131], + }, ), ( str, - DataFrame( - { - "b": ["16,000.1", "0", "23,000"], - "c": ["0", "4,001", "131"], - } - ), + { + "b": ["16,000.1", "0", "23,000"], + "c": ["0", "4,001", "131"], + }, ), ], ) @@ -560,5 +559,6 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp dtype=dtype, thousands=",", ) + expected = DataFrame(expected) expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index bed2b5e10a6f7..6243185294894 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -4,7 +4,6 @@ engine is set to 'python-fwf' internally. """ -from datetime import datetime from io import ( BytesIO, StringIO, @@ -22,10 +21,6 @@ DatetimeIndex, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import urlopen from pandas.io.parsers import ( @@ -284,17 +279,6 @@ def test_fwf_regression(): 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 """ - - with tm.assert_produces_warning(FutureWarning, match="use 'date_format' instead"): - result = read_fwf( - StringIO(data), - index_col=0, - header=None, - names=names, - widths=widths, - parse_dates=True, - date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), - ) expected = DataFrame( [ [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], @@ -310,11 +294,11 @@ def test_fwf_regression(): "2009-06-13 20:40:00", "2009-06-13 20:50:00", "2009-06-13 21:00:00", - ] + ], + dtype="M8[us]", ), columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) - tm.assert_frame_equal(result, expected) result = read_fwf( StringIO(data), index_col=0, @@ -324,6 +308,7 @@ def test_fwf_regression(): parse_dates=True, date_format="%Y%j%H%M%S", ) + expected.index = expected.index.astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -488,9 +473,7 @@ def test_full_file_with_spaces(): 868 Jennifer Love Hewitt 0 17000.00 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 5000.00 2/5/2007 -""".strip( - "\r\n" - ) +""".strip("\r\n") colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -507,9 +490,7 @@ def test_full_file_with_spaces_and_missing(): 868 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -""".strip( - "\r\n" - ) +""".strip("\r\n") colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -526,9 +507,7 @@ def test_messed_up_data(): 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -""".strip( - "\r\n" - ) +""".strip("\r\n") colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -544,9 +523,7 @@ def test_multiple_delimiters(): ++44~~~~12.01 baz~~Jennifer Love Hewitt ~~55 11+++foo++++Jada Pinkett-Smith ..66++++++.03~~~bar Bill Murray -""".strip( - "\r\n" - ) +""".strip("\r\n") delimiter = " +~.\\" colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter) @@ -560,9 +537,7 @@ def test_variable_width_unicode(): שלום שלום ום שלל של ום -""".strip( - "\r\n" - ) +""".strip("\r\n") encoding = "utf8" kwargs = {"header": None, "encoding": encoding} @@ -603,10 +578,7 @@ def test_skiprows_inference(): """.strip() skiprows = 2 - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) - + expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+") result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -621,10 +593,7 @@ def test_skiprows_by_index_inference(): """.strip() skiprows = [0, 2] - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) - + expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+") result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -968,36 +937,28 @@ def test_widths_and_usecols(): def test_dtype_backend(string_storage, dtype_backend): # GH#50289 - if string_storage == "python": - arr = StringArray(np.array(["a", "b"], dtype=np.object_)) - arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - arr = ArrowExtensionArray(pa.array(["a", "b"])) - arr_na = ArrowExtensionArray(pa.array([None, "a"])) - else: - pa = pytest.importorskip("pyarrow") - arr = ArrowStringArray(pa.array(["a", "b"])) - arr_na = ArrowStringArray(pa.array([None, "a"])) - data = """a b c d e f g h i 1 2.5 True a 3 4.5 False b True 6 7.5 a""" with pd.option_context("mode.string_storage", string_storage): result = read_fwf(StringIO(data), dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": pd.Series([1, 3], dtype="Int64"), "b": pd.Series([2.5, 4.5], dtype="Float64"), "c": pd.Series([True, False], dtype="boolean"), - "d": arr, + "d": pd.Series(["a", "b"], dtype=string_dtype), "e": pd.Series([pd.NA, True], dtype="boolean"), "f": pd.Series([pd.NA, 6], dtype="Int64"), "g": pd.Series([pd.NA, 7.5], dtype="Float64"), - "h": arr_na, + "h": pd.Series([None, "a"], dtype=string_dtype), "i": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) @@ -1013,7 +974,9 @@ def test_dtype_backend(string_storage, dtype_backend): ) expected["i"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 2d50916228f14..99642ee4befc6 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -42,7 +42,9 @@ def test_skip_rows_bug(all_parsers, skiprows): StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( @@ -85,7 +87,9 @@ def test_skip_rows_blank(all_parsers): StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( @@ -187,9 +191,10 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported +@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators @pytest.mark.parametrize( - "lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" + "lineterminator", + ["\n", "\r\n", "\r"], # "LF" # "CRLF" # "CR" ) def test_skiprows_lineterminator(all_parsers, lineterminator, request): # see gh-9079 @@ -217,16 +222,12 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request): data = data.replace("\n", lineterminator) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - skiprows=1, - delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"], - ) + result = parser.read_csv( + StringIO(data), + skiprows=1, + sep=r"\s+", + names=["date", "time", "var", "flag", "oflag"], + ) tm.assert_frame_equal(result, expected) @@ -245,8 +246,8 @@ def test_skiprows_infield_quote(all_parsers): @pytest.mark.parametrize( "kwargs,expected", [ - ({}, DataFrame({"1": [3, 5]})), - ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})), + ({}, "1"), + ({"header": 0, "names": ["foo"]}, "foo"), ], ) def test_skip_rows_callable(all_parsers, kwargs, expected): @@ -254,6 +255,7 @@ def test_skip_rows_callable(all_parsers, kwargs, expected): data = "a\n1\n2\n3\n4\n5" result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) + expected = DataFrame({expected: [3, 5]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index fef5414e85e52..eeb783f1957b7 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -2,6 +2,7 @@ Tests the TextReader class in parsers.pyx, which is integral to the C engine in parsers.py """ + from io import ( BytesIO, StringIO, @@ -47,6 +48,13 @@ def test_StringIO(self, csv_path): reader = TextReader(src, header=None) reader.read() + def test_encoding_mismatch_warning(self, csv_path): + # GH-57954 + with open(csv_path, encoding="UTF-8") as f: + msg = "latin1 is different from the encoding" + with pytest.raises(ValueError, match=msg): + read_csv(f, encoding="latin1") + def test_string_factorize(self): # should this be optional? data = "a\nb\na\nb\na" @@ -136,7 +144,10 @@ def test_skip_bad_lines(self): reader.read() reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip + StringIO(data), + delimiter=":", + header=None, + on_bad_lines=2, # Skip ) result = reader.read() expected = { @@ -148,7 +159,10 @@ def test_skip_bad_lines(self): with tm.assert_produces_warning(ParserWarning, match="Skipping line"): reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn + StringIO(data), + delimiter=":", + header=None, + on_bad_lines=1, # Warn ) reader.read() diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index f8790bdb5fa42..07f84466e3ac2 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -6,6 +6,7 @@ Ultimately, the goal is to remove test cases from this test suite as new feature support is added to the parsers. """ + from io import StringIO import os from pathlib import Path @@ -43,12 +44,7 @@ def test_c_engine(self): data = "a b c\n1 2 3" msg = "does not support" - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - # specify C engine with unsupported options (raise) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="c", sep=r"\s") with pytest.raises(ValueError, match=msg): @@ -57,8 +53,6 @@ def test_c_engine(self): read_csv(StringIO(data), engine="c", skipfooter=1) # specify C-unsupported options without python-unsupported options - with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)): - read_csv(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=r"\s") with tm.assert_produces_warning(parsers.ParserWarning): @@ -104,8 +98,8 @@ def test_python_engine(self, python_engine): for default in py_unsupported: msg = ( - f"The {repr(default)} option is not " - f"supported with the {repr(python_engine)} engine" + f"The {default!r} option is not " + f"supported with the {python_engine!r} engine" ) kwargs = {default: object()} @@ -143,10 +137,7 @@ def test_pyarrow_engine(self): 1,2,3,4,""" for default in pa_unsupported: - msg = ( - f"The {repr(default)} option is not " - f"supported with the 'pyarrow' engine" - ) + msg = f"The {default!r} option is not supported with the 'pyarrow' engine" kwargs = {default: object()} default_needs_bool = {"warn_bad_lines", "error_bad_lines"} if default == "dialect": @@ -156,18 +147,8 @@ def test_pyarrow_engine(self): elif default == "on_bad_lines": kwargs[default] = "warn" - warn = None - depr_msg = None - if "delim_whitespace" in kwargs: - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - warn = FutureWarning - if "verbose" in kwargs: - depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" - warn = FutureWarning - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=depr_msg): - read_csv(StringIO(data), engine="pyarrow", **kwargs) + read_csv(StringIO(data), engine="pyarrow", **kwargs) def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): # GH 5686 diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index bc66189ca064e..cc54f2487aa60 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -2,6 +2,7 @@ Tests the usecols functionality during parsing for all of the parsers defined in parsers.py """ + from io import StringIO import pytest @@ -25,42 +26,6 @@ ) -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -def test_usecols_with_parse_dates(all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - if parser.engine == "pyarrow": - with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - tm.assert_frame_equal(result, expected) - - @skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 @@ -105,7 +70,7 @@ def test_usecols_with_parse_dates3(all_parsers): parse_dates = [0] cols = { - "a": Timestamp("2016-09-21").as_unit("ns"), + "a": Timestamp("2016-09-21"), "b": [1], "c": [1], "d": [2], @@ -120,75 +85,3 @@ def test_usecols_with_parse_dates3(all_parsers): result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates4(all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - usecols=usecols, - parse_dates=parse_dates, - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], -) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request): - # see gh-9755 - s = """0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0): - mark = pytest.mark.xfail( - reason="Length mismatch in some cases, UserWarning in other" - ) - request.applymarker(mark) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_strings.py b/pandas/tests/io/parser/usecols/test_strings.py index d4ade41d38465..1538bd4e805f7 100644 --- a/pandas/tests/io/parser/usecols/test_strings.py +++ b/pandas/tests/io/parser/usecols/test_strings.py @@ -2,6 +2,7 @@ Tests the usecols functionality during parsing for all of the parsers defined in parsers.py """ + from io import StringIO import pytest @@ -74,8 +75,7 @@ def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): parser.read_csv(StringIO(data), usecols=usecols) -@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) -def test_usecols_with_multi_byte_characters(all_parsers, usecols): +def test_usecols_with_multi_byte_characters(all_parsers): data = """あああ,いい,ううう,ええええ 0.056674973,8,True,a 2.613230982,2,False,b @@ -92,5 +92,5 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols): } expected = DataFrame(exp_data) - result = parser.read_csv(StringIO(data), usecols=usecols) + result = parser.read_csv(StringIO(data), usecols=["あああ", "いい"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 767fba666e417..82b42beb38ae0 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -2,6 +2,7 @@ Tests the usecols functionality during parsing for all of the parsers defined in parsers.py """ + from io import StringIO import numpy as np @@ -157,7 +158,8 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") -@skip_pyarrow # CSV parse error in one case, AttributeError in another +# ArrowKeyError: Column 'a' in include_columns does not exist in CSV file +@skip_pyarrow @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -253,29 +255,12 @@ def test_usecols_regex_sep(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow # Column 'a' in include_columns does not exist in CSV file def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) - return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) @@ -389,20 +374,18 @@ def test_incomplete_first_row(all_parsers, usecols): "19,29,39\n" * 2 + "10,20,30,40", [0, 1, 2], {"header": None}, - DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), + [[19, 29, 39], [19, 29, 39], [10, 20, 30]], ), # see gh-9549 ( ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"], {}, - DataFrame( - { - "A": [1, 3, 1, 1, 1, 5], - "B": [2, 4, 2, 2, 2, 6], - "C": [3, 5, 4, 3, 3, 7], - } - ), + { + "A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7], + }, ), ], ) @@ -410,6 +393,7 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): # see gh-8985 parser = all_parsers result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) + expected = DataFrame(expected) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 00a81a4f1f385..479f2468a86ab 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -6,7 +6,7 @@ import pytest from pandas._libs.tslibs import Timestamp -import pandas.util._test_decorators as td +from pandas.compat import PY312 import pandas as pd from pandas import ( @@ -23,7 +23,7 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -35,7 +35,7 @@ def test_append(setup_path): # tables.NaturalNameWarning): df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) _maybe_remove(store, "df1") @@ -133,6 +133,8 @@ def test_append_series(setup_path): # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] + # Reading/writing RangeIndex info is not supported yet + expected.index = Index(expected.index._data) result = store.select("ns", "foo>70 and index<90") tm.assert_series_equal(result, expected, check_index_type=True) @@ -142,7 +144,7 @@ def test_append_series(setup_path): mi["C"] = "foo" mi.loc[3:5, "C"] = "bar" mi.set_index(["C", "B"], inplace=True) - s = mi.stack(future_stack=True) + s = mi.stack() s.index = s.index.droplevel(2) store.append("mi", s) tm.assert_series_equal(store["mi"], s, check_index_type=True) @@ -196,7 +198,7 @@ def test_append_some_nans(setup_path): tm.assert_frame_equal(store["df3"], df3, check_index_type=True) -def test_append_all_nans(setup_path): +def test_append_all_nans(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df = DataFrame( { @@ -248,7 +250,13 @@ def test_append_all_nans(setup_path): _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df, check_index_type=True) + result = store["df"] + expected = df + if using_infer_string: + # TODO: Test is incorrect when not using_infer_string. + # Should take the last 4 rows uncondiationally. + expected = expected[-4:] + tm.assert_frame_equal(result, expected, check_index_type=True) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) @@ -282,12 +290,12 @@ def test_append_all_nans(setup_path): tm.assert_frame_equal(store["df2"], df, check_index_type=True) -def test_append_frame_column_oriented(setup_path): +def test_append_frame_column_oriented(setup_path, request): with ensure_clean_store(setup_path) as store: # column oriented df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.index = df.index._with_freq(None) # freq doesn't round-trip @@ -302,6 +310,13 @@ def test_append_frame_column_oriented(setup_path): tm.assert_frame_equal(expected, result) # selection on the non-indexable + request.applymarker( + pytest.mark.xfail( + PY312, + reason="AST change in PY312", + raises=ValueError, + ) + ) result = store.select("df1", ("columns=A", "index=df.index[0:4]")) expected = df.reindex(columns=["A"], index=df.index[0:4]) tm.assert_frame_equal(expected, result) @@ -412,7 +427,7 @@ def check_col(key, name, size): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -439,7 +454,7 @@ def check_col(key, name, size): _maybe_remove(store, "df") df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -503,7 +518,7 @@ def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.iloc[0, df.columns.get_loc("B")] = 1.0 @@ -679,8 +694,8 @@ def test_append_misc(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df, chunksize=1) result = store.select("df") @@ -696,8 +711,8 @@ def test_append_misc_chunksize(setup_path, chunksize): # more chunksize in append tests df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["string"] = "foo" df["float322"] = 1.0 @@ -733,19 +748,15 @@ def test_append_misc_empty_frame(setup_path): tm.assert_frame_equal(store.select("df2"), df) -# TODO(ArrayManager) currently we rely on falling back to BlockManager, but -# the conversion from AM->BM converts the invalid object dtype column into -# a datetime64 column no longer raising an error -@td.skip_array_manager_not_yet_implemented -def test_append_raise(setup_path): +def test_append_raise(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages # list in column df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ @@ -765,8 +776,8 @@ def test_append_raise(setup_path): # datetime with embedded nans as object df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) @@ -785,7 +796,7 @@ def test_append_raise(setup_path): # series directly msg = re.escape( "cannot properly create the storer for: " - "[group->df,value->]" + "[group->df,value->]" ) with pytest.raises(TypeError, match=msg): store.append("df", Series(np.arange(10))) @@ -793,8 +804,8 @@ def test_append_raise(setup_path): # appending an incompatible table df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df) @@ -813,12 +824,9 @@ def test_append_raise(setup_path): store.append("df", df) df["foo"] = "bar" msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64[s],kind->datetime64[s],shape->None]" + "Cannot serialize the column [foo] " + "because its data contents are not [string] " + "but [datetime64[s]] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df", df) @@ -874,7 +882,7 @@ def test_append_with_timedelta(setup_path): def test_append_to_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -911,12 +919,12 @@ def test_append_to_multiple(setup_path): def test_append_to_multiple_dropna(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan @@ -936,15 +944,16 @@ def test_append_to_multiple_dropna(setup_path): def test_append_to_multiple_dropna_false(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) - with ensure_clean_store(setup_path) as store, pd.option_context( - "io.hdf.dropna_table", True + with ( + ensure_clean_store(setup_path) as store, + pd.option_context("io.hdf.dropna_table", True), ): # dropna=False shouldn't synchronize row indexes store.append_to_multiple( @@ -970,6 +979,8 @@ def test_append_to_multiple_min_itemsize(setup_path): } ) expected = df.iloc[[0]] + # Reading/writing RangeIndex info is not supported yet + expected.index = Index(list(range(len(expected.index)))) with ensure_clean_store(setup_path) as store: store.append_to_multiple( @@ -984,3 +995,29 @@ def test_append_to_multiple_min_itemsize(setup_path): ) result = store.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_append_string_nan_rep(setup_path): + # GH 16300 + df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10)) + df_nan = df.copy() + df_nan.loc[0:4, :] = np.nan + msg = "NaN representation is too large for existing column size" + + with ensure_clean_store(setup_path) as store: + # string column too small + store.append("sa", df["A"]) + with pytest.raises(ValueError, match=msg): + store.append("sa", df_nan["A"]) + + # nan_rep too big + store.append("sb", df["B"], nan_rep="bars") + with pytest.raises(ValueError, match=msg): + store.append("sb", df_nan["B"]) + + # smaller modified nan_rep + store.append("sc", df["A"], nan_rep="n") + store.append("sc", df_nan["A"]) + result = store["sc"] + expected = concat([df["A"], df_nan["A"]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 58ebdfe7696b4..ed2616b24cd71 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -14,7 +14,7 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_categorical(setup_path): @@ -190,25 +190,19 @@ def test_categorical_nan_only_columns(tmp_path, setup_path): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "where, df, expected", - [ - ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})), - ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})), - ], -) -def test_convert_value( - tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame -): +@pytest.mark.parametrize("where, expected", [["q", []], ["a", ["a"]]]) +def test_convert_value(tmp_path, setup_path, where: str, expected): # GH39420 # Check that read_hdf with categorical columns can filter by where condition. + df = DataFrame({"col": ["a", "b", "s"]}) df.col = df.col.astype("category") max_widths = {"col": 1} categorical_values = sorted(df.col.unique()) + expected = DataFrame({"col": expected}) expected.col = expected.col.astype("category") expected.col = expected.col.cat.set_categories(categorical_values) path = tmp_path / setup_path df.to_hdf(path, key="df", format="table", min_itemsize=max_widths) - result = read_hdf(path, where=where) + result = read_hdf(path, where=f'col=="{where}"') tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index b07fb3ddd3ac8..b78a503a2b8a3 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -36,7 +36,7 @@ def pytables_hdf5_file(tmp_path): t.row[key] = value t.row.append() - yield path, objname, pd.DataFrame(testsamples) + return path, objname, pd.DataFrame(testsamples) class TestReadPyTablesHDF5: diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 2021101098892..b28101c09820f 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -22,7 +22,7 @@ _maybe_adjust_name, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_pass_spec_to_storer(setup_path): @@ -88,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path): with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... - msg = re.escape( - """Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""" + msg = "|".join( + [ + re.escape( + "Cannot serialize the column [datetime1]\nbecause its data " + "contents are not [string] but [date] object dtype" + ), + re.escape("[date] is not implemented as a table column"), + ] ) with pytest.raises(TypeError, match=msg): store.append("df_unimplemented", df) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index d93de16816725..27b5d34146f85 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -8,6 +8,7 @@ is_ci_environment, is_platform_linux, is_platform_little_endian, + is_platform_mac, ) from pandas.errors import ( ClosedFileError, @@ -32,11 +33,11 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): +def test_mode(setup_path, tmp_path, mode, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -85,10 +86,12 @@ def test_mode(setup_path, tmp_path, mode): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) + if using_infer_string: + df.columns = df.columns.astype("str") tm.assert_frame_equal(result, df) -def test_default_mode(tmp_path, setup_path): +def test_default_mode(tmp_path, setup_path, using_infer_string): # read_hdf uses default mode df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -98,7 +101,10 @@ def test_default_mode(tmp_path, setup_path): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) def test_reopen_handle(tmp_path, setup_path): @@ -157,7 +163,7 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path): +def test_open_args(setup_path, using_infer_string): with tm.ensure_clean(setup_path) as path: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -172,8 +178,13 @@ def test_open_args(setup_path): store["df"] = df store.append("df2", df) - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) store.close() @@ -188,7 +199,7 @@ def test_flush(setup_path): store.flush(fsync=True) -def test_complibs_default_settings(tmp_path, setup_path): +def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): # GH15943 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -201,7 +212,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -212,7 +227,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -223,7 +242,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -258,7 +281,7 @@ def test_complibs_default_settings_override(tmp_path, setup_path): @pytest.mark.filterwarnings("ignore:object name is not a valid") @pytest.mark.skipif( not PY311 and is_ci_environment() and is_platform_linux(), - reason="Segfaulting in a CI environment" + reason="Segfaulting in a CI environment", # with xfail, would sometimes raise UnicodeDecodeError # invalid state byte ) @@ -287,12 +310,17 @@ def test_complibs(tmp_path, lvl, lib, request): result = read_hdf(tmpfile, gname) tm.assert_frame_equal(result, df) + is_mac = is_platform_mac() + # Open file and check metadata for correct amount of compression with tables.open_file(tmpfile, mode="r") as h5table: for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): assert node.filters.complevel == lvl if lvl == 0: assert node.filters.complib is None + elif is_mac and lib == "blosc2": + res = node.filters.complib + assert res in [lib, "blosc2:blosclz"], res else: assert node.filters.complib == lib @@ -328,7 +356,7 @@ def test_encoding(setup_path): [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ], ) -@pytest.mark.parametrize("dtype", ["category", object]) +@pytest.mark.parametrize("dtype", ["category", None]) def test_latin_encoding(tmp_path, setup_path, dtype, val): enc = "latin-1" nan_rep = "" diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 55bd3f0d5a03a..9c5fc8786c7c6 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -13,7 +13,7 @@ tables, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index bc5f046b7fa33..c9fe6070b34c3 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -22,7 +22,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_format_type(tmp_path, setup_path): @@ -49,8 +49,8 @@ def test_api_default_format(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -74,8 +74,8 @@ def test_api_default_format(tmp_path, setup_path): path = tmp_path / setup_path df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -101,7 +101,7 @@ def test_put(setup_path): ) df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) store["a"] = ts @@ -161,7 +161,7 @@ def test_put_compression(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -178,7 +178,7 @@ def test_put_compression(setup_path): def test_put_compression_blosc(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -192,10 +192,20 @@ def test_put_compression_blosc(setup_path): tm.assert_frame_equal(store["c"], df) -def test_put_mixed_type(setup_path): +def test_put_datetime_ser(setup_path, performance_warning, using_infer_string): + # https://github.com/pandas-dev/pandas/pull/60663 + ser = Series(3 * [Timestamp("20010102").as_unit("ns")]) + with ensure_clean_store(setup_path) as store: + store.put("ser", ser) + expected = ser.copy() + result = store.get("ser") + tm.assert_series_equal(result, expected) + + +def test_put_mixed_type(setup_path, performance_warning, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" @@ -215,13 +225,42 @@ def test_put_mixed_type(setup_path): with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + warning = None if using_infer_string else performance_warning + with tm.assert_produces_warning(warning): store.put("df", df) expected = store.get("df") tm.assert_frame_equal(expected, df) +def test_put_str_frame(setup_path, performance_warning, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype)}) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("df", df) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = df.astype(expected_dtype) + result = store.get("df") + tm.assert_frame_equal(result, expected) + + +def test_put_str_series(setup_path, performance_warning, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + ser = Series(["x", pd.NA, "y"], dtype=dtype) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("ser", ser) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = ser.astype(expected_dtype) + result = store.get("ser") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format", ["table", "fixed"]) @pytest.mark.parametrize( "index", @@ -248,7 +287,7 @@ def test_store_index_types(setup_path, format, index): tm.assert_frame_equal(df, store["df"]) -def test_column_multiindex(setup_path): +def test_column_multiindex(setup_path, using_infer_string): # GH 4710 # recreate multi-indexes properly @@ -259,6 +298,12 @@ def test_column_multiindex(setup_path): expected = df.set_axis(df.index.to_numpy()) with ensure_clean_store(setup_path) as store: + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + store.put("df", df) + return store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e4a3ea1fc9db8..7a3a7339e7809 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,7 +5,6 @@ import numpy as np import pytest -from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows import pandas as pd @@ -22,11 +21,10 @@ _maybe_remove, ensure_clean_store, ) -from pandas.util import _test_decorators as td from pandas.io.pytables import TableIterator -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_read_missing_key_close_store(tmp_path, setup_path): @@ -75,7 +73,7 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): def test_read_column(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -172,51 +170,7 @@ def test_pytables_native2_read(datapath): assert isinstance(d1, DataFrame) -def test_legacy_table_fixed_format_read_py2(datapath): - # GH 24510 - # legacy table with fixed format written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" - ) as store: - result = store.select("df") - expected = DataFrame( - [[1, 2, 3, "D"]], - columns=["A", "B", "C", "D"], - index=Index(["ABC"], name="INDEX_NAME"), - ) - tm.assert_frame_equal(expected, result) - - -def test_legacy_table_fixed_format_read_datetime_py2(datapath): - # GH 31750 - # legacy table with fixed format and datetime64 column written in Python 2 - expected = DataFrame( - [[Timestamp("2020-02-06T18:00")]], - columns=["A"], - index=Index(["date"]), - dtype="M8[ns]", - ) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), - mode="r", - ) as store: - result = store.select("df") - tm.assert_frame_equal(expected, result) - - -def test_legacy_table_read_py2(datapath): - # issue: 24925 - # legacy table written in Python 2 - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" - ) as store: - result = store.select("table") - - expected = DataFrame({"a": ["a", "b"], "b": [2, 3]}) - tm.assert_frame_equal(expected, result) - - -def test_read_hdf_open_store(tmp_path, setup_path): +def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame( @@ -228,6 +182,12 @@ def test_read_hdf_open_store(tmp_path, setup_path): df = df.set_index(keys="E", append=True) path = tmp_path / setup_path + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + df.to_hdf(path, key="df", mode="w") + return df.to_hdf(path, key="df", mode="w") direct = read_hdf(path, "df") with HDFStore(path, mode="r") as store: @@ -337,25 +297,6 @@ def test_read_from_pathlib_path(tmp_path, setup_path): tm.assert_frame_equal(expected, actual) -@td.skip_if_no("py.path") -def test_read_from_py_localpath(tmp_path, setup_path): - # GH11773 - from py.path import local as LocalPath - - expected = DataFrame( - np.random.default_rng(2).random((4, 5)), - index=list("abcd"), - columns=list("ABCDE"), - ) - filename = tmp_path / setup_path - path_obj = LocalPath(filename) - - expected.to_hdf(path_obj, key="df", mode="a") - actual = read_hdf(path_obj, key="df") - - tm.assert_frame_equal(expected, actual) - - @pytest.mark.parametrize("format", ["fixed", "table"]) def test_read_hdf_series_mode_r(tmp_path, format, setup_path): # GH 16583 @@ -368,37 +309,8 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path): tm.assert_series_equal(result, series) -@pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning") -@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -def test_read_py2_hdf_file_in_py3(datapath): - # GH 16781 - - # tests reading a PeriodIndex DataFrame written in Python2 in Python3 - - # the file was generated in Python 2.7 like so: - # - # df = DataFrame([1.,2,3], index=pd.PeriodIndex( - # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) - # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') - - expected = DataFrame( - [1.0, 2, 3], - index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), - ) - - with ensure_clean_store( - datapath( - "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" - ), - mode="r", - ) as store: - result = store["p"] - tm.assert_frame_equal(result, expected) - - def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -406,7 +318,18 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype="string[pyarrow_numpy]", - columns=Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) + + +def test_hdfstore_read_datetime64_unit_s(tmp_path, setup_path): + # GH 59004 + df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]") + path = tmp_path / setup_path + with HDFStore(path, mode="w") as store: + store.put("df_s", df_s) + with HDFStore(path, mode="r") as store: + df_fromstore = store.get("df_s") + tm.assert_frame_equal(df_s, df_fromstore) diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py index 6284b826c3cf0..ab8dcacc4b8d4 100644 --- a/pandas/tests/io/pytables/test_retain_attributes.py +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -90,3 +90,16 @@ def test_retain_index_attributes2(tmp_path, setup_path): df2.to_hdf(path, key="data", append=True) assert read_hdf(path, "data").index.name is None + + +def test_retain_datetime_attribute(tmp_path, setup_path): + path = tmp_path / setup_path + ser = Series( + ["2024-08-26 15:13:14", "2024-08-26 15:14:14"], + dtype="datetime64[us, UTC]", + ) + dataframe = DataFrame(ser) + dataframe.to_hdf(path, key="Annotations", mode="w") + + recovered_dataframe = read_hdf(path, key="Annotations") + tm.assert_frame_equal(dataframe, recovered_dataframe) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 4ba9787a5a6b9..409b92d2ddde1 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -24,7 +24,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_conv_read_write(): @@ -44,8 +44,8 @@ def roundtrip(key, obj, **kwargs): o = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) tm.assert_frame_equal(o, roundtrip("frame", o)) @@ -145,8 +145,8 @@ def test_api_invalid(tmp_path, setup_path): # Invalid. df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) msg = "Can only append to Tables" @@ -196,7 +196,7 @@ def test_put_integer(setup_path): _check_roundtrip(df, tm.assert_frame_equal, setup_path) -def test_table_values_dtypes_roundtrip(setup_path): +def test_table_values_dtypes_roundtrip(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") store.append("df_f8", df1) @@ -208,12 +208,9 @@ def test_table_values_dtypes_roundtrip(setup_path): # incompatible dtype msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" + "Cannot serialize the column [a] " + "because its data contents are not [float] " + "but [integer] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df_i8", df1) @@ -236,12 +233,15 @@ def test_table_values_dtypes_roundtrip(setup_path): df1["float322"] = 1.0 df1["float322"] = df1["float322"].astype("float32") df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") + df1["time_s_1"] = Timestamp("20130101") + df1["time_s_2"] = Timestamp("20130101 00:00:00") + df1["time_ms"] = Timestamp("20130101 00:00:00.000") + df1["time_ns"] = Timestamp("20130102 00:00:00.000000000") store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] + str_dtype = "str" if using_infer_string else "object" expected = Series( { "float32": 2, @@ -251,8 +251,10 @@ def test_table_values_dtypes_roundtrip(setup_path): "int16": 1, "int8": 1, "int64": 1, - "object": 1, - "datetime64[ns]": 2, + str_dtype: 1, + "datetime64[s]": 2, + "datetime64[ms]": 1, + "datetime64[ns]": 1, }, name="count", ) @@ -271,10 +273,10 @@ def test_series(setup_path): ) _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - ts2 = Series(ts.index, Index(ts.index, dtype=object)) + ts2 = Series(ts.index, Index(ts.index)) _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + ts3 = Series(ts.values, Index(np.asarray(ts.index))) _check_roundtrip( ts3, tm.assert_series_equal, path=setup_path, check_index_type=False ) @@ -287,14 +289,14 @@ def test_float_index(setup_path): _check_roundtrip(s, tm.assert_series_equal, path=setup_path) -def test_tuple_index(setup_path): +def test_tuple_index(setup_path, performance_warning): # GH #492 col = np.arange(10) idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] data = np.random.default_rng(2).standard_normal(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + with tm.assert_produces_warning(performance_warning): _check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) @@ -364,8 +366,8 @@ def test_timeseries_preepoch(setup_path, request): def test_frame(compression, setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # put in some random NAs @@ -381,7 +383,7 @@ def test_frame(compression, setup_path): tdf = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _check_roundtrip( @@ -396,7 +398,10 @@ def test_frame(compression, setup_path): assert recons._mgr.is_consolidated() # empty - _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + df2 = df[:0] + # Prevent df2 from having index with inferred_type as string + df2.index = Index([]) + _check_roundtrip(df2[:0], tm.assert_frame_equal, path=setup_path) def test_empty_series_frame(setup_path): @@ -428,9 +433,17 @@ def test_can_serialize_dates(setup_path): _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) -def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): +def test_store_hierarchical( + setup_path, using_infer_string, multiindex_dataframe_random_data +): frame = multiindex_dataframe_random_data + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + return _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) @@ -449,8 +462,8 @@ def test_store_mixed(compression, setup_path): def _make_one(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 0e303d1c890c5..5e76aae28c147 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -2,6 +2,7 @@ import pytest from pandas._libs.tslibs import Timestamp +from pandas.compat import PY312 import pandas as pd from pandas import ( @@ -24,7 +25,7 @@ from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_select_columns_in_where(setup_path): @@ -132,7 +133,7 @@ def test_select(setup_path): # select with columns= df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -168,7 +169,7 @@ def test_select(setup_path): tm.assert_frame_equal(expected, result) -def test_select_dtypes(setup_path): +def test_select_dtypes(setup_path, request): with ensure_clean_store(setup_path) as store: # with a Timestamp data column (GH #2637) df = DataFrame( @@ -272,13 +273,20 @@ def test_select_dtypes(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) expected = df[df["A"] > 0] store.append("df", df, data_columns=True) + request.applymarker( + pytest.mark.xfail( + PY312, + reason="AST change in PY312", + raises=ValueError, + ) + ) np_zero = np.float64(0) # noqa: F841 result = store.select("df", where=["A>np_zero"]) tm.assert_frame_equal(expected, result) @@ -337,7 +345,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -362,7 +370,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df_non_table") @@ -378,7 +386,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df", format="table") @@ -395,7 +403,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df1", df1, data_columns=True) @@ -423,7 +431,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -458,7 +466,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -500,7 +508,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -534,7 +542,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -558,7 +566,7 @@ def test_select_iterator_many_empty_frames(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -607,10 +615,10 @@ def test_select_iterator_many_empty_frames(setup_path): assert len(results) == 0 -def test_frame_select(setup_path): +def test_frame_select(setup_path, request): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -624,6 +632,13 @@ def test_frame_select(setup_path): crit2 = "columns=['A', 'D']" crit3 = "columns=A" + request.applymarker( + pytest.mark.xfail( + PY312, + reason="AST change in PY312", + raises=TypeError, + ) + ) result = store.select("frame", [crit1, crit2]) expected = df.loc[date:, ["A", "D"]] tm.assert_frame_equal(result, expected) @@ -635,7 +650,7 @@ def test_frame_select(setup_path): # invalid terms df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df_time", df) @@ -654,7 +669,7 @@ def test_frame_select_complex(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -771,7 +786,7 @@ def test_invalid_filtering(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -793,7 +808,7 @@ def test_string_select(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -837,7 +852,7 @@ def test_string_select(setup_path): def test_select_as_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -1038,7 +1053,6 @@ def test_select_large_integer(tmp_path): ), columns=["x", "y"], ) - result = None with HDFStore(path) as s: s.append("data", df, data_columns=True, index=False) result = s.select("data", where="y==-9223372036854775801").get("y").get(0) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 82d3052e7f5d6..5cfefeb469e8a 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat import PY312 + import pandas as pd from pandas import ( DataFrame, @@ -21,6 +23,9 @@ timedelta_range, ) import pandas._testing as tm +from pandas.api.types import ( + CategoricalDtype, +) from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, @@ -31,7 +36,7 @@ read_hdf, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -103,7 +108,7 @@ def test_iter_empty(setup_path): assert list(store) == [] -def test_repr(setup_path): +def test_repr(setup_path, performance_warning, using_infer_string): with ensure_clean_store(setup_path) as store: repr(store) store.info() @@ -138,7 +143,9 @@ def test_repr(setup_path): df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate() - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + warning = None if using_infer_string else performance_warning + msg = "cannot\nmap directly to c-types .* dtype='object'" + with tm.assert_produces_warning(warning, match=msg): store["df"] = df # make a random group in hdf space @@ -304,12 +311,12 @@ def test_getattr(setup_path): # test attribute access result = store.a tm.assert_series_equal(result, s) - result = getattr(store, "a") + result = store.a tm.assert_series_equal(result, s) df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store["df"] = df @@ -354,20 +361,6 @@ def test_store_dropna(tmp_path, setup_path): tm.assert_frame_equal(df_without_missing, reloaded) -def test_keyword_deprecation(tmp_path, setup_path): - # GH 54229 - path = tmp_path / setup_path - - msg = ( - "Starting with pandas version 3.0 all arguments of to_hdf except for the " - "argument 'path_or_buf' will be keyword-only." - ) - df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) - - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_hdf(path, "key") - - def test_to_hdf_with_min_itemsize(tmp_path, setup_path): path = tmp_path / setup_path @@ -376,7 +369,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -393,15 +386,23 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): @pytest.mark.parametrize("format", ["fixed", "table"]) -def test_to_hdf_errors(tmp_path, format, setup_path): +def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string): data = ["\ud800foo"] - ser = Series(data, index=Index(data)) + ser = Series(data, index=Index(data, dtype="object"), dtype="object") path = tmp_path / setup_path # GH 20835 ser.to_hdf(path, key="table", format=format, errors="surrogatepass") result = read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) + + if using_infer_string: + # https://github.com/pandas-dev/pandas/pull/60993 + # Surrogates fallback to python storage. + dtype = pd.StringDtype(storage="python", na_value=np.nan) + else: + dtype = "object" + expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype) + tm.assert_series_equal(result, expected) def test_create_table_index(setup_path): @@ -413,7 +414,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -448,7 +449,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -490,8 +491,8 @@ def test_table_mixed_dtypes(setup_path): # frame df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" @@ -546,8 +547,8 @@ def test_remove(setup_path): ) df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store["a"] = ts store["b"] = df @@ -610,8 +611,8 @@ def test_same_name_scoping(setup_path): def test_store_index_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.index.name = "foo" @@ -625,10 +626,14 @@ def test_store_index_name(setup_path): @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = DatetimeIndex( - [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], - name="cols\u05d2", - ).tz_localize(tz) + idx = ( + DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], + name="cols\u05d2", + ) + .tz_localize(tz) + .as_unit(unit) + ) idx1 = ( DatetimeIndex( [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], @@ -653,8 +658,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz def test_store_series_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) series = df["A"] @@ -668,7 +673,7 @@ def test_overwrite_node(setup_path): with ensure_clean_store(setup_path) as store: store["a"] = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) ts = Series( @@ -682,7 +687,7 @@ def test_overwrite_node(setup_path): def test_coordinates(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -717,7 +722,7 @@ def test_coordinates(setup_path): _maybe_remove(store, "df2") df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -873,14 +878,14 @@ def test_start_stop_fixed(setup_path): # sparse; not implemented df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan -def test_select_filter_corner(setup_path): +def test_select_filter_corner(setup_path, request): df = DataFrame(np.random.default_rng(2).standard_normal((50, 100))) df.index = [f"{c:3d}" for c in df.index] df.columns = [f"{c:3d}" for c in df.columns] @@ -888,6 +893,13 @@ def test_select_filter_corner(setup_path): with ensure_clean_store(setup_path) as store: store.put("frame", df, format="table") + request.applymarker( + pytest.mark.xfail( + PY312, + reason="AST change in PY312", + raises=ValueError, + ) + ) crit = "columns=df.columns[:75]" result = store.select("frame", [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) @@ -900,8 +912,8 @@ def test_select_filter_corner(setup_path): def test_path_pathlib(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( @@ -930,8 +942,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path): def test_path_pathlib_hdfstore(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) def writer(path): @@ -949,8 +961,8 @@ def reader(path): def test_pickle_path_localpath(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") @@ -958,31 +970,12 @@ def test_pickle_path_localpath(): tm.assert_frame_equal(df, result) -def test_path_localpath_hdfstore(): - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) - - def writer(path): - with HDFStore(path) as store: - df.to_hdf(store, key="df") - - def reader(path): - with HDFStore(path) as store: - return read_hdf(store, "df") - - result = tm.round_trip_localpath(writer, reader) - tm.assert_frame_equal(df, result) - - @pytest.mark.parametrize("propindexes", [True, False]) def test_copy(propindexes): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: @@ -1117,3 +1110,23 @@ def test_store_bool_index(tmp_path, setup_path): df.to_hdf(path, key="a") result = read_hdf(path, "a") tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("model", ["name", "longname", "verylongname"]) +def test_select_categorical_string_columns(tmp_path, model): + # Corresponding to BUG: 57608 + + path = tmp_path / "test.h5" + + models = CategoricalDtype(categories=["name", "longname", "verylongname"]) + df = DataFrame( + {"modelId": ["name", "longname", "longname"], "value": [1, 2, 3]} + ).astype({"modelId": models, "value": int}) + + with HDFStore(path, "w") as store: + store.append("df", df, data_columns=["modelId"]) + + with HDFStore(path, "r") as store: + result = store.select("df", "modelId == model") + expected = df[df["modelId"] == model] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8c61830ebe038..9192804e49bd1 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -104,7 +104,7 @@ def test_append_with_timezones(setup_path, gettz): msg = ( r"invalid info for \[values_block_1\] for \[tz\], " - r"existing_value \[(dateutil/.*)?US/Eastern\] " + r"existing_value \[(dateutil/.*)?(US/Eastern|America/New_York)\] " r"conflicts with new value \[(dateutil/.*)?EET\]" ) with pytest.raises(ValueError, match=msg): @@ -312,23 +312,6 @@ def test_store_timezone(setup_path): tm.assert_frame_equal(result, df) -def test_legacy_datetimetz_object(datapath): - # legacy from < 0.17.0 - # 8260 - expected = DataFrame( - { - "A": Timestamp("20130102", tz="US/Eastern").as_unit("ns"), - "B": Timestamp("20130603", tz="CET").as_unit("ns"), - }, - index=range(5), - ) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" - ) as store: - result = store["df"] - tm.assert_frame_equal(result, expected) - - def test_dst_transitions(setup_path): # make sure we are not failing on transitions with ensure_clean_store(setup_path) as store: @@ -362,17 +345,3 @@ def test_read_with_where_tz_aware_index(tmp_path, setup_path): store.append(key, expected, format="table", append=True) result = pd.read_hdf(path, key, where="DATE > 20151130") tm.assert_frame_equal(result, expected) - - -def test_py2_created_with_datetimez(datapath): - # The test HDF5 file was created in Python 2, but could not be read in - # Python 3. - # - # GH26443 - index = DatetimeIndex(["2019-01-01T18:00"], dtype="M8[ns, America/New_York]") - expected = DataFrame({"data": 123}, index=index) - with ensure_clean_store( - datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" - ) as store: - result = store["key"] - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b71896c77ffb5..a17cd27f8284e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,9 +7,11 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas.compat._constants import ( + IS64, + WASM, +) from pandas.errors import EmptyDataError -import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -28,9 +30,9 @@ def data_test_ix(request, dirpath): fname = os.path.join(dirpath, f"test_sas7bdat_{i}.csv") df = pd.read_csv(fname) epoch = datetime(1960, 1, 1) - t1 = pd.to_timedelta(df["Column4"], unit="d") + t1 = pd.to_timedelta(df["Column4"], unit="D") df["Column4"] = (epoch + t1).astype("M8[s]") - t2 = pd.to_timedelta(df["Column12"], unit="d") + t2 = pd.to_timedelta(df["Column12"], unit="D") df["Column12"] = (epoch + t2).astype("M8[s]") for k in range(df.shape[1]): col = df.iloc[:, k] @@ -82,17 +84,6 @@ def test_path_pathlib(self, dirpath, data_test_ix): df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, expected) - @td.skip_if_no("py.path") - @pytest.mark.slow - def test_path_localpath(self, dirpath, data_test_ix): - from py.path import local as LocalPath - - expected, test_ix = data_test_ix - for k in test_ix: - fname = LocalPath(os.path.join(dirpath, f"test{k}.sas7bdat")) - df = pd.read_sas(fname, encoding="utf-8") - tm.assert_frame_equal(df, expected) - @pytest.mark.slow @pytest.mark.parametrize("chunksize", (3, 5, 10, 11)) @pytest.mark.parametrize("k", range(1, 17)) @@ -180,6 +171,7 @@ def test_airline(datapath): tm.assert_frame_equal(df, df0) +@pytest.mark.skipif(WASM, reason="Pyodide/WASM has 32-bitness") def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) fname = datapath("io", "sas", "data", "datetime.sas7bdat") @@ -248,11 +240,13 @@ def test_zero_variables(datapath): pd.read_sas(fname) -def test_zero_rows(datapath): +@pytest.mark.parametrize("encoding", [None, "utf8"]) +def test_zero_rows(datapath, encoding): # GH 18198 fname = datapath("io", "sas", "data", "zero_rows.sas7bdat") - result = pd.read_sas(fname) - expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0] + result = pd.read_sas(fname, encoding=encoding) + str_value = b"a" if encoding is None else "a" + expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0] tm.assert_frame_equal(result, expected) @@ -265,6 +259,7 @@ def test_corrupt_read(datapath): pd.read_sas(fname) +@pytest.mark.xfail(WASM, reason="failing with currently set tolerances on WASM") def test_max_sas_date(datapath): # GH 20927 # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 @@ -304,6 +299,7 @@ def test_max_sas_date(datapath): tm.assert_frame_equal(df, expected) +@pytest.mark.xfail(WASM, reason="failing with currently set tolerances on WASM") def test_max_sas_date_iterator(datapath): # GH 20927 # when called as an iterator, only those chunks with a date > pd.Timestamp.max @@ -349,6 +345,7 @@ def test_max_sas_date_iterator(datapath): tm.assert_frame_equal(results[1], expected[1]) +@pytest.mark.skipif(WASM, reason="Pyodide/WASM has 32-bitness") def test_null_date(datapath): fname = datapath("io", "sas", "data", "dates_null.sas7bdat") df = pd.read_sas(fname, encoding="utf-8") @@ -408,7 +405,7 @@ def test_0x40_control_byte(datapath): fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") df = pd.read_sas(fname, encoding="ascii") fname = datapath("io", "sas", "data", "0x40controlbyte.csv") - df0 = pd.read_csv(fname, dtype="object") + df0 = pd.read_csv(fname, dtype="str") tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 766c9c37d55b9..30ca70d7e0009 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -19,31 +19,12 @@ def numeric_as_float(data): class TestXport: - @pytest.fixture - def file01(self, datapath): - return datapath("io", "sas", "data", "DEMO_G.xpt") - - @pytest.fixture - def file02(self, datapath): - return datapath("io", "sas", "data", "SSHSV1_A.xpt") - - @pytest.fixture - def file03(self, datapath): - return datapath("io", "sas", "data", "DRXFCD_G.xpt") - - @pytest.fixture - def file04(self, datapath): - return datapath("io", "sas", "data", "paxraw_d_short.xpt") - - @pytest.fixture - def file05(self, datapath): - return datapath("io", "sas", "data", "DEMO_PUF.cpt") - @pytest.mark.slow - def test1_basic(self, file01): + def test1_basic(self, datapath): # Tests with DEMO_G.xpt (all numeric file) # Compare to this + file01 = datapath("io", "sas", "data", "DEMO_G.xpt") data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) numeric_as_float(data_csv) @@ -78,10 +59,11 @@ def test1_basic(self, file01): data = read_sas(file01) tm.assert_frame_equal(data, data_csv) - def test1_index(self, file01): + def test1_index(self, datapath): # Tests with DEMO_G.xpt using index (all numeric file) # Compare to this + file01 = datapath("io", "sas", "data", "DEMO_G.xpt") data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) data_csv = data_csv.set_index("SEQN") numeric_as_float(data_csv) @@ -100,9 +82,10 @@ def test1_index(self, file01): data = reader.get_chunk() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) - def test1_incremental(self, file01): + def test1_incremental(self, datapath): # Test with DEMO_G.xpt, reading full file incrementally + file01 = datapath("io", "sas", "data", "DEMO_G.xpt") data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) data_csv = data_csv.set_index("SEQN") numeric_as_float(data_csv) @@ -113,9 +96,10 @@ def test1_incremental(self, file01): tm.assert_frame_equal(data, data_csv, check_index_type=False) - def test2(self, file02): + def test2(self, datapath): # Test with SSHSV1_A.xpt + file02 = datapath("io", "sas", "data", "SSHSV1_A.xpt") # Compare to this data_csv = pd.read_csv(file02.replace(".xpt", ".csv")) numeric_as_float(data_csv) @@ -123,10 +107,11 @@ def test2(self, file02): data = read_sas(file02) tm.assert_frame_equal(data, data_csv) - def test2_binary(self, file02): + def test2_binary(self, datapath): # Test with SSHSV1_A.xpt, read as a binary file # Compare to this + file02 = datapath("io", "sas", "data", "SSHSV1_A.xpt") data_csv = pd.read_csv(file02.replace(".xpt", ".csv")) numeric_as_float(data_csv) @@ -137,31 +122,32 @@ def test2_binary(self, file02): tm.assert_frame_equal(data, data_csv) - def test_multiple_types(self, file03): + def test_multiple_types(self, datapath): # Test with DRXFCD_G.xpt (contains text and numeric variables) # Compare to this + file03 = datapath("io", "sas", "data", "DRXFCD_G.xpt") data_csv = pd.read_csv(file03.replace(".xpt", ".csv")) data = read_sas(file03, encoding="utf-8") tm.assert_frame_equal(data, data_csv) - def test_truncated_float_support(self, file04): + def test_truncated_float_support(self, datapath): # Test with paxraw_d_short.xpt, a shortened version of: # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP # This file has truncated floats (5 bytes in this case). # GH 11713 - + file04 = datapath("io", "sas", "data", "paxraw_d_short.xpt") data_csv = pd.read_csv(file04.replace(".xpt", ".csv")) data = read_sas(file04, format="xport") tm.assert_frame_equal(data.astype("int64"), data_csv) - def test_cport_header_found_raises(self, file05): + def test_cport_header_found_raises(self, datapath): # Test with DEMO_PUF.cpt, the beginning of puf2019_1_fall.xpt # from https://www.cms.gov/files/zip/puf2019.zip # (despite the extension, it's a cpt file) msg = "Header record indicates a CPORT file, which is not readable." with pytest.raises(ValueError, match=msg): - read_sas(file05, format="xport") + read_sas(datapath("io", "sas", "data", "DEMO_PUF.cpt"), format="xport") diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3c0208fcc74ec..b5e97314caf03 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -17,10 +17,6 @@ read_clipboard, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.clipboard import ( CheckedCall, @@ -222,7 +218,7 @@ def test_excel_sep_warning(self, df): # Separator is ignored when excel=False and should produce a warning def test_copy_delim_warning(self, df): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match="ignores the sep argument"): df.to_clipboard(excel=False, sep="\t") # Tests that the default behavior of to_clipboard is tab @@ -349,26 +345,18 @@ def test_raw_roundtrip(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine, using_infer_string ): # GH#50502 - if string_storage == "pyarrow" or dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - - if string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow" and engine != "c": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + if engine == "c" and string_storage == "pyarrow": + # TODO avoid this exception? + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) else: - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) + string_dtype = pd.StringDtype(string_storage) text = """a,b,c,d,e,f,g,h,i x,1,4.0,x,2,4.0,,True,False @@ -380,10 +368,10 @@ def test_read_clipboard_dtype_backend( expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), @@ -402,6 +390,11 @@ def test_read_clipboard_dtype_backend( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(self): @@ -411,13 +404,3 @@ def test_invalid_dtype_backend(self): ) with pytest.raises(ValueError, match=msg): read_clipboard(dtype_backend="numpy") - - def test_to_clipboard_pos_args_deprecation(self): - # GH-54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_clipboard " - r"will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_clipboard(True, None) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 074033868635a..4a5e41397b59d 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -1,6 +1,7 @@ """ Tests for the pandas.io.common functionalities """ + import codecs import errno from functools import partial @@ -18,7 +19,11 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows +from pandas.compat import ( + WASM, + is_platform_windows, +) +from pandas.compat.pyarrow import pa_version_under19p0 import pandas.util._test_decorators as td import pandas as pd @@ -41,16 +46,6 @@ def __fspath__(self): return self.path -# Functions that consume a string path and return a string or path-like object -path_types = [str, CustomFSPath, Path] - -try: - from py.path import local as LocalPath - - path_types.append(LocalPath) -except ImportError: - pass - HERE = os.path.abspath(os.path.dirname(__file__)) @@ -86,13 +81,6 @@ def test_stringify_path_pathlib(self): redundant_path = icom.stringify_path(Path("foo//bar")) assert redundant_path == os.path.join("foo", "bar") - @td.skip_if_no("py.path") - def test_stringify_path_localpath(self): - path = os.path.join("foo", "bar") - abs_path = os.path.abspath(path) - lpath = LocalPath(path) - assert icom.stringify_path(lpath) == abs_path - def test_stringify_path_fspath(self): p = CustomFSPath("foo/bar.csv") result = icom.stringify_path(p) @@ -105,7 +93,7 @@ def test_stringify_file_and_path_like(self): with fsspec.open(f"file://{path}", mode="wb") as fsspec_obj: assert fsspec_obj == icom.stringify_path(fsspec_obj) - @pytest.mark.parametrize("path_type", path_types) + @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path]) def test_infer_compression_from_path(self, compression_format, path_type): extension, expected = compression_format path = path_type("foo/bar.csv" + extension) @@ -114,7 +102,6 @@ def test_infer_compression_from_path(self, compression_format, path_type): @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path]) def test_get_handle_with_path(self, path_type): - # ignore LocalPath: it creates strange paths: /absolute/~/sometest with tempfile.TemporaryDirectory(dir=Path.home()) as tmp: filename = path_type("~/" + Path(tmp).name + "/sometest") with icom.get_handle(filename, "w") as handles: @@ -166,6 +153,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() + if pa_version_under19p0: + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed @@ -181,6 +170,7 @@ def test_iterator(self): tm.assert_frame_equal(first, expected.iloc[[0]]) tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) + @pytest.mark.skipif(WASM, reason="limited file system access on WASM") @pytest.mark.parametrize( "reader, module, error_class, fn_ext", [ @@ -246,6 +236,7 @@ def test_write_missing_parent_directory(self, method, module, error_class, fn_ex ): method(dummy_frame, path) + @pytest.mark.skipif(WASM, reason="limited file system access on WASM") @pytest.mark.parametrize( "reader, module, error_class, fn_ext", [ @@ -308,7 +299,7 @@ def test_read_expands_user_home_dir( ( pd.read_hdf, "tables", - ("io", "data", "legacy_hdf", "datetimetz_object.h5"), + ("io", "data", "legacy_hdf", "pytables_native2.h5"), ), (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), @@ -400,6 +391,7 @@ def mmap_file(datapath): class TestMMapWrapper: + @pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_constructor_bad_file(self, mmap_file): non_file = StringIO("I am not a file") non_file.fileno = lambda: -1 @@ -422,6 +414,7 @@ def test_constructor_bad_file(self, mmap_file): with pytest.raises(ValueError, match=msg): icom._maybe_memory_map(target, True) + @pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_next(self, mmap_file): with open(mmap_file, encoding="utf-8") as target: lines = target.readlines() @@ -443,8 +436,8 @@ def test_unknown_engine(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): @@ -459,8 +452,8 @@ def test_binary_mode(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @@ -477,15 +470,18 @@ def test_warning_missing_utf_bom(self, encoding, compression_): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: - with tm.assert_produces_warning(UnicodeWarning): + with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"): df.to_csv(path, compression=compression_, encoding=encoding) # reading should fail (otherwise we wouldn't need the warning) - msg = r"UTF-\d+ stream does not start with BOM" + msg = ( + r"UTF-\d+ stream does not start with BOM|" + r"'utf-\d+' codec can't decode byte" + ) with pytest.raises(UnicodeError, match=msg): pd.read_csv(path, compression=compression_, encoding=encoding) @@ -505,14 +501,26 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") +def test_is_fsspec_url_chained(): + # GH#48978 Support chained fsspec URLs + # See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining. + assert icom.is_fsspec_url("filecache::s3://pandas/test.csv") + assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/file.zip") + assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/file.zip") + assert icom.is_fsspec_url("filecache::dask::s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache::://pandas/test.csv") + + @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with codecs.open(path, mode="w", encoding=encoding) as handle: @@ -529,8 +537,8 @@ def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with open(path, "wb") as handle: @@ -555,15 +563,15 @@ def test_explicit_encoding(io_class, mode, msg): # wrong mode is requested expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with io_class() as buffer: with pytest.raises(TypeError, match=msg): expected.to_csv(buffer, mode=f"w{mode}") -@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"]) +@pytest.mark.parametrize("encoding_errors", ["strict", "replace"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_encoding_errors(encoding_errors, format): # GH39450 @@ -598,6 +606,17 @@ def test_encoding_errors(encoding_errors, format): tm.assert_frame_equal(df, expected) +@pytest.mark.parametrize("encoding_errors", [0, None]) +def test_encoding_errors_badtype(encoding_errors): + # GH 59075 + content = StringIO("A,B\n1,2\n3,4\n") + reader = partial(pd.read_csv, encoding_errors=encoding_errors) + expected_error = "encoding_errors must be a string, got " + expected_error += f"{type(encoding_errors).__name__}" + with pytest.raises(ValueError, match=expected_error): + reader(content) + + def test_bad_encdoing_errors(): # GH 39777 with tm.ensure_clean() as path: @@ -605,6 +624,7 @@ def test_bad_encdoing_errors(): icom.get_handle(path, "w", errors="bad") +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_errno_attribute(): # GH 13872 with pytest.raises(FileNotFoundError, match="\\[Errno 2\\]") as err: @@ -630,6 +650,19 @@ def close(self): handles.created_handles.append(TestError()) +@td.skip_if_no("fsspec") +@pytest.mark.parametrize("compression", [None, "infer"]) +def test_read_csv_chained_url_no_error(compression): + # GH 60100 + tar_file_path = "pandas/tests/io/data/tar/test-csv.tar" + chained_file_url = f"tar://test.csv::file://{tar_file_path}" + + result = pd.read_csv(chained_file_url, compression=compression, sep=";") + expected = pd.DataFrame({"1": {0: 3}, "2": {0: 4}}) + + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( "reader", [ @@ -648,3 +681,17 @@ def test_pickle_reader(reader): # GH 22265 with BytesIO() as buffer: pickle.dump(reader, buffer) + + +@td.skip_if_no("pyarrow") +def test_pyarrow_read_csv_datetime_dtype(): + # GH 59904 + data = '"date"\n"20/12/2025"\n""\n"31/12/2020"' + result = pd.read_csv( + StringIO(data), parse_dates=["date"], dayfirst=True, dtype_backend="pyarrow" + ) + + expect_data = pd.to_datetime(["20/12/2025", pd.NaT, "31/12/2020"], dayfirst=True) + expect = pd.DataFrame({"date": expect_data}) + + tm.assert_frame_equal(expect, result) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 3a58dda9e8dc4..fd1e9b4fdf211 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -133,7 +133,7 @@ def test_compression_warning(compression_only): ) with tm.ensure_clean() as path: with icom.get_handle(path, "w", compression=compression_only) as handles: - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="has no effect"): df.to_csv(handles.handle, compression=compression_only) @@ -145,8 +145,8 @@ def test_compression_binary(compression_only): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) # with a file @@ -177,8 +177,8 @@ def test_gzip_reproducibility_file_name(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} @@ -200,8 +200,8 @@ def test_gzip_reproducibility_file_object(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} @@ -231,7 +231,7 @@ def test_with_missing_lzma(): @pytest.mark.single_cpu def test_with_missing_lzma_runtime(): - """Tests if RuntimeError is hit when calling lzma without + """Tests if ModuleNotFoundError is hit when calling lzma without having the module available. """ code = textwrap.dedent( @@ -241,7 +241,7 @@ def test_with_missing_lzma_runtime(): sys.modules['lzma'] = None import pandas as pd df = pd.DataFrame() - with pytest.raises(RuntimeError, match='lzma module'): + with pytest.raises(ModuleNotFoundError, match='import of lzma'): df.to_csv('foo.csv', compression='xz') """ ) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 22a7d3b83a459..e778193c147c1 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -1,13 +1,18 @@ -""" test feather-format compat """ +"""test feather-format compat""" + +from datetime import datetime +import zoneinfo + import numpy as np import pytest +from pandas.compat.pyarrow import ( + pa_version_under18p0, + pa_version_under19p0, +) + import pandas as pd import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.feather_format import read_feather, to_feather # isort:skip @@ -15,6 +20,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) + pa = pytest.importorskip("pyarrow") @@ -36,7 +42,9 @@ def check_external_error_on_write(self, df): with tm.ensure_clean() as path: to_feather(df, path) - def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs): + def check_round_trip(self, df, expected=None, write_kwargs=None, **read_kwargs): + if write_kwargs is None: + write_kwargs = {} if expected is None: expected = df.copy() @@ -59,6 +67,7 @@ def test_error(self): self.check_error_on_write(obj, ValueError, msg) def test_basic(self): + tz = zoneinfo.ZoneInfo("US/Eastern") df = pd.DataFrame( { "string": list("abc"), @@ -73,7 +82,7 @@ def test_basic(self): list(pd.date_range("20130101", periods=3)), freq=None ), "dttz": pd.DatetimeIndex( - list(pd.date_range("20130101", periods=3, tz="US/Eastern")), + list(pd.date_range("20130101", periods=3, tz=tz)), freq=None, ), "dt_with_null": [ @@ -90,7 +99,7 @@ def test_basic(self): df["timedeltas"] = pd.timedelta_range("1 day", periods=3) df["intervals"] = pd.interval_range(0, 3, 3) - assert df.dttz.dtype.tz.zone == "US/Eastern" + assert df.dttz.dtype.tz.key == "US/Eastern" expected = df.copy() expected.loc[1, "bool_with_null"] = None @@ -134,26 +143,17 @@ def test_rw_use_threads(self): def test_path_pathlib(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) - def test_path_localpath(self): - df = pd.DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), - ).reset_index() - result = tm.round_trip_localpath(df.to_feather, read_feather) - tm.assert_frame_equal(df, result) - def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() self.check_round_trip(df, write_kwargs={"version": 1}) @@ -167,7 +167,9 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - def test_read_feather_dtype_backend(self, string_storage, dtype_backend): + def test_read_feather_dtype_backend( + self, string_storage, dtype_backend, using_infer_string + ): # GH#50765 df = pd.DataFrame( { @@ -182,25 +184,20 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - with tm.ensure_clean() as path: to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): result = read_feather(path, dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + if using_infer_string: + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = pd.DataFrame( { "a": pd.Series([1, np.nan, 3], dtype="Int64"), @@ -209,8 +206,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), "e": pd.Series([True, False, pd.NA], dtype="boolean"), "f": pd.Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": pd.Series(["a", "b", "c"], dtype=string_dtype), + "h": pd.Series(["a", "b", None], dtype=string_dtype), } ) @@ -224,6 +221,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) def test_int_columns_and_index(self): @@ -241,12 +242,55 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") - def test_string_inference(self, tmp_path): + def test_string_inference(self, tmp_path, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") + dtype = pd.StringDtype(na_value=np.nan) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) + ) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=dtype, + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") + def test_string_inference_string_view_type(self, tmp_path): + # GH#54798 + import pyarrow as pa + from pyarrow import feather + + path = tmp_path / "string_view.parquet" + table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) + feather.write_feather(table, path) + + with pd.option_context("future.infer_string", True): + result = read_feather(path) + + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) + + def test_out_of_bounds_datetime_to_feather(self): + # GH#47832 + df = pd.DataFrame( + { + "date": [ + datetime.fromisoformat("1654-01-01"), + datetime.fromisoformat("1920-01-01"), + ], + } + ) + self.check_round_trip(df) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index a1dec8a2d05b4..2e3e74a9d31ff 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -3,6 +3,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, date_range, @@ -72,7 +76,9 @@ def test_read_csv(cleared_fs, df1): w.write(text) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_reasonable_error(monkeypatch, cleared_fs): @@ -95,7 +101,9 @@ def test_to_csv(cleared_fs, df1): df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_to_excel(cleared_fs, df1): @@ -106,7 +114,9 @@ def test_to_excel(cleared_fs, df1): df2 = read_excel(path, parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) @pytest.mark.parametrize("binary_mode", [False, True]) @@ -128,7 +138,9 @@ def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1): ) assert not fsspec_object.closed - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_csv_options(fsspectest): @@ -166,6 +178,9 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet" +) def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -194,7 +209,6 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -252,8 +266,8 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") @pytest.mark.single_cpu -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_s3_parquet(s3_public_bucket, s3so, df1): pytest.importorskip("fastparquet") pytest.importorskip("s3fs") diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py deleted file mode 100644 index b2b212ceb2c41..0000000000000 --- a/pandas/tests/io/test_gbq.py +++ /dev/null @@ -1,14 +0,0 @@ -import pandas as pd -import pandas._testing as tm - - -def test_read_gbq_deprecated(): - with tm.assert_produces_warning(FutureWarning): - with tm.external_error_raised(Exception): - pd.read_gbq("fake") - - -def test_to_gbq_deprecated(): - with tm.assert_produces_warning(FutureWarning): - with tm.external_error_raised(Exception): - pd.DataFrame(range(1)).to_gbq("fake") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 0ce6a8bf82cd8..f68ef5fa2e0e5 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under17p0 + from pandas import ( DataFrame, Index, @@ -52,7 +54,7 @@ def ls(self, path, **kwargs): # Patches pyarrow; other processes should not pick up change @pytest.mark.single_cpu @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): +def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys, request): """ Test that many to/read functions support GCS. @@ -78,7 +80,12 @@ def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): df1.to_excel(path) df2 = read_excel(path, parse_dates=["dt"], index_col=0) elif format == "json": - df1.to_json(path) + msg = ( + "The default 'epoch' date format is deprecated and will be removed " + "in a future version, please use 'iso' date format instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df1.to_json(path) df2 = read_json(path, convert_dates=["dt"]) elif format == "parquet": pytest.importorskip("pyarrow") @@ -91,6 +98,13 @@ def from_uri(path): to_local = pathlib.Path(path.replace("gs://", "")).absolute().as_uri() return pa_fs.LocalFileSystem(to_local) + request.applymarker( + pytest.mark.xfail( + not pa_version_under17p0, + raises=TypeError, + reason="pyarrow 17 broke the mocked filesystem", + ) + ) with monkeypatch.context() as m: m.setattr(pa_fs, "FileSystem", MockFileSystem) df1.to_parquet(path) @@ -102,7 +116,11 @@ def from_uri(path): df1.to_markdown(path) df2 = df1 - tm.assert_frame_equal(df1, df2) + expected = df1[:] + if format in ["csv", "excel"]: + expected["dt"] = expected["dt"].dt.as_unit("s") + + tm.assert_frame_equal(df2, expected) def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): @@ -115,15 +133,17 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): """ if compression == "zip": # Only compare the CRC checksum of the file contents - with zipfile.ZipFile(BytesIO(result)) as exp, zipfile.ZipFile( - BytesIO(expected) - ) as res: + with ( + zipfile.ZipFile(BytesIO(result)) as exp, + zipfile.ZipFile(BytesIO(expected)) as res, + ): for res_info, exp_info in zip(res.infolist(), exp.infolist()): assert res_info.CRC == exp_info.CRC elif compression == "tar": - with tarfile.open(fileobj=BytesIO(result)) as tar_exp, tarfile.open( - fileobj=BytesIO(expected) - ) as tar_res: + with ( + tarfile.open(fileobj=BytesIO(result)) as tar_exp, + tarfile.open(fileobj=BytesIO(expected)) as tar_res, + ): for tar_res_info, tar_exp_info in zip( tar_res.getmembers(), tar_exp.getmembers() ): @@ -148,8 +168,8 @@ def test_to_csv_compression_encoding_gcs( """ df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # reference of compressed and encoded file diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 607357e709b6e..bef28c4f027da 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -29,10 +29,6 @@ to_datetime, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import file_path_to_url @@ -112,13 +108,9 @@ def flavor_read_html(request): class TestReadHtml: def test_literal_html_deprecation(self, flavor_read_html): # GH 53785 - msg = ( - "Passing literal html to 'read_html' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) + msg = r"\[Errno 2\] No such file or director" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(FileNotFoundError, match=msg): flavor_read_html( """ @@ -154,10 +146,10 @@ def test_to_html_compat(self, flavor_read_html): df = ( DataFrame( np.random.default_rng(2).random((4, 3)), - columns=pd.Index(list("abc"), dtype=object), + columns=pd.Index(list("abc")), ) - # pylint: disable-next=consider-using-f-string - .map("{:.3f}".format).astype(float) + .map("{:.3f}".format) + .astype(float) ) out = df.to_html() res = flavor_read_html( @@ -180,24 +172,16 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -206,8 +190,8 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) @@ -223,7 +207,9 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.network @pytest.mark.single_cpu @@ -1018,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html): tm.assert_frame_equal(result, expected) + def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html): + # GH60210 + + result = flavor_read_html( + StringIO( + """ +
    + + + + + + + + + + + +
    AB
    1
    C2
    + """ + ) + )[0] + + expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 result = flavor_read_html( @@ -1048,30 +1061,20 @@ def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) - expected = df.to_html() - res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) - tm.assert_frame_equal(df, res[0]) - res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) - tm.assert_frame_equal(df, res[0]) - - def test_parse_dates_combine(self, flavor_read_html): - raw_dates = Series(date_range("1/1/2001", periods=10)) - df = DataFrame( - { - "date": raw_dates.map(lambda x: str(x.date())), - "time": raw_dates.map(lambda x: str(x.time())), - } - ) - res = flavor_read_html( - StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 - ) - newdf = DataFrame({"datetime": raw_dates}) - tm.assert_frame_equal(newdf, res[0]) + + expected = df[:] + expected["date"] = expected["date"].dt.as_unit("s") + + str_df = df.to_html() + res = flavor_read_html(StringIO(str_df), parse_dates=[1], index_col=0) + tm.assert_frame_equal(expected, res[0]) + res = flavor_read_html(StringIO(str_df), parse_dates=["date"], index_col=0) + tm.assert_frame_equal(expected, res[0]) def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") - assert os.path.isfile(data), f"{repr(data)} is not a file" - assert os.path.getsize(data), f"{repr(data)} is an empty file" + assert os.path.isfile(data), f"{data!r} is not a file" + assert os.path.getsize(data), f"{data!r} is an empty file" result = flavor_read_html(data, match="Arizona", header=1)[0] assert result.shape == (60, 12) assert "Unnamed" in result.columns[-1] @@ -1333,8 +1336,8 @@ def test_to_html_borderless(self): @pytest.mark.parametrize( "displayed_only,exp0,exp1", [ - (True, DataFrame(["foo"]), None), - (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), + (True, ["foo"], None), + (False, ["foo bar baz qux"], DataFrame(["foo"])), ], ) def test_displayed_only(self, displayed_only, exp0, exp1, flavor_read_html): @@ -1359,6 +1362,7 @@ def test_displayed_only(self, displayed_only, exp0, exp1, flavor_read_html): """ + exp0 = DataFrame(exp0) dfs = flavor_read_html(StringIO(data), displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) @@ -1403,7 +1407,7 @@ def test_encode(self, html_encoding_file, flavor_read_html): try: with open(html_encoding_file, "rb") as fobj: from_string = flavor_read_html( - fobj.read(), encoding=encoding, index_col=0 + BytesIO(fobj.read()), encoding=encoding, index_col=0 ).pop() with open(html_encoding_file, "rb") as fobj: @@ -1462,9 +1466,7 @@ def seek(self, offset): def seekable(self): return True - # GH 49036 pylint checks for presence of __next__ for iterators - def __next__(self): - ... + def __next__(self): ... def __iter__(self) -> Iterator: # `is_file_like` depends on the presence of diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index 2ca11ad1f74e6..3b9c8769ad9dc 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -1,12 +1,15 @@ """ Tests for the pandas custom headers in http(s) requests """ + from functools import partial import gzip from io import BytesIO import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -100,11 +103,10 @@ def stata_responder(df): pytest.param( parquetfastparquet_responder, partial(pd.read_parquet, engine="fastparquet"), - # TODO(ArrayManager) fastparquet marks=[ td.skip_if_no("fastparquet"), td.skip_if_no("fsspec"), - td.skip_array_manager_not_yet_implemented, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string"), ], ), (pickle_respnder, pd.read_pickle), diff --git a/pandas/tests/io/test_iceberg.py b/pandas/tests/io/test_iceberg.py new file mode 100644 index 0000000000000..765eccb602434 --- /dev/null +++ b/pandas/tests/io/test_iceberg.py @@ -0,0 +1,143 @@ +""" +Tests for the Apache Iceberg format. + +Tests in this file use a simple Iceberg catalog based on SQLite, with the same +data used for Parquet tests (``pandas/tests/io/data/parquet/simple.parquet``). +""" + +import collections +import importlib +import pathlib + +import pytest + +import pandas as pd +import pandas._testing as tm + +from pandas.io.iceberg import read_iceberg + +pytestmark = pytest.mark.single_cpu + +pyiceberg = pytest.importorskip("pyiceberg") +pyiceberg_catalog = pytest.importorskip("pyiceberg.catalog") +pq = pytest.importorskip("pyarrow.parquet") + +Catalog = collections.namedtuple("Catalog", ["name", "uri"]) + + +@pytest.fixture +def catalog(request, tmp_path): + # the catalog stores the full path of data files, so the catalog needs to be + # created dynamically, and not saved in pandas/tests/io/data as other formats + uri = f"sqlite:///{tmp_path}/catalog.sqlite" + warehouse = f"file://{tmp_path}" + catalog_name = request.param if hasattr(request, "param") else None + catalog = pyiceberg_catalog.load_catalog( + catalog_name or "default", + type="sql", + uri=uri, + warehouse=warehouse, + ) + catalog.create_namespace("ns") + + df = pq.read_table( + pathlib.Path(__file__).parent / "data" / "parquet" / "simple.parquet" + ) + table = catalog.create_table("ns.my_table", schema=df.schema) + table.append(df) + + if catalog_name is not None: + config_path = pathlib.Path.home() / ".pyiceberg.yaml" + with open(config_path, "w", encoding="utf-8") as f: + f.write(f"""\ +catalog: + {catalog_name}: + type: sql + uri: {uri} + warehouse: {warehouse}""") + + importlib.reload(pyiceberg_catalog) # needed to reload the config file + + yield Catalog(name=catalog_name or "default", uri=uri) + + if catalog_name is not None: + config_path.unlink() + + +class TestIceberg: + def test_read(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True) + def test_read_by_catalog_name(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_name=catalog.name, + ) + tm.assert_frame_equal(result, expected) + + def test_read_with_row_filter(self, catalog): + expected = pd.DataFrame( + { + "A": [2, 3], + "B": ["foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + row_filter="A > 1", + ) + tm.assert_frame_equal(result, expected) + + def test_read_with_case_sensitive(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + selected_fields=["a"], + case_sensitive=False, + ) + tm.assert_frame_equal(result, expected) + + with pytest.raises(ValueError, match="^Could not find column"): + read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + selected_fields=["a"], + case_sensitive=True, + ) + + def test_read_with_limit(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2], + "B": ["foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + limit=2, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a4021311fc963..efb3dffecd856 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -1,4 +1,5 @@ -""" test orc compat """ +"""test orc compat""" + import datetime from decimal import Decimal from io import BytesIO @@ -27,22 +28,7 @@ def dirpath(datapath): return datapath("io", "data", "orc") -@pytest.fixture( - params=[ - np.array([1, 20], dtype="uint64"), - pd.Series(["a", "b", "a"], dtype="category"), - [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)], - [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")], - ] -) -def orc_writer_dtypes_not_supported(request): - # Examples of dataframes with dtypes for which conversion to ORC - # hasn't been implemented yet, that is, Category, unsigned integers, - # interval, period and sparse. - return pd.DataFrame({"unimpl": request.param}) - - -def test_orc_reader_empty(dirpath): +def test_orc_reader_empty(dirpath, using_infer_string): columns = [ "boolean1", "byte1", @@ -63,11 +49,12 @@ def test_orc_reader_empty(dirpath): "float32", "float64", "object", - "object", + "str" if using_infer_string else "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) for colname, dtype in zip(columns, dtypes): expected[colname] = pd.Series(dtype=dtype) + expected.columns = expected.columns.astype("str") inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") got = read_orc(inputfile, columns=columns) @@ -294,17 +281,27 @@ def test_orc_roundtrip_bytesio(): tm.assert_equal(expected, got) +@pytest.mark.parametrize( + "orc_writer_dtypes_not_supported", + [ + np.array([1, 20], dtype="uint64"), + pd.Series(["a", "b", "a"], dtype="category"), + [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)], + [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")], + ], +) def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): # GH44554 # PyArrow gained ORC write support with the current argument order pytest.importorskip("pyarrow") + df = pd.DataFrame({"unimpl": orc_writer_dtypes_not_supported}) msg = "The dtype of one or more columns is not supported yet." with pytest.raises(NotImplementedError, match=msg): - orc_writer_dtypes_not_supported.to_orc() + df.to_orc() -def test_orc_dtype_backend_pyarrow(): +def test_orc_dtype_backend_pyarrow(using_infer_string): pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -325,6 +322,8 @@ def test_orc_dtype_backend_pyarrow(): ], } ) + # FIXME: without casting to ns we do not round-trip correctly + df["datetime_with_nat"] = df["datetime_with_nat"].astype("M8[ns]") bytes_data = df.copy().to_orc() result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow") @@ -335,6 +334,13 @@ def test_orc_dtype_backend_pyarrow(): for col in df.columns } ) + if using_infer_string: + # ORC does not preserve distinction between string and large string + # -> the default large string comes back as string + string_dtype = pd.ArrowDtype(pa.string()) + expected["string"] = expected["string"].astype(string_dtype) + expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype) + expected["string_with_none"] = expected["string_with_none"].astype(string_dtype) tm.assert_frame_equal(result, expected) @@ -430,7 +436,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ad7cdad363e78..78f39b649cb9a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,4 +1,5 @@ -""" test parquet compat """ +"""test parquet compat""" + import datetime from decimal import Decimal from io import BytesIO @@ -8,14 +9,16 @@ import numpy as np import pytest -from pandas._config import using_copy_on_write -from pandas._config.config import _get_option +from pandas._config import using_string_dtype from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, + pa_version_under17p0, + pa_version_under19p0, + pa_version_under20p0, ) import pandas as pd @@ -45,8 +48,6 @@ _HAVE_FASTPARQUET = False -# TODO(ArrayManager) fastparquet relies on BlockManager internals - pytestmark = [ pytest.mark.filterwarnings("ignore:DataFrame._data is deprecated:FutureWarning"), pytest.mark.filterwarnings( @@ -60,11 +61,17 @@ params=[ pytest.param( "fastparquet", - marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET - or _get_option("mode.data_manager", silent=True) == "array", - reason="fastparquet is not installed or ArrayManager is used", - ), + marks=[ + pytest.mark.skipif( + not _HAVE_FASTPARQUET, + reason="fastparquet is not installed", + ), + pytest.mark.xfail( + using_string_dtype(), + reason="TODO(infer_string) fastparquet", + strict=False, + ), + ], ), pytest.param( "pyarrow", @@ -86,17 +93,19 @@ def pa(): @pytest.fixture -def fp(): +def fp(request): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") - elif _get_option("mode.data_manager", silent=True) == "array": - pytest.skip("ArrayManager is not supported with fastparquet") + if using_string_dtype(): + request.applymarker( + pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False) + ) return "fastparquet" @pytest.fixture def df_compat(): - return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"])) @pytest.fixture @@ -244,8 +253,10 @@ def test_invalid_engine(df_compat): check_round_trip(df_compat, "foo", "bar") -def test_options_py(df_compat, pa): +def test_options_py(df_compat, pa, using_infer_string): # use the set option + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) @@ -355,23 +366,6 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): tm.assert_frame_equal(result, df[["a", "d"]]) -def test_parquet_pos_args_deprecation(engine): - # GH-54229 - df = pd.DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_parquet except for the " - r"argument 'path' will be keyword-only." - ) - with tm.ensure_clean() as path: - with tm.assert_produces_warning( - FutureWarning, - match=msg, - check_stacklevel=False, - raise_on_extra_warnings=False, - ): - df.to_parquet(path, engine) - - class Base: def check_error_on_write(self, df, engine, exc, err_msg): # check that we are raising the exception on writing @@ -385,16 +379,6 @@ def check_external_error_on_write(self, df, engine, exc): with tm.external_error_raised(exc): to_parquet(df, path, engine, compression=None) - @pytest.mark.network - @pytest.mark.single_cpu - def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): - if engine != "auto": - pytest.importorskip(engine) - with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: - httpserver.serve_content(content=f.read()) - df = read_parquet(httpserver.url) - tm.assert_frame_equal(df, df_compat) - class TestBasic(Base): def test_error(self, engine): @@ -448,15 +432,10 @@ def test_read_filters(self, engine, tmp_path): repeat=1, ) - def test_write_index(self, engine, using_copy_on_write, request): - check_names = engine != "fastparquet" - if using_copy_on_write and engine == "fastparquet": - request.applymarker( - pytest.mark.xfail(reason="fastparquet write into index") - ) - + def test_write_index(self): + pytest.importorskip("pyarrow") df = pd.DataFrame({"A": [1, 2, 3]}) - check_round_trip(df, engine) + check_round_trip(df, "pyarrow") indexes = [ [2, 3, 4], @@ -469,12 +448,12 @@ def test_write_index(self, engine, using_copy_on_write, request): df.index = index if isinstance(index, pd.DatetimeIndex): df.index = df.index._with_freq(None) # freq doesn't round-trip - check_round_trip(df, engine, check_names=check_names) + check_round_trip(df, "pyarrow") # index with meta-data df.index = [0, 1, 2] df.index.name = "foo" - check_round_trip(df, engine) + check_round_trip(df, "pyarrow") def test_write_multiindex(self, pa): # Not supported in fastparquet as of 0.1.3 or older pyarrow version @@ -684,6 +663,7 @@ def test_read_empty_array(self, pa, dtype): "value": pd.array([], dtype=dtype), } ) + pytest.importorskip("pyarrow", "11.0.0") # GH 45694 expected = None if dtype == "float": @@ -696,10 +676,26 @@ def test_read_empty_array(self, pa, dtype): df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected ) + @pytest.mark.network + @pytest.mark.single_cpu + def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): + if engine != "auto": + pytest.importorskip(engine) + with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: + httpserver.serve_content(content=f.read()) + df = read_parquet(httpserver.url, engine=engine) + + expected = df_compat + if pa_version_under19p0: + expected.columns = expected.columns.astype(object) + tm.assert_frame_equal(df, expected) + class TestParquetPyArrow(Base): + @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") def test_basic(self, pa, df_full): df = df_full + pytest.importorskip("pyarrow", "11.0.0") # additional supported types for pyarrow dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels") @@ -733,6 +729,14 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None + if pa_version_under11p0: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ns]" + ) + else: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ms]" + ) tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): @@ -785,18 +789,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame() - df["a"] = pd.Categorical(list("abcdef")) - - # test for null, out-of-order values, and unobserved category - df["b"] = pd.Categorical( - ["bar", "foo", "foo", "bar", None, "bar"], - dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), - ) - - # test for ordered flag - df["c"] = pd.Categorical( - ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + df = pd.DataFrame( + { + "a": pd.Categorical(list("abcdef")), + # test for null, out-of-order values, and unobserved category + "b": pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ), + # test for ordered flag + "c": pd.Categorical( + ["a", "b", "c", "a", "c", "b"], + categories=["b", "c", "d"], + ordered=True, + ), + } ) check_round_trip(df, pa) @@ -827,13 +834,7 @@ def test_s3_roundtrip(self, df_compat, s3_public_bucket, pa, s3so): ) @pytest.mark.single_cpu - @pytest.mark.parametrize( - "partition_col", - [ - ["A"], - [], - ], - ) + @pytest.mark.parametrize("partition_col", [["A"], []]) def test_s3_roundtrip_for_dir( self, df_compat, s3_public_bucket, pa, partition_col, s3so ): @@ -865,11 +866,13 @@ def test_s3_roundtrip_for_dir( repeat=1, ) - def test_read_file_like_obj_support(self, df_compat): + def test_read_file_like_obj_support(self, df_compat, using_infer_string): pytest.importorskip("pyarrow") buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") tm.assert_frame_equal(df_compat, df_from_buf) def test_expand_user(self, df_compat, monkeypatch): @@ -925,7 +928,7 @@ def test_write_with_schema(self, pa): out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) - def test_additional_extension_arrays(self, pa): + def test_additional_extension_arrays(self, pa, using_infer_string): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol pytest.importorskip("pyarrow") @@ -936,17 +939,28 @@ def test_additional_extension_arrays(self, pa): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - check_round_trip(df, pa) + if using_infer_string and pa_version_under19p0: + check_round_trip(df, pa, expected=df.astype({"c": "str"})) + else: + check_round_trip(df, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) check_round_trip(df, pa) - def test_pyarrow_backed_string_array(self, pa, string_storage): + def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string): # test ArrowStringArray supported through the __arrow_array__ protocol pytest.importorskip("pyarrow") df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): - check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) + if using_infer_string: + if pa_version_under19p0: + expected = df.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") + expected.columns = expected.columns.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") + check_round_trip(df, pa, expected=expected) def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the @@ -972,14 +986,9 @@ def test_timestamp_nanoseconds(self, pa): df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) - def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): - if timezone_aware_date_list.tzinfo != datetime.timezone.utc: - request.applymarker( - pytest.mark.xfail( - reason="temporary skip this test until it is properly resolved: " - "https://github.com/pandas-dev/pandas/issues/37286" - ) - ) + def test_timezone_aware_index(self, pa, timezone_aware_date_list): + pytest.importorskip("pyarrow", "11.0.0") + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -992,7 +1001,23 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + + expected = df[:] + if pa_version_under11p0: + expected.index = expected.index.as_unit("ns") + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: + # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone + # https://github.com/pandas-dev/pandas/issues/37286 + try: + import pytz + except ImportError: + pass + else: + offset = df.index.tz.utcoffset(timezone_aware_date_list) + tz = pytz.FixedOffset(offset.total_seconds() / 60) + expected.index = expected.index.tz_convert(tz) + expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) + check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -1000,25 +1025,10 @@ def test_filter_row_groups(self, pa): df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: df.to_parquet(path, engine=pa) - result = read_parquet( - path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False - ) + result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 - def test_read_parquet_manager(self, pa, using_array_manager): - # ensure that read_parquet honors the pandas.options.mode.data_manager option - df = pd.DataFrame( - np.random.default_rng(2).standard_normal((10, 3)), columns=["A", "B", "C"] - ) - - with tm.ensure_clean() as path: - df.to_parquet(path, engine=pa) - result = read_parquet(path, pa) - if using_array_manager: - assert isinstance(result._mgr, pd.core.internals.ArrayManager) - else: - assert isinstance(result._mgr, pd.core.internals.BlockManager) - + @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning") def test_read_dtype_backend_pyarrow_config(self, pa, df_full): import pyarrow @@ -1035,13 +1045,14 @@ def test_read_dtype_backend_pyarrow_config(self, pa, df_full): if pa_version_under13p0: # pyarrow infers datetimes as us instead of ns expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]") - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( - "timestamp[us][pyarrow]" - ) expected["datetime_tz"] = expected["datetime_tz"].astype( pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "timestamp[ms][pyarrow]" + ) + check_round_trip( df, engine=pa, @@ -1065,24 +1076,34 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) - def test_columns_dtypes_not_invalid(self, pa): + @pytest.mark.parametrize( + "columns", + [ + [0, 1], + pytest.param( + [b"foo", b"bar"], + marks=pytest.mark.xfail( + pa_version_under20p0, + raises=NotImplementedError, + reason="https://github.com/apache/arrow/pull/44171", + ), + ), + pytest.param( + [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ], + marks=pytest.mark.xfail( + pa_version_under17p0, + reason="pa.pandas_compat passes 'datetime64' to .astype", + ), + ), + ], + ) + def test_columns_dtypes_not_invalid(self, pa, columns): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - # numeric - df.columns = [0, 1] - check_round_trip(df, pa) - - # bytes - df.columns = [b"foo", b"bar"] - with pytest.raises(NotImplementedError, match="|S3"): - # Bytes fails on read_parquet - check_round_trip(df, pa) - - # python object - df.columns = [ - datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1), - ] + df.columns = columns check_round_trip(df, pa) def test_empty_columns(self, pa): @@ -1098,17 +1119,24 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs - def test_string_inference(self, tmp_path, pa): + def test_string_inference(self, tmp_path, pa, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) - df.to_parquet(path, engine="pyarrow") + df.to_parquet(path, engine=pa) with pd.option_context("future.infer_string", True): - result = read_parquet(path, engine="pyarrow") + result = read_parquet(path, engine=pa) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=pd.Index(["a", "b"], dtype=dtype), + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), ) tm.assert_frame_equal(result, expected) @@ -1121,7 +1149,10 @@ def test_roundtrip_decimal(self, tmp_path, pa): df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + if pa_version_under19p0: + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + else: + expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) def test_infer_string_large_string_type(self, tmp_path, pa): @@ -1138,8 +1169,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa): result = read_parquet(path) expected = pd.DataFrame( data={"a": [None, "b", "c"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -1154,13 +1185,54 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # df.to_parquet(tmp_path / "test.parquet") # result = read_parquet(tmp_path / "test.parquet") # assert result["strings"].dtype == "string" + # FIXME: don't leave commented-out + + def test_non_nanosecond_timestamps(self, temp_file): + # GH#49236 + pa = pytest.importorskip("pyarrow", "11.0.0") + pq = pytest.importorskip("pyarrow.parquet") + + arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) + table = pa.table([arr], names=["timestamp"]) + pq.write_table(table, temp_file) + result = read_parquet(temp_file) + expected = pd.DataFrame( + data={"timestamp": [datetime.datetime(1600, 1, 1)]}, + dtype="datetime64[us]", + ) + tm.assert_frame_equal(result, expected) + + def test_maps_as_pydicts(self, pa): + pyarrow = pytest.importorskip("pyarrow", "13.0.0") + + schema = pyarrow.schema( + [("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))] + ) + df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}]) + check_round_trip( + df, + pa, + write_kwargs={"schema": schema}, + read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}}, + ) class TestParquetFastParquet(Base): - def test_basic(self, fp, df_full): + def test_basic(self, fp, df_full, request): + pytz = pytest.importorskip("pytz") + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=("datetime_with_nat gets incorrect values"), + ) + ) + + tz = pytz.timezone("US/Eastern") df = df_full - dti = pd.date_range("20130101", periods=3, tz="US/Eastern") + dti = pd.date_range("20130101", periods=3, tz=tz) dti = dti._with_freq(None) # freq doesn't round-trip df["datetime_tz"] = dti df["timedelta"] = pd.timedelta_range("1 day", periods=3) @@ -1193,7 +1265,17 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - def test_bool_with_none(self, fp): + def test_bool_with_none(self, fp, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0") and Version( + np.__version__ + ) >= Version("2.0.0"): + request.applymarker( + pytest.mark.xfail( + reason=("fastparquet uses np.float_ in numpy2"), + ) + ) df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes @@ -1301,15 +1383,25 @@ def test_error_on_using_partition_cols_and_partition_on( partition_cols=partition_cols, ) - @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") def test_empty_dataframe(self, fp): # GH #27339 df = pd.DataFrame() expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") - def test_timezone_aware_index(self, fp, timezone_aware_date_list): + def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=( + "fastparquet bug, see " + "https://github.com/dask/fastparquet/issues/929" + ), + ) + ) + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -1318,21 +1410,10 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected.index.name = "index" check_round_trip(df, fp, expected=expected) - def test_use_nullable_dtypes_not_supported(self, fp): - df = pd.DataFrame({"a": [1, 2]}) - - with tm.ensure_clean() as path: - df.to_parquet(path) - with pytest.raises(ValueError, match="not supported for the fastparquet"): - with tm.assert_produces_warning(FutureWarning): - read_parquet(path, engine="fastparquet", use_nullable_dtypes=True) - with pytest.raises(ValueError, match="not supported for the fastparquet"): - read_parquet(path, engine="fastparquet", dtype_backend="pyarrow") - def test_close_file_handle_on_read_error(self): with tm.ensure_clean("test.parquet") as path: pathlib.Path(path).write_bytes(b"breakit") - with pytest.raises(Exception, match=""): # Not important which exception + with tm.external_error_raised(Exception): # Not important which exception read_parquet(path, engine="fastparquet") # The next line raises an error on Windows if the file is still open pathlib.Path(path).unlink(missing_ok=False) @@ -1417,10 +1498,3 @@ def test_invalid_dtype_backend(self, engine): df.to_parquet(path) with pytest.raises(ValueError, match=msg): read_parquet(path, dtype_backend="numpy") - - @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") - def test_empty_columns(self, fp): - # GH 52034 - df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) - expected = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) - check_round_trip(df, fp, expected=expected) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 4f3993a038197..bab2c1561eb99 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -10,9 +10,9 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ + from __future__ import annotations -from array import array import bz2 import datetime import functools @@ -31,13 +31,8 @@ import numpy as np import pytest -from pandas.compat import ( - get_lzma_file, - is_platform_little_endian, -) +from pandas.compat import is_platform_little_endian from pandas.compat._optional import import_optional_dependency -from pandas.compat.compressors import flatten_buffer -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -81,36 +76,8 @@ def compare_element(result, expected, typ): # --------------------- -@pytest.mark.parametrize( - "data", - [ - b"123", - b"123456", - bytearray(b"123"), - memoryview(b"123"), - pickle.PickleBuffer(b"123"), - array("I", [1, 2, 3]), - memoryview(b"123456").cast("B", (3, 2)), - memoryview(b"123456").cast("B", (3, 2))[::2], - np.arange(12).reshape((3, 4), order="C"), - np.arange(12).reshape((3, 4), order="F"), - np.arange(12).reshape((3, 4), order="C")[:, ::2], - ], -) -def test_flatten_buffer(data): - result = flatten_buffer(data) - expected = memoryview(data).tobytes("A") - assert result == expected - if isinstance(data, (bytes, bytearray)): - assert result is data - elif isinstance(result, memoryview): - assert result.ndim == 1 - assert result.format == "B" - assert result.contiguous - assert result.shape == (result.nbytes,) - - def test_pickles(datapath): + pytest.importorskip("pytz") if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") @@ -230,16 +197,6 @@ def test_pickle_path_pathlib(): tm.assert_frame_equal(df, result) -def test_pickle_path_localpath(): - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) - result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle) - tm.assert_frame_equal(df, result) - - # --------------------- # test pickle compression # --------------------- @@ -271,7 +228,9 @@ def compress_file(self, src_path, dest_path, compression): tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path)) tar.addfile(tarinfo, fh) elif compression == "xz": - f = get_lzma_file()(dest_path, "w") + import lzma + + f = lzma.LZMAFile(dest_path, "w") elif compression == "zstd": f = import_optional_dependency("zstandard").open(dest_path, "wb") else: @@ -310,13 +269,13 @@ def test_write_explicit(self, compression, get_random_path): @pytest.mark.parametrize("compression", ["", "None", "bad", "7z"]) def test_write_explicit_bad(self, compression, get_random_path): - with pytest.raises(ValueError, match="Unrecognized compression type"): - with tm.ensure_clean(get_random_path) as path: - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) + with tm.ensure_clean(get_random_path) as path: + with pytest.raises(ValueError, match="Unrecognized compression type"): df.to_pickle(path, compression=compression) def test_write_infer(self, compression_ext, get_random_path): @@ -410,31 +369,6 @@ def test_read(self, protocol, get_random_path): tm.assert_frame_equal(df, df2) -@pytest.mark.parametrize( - ["pickle_file", "excols"], - [ - ("test_py27.pkl", Index(["a", "b", "c"])), - ( - "test_mi_py27.pkl", - pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]), - ), - ], -) -def test_unicode_decode_error(datapath, pickle_file, excols): - # pickle file written with py27, should be readable without raising - # UnicodeDecodeError, see GH#28645 and GH#31988 - path = datapath("io", "data", "pickle", pickle_file) - df = pd.read_pickle(path) - - # just test the columns are correct since the values are random - tm.assert_index_equal(df.columns, excols) - - -# --------------------- -# tests for buffer I/O -# --------------------- - - def test_pickle_buffer_roundtrip(): with tm.ensure_clean() as path: df = DataFrame( @@ -449,55 +383,6 @@ def test_pickle_buffer_roundtrip(): tm.assert_frame_equal(df, result) -# --------------------- -# tests for URL I/O -# --------------------- - - -@pytest.mark.parametrize( - "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"] -) -def test_pickle_generalurl_read(monkeypatch, mockurl): - def python_pickler(obj, path): - with open(path, "wb") as fh: - pickle.dump(obj, fh, protocol=-1) - - class MockReadResponse: - def __init__(self, path) -> None: - self.file = open(path, "rb") - if "gzip" in path: - self.headers = {"Content-Encoding": "gzip"} - else: - self.headers = {"Content-Encoding": ""} - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def read(self): - return self.file.read() - - def close(self): - return self.file.close() - - with tm.ensure_clean() as path: - - def mock_urlopen_read(*args, **kwargs): - return MockReadResponse(path) - - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) - python_pickler(df, path) - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) - result = pd.read_pickle(mockurl) - tm.assert_frame_equal(df, result) - - def test_pickle_fsspec_roundtrip(): pytest.importorskip("fsspec") with tm.ensure_clean(): @@ -600,7 +485,6 @@ def test_pickle_strings(string_series): tm.assert_series_equal(unp_series, string_series) -@td.skip_array_manager_invalid_test def test_pickle_preserves_block_ndim(): # GH#37631 ser = Series(list("abc")).astype("category").iloc[[0]] @@ -638,15 +522,3 @@ def test_pickle_frame_v124_unpickle_130(datapath): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(df, expected) - - -def test_pickle_pos_args_deprecation(): - # GH-54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_pickle except for the " - r"argument 'path' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - buffer = io.BytesIO() - df.to_pickle(buffer, "infer") diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index e118c90d9bc02..973cb21ac3041 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -65,6 +65,22 @@ def test_spss_labelled_str(datapath): tm.assert_frame_equal(df, expected) +@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") +@pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning") +def test_spss_kwargs(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT + fname = datapath("io", "data", "spss", "labelled-str.sav") + + df = pd.read_spss(fname, convert_categoricals=True, row_limit=1) + expected = pd.DataFrame({"gender": ["Male"]}, dtype="category") + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False, row_offset=1) + expected = pd.DataFrame({"gender": ["F"]}) + tm.assert_frame_equal(df, expected) + + @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") @pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning") def test_spss_umlauts(datapath): @@ -153,12 +169,9 @@ def test_spss_metadata(datapath): "variable_measure": {"VAR00002": "unknown"}, "file_label": None, "file_format": "sav/zsav", + "creation_time": datetime.datetime(2015, 2, 6, 14, 33, 36), + "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), } - if Version(pyreadstat.__version__) >= Version("1.2.4"): - metadata.update( - { - "creation_time": datetime.datetime(2015, 2, 6, 14, 33, 36), - "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), - } - ) - assert df.attrs == metadata + if Version(pyreadstat.__version__) >= Version("1.2.8"): + metadata["mr_sets"] = {} + tm.assert_dict_equal(df.attrs, metadata) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6645aefd4f0a7..ff06d04fc23bd 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -18,11 +18,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib -from pandas.compat import ( - pa_version_under13p0, - pa_version_under14p1, -) +from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -40,10 +39,6 @@ to_timedelta, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.util.version import Version from pandas.io import sql @@ -61,9 +56,12 @@ import sqlalchemy -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.single_cpu, +] @pytest.fixture @@ -239,14 +237,17 @@ def types_table_metadata(dialect: str): "types", metadata, Column("TextCol", TEXT), - Column("DateCol", date_type), + # error: Cannot infer type argument 1 of "Column" + Column("DateCol", date_type), # type: ignore[misc] Column("IntDateCol", Integer), Column("IntDateOnlyCol", Integer), Column("FloatCol", Float), Column("IntCol", Integer), - Column("BoolCol", bool_type), + # error: Cannot infer type argument 1 of "Column" + Column("BoolCol", bool_type), # type: ignore[misc] Column("IntColWithNull", Integer), - Column("BoolColWithNull", bool_type), + # error: Cannot infer type argument 1 of "Column" + Column("BoolColWithNull", bool_type), # type: ignore[misc] ) return types @@ -368,7 +369,7 @@ def create_and_load_postgres_datetz(conn): Timestamp("2000-01-01 08:00:00", tz="UTC"), Timestamp("2000-06-01 07:00:00", tz="UTC"), ] - return Series(expected_data, name="DateColWithTz") + return Series(expected_data, name="DateColWithTz").astype("M8[us, UTC]") def check_iris_frame(frame: DataFrame): @@ -620,13 +621,13 @@ def mysql_pymysql_engine(): def mysql_pymysql_engine_iris(mysql_pymysql_engine, iris_path): create_and_load_iris(mysql_pymysql_engine, iris_path) create_and_load_iris_view(mysql_pymysql_engine) - yield mysql_pymysql_engine + return mysql_pymysql_engine @pytest.fixture def mysql_pymysql_engine_types(mysql_pymysql_engine, types_data): create_and_load_types(mysql_pymysql_engine, types_data, "mysql") - yield mysql_pymysql_engine + return mysql_pymysql_engine @pytest.fixture @@ -667,13 +668,13 @@ def postgresql_psycopg2_engine(): def postgresql_psycopg2_engine_iris(postgresql_psycopg2_engine, iris_path): create_and_load_iris(postgresql_psycopg2_engine, iris_path) create_and_load_iris_view(postgresql_psycopg2_engine) - yield postgresql_psycopg2_engine + return postgresql_psycopg2_engine @pytest.fixture def postgresql_psycopg2_engine_types(postgresql_psycopg2_engine, types_data): create_and_load_types(postgresql_psycopg2_engine, types_data, "postgres") - yield postgresql_psycopg2_engine + return postgresql_psycopg2_engine @pytest.fixture @@ -684,6 +685,7 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture def postgresql_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_postgresql") from adbc_driver_postgresql import dbapi @@ -713,7 +715,7 @@ def postgresql_adbc_iris(postgresql_adbc_conn, iris_path): except mgr.ProgrammingError: # note arrow-adbc issue 1022 conn.rollback() create_and_load_iris_view(conn) - yield conn + return conn @pytest.fixture @@ -730,7 +732,7 @@ def postgresql_adbc_types(postgresql_adbc_conn, types_data): create_and_load_types_postgresql(conn, new_data) - yield conn + return conn @pytest.fixture @@ -784,7 +786,7 @@ def sqlite_str_iris(sqlite_str, iris_path): def sqlite_engine_iris(sqlite_engine, iris_path): create_and_load_iris(sqlite_engine, iris_path) create_and_load_iris_view(sqlite_engine) - yield sqlite_engine + return sqlite_engine @pytest.fixture @@ -805,7 +807,7 @@ def sqlite_str_types(sqlite_str, types_data): @pytest.fixture def sqlite_engine_types(sqlite_engine, types_data): create_and_load_types(sqlite_engine, types_data, "sqlite") - yield sqlite_engine + return sqlite_engine @pytest.fixture @@ -816,6 +818,7 @@ def sqlite_conn_types(sqlite_engine_types): @pytest.fixture def sqlite_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_sqlite") from adbc_driver_sqlite import dbapi @@ -845,7 +848,7 @@ def sqlite_adbc_iris(sqlite_adbc_conn, iris_path): except mgr.ProgrammingError: conn.rollback() create_and_load_iris_view(conn) - yield conn + return conn @pytest.fixture @@ -867,7 +870,7 @@ def sqlite_adbc_types(sqlite_adbc_conn, types_data): create_and_load_types_sqlite3(conn, new_data) conn.commit() - yield conn + return conn @pytest.fixture @@ -881,14 +884,14 @@ def sqlite_buildin(): def sqlite_buildin_iris(sqlite_buildin, iris_path): create_and_load_iris_sqlite3(sqlite_buildin, iris_path) create_and_load_iris_view(sqlite_buildin) - yield sqlite_buildin + return sqlite_buildin @pytest.fixture def sqlite_buildin_types(sqlite_buildin, types_data): types_data = [tuple(entry.values()) for entry in types_data] create_and_load_types_sqlite3(sqlite_buildin, types_data) - yield sqlite_buildin + return sqlite_buildin mysql_connectable = [ @@ -956,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data): adbc_connectable_iris = [ pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), - pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), + "sqlite_adbc_iris", ] adbc_connectable_types = [ pytest.param("postgresql_adbc_types", marks=pytest.mark.db), - pytest.param("sqlite_adbc_types", marks=pytest.mark.db), + "sqlite_adbc_types", ] @@ -985,13 +988,13 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): - if conn == "postgresql_adbc_conn": + if conn == "postgresql_adbc_conn" and not using_string_dtype(): request.node.add_marker( pytest.mark.xfail( - reason="postgres ADBC driver cannot insert index with null type", - strict=True, + reason="postgres ADBC driver < 1.2 cannot insert index with null type", ) ) + # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -1065,7 +1068,9 @@ def test_to_sql(conn, method, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) -@pytest.mark.parametrize("mode, num_row_coef", [("replace", 1), ("append", 2)]) +@pytest.mark.parametrize( + "mode, num_row_coef", [("replace", 1), ("append", 2), ("delete_rows", 1)] +) def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: @@ -1373,6 +1378,30 @@ def insert_on_conflict(table, conn, keys, data_iter): pandasSQL.drop_table("test_insert_conflict") +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_on_public_schema(conn, request): + if "sqlite" in conn or "mysql" in conn: + request.applymarker( + pytest.mark.xfail( + reason="test for public schema only specific to postgresql" + ) + ) + + conn = request.getfixturevalue(conn) + + test_data = DataFrame([[1, 2.1, "a"], [2, 3.1, "b"]], columns=list("abc")) + test_data.to_sql( + name="test_public_schema", + con=conn, + if_exists="append", + index=False, + schema="public", + ) + + df_out = sql.read_sql_table("test_public_schema", conn, schema="public") + tm.assert_frame_equal(test_data, df_out) + + @pytest.mark.parametrize("conn", mysql_connectable) def test_insertion_method_on_conflict_update(conn, request): # GH 14553: Example in to_sql docstring @@ -1487,26 +1516,6 @@ def test_read_view_sqlite(sqlite_buildin): tm.assert_frame_equal(result, expected) -def test_execute_typeerror(sqlite_engine_iris): - with pytest.raises(TypeError, match="pandas.io.sql.execute requires a connection"): - with tm.assert_produces_warning( - FutureWarning, - match="`pandas.io.sql.execute` is deprecated and " - "will be removed in the future version.", - ): - sql.execute("select * from iris", sqlite_engine_iris) - - -def test_execute_deprecated(sqlite_conn_iris): - # GH50185 - with tm.assert_produces_warning( - FutureWarning, - match="`pandas.io.sql.execute` is deprecated and " - "will be removed in the future version.", - ): - sql.execute("select * from iris", sqlite_conn_iris) - - def flavor(conn_name): if "postgresql" in conn_name: return "postgresql" @@ -1698,11 +1707,9 @@ def test_api_roundtrip(conn, request, test_frame1): # HACK! if "adbc" in conn_name: - result = result.rename(columns={"__index_level_0__": "level_0"}) - result.index = test_frame1.index - result.set_index("level_0", inplace=True) - result.index.astype(int) - result.index.name = None + result = result.drop(columns="__index_level_0__") + else: + result = result.drop(columns="level_0") tm.assert_frame_equal(result, test_frame1) @@ -1796,7 +1803,7 @@ def test_api_date_parsing(conn, request): @pytest.mark.parametrize("conn", all_connectable_types) -@pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) +@pytest.mark.parametrize("error", ["raise", "coerce"]) @pytest.mark.parametrize( "read_sql, text, mode", [ @@ -1820,7 +1827,7 @@ def test_api_custom_dateparsing_error( pytest.mark.xfail(reason="failing combination of arguments") ) - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + expected = types_data_frame.astype({"DateCol": "datetime64[s]"}) result = read_sql( text, @@ -1843,10 +1850,12 @@ def test_api_custom_dateparsing_error( } ) - if not pa_version_under13p0: - # TODO: is this astype safe? - expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") - + if conn_name == "postgresql_adbc_types" and pa_version_under14p1: + expected["DateCol"] = expected["DateCol"].astype("datetime64[ns]") + elif "postgres" in conn_name or "mysql" in conn_name: + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + else: + expected["DateCol"] = expected["DateCol"].astype("datetime64[s]") tm.assert_frame_equal(result, expected) @@ -2229,12 +2238,14 @@ def test_api_chunksize_read(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="categorical dtype not implemented for ADBC postgres driver", - strict=True, + adbc = import_optional_dependency("adbc_driver_postgresql", errors="ignore") + if adbc is not None and Version(adbc.__version__) < Version("0.9.0"): + request.node.add_marker( + pytest.mark.xfail( + reason="categorical dtype not implemented for ADBC postgres driver", + strict=True, + ) ) - ) # GH8624 # test that categorical gets written correctly as dense column conn = request.getfixturevalue(conn) @@ -2294,9 +2305,16 @@ def test_api_escaped_table_name(conn, request): def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail(reason="pyarrow->pandas throws ValueError", strict=True) - ) + pa = pytest.importorskip("pyarrow") + if not ( + Version(pa.__version__) >= Version("16.0") + and conn in ["sqlite_adbc_conn", "postgresql_adbc_conn"] + ): + request.node.add_marker( + pytest.mark.xfail( + reason="pyarrow->pandas throws ValueError", strict=True + ) + ) conn = request.getfixturevalue(conn) if sql.has_table("test_table", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -2480,10 +2498,8 @@ def test_sqlalchemy_integer_overload_mapping(conn, request, integer): sql.SQLTable("test_type", db, frame=df) -@pytest.mark.parametrize("conn", all_connectable) -def test_database_uri_string(conn, request, test_frame1): +def test_database_uri_string(request, test_frame1): pytest.importorskip("sqlalchemy") - conn = request.getfixturevalue(conn) # Test read_sql and .to_sql method with a database URI (GH10654) # db_uri = 'sqlite:///:memory:' # raises # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near @@ -2502,10 +2518,8 @@ def test_database_uri_string(conn, request, test_frame1): @td.skip_if_installed("pg8000") -@pytest.mark.parametrize("conn", all_connectable) -def test_pg8000_sqlalchemy_passthrough_error(conn, request): +def test_pg8000_sqlalchemy_passthrough_error(request): pytest.importorskip("sqlalchemy") - conn = request.getfixturevalue(conn) # using driver that will not be installed on CI to trigger error # in sqlalchemy.create_engine -> test passing of this error to user db_uri = "postgresql+pg8000://user:pass@host/dbname" @@ -2596,7 +2610,7 @@ def close(self): self.conn.close() with contextlib.closing(MockSqliteConnection(":memory:")) as conn: - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="only supports SQLAlchemy"): sql.read_sql("SELECT 1", conn) @@ -2682,6 +2696,59 @@ def test_drop_table(conn, request): assert not insp.has_table("temp_frame") +@pytest.mark.parametrize("conn_name", all_connectable) +def test_delete_rows_success(conn_name, test_frame1, request): + table_name = "temp_delete_rows_frame" + conn = request.getfixturevalue(conn_name) + + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, table_name) == test_frame1.shape[0] + + with pandasSQL.run_transaction(): + assert pandasSQL.delete_rows(table_name) is None + + assert count_rows(conn, table_name) == 0 + assert pandasSQL.has_table(table_name) + + +@pytest.mark.parametrize("conn_name", all_connectable) +def test_delete_rows_is_atomic(conn_name, request): + sqlalchemy = pytest.importorskip("sqlalchemy") + + table_name = "temp_delete_rows_atomic_frame" + table_stmt = f"CREATE TABLE {table_name} (a INTEGER, b INTEGER UNIQUE NOT NULL)" + + if conn_name != "sqlite_buildin" and "adbc" not in conn_name: + table_stmt = sqlalchemy.text(table_stmt) + + # setting dtype is mandatory for adbc related tests + original_df = DataFrame({"a": [1, 2], "b": [3, 4]}, dtype="int32") + replacing_df = DataFrame({"a": [5, 6, 7], "b": [8, 8, 8]}, dtype="int32") + + conn = request.getfixturevalue(conn_name) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction() as cur: + cur.execute(table_stmt) + + with pandasSQL.run_transaction(): + pandasSQL.to_sql(original_df, table_name, if_exists="append", index=False) + + # inserting duplicated values in a UNIQUE constraint column + with pytest.raises(pd.errors.DatabaseError): + with pandasSQL.run_transaction(): + pandasSQL.to_sql( + replacing_df, table_name, if_exists="delete_rows", index=False + ) + + # failed "delete_rows" is rolled back preserving original data + with pandasSQL.run_transaction(): + result_df = pandasSQL.read_query( + f"SELECT * FROM {table_name}", dtype="int32" + ) + tm.assert_frame_equal(result_df, original_df) + + @pytest.mark.parametrize("conn", all_connectable) def test_roundtrip(conn, request, test_frame1): if conn == "sqlite_str": @@ -2689,10 +2756,10 @@ def test_roundtrip(conn, request, test_frame1): conn_name = conn conn = request.getfixturevalue(conn) - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 - result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 + result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") if "adbc" in conn_name: result = result.rename(columns={"__index_level_0__": "level_0"}) @@ -2822,7 +2889,9 @@ def test_datetime_with_timezone_table(conn, request): conn = request.getfixturevalue(conn) expected = create_and_load_postgres_datetz(conn) result = sql.read_sql_table("datetz", conn) - tm.assert_frame_equal(result, expected.to_frame()) + + exp_frame = expected.to_frame() + tm.assert_frame_equal(result, exp_frame) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2834,7 +2903,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): # For dbs that support timestamps with timezones, should get back UTC # otherwise naive data should be returned expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific", unit="us")} ) assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 @@ -2852,7 +2921,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): if "sqlite" in conn_name: # read_sql_query does not return datetime type like read_sql_table assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) + result["A"] = to_datetime(result["A"]).dt.as_unit("us") tm.assert_frame_equal(result, expected) @@ -2863,7 +2932,9 @@ def test_out_of_bounds_datetime(conn, request): data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 result = sql.read_sql_table("test_datetime_obb", conn) - expected = DataFrame([pd.NaT], columns=["date"]) + expected = DataFrame( + np.array([datetime(9999, 1, 1)], dtype="M8[us]"), columns=["date"] + ) tm.assert_frame_equal(result, expected) @@ -2872,7 +2943,7 @@ def test_naive_datetimeindex_roundtrip(conn, request): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC conn = request.getfixturevalue(conn) - dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) + dates = date_range("2018-01-01", periods=5, freq="6h", unit="us")._with_freq(None) expected = DataFrame({"nums": range(5)}, index=dates) assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 result = sql.read_sql_table("foo_table", conn, index_col="info_date") @@ -2924,7 +2995,10 @@ def test_datetime(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) result = result.drop("index", axis=1) - tm.assert_frame_equal(result, df) + + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) @@ -2932,9 +3006,7 @@ def test_datetime(conn, request): if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2949,16 +3021,17 @@ def test_datetime_NaT(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) - tm.assert_frame_equal(result, df) + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -3380,15 +3453,8 @@ def test_to_sql_with_negative_npinf(conn, request, input): # GH 36465 # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error # for pymysql version >= 0.10 - # TODO(GH#36465): remove this version check after GH 36465 is fixed - pymysql = pytest.importorskip("pymysql") - - if Version(pymysql.__version__) < Version("1.0.3") and "infe0" in df.columns: - mark = pytest.mark.xfail(reason="GH 36465") - request.applymarker(mark) - - msg = "inf cannot be used with MySQL" - with pytest.raises(ValueError, match=msg): + msg = "Execution failed on sql" + with pytest.raises(pd.errors.DatabaseError, match=msg): df.to_sql(name="foobar", con=conn, index=False) else: assert df.to_sql(name="foobar", con=conn, index=False) == 1 @@ -3508,13 +3574,6 @@ def test_options_get_engine(): assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) -def test_get_engine_auto_error_message(): - # Expect different error messages from get_engine(engine="auto") - # if engines aren't installed vs. are installed but bad version - pass - # TODO(GH#36893) fill this in when we add more engines - - @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) def test_read_sql_dtype_backend( @@ -3537,7 +3596,8 @@ def test_read_sql_dtype_backend( result = getattr(pd, func)( f"Select * from {table}", conn, dtype_backend=dtype_backend ) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3587,7 +3647,7 @@ def test_read_sql_dtype_backend_table( with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3640,24 +3700,13 @@ def dtype_backend_data() -> DataFrame: @pytest.fixture def dtype_backend_expected(): - def func(storage, dtype_backend, conn_name) -> DataFrame: - string_array: StringArray | ArrowStringArray - string_array_na: StringArray | ArrowStringArray - if storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": + def func(string_storage, dtype_backend, conn_name) -> DataFrame: + string_dtype: pd.StringDtype | pd.ArrowDtype + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + string_dtype = pd.StringDtype(string_storage) df = DataFrame( { @@ -3667,8 +3716,8 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, pd.NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": @@ -3744,20 +3793,6 @@ def test_read_sql_dtype(conn, request, func, dtype_backend): tm.assert_frame_equal(result, expected) -def test_keyword_deprecation(sqlite_engine): - conn = sqlite_engine - # GH 54397 - msg = ( - "Starting with pandas version 3.0 all arguments of to_sql except for the " - "arguments 'name' and 'con' will be keyword-only." - ) - df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) - df.to_sql("example", conn) - - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_sql("example", conn, None, if_exists="replace") - - def test_bigint_warning(sqlite_engine): conn = sqlite_engine # test no warning for BIGINT (to support int64) is raised (GH7433) @@ -3817,7 +3852,6 @@ class Test(BaseModel): def test_read_sql_string_inference(sqlite_engine): conn = sqlite_engine # GH#54430 - pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=conn, index=False, if_exists="replace") @@ -3825,7 +3859,7 @@ def test_read_sql_string_inference(sqlite_engine): with pd.option_context("future.infer_string", True): result = read_sql_table(table, conn) - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -3964,6 +3998,7 @@ def test_self_join_date_columns(postgresql_psycopg2_engine): expected = DataFrame( [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 ) + expected["created_dt"] = expected["created_dt"].astype("M8[us, UTC]") tm.assert_frame_equal(result, expected) # Cleanup @@ -4128,7 +4163,7 @@ def tquery(query, con=None): def test_xsqlite_basic(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 10 @@ -4155,7 +4190,7 @@ def test_xsqlite_basic(sqlite_buildin): def test_xsqlite_write_row_by_row(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) frame.iloc[0, 0] = np.nan @@ -4178,7 +4213,7 @@ def test_xsqlite_write_row_by_row(sqlite_buildin): def test_xsqlite_execute(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") @@ -4199,7 +4234,7 @@ def test_xsqlite_execute(sqlite_buildin): def test_xsqlite_schema(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") @@ -4230,11 +4265,11 @@ def test_xsqlite_execute_fail(sqlite_buildin): cur.execute(create_sql) with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)") + pandas_sql.execute("INSERT INTO test VALUES('foo', 'baz', 2.567)") with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 7)") def test_xsqlite_execute_closed_connection(): @@ -4252,7 +4287,7 @@ def test_xsqlite_execute_closed_connection(): cur.execute(create_sql) with sql.pandasSQL_builder(conn) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)") msg = "Cannot operate on a closed database." with pytest.raises(sqlite3.ProgrammingError, match=msg): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3e4e1a107da9d..e73de78847c8f 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -3,7 +3,9 @@ from datetime import datetime import gzip import io +import itertools import os +import string import struct import tarfile import zipfile @@ -11,6 +13,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import CategoricalDtype import pandas._testing as tm @@ -61,16 +65,16 @@ def read_csv(self, file): return read_csv(file, parse_dates=True) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_empty_dta(self, version): + def test_read_empty_dta(self, version, temp_file): empty_ds = DataFrame(columns=["unit"]) # GH 7369, make sure can read a 0-obs dta file - with tm.ensure_clean() as path: - empty_ds.to_stata(path, write_index=False, version=version) - empty_ds2 = read_stata(path) - tm.assert_frame_equal(empty_ds, empty_ds2) + path = temp_file + empty_ds.to_stata(path, write_index=False, version=version) + empty_ds2 = read_stata(path) + tm.assert_frame_equal(empty_ds, empty_ds2) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_empty_dta_with_dtypes(self, version): + def test_read_empty_dta_with_dtypes(self, version, temp_file): # GH 46240 # Fixing above bug revealed that types are not correctly preserved when # writing empty DataFrames @@ -88,7 +92,12 @@ def test_read_empty_dta_with_dtypes(self, version): "f64": np.array([0], dtype=np.float64), } ) - expected = empty_df_typed.copy() + # GH 7369, make sure can read a 0-obs dta file + path = temp_file + empty_df_typed.to_stata(path, write_index=False, version=version) + empty_reread = read_stata(path) + + expected = empty_df_typed # No uint# support. Downcast since values in range for int# expected["u8"] = expected["u8"].astype(np.int8) expected["u16"] = expected["u16"].astype(np.int16) @@ -97,29 +106,27 @@ def test_read_empty_dta_with_dtypes(self, version): expected["u64"] = expected["u64"].astype(np.int32) expected["i64"] = expected["i64"].astype(np.int32) - # GH 7369, make sure can read a 0-obs dta file - with tm.ensure_clean() as path: - empty_df_typed.to_stata(path, write_index=False, version=version) - empty_reread = read_stata(path) - tm.assert_frame_equal(expected, empty_reread) - tm.assert_series_equal(expected.dtypes, empty_reread.dtypes) + tm.assert_frame_equal(expected, empty_reread) + tm.assert_series_equal(expected.dtypes, empty_reread.dtypes) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_index_col_none(self, version): + def test_read_index_col_none(self, version, temp_file): df = DataFrame({"a": range(5), "b": ["b1", "b2", "b3", "b4", "b5"]}) # GH 7369, make sure can read a 0-obs dta file - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False, version=version) - read_df = read_stata(path) + path = temp_file + df.to_stata(path, write_index=False, version=version) + read_df = read_stata(path) assert isinstance(read_df.index, pd.RangeIndex) - expected = df.copy() + expected = df expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(read_df, expected, check_index_type=True) - @pytest.mark.parametrize("file", ["stata1_114", "stata1_117"]) - def test_read_dta1(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + @pytest.mark.parametrize( + "version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119] + ) + def test_read_dta1(self, version, datapath): + file = datapath("io", "data", "stata", f"stata1_{version}.dta") parsed = self.read_dta(file) # Pandas uses np.nan as missing value. @@ -133,6 +140,18 @@ def test_read_dta1(self, file, datapath): # the casting doesn't fail so need to match stata here expected["float_miss"] = expected["float_miss"].astype(np.float32) + # Column names too long for older Stata formats + if version <= 108: + expected = expected.rename( + columns={ + "float_miss": "f_miss", + "double_miss": "d_miss", + "byte_miss": "b_miss", + "int_miss": "i_miss", + "long_miss": "l_miss", + } + ) + tm.assert_frame_equal(parsed, expected) def test_read_dta2(self, datapath): @@ -171,17 +190,25 @@ def test_read_dta2(self, datapath): "yearly_date", ], ) - expected["yearly_date"] = expected["yearly_date"].astype("O") + # TODO(GH#55564): just pass M8[s] to the constructor + expected["datetime_c"] = expected["datetime_c"].astype("M8[ms]") + expected["date"] = expected["date"].astype("M8[s]") + expected["weekly_date"] = expected["weekly_date"].astype("M8[s]") + expected["monthly_date"] = expected["monthly_date"].astype("M8[s]") + expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]") + expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]") + expected["yearly_date"] = expected["yearly_date"].astype("M8[s]") path1 = datapath("io", "data", "stata", "stata2_114.dta") path2 = datapath("io", "data", "stata", "stata2_115.dta") path3 = datapath("io", "data", "stata", "stata2_117.dta") - with tm.assert_produces_warning(UserWarning): + msg = "Leaving in Stata Internal Format" + with tm.assert_produces_warning(UserWarning, match=msg): parsed_114 = self.read_dta(path1) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): parsed_115 = self.read_dta(path2) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): parsed_117 = self.read_dta(path3) # FIXME: don't leave commented-out # 113 is buggy due to limits of date format support in Stata @@ -193,9 +220,9 @@ def test_read_dta2(self, datapath): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) @pytest.mark.parametrize( "file", ["stata3_113", "stata3_114", "stata3_115", "stata3_117"] @@ -212,11 +239,9 @@ def test_read_dta3(self, file, datapath): tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize( - "file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"] - ) - def test_read_dta4(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + @pytest.mark.parametrize("version", [110, 111, 113, 114, 115, 117]) + def test_read_dta4(self, version, datapath): + file = datapath("io", "data", "stata", f"stata4_{version}.dta") parsed = self.read_dta(file) expected = DataFrame.from_records( @@ -258,9 +283,66 @@ def test_read_dta4(self, file, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("version", [102, 103, 104, 105, 108]) + def test_readold_dta4(self, version, datapath): + # This test is the same as test_read_dta4 above except that the columns + # had to be renamed to match the restrictions in older file format + file = datapath("io", "data", "stata", f"stata4_{version}.dta") + parsed = self.read_dta(file) + + expected = DataFrame.from_records( + [ + ["one", "ten", "one", "one", "one"], + ["two", "nine", "two", "two", "two"], + ["three", "eight", "three", "three", "three"], + ["four", "seven", 4, "four", "four"], + ["five", "six", 5, np.nan, "five"], + ["six", "five", 6, np.nan, "six"], + ["seven", "four", 7, np.nan, "seven"], + ["eight", "three", 8, np.nan, "eight"], + ["nine", "two", 9, np.nan, "nine"], + ["ten", "one", "ten", np.nan, "ten"], + ], + columns=[ + "fulllab", + "fulllab2", + "incmplab", + "misslab", + "floatlab", + ], + ) + + # these are all categoricals + for col in expected: + orig = expected[col].copy() + + categories = np.asarray(expected["fulllab"][orig.notna()]) + if col == "incmplab": + categories = orig + + cat = orig.astype("category")._values + cat = cat.set_categories(categories, ordered=True) + cat.categories.rename(None, inplace=True) + + expected[col] = cat + + # stata doesn't save .category metadata + tm.assert_frame_equal(parsed, expected) + # File containing strls - def test_read_dta12(self, datapath): - parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta")) + @pytest.mark.parametrize( + "file", + [ + "stata12_117", + "stata12_be_117", + "stata12_118", + "stata12_be_118", + "stata12_119", + "stata12_be_119", + ], + ) + def test_read_dta_strl(self, file, datapath): + parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) expected = DataFrame.from_records( [ [1, "abc", "abcdefghi"], @@ -270,10 +352,20 @@ def test_read_dta12(self, datapath): columns=["x", "y", "z"], ) - tm.assert_frame_equal(parsed_117, expected, check_dtype=False) + tm.assert_frame_equal(parsed, expected, check_dtype=False) - def test_read_dta18(self, datapath): - parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata14_118", + "stata14_be_118", + "stata14_119", + "stata14_be_119", + ], + ) + def test_read_dta118_119(self, file, datapath): + parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( [ @@ -297,7 +389,7 @@ def test_read_dta18(self, datapath): for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) - with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr: + with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr: vl = rdr.variable_labels() vl_expected = { "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", @@ -312,39 +404,39 @@ def test_read_dta18(self, datapath): assert rdr.data_label == "This is a Ünicode data label" - def test_read_write_dta5(self): + def test_read_write_dta5(self, temp_file): original = DataFrame( [(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], ) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates=None) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, convert_dates=None) + written_and_read_again = self.read_dta(path) - expected = original.copy() + expected = original expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - def test_write_dta6(self, datapath): + def test_write_dta6(self, datapath, temp_file): original = self.read_csv(datapath("io", "data", "stata", "stata3.csv")) original.index.name = "index" original.index = original.index.astype(np.int32) original["year"] = original["year"].astype(np.int32) original["quarter"] = original["quarter"].astype(np.int32) - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates=None) - written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) + path = temp_file + original.to_stata(path, convert_dates=None) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + original, + check_index_type=False, + ) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_write_dta10(self, version): + def test_read_write_dta10(self, version, temp_file, using_infer_string): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], columns=["string", "object", "integer", "floating", "datetime"], @@ -354,24 +446,29 @@ def test_read_write_dta10(self, version): original.index = original.index.astype(np.int32) original["integer"] = original["integer"].astype(np.int32) - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) - written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) + path = temp_file + original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) + written_and_read_again = self.read_dta(path) - def test_stata_doc_examples(self): - with tm.ensure_clean() as path: - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") - ) - df.to_stata(path) + expected = original.copy() + # "tc" convert_dates means we store in ms + expected["datetime"] = expected["datetime"].astype("M8[ms]") + if using_infer_string: + expected["object"] = expected["object"].astype("str") + + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + expected, + ) - def test_write_preserves_original(self): + def test_stata_doc_examples(self, temp_file): + path = temp_file + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") + ) + df.to_stata(path) + + def test_write_preserves_original(self, temp_file): # 9795 df = DataFrame( @@ -379,12 +476,12 @@ def test_write_preserves_original(self): ) df.loc[2, "a":"c"] = np.nan df_copy = df.copy() - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False) + path = temp_file + df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_encoding(self, version, datapath): + def test_encoding(self, version, datapath, temp_file): # GH 4626, proper encoding handling raw = read_stata(datapath("io", "data", "stata", "stata1_encoding.dta")) encoded = read_stata(datapath("io", "data", "stata", "stata1_encoding.dta")) @@ -394,17 +491,17 @@ def test_encoding(self, version, datapath): assert result == expected assert isinstance(result, str) - with tm.ensure_clean() as path: - encoded.to_stata(path, write_index=False, version=version) - reread_encoded = read_stata(path) - tm.assert_frame_equal(encoded, reread_encoded) + path = temp_file + encoded.to_stata(path, write_index=False, version=version) + reread_encoded = read_stata(path) + tm.assert_frame_equal(encoded, reread_encoded) - def test_read_write_dta11(self): + def test_read_write_dta11(self, temp_file): original = DataFrame( [(1, 2, 3, 4)], columns=[ "good", - "b\u00E4d", + "b\u00e4d", "8number", "astringwithmorethan32characters______", ], @@ -416,18 +513,19 @@ def test_read_write_dta11(self): formatted.index.name = "index" formatted = formatted.astype(np.int32) - with tm.ensure_clean() as path: - with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path, convert_dates=None) + path = temp_file + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): + original.to_stata(path, convert_dates=None) - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) - expected = formatted.copy() + expected = formatted expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_write_dta12(self, version): + def test_read_write_dta12(self, version, temp_file): original = DataFrame( [(1, 2, 3, 4, 5, 6)], columns=[ @@ -453,18 +551,19 @@ def test_read_write_dta12(self, version): formatted.index.name = "index" formatted = formatted.astype(np.int32) - with tm.ensure_clean() as path: - with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path, convert_dates=None, version=version) - # should get a warning for that format. + path = temp_file + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): + original.to_stata(path, convert_dates=None, version=version) + # should get a warning for that format. - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) - expected = formatted.copy() + expected = formatted expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - def test_read_write_dta13(self): + def test_read_write_dta13(self, temp_file): s1 = Series(2**9, dtype=np.int16) s2 = Series(2**17, dtype=np.int32) s3 = Series(2**33, dtype=np.int64) @@ -474,11 +573,11 @@ def test_read_write_dta13(self): formatted = original formatted["int64"] = formatted["int64"].astype(np.float64) - with tm.ensure_clean() as path: - original.to_stata(path) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path) + written_and_read_again = self.read_dta(path) - expected = formatted.copy() + expected = formatted expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @@ -486,19 +585,20 @@ def test_read_write_dta13(self): @pytest.mark.parametrize( "file", ["stata5_113", "stata5_114", "stata5_115", "stata5_117"] ) - def test_read_write_reread_dta14(self, file, parsed_114, version, datapath): + def test_read_write_reread_dta14( + self, file, parsed_114, version, datapath, temp_file + ): file = datapath("io", "data", "stata", f"{file}.dta") parsed = self.read_dta(file) parsed.index.name = "index" tm.assert_frame_equal(parsed_114, parsed) - with tm.ensure_clean() as path: - parsed_114.to_stata(path, convert_dates={"date_td": "td"}, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + parsed_114.to_stata(path, convert_dates={"date_td": "td"}, version=version) + written_and_read_again = self.read_dta(path) expected = parsed_114.copy() - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @pytest.mark.parametrize( @@ -511,9 +611,10 @@ def test_read_write_reread_dta15(self, file, datapath): expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + + # TODO(GH#55564): directly cast to M8[s] + arr = expected["date_td"].astype("Period[D]")._values.asfreq("s", how="S") + expected["date_td"] = arr.view("M8[s]") file = datapath("io", "data", "stata", f"{file}.dta") parsed = self.read_dta(file) @@ -521,50 +622,50 @@ def test_read_write_reread_dta15(self, file, datapath): tm.assert_frame_equal(expected, parsed) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_timestamp_and_label(self, version): + def test_timestamp_and_label(self, version, temp_file): original = DataFrame([(1,)], columns=["variable"]) time_stamp = datetime(2000, 2, 29, 14, 21) data_label = "This is a data file." - with tm.ensure_clean() as path: - original.to_stata( - path, time_stamp=time_stamp, data_label=data_label, version=version - ) + path = temp_file + original.to_stata( + path, time_stamp=time_stamp, data_label=data_label, version=version + ) - with StataReader(path) as reader: - assert reader.time_stamp == "29 Feb 2000 14:21" - assert reader.data_label == data_label + with StataReader(path) as reader: + assert reader.time_stamp == "29 Feb 2000 14:21" + assert reader.data_label == data_label @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_invalid_timestamp(self, version): + def test_invalid_timestamp(self, version, temp_file): original = DataFrame([(1,)], columns=["variable"]) time_stamp = "01 Jan 2000, 00:00:00" - with tm.ensure_clean() as path: - msg = "time_stamp should be datetime type" - with pytest.raises(ValueError, match=msg): - original.to_stata(path, time_stamp=time_stamp, version=version) - assert not os.path.isfile(path) + path = temp_file + msg = "time_stamp should be datetime type" + with pytest.raises(ValueError, match=msg): + original.to_stata(path, time_stamp=time_stamp, version=version) + assert not os.path.isfile(path) - def test_numeric_column_names(self): + def test_numeric_column_names(self, temp_file): original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) original.index.name = "index" - with tm.ensure_clean() as path: - # should get a warning for that format. - with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path) + path = temp_file + # should get a warning for that format. + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): + original.to_stata(path) - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") columns = list(written_and_read_again.columns) convert_col_name = lambda x: int(x[1]) written_and_read_again.columns = map(convert_col_name, columns) - expected = original.copy() - expected.index = expected.index.astype(np.int32) + expected = original tm.assert_frame_equal(expected, written_and_read_again) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_nan_to_missing_value(self, version): + def test_nan_to_missing_value(self, version, temp_file): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) s1[::2] = np.nan @@ -572,71 +673,70 @@ def test_nan_to_missing_value(self, version): original = DataFrame({"s1": s1, "s2": s2}) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, version=version) + written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") - expected = original.copy() - expected.index = expected.index.astype(np.int32) + expected = original tm.assert_frame_equal(written_and_read_again, expected) - def test_no_index(self): + def test_no_index(self, temp_file): columns = ["x", "y"] original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns) original.index.name = "index_not_written" - with tm.ensure_clean() as path: - original.to_stata(path, write_index=False) - written_and_read_again = self.read_dta(path) - with pytest.raises(KeyError, match=original.index.name): - written_and_read_again["index_not_written"] + path = temp_file + original.to_stata(path, write_index=False) + written_and_read_again = self.read_dta(path) + with pytest.raises(KeyError, match=original.index.name): + written_and_read_again["index_not_written"] - def test_string_no_dates(self): + def test_string_no_dates(self, temp_file): s1 = Series(["a", "A longer string"]) s2 = Series([1.0, 2.0], dtype=np.float64) original = DataFrame({"s1": s1, "s2": s2}) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path) + written_and_read_again = self.read_dta(path) - expected = original.copy() - expected.index = expected.index.astype(np.int32) + expected = original tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - def test_large_value_conversion(self): + def test_large_value_conversion(self, temp_file): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2**15 - 1], dtype=np.int16) s3 = Series([1, 2**63 - 1], dtype=np.int64) original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) original.index.name = "index" - with tm.ensure_clean() as path: - with tm.assert_produces_warning(PossiblePrecisionLoss): - original.to_stata(path) + path = temp_file + with tm.assert_produces_warning(PossiblePrecisionLoss, match="from int64 to"): + original.to_stata(path) - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) - modified = original.copy() + modified = original modified["s1"] = Series(modified["s1"], dtype=np.int16) modified["s2"] = Series(modified["s2"], dtype=np.int32) modified["s3"] = Series(modified["s3"], dtype=np.float64) - modified.index = original.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) - def test_dates_invalid_column(self): + def test_dates_invalid_column(self, temp_file): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = "index" - with tm.ensure_clean() as path: - with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path, convert_dates={0: "tc"}) + path = temp_file + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): + original.to_stata(path, convert_dates={0: "tc"}) - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) - modified = original.copy() - modified.columns = ["_0"] - modified.index = original.index.astype(np.int32) - tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) + expected = original.copy() + expected.columns = ["_0"] + expected.index = original.index.astype(np.int32) + expected["_0"] = expected["_0"].astype("M8[ms]") + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) def test_105(self, datapath): # Data obtained from: @@ -661,7 +761,7 @@ def test_value_labels_old_format(self, datapath): with StataReader(dpath) as reader: assert reader.value_labels() == {} - def test_date_export_formats(self): + def test_date_export_formats(self, temp_file): columns = ["tc", "td", "tw", "tm", "tq", "th", "ty"] conversions = {c: c for c in columns} data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) @@ -681,32 +781,34 @@ def test_date_export_formats(self): [expected_values], index=pd.Index([0], dtype=np.int32, name="index"), columns=columns, + dtype="M8[s]", ) + expected["tc"] = expected["tc"].astype("M8[ms]") - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates=conversions) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, convert_dates=conversions) + written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - def test_write_missing_strings(self): + def test_write_missing_strings(self, temp_file): original = DataFrame([["1"], [None]], columns=["foo"]) expected = DataFrame( [["1"], [""]], - index=pd.Index([0, 1], dtype=np.int32, name="index"), + index=pd.RangeIndex(2, name="index"), columns=["foo"], ) - with tm.ensure_clean() as path: - original.to_stata(path) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path) + written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("byteorder", [">", "<"]) - def test_bool_uint(self, byteorder, version): + def test_bool_uint(self, byteorder, version, temp_file): s0 = Series([0, 1, True], dtype=np.bool_) s1 = Series([0, 1, 100], dtype=np.uint8) s2 = Series([0, 1, 255], dtype=np.uint8) @@ -719,8 +821,14 @@ def test_bool_uint(self, byteorder, version): {"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6} ) original.index.name = "index" - expected = original.copy() - expected.index = original.index.astype(np.int32) + + path = temp_file + original.to_stata(path, byteorder=byteorder, version=version) + written_and_read_again = self.read_dta(path) + + written_and_read_again = written_and_read_again.set_index("index") + + expected = original expected_types = ( np.int8, np.int8, @@ -733,11 +841,6 @@ def test_bool_uint(self, byteorder, version): for c, t in zip(expected.columns, expected_types): expected[c] = expected[c].astype(t) - with tm.ensure_clean() as path: - original.to_stata(path, byteorder=byteorder, version=version) - written_and_read_again = self.read_dta(path) - - written_and_read_again = written_and_read_again.set_index("index") tm.assert_frame_equal(written_and_read_again, expected) def test_variable_labels(self, datapath): @@ -753,7 +856,7 @@ def test_variable_labels(self, datapath): assert k in keys assert v in labels - def test_minimal_size_col(self): + def test_minimal_size_col(self, temp_file): str_lens = (1, 100, 244) s = {} for str_len in str_lens: @@ -761,16 +864,16 @@ def test_minimal_size_col(self): ["a" * str_len, "b" * str_len, "c" * str_len] ) original = DataFrame(s) - with tm.ensure_clean() as path: - original.to_stata(path, write_index=False) + path = temp_file + original.to_stata(path, write_index=False) - with StataReader(path) as sr: - sr._ensure_open() # The `_*list` variables are initialized here - for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist): - assert int(variable[1:]) == int(fmt[1:-1]) - assert int(variable[1:]) == typ + with StataReader(path) as sr: + sr._ensure_open() # The `_*list` variables are initialized here + for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist): + assert int(variable[1:]) == int(fmt[1:-1]) + assert int(variable[1:]) == typ - def test_excessively_long_string(self): + def test_excessively_long_string(self, temp_file): str_lens = (1, 244, 500) s = {} for str_len in str_lens: @@ -785,16 +888,16 @@ def test_excessively_long_string(self): r"the newer \(Stata 13 and later\) format\." ) with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - original.to_stata(path) + path = temp_file + original.to_stata(path) - def test_missing_value_generator(self): + def test_missing_value_generator(self, temp_file): types = ("b", "h", "l") df = DataFrame([[0.0]], columns=["float_"]) - with tm.ensure_clean() as path: - df.to_stata(path) - with StataReader(path) as rdr: - valid_range = rdr.VALID_RANGE + path = temp_file + df.to_stata(path) + with StataReader(path) as rdr: + valid_range = rdr.VALID_RANGE expected_values = ["." + chr(97 + i) for i in range(26)] expected_values.insert(0, ".") for t in types: @@ -819,8 +922,8 @@ def test_missing_value_generator(self): ) assert val.string == ".z" - @pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"]) - def test_missing_value_conversion(self, file, datapath): + @pytest.mark.parametrize("version", [113, 115, 117]) + def test_missing_value_conversion(self, version, datapath): columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] smv = StataMissingValue(101) keys = sorted(smv.MISSING_VALUES.keys()) @@ -831,11 +934,45 @@ def test_missing_value_conversion(self, file, datapath): expected = DataFrame(data, columns=columns) parsed = read_stata( - datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_missing_value_conversion_compat(self, version, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in range(5)] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_missing_value_conversion_compat_nobyte(self, version, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, ) tm.assert_frame_equal(parsed, expected) - def test_big_dates(self, datapath): + def test_big_dates(self, datapath, temp_file): yr = [1960, 2000, 9999, 100, 2262, 1677] mo = [1, 1, 12, 1, 4, 9] dd = [1, 1, 31, 1, 22, 23] @@ -876,22 +1013,29 @@ def test_big_dates(self, datapath): expected[5][5] = expected[5][6] = datetime(1678, 1, 1) expected = DataFrame(expected, columns=columns, dtype=object) + expected["date_tc"] = expected["date_tc"].astype("M8[ms]") + expected["date_td"] = expected["date_td"].astype("M8[s]") + expected["date_tm"] = expected["date_tm"].astype("M8[s]") + expected["date_tw"] = expected["date_tw"].astype("M8[s]") + expected["date_tq"] = expected["date_tq"].astype("M8[s]") + expected["date_th"] = expected["date_th"].astype("M8[s]") + expected["date_ty"] = expected["date_ty"].astype("M8[s]") + parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) - tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) - tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) date_conversion = {c: c[-2:] for c in columns} # {c : c[-2:] for c in columns} - with tm.ensure_clean() as path: - expected.index.name = "index" - expected.to_stata(path, convert_dates=date_conversion) - written_and_read_again = self.read_dta(path) + path = temp_file + expected.index.name = "index" + expected.to_stata(path, convert_dates=date_conversion) + written_and_read_again = self.read_dta(path) tm.assert_frame_equal( written_and_read_again.set_index("index"), expected.set_index(expected.index.astype(np.int32)), - check_datetimelike_compat=True, ) def test_dtype_conversion(self, datapath): @@ -901,9 +1045,7 @@ def test_dtype_conversion(self, datapath): expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + expected["date_td"] = expected["date_td"].astype("M8[s]") no_conversion = read_stata( datapath("io", "data", "stata", "stata6_117.dta"), convert_dates=True @@ -917,12 +1059,10 @@ def test_dtype_conversion(self, datapath): ) # read_csv types are the same - expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + expected2 = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) + expected2["date_td"] = expected["date_td"] - tm.assert_frame_equal(expected, conversion) + tm.assert_frame_equal(expected2, conversion) def test_drop_column(self, datapath): expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) @@ -957,27 +1097,25 @@ def test_drop_column(self, datapath): msg = "columns contains duplicate entries" with pytest.raises(ValueError, match=msg): - columns = ["byte_", "byte_"] read_stata( datapath("io", "data", "stata", "stata6_117.dta"), convert_dates=True, - columns=columns, + columns=["byte_", "byte_"], ) msg = "The following columns were not found in the Stata data set: not_found" with pytest.raises(ValueError, match=msg): - columns = ["byte_", "int_", "long_", "not_found"] read_stata( datapath("io", "data", "stata", "stata6_117.dta"), convert_dates=True, - columns=columns, + columns=["byte_", "int_", "long_", "not_found"], ) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.filterwarnings( "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch" ) - def test_categorical_writing(self, version): + def test_categorical_writing(self, version, temp_file): original = DataFrame.from_records( [ ["one", "ten", "one", "one", "one", 1], @@ -1000,18 +1138,19 @@ def test_categorical_writing(self, version): "unlabeled", ], ) - expected = original.copy() + path = temp_file + original.astype("category").to_stata(path, version=version) + written_and_read_again = self.read_dta(path) - # these are all categoricals - original = pd.concat( - [original[col].astype("category") for col in original], axis=1 - ) - expected.index = expected.index.set_names("index").astype(np.int32) + res = written_and_read_again.set_index("index") + + expected = original + expected.index = expected.index.set_names("index") expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str) expected["unlabeled"] = expected["unlabeled"].apply(str) for col in expected: - orig = expected[col].copy() + orig = expected[col] cat = orig.astype("category")._values cat = cat.as_ordered() @@ -1022,45 +1161,21 @@ def test_categorical_writing(self, version): expected[col] = cat - with tm.ensure_clean() as path: - original.to_stata(path, version=version) - written_and_read_again = self.read_dta(path) - - res = written_and_read_again.set_index("index") tm.assert_frame_equal(res, expected) - def test_categorical_warnings_and_errors(self): + def test_categorical_warnings_and_errors(self, temp_file): # Warning for non-string labels - # Error for labels too long - original = DataFrame.from_records( - [["a" * 10000], ["b" * 10000], ["c" * 10000], ["d" * 10000]], - columns=["Too_long"], - ) - - original = pd.concat( - [original[col].astype("category") for col in original], axis=1 - ) - with tm.ensure_clean() as path: - msg = ( - "Stata value labels for a single variable must have " - r"a combined length less than 32,000 characters\." - ) - with pytest.raises(ValueError, match=msg): - original.to_stata(path) - original = DataFrame.from_records( [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"] - ) - original = pd.concat( - [original[col].astype("category") for col in original], axis=1 - ) + ).astype("category") - with tm.assert_produces_warning(ValueLabelTypeMismatch): - original.to_stata(path) + msg = "data file created has not lost information due to duplicate labels" + with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg): + original.to_stata(temp_file) # should get a warning for mixed content @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_categorical_with_stata_missing_values(self, version): + def test_categorical_with_stata_missing_values(self, version, temp_file): values = [["a" + str(i)] for i in range(120)] values.append([np.nan]) original = DataFrame.from_records(values, columns=["many_labels"]) @@ -1068,19 +1183,18 @@ def test_categorical_with_stata_missing_values(self, version): [original[col].astype("category") for col in original], axis=1 ) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, version=version) + written_and_read_again = self.read_dta(path) res = written_and_read_again.set_index("index") - expected = original.copy() + expected = original for col in expected: cat = expected[col]._values new_cats = cat.remove_unused_categories().categories cat = cat.set_categories(new_cats, ordered=True) expected[col] = cat - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(res, expected) @pytest.mark.parametrize("file", ["stata10_115", "stata10_117"]) @@ -1193,7 +1307,9 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1209,6 +1325,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame: if cat.categories.dtype == object: categories = pd.Index._with_infer(cat.categories._values) cat = cat.set_categories(categories) + elif cat.categories.dtype == "string" and len(cat.categories) == 0: + # if the read categories are empty, it comes back as object dtype + categories = cat.categories.astype(object) + cat = cat.set_categories(categories) from_frame[col] = cat return from_frame @@ -1285,7 +1405,9 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1306,55 +1428,51 @@ def test_read_chunks_columns(self, datapath): pos += chunksize @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_write_variable_labels(self, version, mixed_frame): + def test_write_variable_labels(self, version, mixed_frame, temp_file): # GH 13631, add support for writing variable labels mixed_frame.index.name = "index" variable_labels = {"a": "City Rank", "b": "City Exponent", "c": "City"} - with tm.ensure_clean() as path: - mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) - with StataReader(path) as sr: - read_labels = sr.variable_labels() - expected_labels = { - "index": "", - "a": "City Rank", - "b": "City Exponent", - "c": "City", - } - assert read_labels == expected_labels + path = temp_file + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + expected_labels = { + "index": "", + "a": "City Rank", + "b": "City Exponent", + "c": "City", + } + assert read_labels == expected_labels variable_labels["index"] = "The Index" - with tm.ensure_clean() as path: - mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) - with StataReader(path) as sr: - read_labels = sr.variable_labels() - assert read_labels == variable_labels + path = temp_file + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + assert read_labels == variable_labels @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_invalid_variable_labels(self, version, mixed_frame): + def test_invalid_variable_labels(self, version, mixed_frame, temp_file): mixed_frame.index.name = "index" variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} - with tm.ensure_clean() as path: - msg = "Variable labels must be 80 characters or fewer" - with pytest.raises(ValueError, match=msg): - mixed_frame.to_stata( - path, variable_labels=variable_labels, version=version - ) + path = temp_file + msg = "Variable labels must be 80 characters or fewer" + with pytest.raises(ValueError, match=msg): + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) @pytest.mark.parametrize("version", [114, 117]) - def test_invalid_variable_label_encoding(self, version, mixed_frame): + def test_invalid_variable_label_encoding(self, version, mixed_frame, temp_file): mixed_frame.index.name = "index" variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} variable_labels["a"] = "invalid character Œ" - with tm.ensure_clean() as path: - with pytest.raises( - ValueError, match="Variable labels must contain only characters" - ): - mixed_frame.to_stata( - path, variable_labels=variable_labels, version=version - ) + path = temp_file + with pytest.raises( + ValueError, match="Variable labels must contain only characters" + ): + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) - def test_write_variable_label_errors(self, mixed_frame): - values = ["\u03A1", "\u0391", "\u039D", "\u0394", "\u0391", "\u03A3"] + def test_write_variable_label_errors(self, mixed_frame, temp_file): + values = ["\u03a1", "\u0391", "\u039d", "\u0394", "\u0391", "\u03a3"] variable_labels_utf8 = { "a": "City Rank", @@ -1367,8 +1485,8 @@ def test_write_variable_label_errors(self, mixed_frame): "encoded in Latin-1" ) with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - mixed_frame.to_stata(path, variable_labels=variable_labels_utf8) + path = temp_file + mixed_frame.to_stata(path, variable_labels=variable_labels_utf8) variable_labels_long = { "a": "City Rank", @@ -1380,10 +1498,10 @@ def test_write_variable_label_errors(self, mixed_frame): msg = "Variable labels must be 80 characters or fewer" with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - mixed_frame.to_stata(path, variable_labels=variable_labels_long) + path = temp_file + mixed_frame.to_stata(path, variable_labels=variable_labels_long) - def test_default_date_conversion(self): + def test_default_date_conversion(self, temp_file): # GH 12259 dates = [ dt.datetime(1999, 12, 31, 12, 12, 12, 12000), @@ -1398,29 +1516,33 @@ def test_default_date_conversion(self): } ) - with tm.ensure_clean() as path: - original.to_stata(path, write_index=False) - reread = read_stata(path, convert_dates=True) - tm.assert_frame_equal(original, reread) + expected = original[:] + # "tc" for convert_dates below stores with "ms" resolution + expected["dates"] = expected["dates"].astype("M8[ms]") - original.to_stata(path, write_index=False, convert_dates={"dates": "tc"}) - direct = read_stata(path, convert_dates=True) - tm.assert_frame_equal(reread, direct) + path = temp_file + original.to_stata(path, write_index=False) + reread = read_stata(path, convert_dates=True) + tm.assert_frame_equal(expected, reread) - dates_idx = original.columns.tolist().index("dates") - original.to_stata(path, write_index=False, convert_dates={dates_idx: "tc"}) - direct = read_stata(path, convert_dates=True) - tm.assert_frame_equal(reread, direct) + original.to_stata(path, write_index=False, convert_dates={"dates": "tc"}) + direct = read_stata(path, convert_dates=True) + tm.assert_frame_equal(reread, direct) - def test_unsupported_type(self): + dates_idx = original.columns.tolist().index("dates") + original.to_stata(path, write_index=False, convert_dates={dates_idx: "tc"}) + direct = read_stata(path, convert_dates=True) + tm.assert_frame_equal(reread, direct) + + def test_unsupported_type(self, temp_file): original = DataFrame({"a": [1 + 2j, 2 + 4j]}) msg = "Data type complex128 not supported" with pytest.raises(NotImplementedError, match=msg): - with tm.ensure_clean() as path: - original.to_stata(path) + path = temp_file + original.to_stata(path) - def test_unsupported_datetype(self): + def test_unsupported_datetype(self, temp_file): dates = [ dt.datetime(1999, 12, 31, 12, 12, 12, 12000), dt.datetime(2012, 12, 21, 12, 21, 12, 21000), @@ -1436,8 +1558,8 @@ def test_unsupported_datetype(self): msg = "Format %tC not implemented" with pytest.raises(NotImplementedError, match=msg): - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates={"dates": "tC"}) + path = temp_file + original.to_stata(path, convert_dates={"dates": "tC"}) dates = pd.date_range("1-1-1990", periods=3, tz="Asia/Hong_Kong") original = DataFrame( @@ -1448,8 +1570,8 @@ def test_unsupported_datetype(self): } ) with pytest.raises(NotImplementedError, match="Data type datetime64"): - with tm.ensure_clean() as path: - original.to_stata(path) + path = temp_file + original.to_stata(path) def test_repeated_column_labels(self, datapath): # GH 13923, 25772 @@ -1485,7 +1607,7 @@ def test_stata_111(self, datapath): original = original[["y", "x", "w", "z"]] tm.assert_frame_equal(original, df) - def test_out_of_range_double(self): + def test_out_of_range_double(self, temp_file): # GH 14618 df = DataFrame( { @@ -1498,10 +1620,10 @@ def test_out_of_range_double(self): r"supported by Stata \(.+\)" ) with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - df.to_stata(path) + path = temp_file + df.to_stata(path) - def test_out_of_range_float(self): + def test_out_of_range_float(self, temp_file): original = DataFrame( { "ColumnOk": [ @@ -1520,17 +1642,16 @@ def test_out_of_range_float(self): for col in original: original[col] = original[col].astype(np.float32) - with tm.ensure_clean() as path: - original.to_stata(path) - reread = read_stata(path) + path = temp_file + original.to_stata(path) + reread = read_stata(path) original["ColumnTooBig"] = original["ColumnTooBig"].astype(np.float64) - expected = original.copy() - expected.index = expected.index.astype(np.int32) + expected = original tm.assert_frame_equal(reread.set_index("index"), expected) @pytest.mark.parametrize("infval", [np.inf, -np.inf]) - def test_inf(self, infval): + def test_inf(self, infval, temp_file): # GH 45350 df = DataFrame({"WithoutInf": [0.0, 1.0], "WithInf": [2.0, infval]}) msg = ( @@ -1538,55 +1659,44 @@ def test_inf(self, infval): "which is outside the range supported by Stata." ) with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - df.to_stata(path) + path = temp_file + df.to_stata(path) def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") result = tm.round_trip_pathlib(df.to_stata, reader) tm.assert_frame_equal(df, result) - def test_pickle_path_localpath(self): - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), - ) - df.index.name = "index" - reader = lambda x: read_stata(x).set_index("index") - result = tm.round_trip_localpath(df.to_stata, reader) - tm.assert_frame_equal(df, result) - @pytest.mark.parametrize("write_index", [True, False]) - def test_value_labels_iterator(self, write_index): + def test_value_labels_iterator(self, write_index, temp_file): # GH 16923 d = {"A": ["B", "E", "C", "A", "E"]} df = DataFrame(data=d) df["A"] = df["A"].astype("category") - with tm.ensure_clean() as path: - df.to_stata(path, write_index=write_index) + path = temp_file + df.to_stata(path, write_index=write_index) - with read_stata(path, iterator=True) as dta_iter: - value_labels = dta_iter.value_labels() + with read_stata(path, iterator=True) as dta_iter: + value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} - def test_set_index(self): + def test_set_index(self, temp_file): # GH 17328 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" - with tm.ensure_clean() as path: - df.to_stata(path) - reread = read_stata(path, index_col="index") + path = temp_file + df.to_stata(path) + reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) @pytest.mark.parametrize( @@ -1609,7 +1719,8 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - def test_writer_117(self): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_writer_117(self, byteorder, temp_file, using_infer_string): original = DataFrame( data=[ [ @@ -1662,40 +1773,47 @@ def test_writer_117(self): original["int32"] = original["int32"].astype(np.int32) original["float32"] = Series(original["float32"], dtype=np.float32) original.index.name = "index" - original.index = original.index.astype(np.int32) copy = original.copy() - with tm.ensure_clean() as path: - original.to_stata( - path, - convert_dates={"datetime": "tc"}, - convert_strl=["forced_strl"], - version=117, - ) - written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) - tm.assert_frame_equal(original, copy) + path = temp_file + original.to_stata( + path, + convert_dates={"datetime": "tc"}, + byteorder=byteorder, + convert_strl=["forced_strl"], + version=117, + ) + written_and_read_again = self.read_dta(path) + + expected = original[:] + # "tc" for convert_dates means we store with "ms" resolution + expected["datetime"] = expected["datetime"].astype("M8[ms]") + if using_infer_string: + # object dtype (with only strings/None) comes back as string dtype + expected["object"] = expected["object"].astype("str") + + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + expected, + ) + tm.assert_frame_equal(original, copy) - def test_convert_strl_name_swap(self): + def test_convert_strl_name_swap(self, temp_file): original = DataFrame( [["a" * 3000, "A", "apple"], ["b" * 1000, "B", "banana"]], columns=["long1" * 10, "long", 1], ) original.index.name = "index" - with tm.assert_produces_warning(InvalidColumnName): - with tm.ensure_clean() as path: - original.to_stata(path, convert_strl=["long", 1], version=117) - reread = self.read_dta(path) - reread = reread.set_index("index") - reread.columns = original.columns - tm.assert_frame_equal(reread, original, check_index_type=False) + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): + path = temp_file + original.to_stata(path, convert_strl=["long", 1], version=117) + reread = self.read_dta(path) + reread = reread.set_index("index") + reread.columns = original.columns + tm.assert_frame_equal(reread, original, check_index_type=False) - def test_invalid_date_conversion(self): + def test_invalid_date_conversion(self, temp_file): # GH 12259 dates = [ dt.datetime(1999, 12, 31, 12, 12, 12, 12000), @@ -1710,46 +1828,56 @@ def test_invalid_date_conversion(self): } ) - with tm.ensure_clean() as path: - msg = "convert_dates key must be a column or an integer" - with pytest.raises(ValueError, match=msg): - original.to_stata(path, convert_dates={"wrong_name": "tc"}) + path = temp_file + msg = "convert_dates key must be a column or an integer" + with pytest.raises(ValueError, match=msg): + original.to_stata(path, convert_dates={"wrong_name": "tc"}) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_nonfile_writing(self, version): + def test_nonfile_writing(self, version, temp_file): # GH 21041 bio = io.BytesIO() df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" - with tm.ensure_clean() as path: - df.to_stata(bio, version=version) - bio.seek(0) - with open(path, "wb") as dta: - dta.write(bio.read()) - reread = read_stata(path, index_col="index") + path = temp_file + df.to_stata(bio, version=version) + bio.seek(0) + with open(path, "wb") as dta: + dta.write(bio.read()) + reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) - def test_gzip_writing(self): + def test_gzip_writing(self, temp_file): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" - with tm.ensure_clean() as path: - with gzip.GzipFile(path, "wb") as gz: - df.to_stata(gz, version=114) - with gzip.GzipFile(path, "rb") as gz: - reread = read_stata(gz, index_col="index") + path = temp_file + with gzip.GzipFile(path, "wb") as gz: + df.to_stata(gz, version=114) + with gzip.GzipFile(path, "rb") as gz: + reread = read_stata(gz, index_col="index") tm.assert_frame_equal(df, reread) - def test_unicode_dta_118(self, datapath): - unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata16_118", + "stata16_be_118", + "stata16_119", + "stata16_be_119", + ], + ) + def test_unicode_dta_118_119(self, file, datapath): + unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"] values = [ @@ -1765,70 +1893,67 @@ def test_unicode_dta_118(self, datapath): tm.assert_frame_equal(unicode_df, expected) - def test_mixed_string_strl(self): + def test_mixed_string_strl(self, temp_file, using_infer_string): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] output = DataFrame(output) output.number = output.number.astype("int32") - with tm.ensure_clean() as path: - output.to_stata(path, write_index=False, version=117) - reread = read_stata(path) - expected = output.fillna("") - tm.assert_frame_equal(reread, expected) + path = temp_file + output.to_stata(path, write_index=False, version=117) + reread = read_stata(path) + expected = output.fillna("") + tm.assert_frame_equal(reread, expected) - # Check strl supports all None (null) - output["mixed"] = None - output.to_stata( - path, write_index=False, convert_strl=["mixed"], version=117 - ) - reread = read_stata(path) - expected = output.fillna("") - tm.assert_frame_equal(reread, expected) + # Check strl supports all None (null) + output["mixed"] = None + output.to_stata(path, write_index=False, convert_strl=["mixed"], version=117) + reread = read_stata(path) + expected = output.fillna("") + if using_infer_string: + expected["mixed"] = expected["mixed"].astype("str") + tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_all_none_exception(self, version): + def test_all_none_exception(self, version, temp_file): output = [{"none": "none", "number": 0}, {"none": None, "number": 1}] output = DataFrame(output) output["none"] = None - with tm.ensure_clean() as path: - with pytest.raises(ValueError, match="Column `none` cannot be exported"): - output.to_stata(path, version=version) + with pytest.raises(ValueError, match="Column `none` cannot be exported"): + output.to_stata(temp_file, version=version) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_invalid_file_not_written(self, version): + def test_invalid_file_not_written(self, version, temp_file): content = "Here is one __�__ Another one __·__ Another one __½__" df = DataFrame([content], columns=["invalid"]) - with tm.ensure_clean() as path: - msg1 = ( - r"'latin-1' codec can't encode character '\\ufffd' " - r"in position 14: ordinal not in range\(256\)" - ) - msg2 = ( - "'ascii' codec can't decode byte 0xef in position 14: " - r"ordinal not in range\(128\)" - ) - with pytest.raises(UnicodeEncodeError, match=f"{msg1}|{msg2}"): - df.to_stata(path) + msg1 = ( + r"'latin-1' codec can't encode character '\\ufffd' " + r"in position 14: ordinal not in range\(256\)" + ) + msg2 = ( + "'ascii' codec can't decode byte 0xef in position 14: " + r"ordinal not in range\(128\)" + ) + with pytest.raises(UnicodeEncodeError, match=f"{msg1}|{msg2}"): + df.to_stata(temp_file) - def test_strl_latin1(self): + def test_strl_latin1(self, temp_file): # GH 23573, correct GSO data to reflect correct size output = DataFrame( [["pandas"] * 2, ["þâÑÐŧ"] * 2], columns=["var_str", "var_strl"] ) - with tm.ensure_clean() as path: - output.to_stata(path, version=117, convert_strl=["var_strl"]) - with open(path, "rb") as reread: - content = reread.read() - expected = "þâÑÐŧ" - assert expected.encode("latin-1") in content - assert expected.encode("utf-8") in content - gsos = content.split(b"strls")[1][1:-2] - for gso in gsos.split(b"GSO")[1:]: - val = gso.split(b"\x00")[-2] - size = gso[gso.find(b"\x82") + 1] - assert len(val) == size - 1 + output.to_stata(temp_file, version=117, convert_strl=["var_strl"]) + with open(temp_file, "rb") as reread: + content = reread.read() + expected = "þâÑÐŧ" + assert expected.encode("latin-1") in content + assert expected.encode("utf-8") in content + gsos = content.split(b"strls")[1][1:-2] + for gso in gsos.split(b"GSO")[1:]: + val = gso.split(b"\x00")[-2] + size = gso[gso.find(b"\x82") + 1] + assert len(val) == size - 1 def test_encoding_latin1_118(self, datapath): # GH 25960 @@ -1863,7 +1988,8 @@ def test_stata_119(self, datapath): assert reader._nvar == 32999 @pytest.mark.parametrize("version", [118, 119, None]) - def test_utf8_writer(self, version): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_utf8_writer(self, version, byteorder, temp_file): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) data = DataFrame( [ @@ -1884,45 +2010,125 @@ def test_utf8_writer(self, version): data_label = "ᴅaᵀa-label" value_labels = {"β": {1: "label", 2: "æøå", 3: "ŋot valid latin-1"}} data["β"] = data["β"].astype(np.int32) - with tm.ensure_clean() as path: - writer = StataWriterUTF8( - path, - data, - data_label=data_label, - convert_strl=["strls"], - variable_labels=variable_labels, - write_index=False, - version=version, - value_labels=value_labels, - ) - writer.write_file() - reread_encoded = read_stata(path) - # Missing is intentionally converted to empty strl - data["strls"] = data["strls"].fillna("") - # Variable with value labels is reread as categorical - data["β"] = ( - data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered() - ) - tm.assert_frame_equal(data, reread_encoded) - with StataReader(path) as reader: - assert reader.data_label == data_label - assert reader.variable_labels() == variable_labels + writer = StataWriterUTF8( + temp_file, + data, + data_label=data_label, + convert_strl=["strls"], + variable_labels=variable_labels, + write_index=False, + byteorder=byteorder, + version=version, + value_labels=value_labels, + ) + writer.write_file() + reread_encoded = read_stata(temp_file) + # Missing is intentionally converted to empty strl + data["strls"] = data["strls"].fillna("") + # Variable with value labels is reread as categorical + data["β"] = ( + data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered() + ) + tm.assert_frame_equal(data, reread_encoded) + with StataReader(temp_file) as reader: + assert reader.data_label == data_label + assert reader.variable_labels() == variable_labels - data.to_stata(path, version=version, write_index=False) - reread_to_stata = read_stata(path) - tm.assert_frame_equal(data, reread_to_stata) + data.to_stata(temp_file, version=version, write_index=False) + reread_to_stata = read_stata(temp_file) + tm.assert_frame_equal(data, reread_to_stata) - def test_writer_118_exceptions(self): + def test_writer_118_exceptions(self, temp_file): df = DataFrame(np.zeros((1, 33000), dtype=np.int8)) - with tm.ensure_clean() as path: - with pytest.raises(ValueError, match="version must be either 118 or 119."): - StataWriterUTF8(path, df, version=117) - with tm.ensure_clean() as path: - with pytest.raises(ValueError, match="You must use version 119"): - StataWriterUTF8(path, df, version=118) + with pytest.raises(ValueError, match="version must be either 118 or 119."): + StataWriterUTF8(temp_file, df, version=117) + with pytest.raises(ValueError, match="You must use version 119"): + StataWriterUTF8(temp_file, df, version=118) + + @pytest.mark.parametrize( + "dtype_backend", + ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], + ) + def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): + df = DataFrame( + { + "a": [1, 2, None], + "b": ["a", "b", "c"], + "c": [True, False, None], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index"), + ) + df = df.convert_dtypes(dtype_backend=dtype_backend) + stata_path = tmp_path / "test_stata.dta" + df.to_stata(stata_path, version=118) + + df.to_stata(temp_file) + written_and_read_again = self.read_dta(temp_file) + + expected = DataFrame( + { + "a": [1, 2, np.nan], + "b": ["a", "b", "c"], + "c": [1.0, 0, np.nan], + "d": [1.5, 2.5, 3.5], + # stata stores with ms unit, so unit does not round-trip exactly + "e": pd.date_range("2020-12-31", periods=3, freq="D", unit="ms"), + }, + index=pd.RangeIndex(range(3), name="index"), + ) + + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + + @pytest.mark.parametrize("version", [113, 114, 115, 117, 118, 119]) + def test_read_data_int_validranges(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-127, 100], dtype=np.int8), + "int": np.array([-32767, 32740], dtype=np.int16), + "long": np.array([-2147483647, 2147483620], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_read_data_int_validranges_compat(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int8), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_read_data_int_validranges_compat_nobyte(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int16), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) -@pytest.mark.parametrize("version", [105, 108, 111, 113, 114]) + +@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114]) def test_backward_compat(version, datapath): data_base = datapath("io", "data", "stata") ref = os.path.join(data_base, "stata-compat-118.dta") @@ -1932,6 +2138,52 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [103, 104]) +def test_backward_compat_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + data_base = datapath("io", "data", "stata") + ref = os.path.join(data_base, "stata-compat-118.dta") + old = os.path.join(data_base, f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + +@pytest.mark.parametrize("version", [102]) +def test_backward_compat_nostring(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", "stata-compat-118.dta") + old = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + # The Stata data format prior to 103 did not support string data + expected = expected.drop(columns=["s10"]) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + +@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) +def test_bigendian(version, datapath): + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref) + big_dta = read_stata(big) + tm.assert_frame_equal(big_dta, expected) + + +# Note: 102 format does not support big-endian byte order +@pytest.mark.parametrize("version", [103, 104]) +def test_bigendian_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref, convert_dates=False) + big_dta = read_stata(big, convert_dates=False) + tm.assert_frame_equal(big_dta, expected) + + def test_direct_read(datapath, monkeypatch): file_path = datapath("io", "data", "stata", "stata-compat-118.dta") @@ -1955,25 +2207,12 @@ def test_direct_read(datapath, monkeypatch): assert reader._path_or_buf is bio -def test_statareader_warns_when_used_without_context(datapath): - file_path = datapath("io", "data", "stata", "stata-compat-118.dta") - with tm.assert_produces_warning( - ResourceWarning, - match="without using a context manager", - ): - sr = StataReader(file_path) - sr.read() - with tm.assert_produces_warning( - FutureWarning, - match="is not part of the public API", - ): - sr.close() - - @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("use_dict", [True, False]) @pytest.mark.parametrize("infer", [True, False]) -def test_compression(compression, version, use_dict, infer, compression_to_extension): +def test_compression( + compression, version, use_dict, infer, compression_to_extension, tmp_path +): file_name = "dta_inferred_compression.dta" if compression: if use_dict: @@ -1991,87 +2230,88 @@ def test_compression(compression, version, use_dict, infer, compression_to_exten np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") ) df.index.name = "index" - with tm.ensure_clean(file_name) as path: - df.to_stata(path, version=version, compression=compression_arg) - if compression == "gzip": - with gzip.open(path, "rb") as comp: - fp = io.BytesIO(comp.read()) - elif compression == "zip": - with zipfile.ZipFile(path, "r") as comp: - fp = io.BytesIO(comp.read(comp.filelist[0])) - elif compression == "tar": - with tarfile.open(path) as tar: - fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read()) - elif compression == "bz2": - with bz2.open(path, "rb") as comp: - fp = io.BytesIO(comp.read()) - elif compression == "zstd": - zstd = pytest.importorskip("zstandard") - with zstd.open(path, "rb") as comp: - fp = io.BytesIO(comp.read()) - elif compression == "xz": - lzma = pytest.importorskip("lzma") - with lzma.open(path, "rb") as comp: - fp = io.BytesIO(comp.read()) - elif compression is None: - fp = path - reread = read_stata(fp, index_col="index") - - expected = df.copy() - expected.index = expected.index.astype(np.int32) + path = tmp_path / file_name + path.touch() + df.to_stata(path, version=version, compression=compression_arg) + if compression == "gzip": + with gzip.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression == "zip": + with zipfile.ZipFile(path, "r") as comp: + fp = io.BytesIO(comp.read(comp.filelist[0])) + elif compression == "tar": + with tarfile.open(path) as tar: + fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read()) + elif compression == "bz2": + with bz2.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression == "zstd": + zstd = pytest.importorskip("zstandard") + with zstd.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression == "xz": + lzma = pytest.importorskip("lzma") + with lzma.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression is None: + fp = path + reread = read_stata(fp, index_col="index") + + expected = df tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("method", ["zip", "infer"]) @pytest.mark.parametrize("file_ext", [None, "dta", "zip"]) -def test_compression_dict(method, file_ext): +def test_compression_dict(method, file_ext, tmp_path): file_name = f"test.{file_ext}" archive_name = "test.dta" df = DataFrame( np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") ) df.index.name = "index" - with tm.ensure_clean(file_name) as path: - compression = {"method": method, "archive_name": archive_name} - df.to_stata(path, compression=compression) - if method == "zip" or file_ext == "zip": - with zipfile.ZipFile(path, "r") as zp: - assert len(zp.filelist) == 1 - assert zp.filelist[0].filename == archive_name - fp = io.BytesIO(zp.read(zp.filelist[0])) - else: - fp = path - reread = read_stata(fp, index_col="index") - - expected = df.copy() - expected.index = expected.index.astype(np.int32) + compression = {"method": method, "archive_name": archive_name} + path = tmp_path / file_name + path.touch() + df.to_stata(path, compression=compression) + if method == "zip" or file_ext == "zip": + with zipfile.ZipFile(path, "r") as zp: + assert len(zp.filelist) == 1 + assert zp.filelist[0].filename == archive_name + fp = io.BytesIO(zp.read(zp.filelist[0])) + else: + fp = path + reread = read_stata(fp, index_col="index") + + expected = df tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) -def test_chunked_categorical(version): +def test_chunked_categorical(version, temp_file): df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")}) df.index.name = "index" expected = df.copy() - expected.index = expected.index.astype(np.int32) - with tm.ensure_clean() as path: - df.to_stata(path, version=version) - with StataReader(path, chunksize=2, order_categoricals=False) as reader: - for i, block in enumerate(reader): - block = block.set_index("index") - assert "cats" in block - tm.assert_series_equal( - block.cats, expected.cats.iloc[2 * i : 2 * (i + 1)] - ) + df.to_stata(temp_file, version=version) + with StataReader(temp_file, chunksize=2, order_categoricals=False) as reader: + for i, block in enumerate(reader): + block = block.set_index("index") + assert "cats" in block + tm.assert_series_equal( + block.cats, + expected.cats.iloc[2 * i : 2 * (i + 1)], + check_index_type=len(block) > 1, + ) def test_chunked_categorical_partial(datapath): dta_file = datapath("io", "data", "stata", "stata-dta-partially-labeled.dta") values = ["a", "b", "a", "b", 3.0] + msg = "series with value labels are not fully labeled" with StataReader(dta_file, chunksize=2) as reader: - with tm.assert_produces_warning(CategoricalConversionWarning): + with tm.assert_produces_warning(CategoricalConversionWarning, match=msg): for i, block in enumerate(reader): assert list(block.cats) == values[2 * i : 2 * (i + 1)] if i < 2: @@ -2079,7 +2319,7 @@ def test_chunked_categorical_partial(datapath): else: idx = pd.Index([3.0], dtype="float64") tm.assert_index_equal(block.cats.cat.categories, idx) - with tm.assert_produces_warning(CategoricalConversionWarning): + with tm.assert_produces_warning(CategoricalConversionWarning, match=msg): with StataReader(dta_file, chunksize=5) as reader: large_chunk = reader.__next__() direct = read_stata(dta_file) @@ -2094,38 +2334,36 @@ def test_iterator_errors(datapath, chunksize): pass -def test_iterator_value_labels(): +def test_iterator_value_labels(temp_file): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False) - expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") - with read_stata(path, chunksize=100) as reader: - for j, chunk in enumerate(reader): - for i in range(2): - tm.assert_index_equal(chunk.dtypes.iloc[i].categories, expected) - tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) + df.to_stata(temp_file, write_index=False) + expected = pd.Index(["a_label", "b_label", "c_label"]) + with read_stata(temp_file, chunksize=100) as reader: + for j, chunk in enumerate(reader): + for i in range(2): + tm.assert_index_equal(chunk.dtypes.iloc[i].categories, expected) + tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) -def test_precision_loss(): +def test_precision_loss(temp_file): df = DataFrame( [[sum(2**i for i in range(60)), sum(2**i for i in range(52))]], columns=["big", "little"], ) - with tm.ensure_clean() as path: - with tm.assert_produces_warning( - PossiblePrecisionLoss, match="Column converted from int64 to float64" - ): - df.to_stata(path, write_index=False) - reread = read_stata(path) - expected_dt = Series([np.float64, np.float64], index=["big", "little"]) - tm.assert_series_equal(reread.dtypes, expected_dt) - assert reread.loc[0, "little"] == df.loc[0, "little"] - assert reread.loc[0, "big"] == float(df.loc[0, "big"]) + with tm.assert_produces_warning( + PossiblePrecisionLoss, match="Column converted from int64 to float64" + ): + df.to_stata(temp_file, write_index=False) + reread = read_stata(temp_file) + expected_dt = Series([np.float64, np.float64], index=["big", "little"]) + tm.assert_series_equal(reread.dtypes, expected_dt) + assert reread.loc[0, "little"] == df.loc[0, "little"] + assert reread.loc[0, "big"] == float(df.loc[0, "big"]) -def test_compression_roundtrip(compression): +def test_compression_roundtrip(compression, temp_file): df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], @@ -2133,22 +2371,21 @@ def test_compression_roundtrip(compression): ) df.index.name = "index" - with tm.ensure_clean() as path: - df.to_stata(path, compression=compression) - reread = read_stata(path, compression=compression, index_col="index") - tm.assert_frame_equal(df, reread) + df.to_stata(temp_file, compression=compression) + reread = read_stata(temp_file, compression=compression, index_col="index") + tm.assert_frame_equal(df, reread) - # explicitly ensure file was compressed. - with tm.decompress_file(path, compression) as fh: - contents = io.BytesIO(fh.read()) - reread = read_stata(contents, index_col="index") - tm.assert_frame_equal(df, reread) + # explicitly ensure file was compressed. + with tm.decompress_file(temp_file, compression) as fh: + contents = io.BytesIO(fh.read()) + reread = read_stata(contents, index_col="index") + tm.assert_frame_equal(df, reread) @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_stata_compression( - compression_only, read_infer, to_infer, compression_to_extension + compression_only, read_infer, to_infer, compression_to_extension, tmp_path ): compression = compression_only @@ -2165,13 +2402,14 @@ def test_stata_compression( to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression - with tm.ensure_clean(filename) as path: - df.to_stata(path, compression=to_compression) - result = read_stata(path, compression=read_compression, index_col="index") - tm.assert_frame_equal(result, df) + path = tmp_path / filename + path.touch() + df.to_stata(path, compression=to_compression) + result = read_stata(path, compression=read_compression, index_col="index") + tm.assert_frame_equal(result, df) -def test_non_categorical_value_labels(): +def test_non_categorical_value_labels(temp_file): data = DataFrame( { "fully_labelled": [1, 2, 3, 3, 1], @@ -2181,35 +2419,35 @@ def test_non_categorical_value_labels(): } ) - with tm.ensure_clean() as path: - value_labels = { - "fully_labelled": {1: "one", 2: "two", 3: "three"}, - "partially_labelled": {1.0: "one", 2.0: "two"}, - } - expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}} + path = temp_file + value_labels = { + "fully_labelled": {1: "one", 2: "two", 3: "three"}, + "partially_labelled": {1.0: "one", 2.0: "two"}, + } + expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}} - writer = StataWriter(path, data, value_labels=value_labels) - writer.write_file() + writer = StataWriter(path, data, value_labels=value_labels) + writer.write_file() - with StataReader(path) as reader: - reader_value_labels = reader.value_labels() - assert reader_value_labels == expected + with StataReader(path) as reader: + reader_value_labels = reader.value_labels() + assert reader_value_labels == expected - msg = "Can't create value labels for notY, it wasn't found in the dataset." - with pytest.raises(KeyError, match=msg): - value_labels = {"notY": {7: "label1", 8: "label2"}} - StataWriter(path, data, value_labels=value_labels) + msg = "Can't create value labels for notY, it wasn't found in the dataset." + value_labels = {"notY": {7: "label1", 8: "label2"}} + with pytest.raises(KeyError, match=msg): + StataWriter(path, data, value_labels=value_labels) - msg = ( - "Can't create value labels for Z, value labels " - "can only be applied to numeric columns." - ) - with pytest.raises(ValueError, match=msg): - value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}} - StataWriter(path, data, value_labels=value_labels) + msg = ( + "Can't create value labels for Z, value labels " + "can only be applied to numeric columns." + ) + value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}} + with pytest.raises(ValueError, match=msg): + StataWriter(path, data, value_labels=value_labels) -def test_non_categorical_value_label_name_conversion(): +def test_non_categorical_value_label_name_conversion(temp_file): # Check conversion of invalid variable names data = DataFrame( { @@ -2237,16 +2475,16 @@ def test_non_categorical_value_label_name_conversion(): "_1__2_": {3: "three"}, } - with tm.ensure_clean() as path: - with tm.assert_produces_warning(InvalidColumnName): - data.to_stata(path, value_labels=value_labels) + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): + data.to_stata(temp_file, value_labels=value_labels) - with StataReader(path) as reader: - reader_value_labels = reader.value_labels() - assert reader_value_labels == expected + with StataReader(temp_file) as reader: + reader_value_labels = reader.value_labels() + assert reader_value_labels == expected -def test_non_categorical_value_label_convert_categoricals_error(): +def test_non_categorical_value_label_convert_categoricals_error(temp_file): # Mapping more than one value to the same label is valid for Stata # labels, but can't be read with convert_categoricals=True value_labels = { @@ -2259,17 +2497,16 @@ def test_non_categorical_value_label_convert_categoricals_error(): } ) - with tm.ensure_clean() as path: - data.to_stata(path, value_labels=value_labels) + data.to_stata(temp_file, value_labels=value_labels) - with StataReader(path, convert_categoricals=False) as reader: - reader_value_labels = reader.value_labels() - assert reader_value_labels == value_labels + with StataReader(temp_file, convert_categoricals=False) as reader: + reader_value_labels = reader.value_labels() + assert reader_value_labels == value_labels - col = "repeated_labels" - repeats = "-" * 80 + "\n" + "\n".join(["More than ten"]) + col = "repeated_labels" + repeats = "-" * 80 + "\n" + "\n".join(["More than ten"]) - msg = f""" + msg = f""" Value labels for column {col} are not unique. These cannot be converted to pandas categoricals. @@ -2280,8 +2517,8 @@ def test_non_categorical_value_label_convert_categoricals_error(): The repeated labels are: {repeats} """ - with pytest.raises(ValueError, match=msg): - read_stata(path, convert_categoricals=True) + with pytest.raises(ValueError, match=msg): + read_stata(temp_file, convert_categoricals=True) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @@ -2299,7 +2536,7 @@ def test_non_categorical_value_label_convert_categoricals_error(): pd.UInt64Dtype, ], ) -def test_nullable_support(dtype, version): +def test_nullable_support(dtype, version, temp_file): df = DataFrame( { "a": Series([1.0, 2.0, 3.0]), @@ -2318,27 +2555,49 @@ def test_nullable_support(dtype, version): smv = StataMissingValue(value) expected_b = Series([1, smv, smv], dtype=object, name="b") expected_c = Series(["a", "b", ""], name="c") - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False, version=version) - reread = read_stata(path, convert_missing=True) - tm.assert_series_equal(df.a, reread.a) - tm.assert_series_equal(reread.b, expected_b) - tm.assert_series_equal(reread.c, expected_c) + df.to_stata(temp_file, write_index=False, version=version) + reread = read_stata(temp_file, convert_missing=True) + tm.assert_series_equal(df.a, reread.a) + tm.assert_series_equal(reread.b, expected_b) + tm.assert_series_equal(reread.c, expected_c) -def test_empty_frame(): +def test_empty_frame(temp_file): # GH 46240 # create an empty DataFrame with int64 and float64 dtypes df = DataFrame(data={"a": range(3), "b": [1.0, 2.0, 3.0]}).head(0) - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False, version=117) - # Read entire dataframe - df2 = read_stata(path) - assert "b" in df2 - # Dtypes don't match since no support for int32 - dtypes = Series({"a": np.dtype("int32"), "b": np.dtype("float64")}) - tm.assert_series_equal(df2.dtypes, dtypes) - # read one column of empty .dta file - df3 = read_stata(path, columns=["a"]) - assert "b" not in df3 - tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]]) + path = temp_file + df.to_stata(path, write_index=False, version=117) + # Read entire dataframe + df2 = read_stata(path) + assert "b" in df2 + # Dtypes don't match since no support for int32 + dtypes = Series({"a": np.dtype("int32"), "b": np.dtype("float64")}) + tm.assert_series_equal(df2.dtypes, dtypes) + # read one column of empty .dta file + df3 = read_stata(path, columns=["a"]) + assert "b" not in df3 + tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]]) + + +@pytest.mark.parametrize("version", [114, 117, 118, 119, None]) +def test_many_strl(temp_file, version): + n = 65534 + df = DataFrame(np.arange(n), columns=["col"]) + lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))] + value_labels = {"col": {i: lbls[i] for i in range(n)}} + df.to_stata(temp_file, value_labels=value_labels, version=version) + + +@pytest.mark.parametrize("version", [117, 118, 119, None]) +def test_strl_missings(temp_file, version): + # GH 23633 + # Check that strl supports None and pd.NA + df = DataFrame( + [ + {"str1": "string" * 500, "number": 0}, + {"str1": None, "number": 1}, + {"str1": pd.NA, "number": 1}, + ] + ) + df.to_stata(temp_file, version=version) diff --git a/pandas/tests/io/xml/conftest.py b/pandas/tests/io/xml/conftest.py index aafda0ff62bbd..8900b0dc46754 100644 --- a/pandas/tests/io/xml/conftest.py +++ b/pandas/tests/io/xml/conftest.py @@ -5,34 +5,101 @@ @pytest.fixture def xml_data_path(): + """ + Returns a Path object to the XML example directory. + + Examples + -------- + >>> def test_read_xml(xml_data_path): + ... pd.read_xml(xml_data_path / "file.xsl") + """ return Path(__file__).parent.parent / "data" / "xml" @pytest.fixture def xml_books(xml_data_path, datapath): + """ + Returns the path (as `str`) to the `books.xml` example file. + + Examples + -------- + >>> def test_read_xml(xml_books): + ... pd.read_xml(xml_books) + """ return datapath(xml_data_path / "books.xml") @pytest.fixture def xml_doc_ch_utf(xml_data_path, datapath): + """ + Returns the path (as `str`) to the `doc_ch_utf.xml` example file. + + Examples + -------- + >>> def test_read_xml(xml_doc_ch_utf): + ... pd.read_xml(xml_doc_ch_utf) + """ return datapath(xml_data_path / "doc_ch_utf.xml") @pytest.fixture def xml_baby_names(xml_data_path, datapath): + """ + Returns the path (as `str`) to the `baby_names.xml` example file. + + Examples + -------- + >>> def test_read_xml(xml_baby_names): + ... pd.read_xml(xml_baby_names) + """ return datapath(xml_data_path / "baby_names.xml") @pytest.fixture def kml_cta_rail_lines(xml_data_path, datapath): + """ + Returns the path (as `str`) to the `cta_rail_lines.kml` example file. + + Examples + -------- + >>> def test_read_xml(kml_cta_rail_lines): + ... pd.read_xml( + ... kml_cta_rail_lines, + ... xpath=".//k:Placemark", + ... namespaces={"k": "http://www.opengis.net/kml/2.2"}, + ... stylesheet=xsl_flatten_doc, + ... ) + """ return datapath(xml_data_path / "cta_rail_lines.kml") @pytest.fixture def xsl_flatten_doc(xml_data_path, datapath): + """ + Returns the path (as `str`) to the `flatten_doc.xsl` example file. + + Examples + -------- + >>> def test_read_xsl(xsl_flatten_doc, mode): + ... with open( + ... xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None + ... ) as f: + ... xsl_obj = f.read() + """ return datapath(xml_data_path / "flatten_doc.xsl") @pytest.fixture def xsl_row_field_output(xml_data_path, datapath): + """ + Returns the path (as `str`) to the `row_field_output.xsl` example file. + + Examples + -------- + >>> def test_read_xsl(xsl_row_field_output, mode): + ... with open( + ... xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None + ... ) as f: + ... xsl_obj = f.read() + """ return datapath(xml_data_path / "row_field_output.xsl") diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 37251a58b0c11..50fef2c5eb4eb 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -33,7 +33,7 @@ # [X] - KeyError: "...is not included in namespaces" # [X] - KeyError: "no valid column" # [X] - ValueError: "To use stylesheet, you need lxml installed..." -# [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) +# [] - OSError: (NEED PERMISSION ISSUE, DISK FULL, ETC.) # [X] - FileNotFoundError: "No such file or directory" # [X] - PermissionError: "Forbidden" @@ -41,7 +41,7 @@ # [X] - TypeError: "...is not a valid type for attr_cols" # [X] - TypeError: "...is not a valid type for elem_cols" # [X] - LookupError: "unknown encoding" -# [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) +# [] - OSError: (NEED PERMISSION ISSUE, DISK FULL, ETC.) # [X] - FileNotFoundError: "No such file or directory" # [X] - KeyError: "...is not included in namespaces" # [X] - KeyError: "no valid column" @@ -298,10 +298,8 @@ def test_index_false_rename_row_root(xml_books, parser): assert output == expected -@pytest.mark.parametrize( - "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]] -) -def test_index_false_with_offset_input_index(parser, offset_index, geom_df): +@pytest.mark.parametrize("typ", [int, str]) +def test_index_false_with_offset_input_index(parser, typ, geom_df): """ Tests that the output does not contain the `` field when the index of the input Dataframe has an offset. @@ -328,7 +326,7 @@ def test_index_false_with_offset_input_index(parser, offset_index, geom_df): 3.0 """ - + offset_index = [typ(i) for i in range(10, 13)] offset_geom_df = geom_df.copy() offset_geom_df.index = Index(offset_index) output = offset_geom_df.to_xml(index=False, parser=parser) @@ -1036,26 +1034,23 @@ def test_stylesheet_buffered_reader(xsl_row_field_output, mode, geom_df): with open( xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None ) as f: - xsl_obj = f.read() - - output = geom_df.to_xml(stylesheet=xsl_obj) + output = geom_df.to_xml(stylesheet=f) assert output == xsl_expected def test_stylesheet_wrong_path(geom_df): - lxml_etree = pytest.importorskip("lxml.etree") + pytest.importorskip("lxml.etree") - xsl = os.path.join("data", "xml", "row_field_output.xslt") + xsl = os.path.join("does", "not", "exist", "row_field_output.xslt") with pytest.raises( - lxml_etree.XMLSyntaxError, - match=("Start tag expected, '<' not found"), + FileNotFoundError, match=r"\[Errno 2\] No such file or director" ): geom_df.to_xml(stylesheet=xsl) -@pytest.mark.parametrize("val", ["", b""]) +@pytest.mark.parametrize("val", [StringIO(""), BytesIO(b"")]) def test_empty_string_stylesheet(val, geom_df): lxml_etree = pytest.importorskip("lxml.etree") @@ -1097,9 +1092,9 @@ def test_incorrect_xsl_syntax(geom_df): """ with pytest.raises( - lxml_etree.XMLSyntaxError, match=("Opening and ending tag mismatch") + lxml_etree.XMLSyntaxError, match="Opening and ending tag mismatch" ): - geom_df.to_xml(stylesheet=xsl) + geom_df.to_xml(stylesheet=StringIO(xsl)) def test_incorrect_xsl_eval(geom_df): @@ -1126,8 +1121,8 @@ def test_incorrect_xsl_eval(geom_df): """ - with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")): - geom_df.to_xml(stylesheet=xsl) + with pytest.raises(lxml_etree.XSLTParseError, match="failed to compile"): + geom_df.to_xml(stylesheet=StringIO(xsl)) def test_incorrect_xsl_apply(geom_df): @@ -1145,9 +1140,9 @@ def test_incorrect_xsl_apply(geom_df): """ - with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")): + with pytest.raises(lxml_etree.XSLTApplyError, match="Cannot resolve URI"): with tm.ensure_clean("test.xml") as path: - geom_df.to_xml(path, stylesheet=xsl) + geom_df.to_xml(path, stylesheet=StringIO(xsl)) def test_stylesheet_with_etree(geom_df): @@ -1162,10 +1157,8 @@ def test_stylesheet_with_etree(geom_df): """ - with pytest.raises( - ValueError, match=("To use stylesheet, you need lxml installed") - ): - geom_df.to_xml(parser="etree", stylesheet=xsl) + with pytest.raises(ValueError, match="To use stylesheet, you need lxml installed"): + geom_df.to_xml(parser="etree", stylesheet=StringIO(xsl)) def test_style_to_csv(geom_df): @@ -1192,7 +1185,7 @@ def test_style_to_csv(geom_df): if out_csv is not None: out_csv = out_csv.strip() - out_xml = geom_df.to_xml(stylesheet=xsl) + out_xml = geom_df.to_xml(stylesheet=StringIO(xsl)) assert out_csv == out_xml @@ -1226,7 +1219,7 @@ def test_style_to_string(geom_df): """ out_str = geom_df.to_string() - out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl) + out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=StringIO(xsl)) assert out_xml == out_str @@ -1271,7 +1264,7 @@ def test_style_to_json(geom_df): """ out_json = geom_df.to_json() - out_xml = geom_df.to_xml(stylesheet=xsl) + out_xml = geom_df.to_xml(stylesheet=StringIO(xsl)) assert out_json == out_xml diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 6f429c1ecbf8a..d897d251909fe 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,6 +14,7 @@ import numpy as np import pytest +from pandas.compat import WASM from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, @@ -28,11 +29,6 @@ Series, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -243,20 +239,17 @@ "-87.65362593118043,41.94742799535678,0" ), }, - } + }, + index=range(5), ) -def test_literal_xml_deprecation(): +def test_literal_xml_raises(): # GH 53809 pytest.importorskip("lxml") - msg = ( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) + msg = "|".join([r".*No such file or directory", r".*Invalid argument"]) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises((FileNotFoundError, OSError), match=msg): read_xml(xml_default_nmsp) @@ -417,7 +410,7 @@ def test_string_charset(parser): df_str = read_xml(StringIO(txt), parser=parser) - df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0]) + df_expected = DataFrame({"c1": 1, "c2": 2}, index=range(1)) tm.assert_frame_equal(df_str, df_expected) @@ -471,33 +464,30 @@ def test_empty_string_lxml(val): r"None \(line 0\)", ] ) + if isinstance(val, str): + data = StringIO(val) + else: + data = BytesIO(val) with pytest.raises(lxml_etree.XMLSyntaxError, match=msg): - if isinstance(val, str): - read_xml(StringIO(val), parser="lxml") - else: - read_xml(BytesIO(val), parser="lxml") + read_xml(data, parser="lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_etree(val): + if isinstance(val, str): + data = StringIO(val) + else: + data = BytesIO(val) with pytest.raises(ParseError, match="no element found"): - if isinstance(val, str): - read_xml(StringIO(val), parser="etree") - else: - read_xml(BytesIO(val), parser="etree") + read_xml(data, parser="etree") +@pytest.mark.skipif(WASM, reason="limited file system access on WASM") def test_wrong_file_path(parser): - msg = ( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) - filename = os.path.join("data", "html", "books.xml") + filename = os.path.join("does", "not", "exist", "books.xml") with pytest.raises( - FutureWarning, - match=msg, + FileNotFoundError, match=r"\[Errno 2\] No such file or directory" ): read_xml(filename, parser=parser) @@ -1044,7 +1034,7 @@ def test_utf16_encoding(xml_baby_names, parser): UnicodeError, match=( "UTF-16 stream does not start with BOM|" - "'utf-16-le' codec can't decode byte" + "'utf-16(-le)?' codec can't decode byte" ), ): read_xml(xml_baby_names, encoding="UTF-16", parser=parser) @@ -1195,14 +1185,12 @@ def test_stylesheet_io(kml_cta_rail_lines, xsl_flatten_doc, mode): def test_stylesheet_buffered_reader(kml_cta_rail_lines, xsl_flatten_doc, mode): pytest.importorskip("lxml") with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f: - xsl_obj = f.read() - - df_style = read_xml( - kml_cta_rail_lines, - xpath=".//k:Placemark", - namespaces={"k": "http://www.opengis.net/kml/2.2"}, - stylesheet=xsl_obj, - ) + df_style = read_xml( + kml_cta_rail_lines, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=f, + ) tm.assert_frame_equal(df_kml, df_style) @@ -1231,7 +1219,7 @@ def test_style_charset(): """ df_orig = read_xml(StringIO(xml)) - df_style = read_xml(StringIO(xml), stylesheet=xsl) + df_style = read_xml(StringIO(xml), stylesheet=StringIO(xsl)) tm.assert_frame_equal(df_orig, df_style) @@ -1269,9 +1257,9 @@ def test_incorrect_xsl_syntax(kml_cta_rail_lines): """ with pytest.raises( - lxml_etree.XMLSyntaxError, match=("Extra content at the end of the document") + lxml_etree.XMLSyntaxError, match="Extra content at the end of the document" ): - read_xml(kml_cta_rail_lines, stylesheet=xsl) + read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl)) def test_incorrect_xsl_eval(kml_cta_rail_lines): @@ -1297,8 +1285,8 @@ def test_incorrect_xsl_eval(kml_cta_rail_lines): """ - with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")): - read_xml(kml_cta_rail_lines, stylesheet=xsl) + with pytest.raises(lxml_etree.XSLTParseError, match="failed to compile"): + read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl)) def test_incorrect_xsl_apply(kml_cta_rail_lines): @@ -1316,18 +1304,17 @@ def test_incorrect_xsl_apply(kml_cta_rail_lines): """ - with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")): - read_xml(kml_cta_rail_lines, stylesheet=xsl) + with pytest.raises(lxml_etree.XSLTApplyError, match="Cannot resolve URI"): + read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl)) def test_wrong_stylesheet(kml_cta_rail_lines, xml_data_path): - xml_etree = pytest.importorskip("lxml.etree") + pytest.importorskip("lxml.etree") - xsl = xml_data_path / "flatten.xsl" + xsl = xml_data_path / "flatten_doesnt_exist.xsl" with pytest.raises( - xml_etree.XMLSyntaxError, - match=("Start tag expected, '<' not found"), + FileNotFoundError, match=r"\[Errno 2\] No such file or directory" ): read_xml(kml_cta_rail_lines, stylesheet=xsl) @@ -1357,18 +1344,11 @@ def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc): read_xml(kml_cta_rail_lines, parser="etree", stylesheet=xsl_flatten_doc) -@pytest.mark.parametrize("val", ["", b""]) -def test_empty_stylesheet(val): - pytest.importorskip("lxml") - msg = ( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) - kml = os.path.join("data", "xml", "cta_rail_lines.kml") - - with pytest.raises(FutureWarning, match=msg): - read_xml(kml, stylesheet=val) +@pytest.mark.parametrize("val", [StringIO(""), BytesIO(b"")]) +def test_empty_stylesheet(val, kml_cta_rail_lines): + lxml_etree = pytest.importorskip("lxml.etree") + with pytest.raises(lxml_etree.XMLSyntaxError): + read_xml(kml_cta_rail_lines, stylesheet=val) # ITERPARSE @@ -1523,8 +1503,7 @@ def test_bad_xml(parser): with pytest.raises( SyntaxError, match=( - "Extra content at the end of the document|" - "junk after document element" + "Extra content at the end of the document|junk after document element" ), ): read_xml( @@ -1906,7 +1885,7 @@ def test_online_stylesheet(): StringIO(xml), xpath=".//tr[td and position() <= 6]", names=["title", "artist"], - stylesheet=xsl, + stylesheet=StringIO(xsl), ) df_expected = DataFrame( @@ -2035,36 +2014,21 @@ def test_read_xml_nullable_dtypes( """ - if using_infer_string: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None])) - - elif string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + with pd.option_context("mode.string_storage", string_storage): + result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) - elif dtype_backend == "pyarrow": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) - - with pd.option_context("mode.string_storage", string_storage): - result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) + string_dtype = pd.StringDtype(string_storage) expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), @@ -2085,7 +2049,9 @@ def test_read_xml_nullable_dtypes( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index a85576ff13f5c..96ef50f9d7149 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -378,58 +378,6 @@ def test_parse_dates_true(parser): tm.assert_frame_equal(df_iter, df_expected) -def test_parse_dates_dictionary(parser): - xml = """ - - - square - 360 - 4.0 - 2020 - 12 - 31 - - - circle - 360 - - 2021 - 12 - 31 - - - triangle - 180 - 3.0 - 2022 - 12 - 31 - -""" - - df_result = read_xml( - StringIO(xml), parse_dates={"date_end": ["year", "month", "day"]}, parser=parser - ) - df_iter = read_xml_iterparse( - xml, - parser=parser, - parse_dates={"date_end": ["year", "month", "day"]}, - iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]}, - ) - - df_expected = DataFrame( - { - "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - tm.assert_frame_equal(df_iter, df_expected) - - def test_day_first_parse_dates(parser): xml = """\ @@ -479,7 +427,5 @@ def test_day_first_parse_dates(parser): def test_wrong_parse_dates_type(xml_books, parser, iterparse): - with pytest.raises( - TypeError, match=("Only booleans, lists, and dictionaries are accepted") - ): + with pytest.raises(TypeError, match="Only booleans and lists are accepted"): read_xml(xml_books, parse_dates={"date"}, parser=parser, iterparse=iterparse) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index e54764f9ac4a6..6a95cfc7355d8 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -1,3 +1,4 @@ +from collections import namedtuple from collections.abc import Generator from contextlib import contextmanager import re @@ -148,18 +149,19 @@ def test_map_locations(self, table_type, dtype, writable): def test_map_locations_mask(self, table_type, dtype, writable): if table_type == ht.PyObjectHashTable: pytest.skip("Mask not supported for object") - N = 3 + N = 129 # must be > 128 to test GH#58924 table = table_type(uses_mask=True) keys = (np.arange(N) + N).astype(dtype) keys.flags.writeable = writable - table.map_locations(keys, np.array([False, False, True])) + mask = np.concatenate([np.repeat(False, N - 1), [True]], axis=0) + table.map_locations(keys, mask) for i in range(N - 1): assert table.get_item(keys[i]) == i with pytest.raises(KeyError, match=re.escape(str(keys[N - 1]))): table.get_item(keys[N - 1]) - assert table.get_na() == 2 + assert table.get_na() == N - 1 def test_lookup(self, table_type, dtype, writable): N = 3 @@ -405,9 +407,8 @@ def test_nan_complex_real(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_complex_imag(self): nan1 = complex(1, float("nan")) @@ -417,9 +418,8 @@ def test_nan_complex_imag(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_in_tuple(self): nan1 = (float("nan"),) @@ -436,9 +436,28 @@ def test_nan_in_nested_tuple(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): + table.get_item(other) + + def test_nan_in_namedtuple(self): + T = namedtuple("T", ["x"]) + nan1 = T(float("nan")) + nan2 = T(float("nan")) + assert nan1.x is not nan2.x + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_in_nested_namedtuple(self): + T = namedtuple("T", ["x", "y"]) + nan1 = T(1, (2, (float("nan"),))) + nan2 = T(1, (2, (float("nan"),))) + other = T(1, 2) + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_hash_equal_tuple_with_nans(): @@ -448,6 +467,22 @@ def test_hash_equal_tuple_with_nans(): assert ht.objects_are_equal(a, b) +def test_hash_equal_namedtuple_with_nans(): + T = namedtuple("T", ["x", "y"]) + a = T(float("nan"), (float("nan"), float("nan"))) + b = T(float("nan"), (float("nan"), float("nan"))) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + +def test_hash_equal_namedtuple_and_tuple(): + T = namedtuple("T", ["x", "y"]) + a = T(1, (2, 3)) + b = (1, (2, 3)) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) @@ -730,12 +765,11 @@ def test_mode(self, dtype): def test_ismember_tuple_with_nans(): # GH-41836 - values = [("a", float("nan")), ("b", 1)] + values = np.empty(2, dtype=object) + values[:] = [("a", float("nan")), ("b", 1)] comps = [("a", float("nan"))] - msg = "isin with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = isin(values, comps) + result = isin(values, comps) expected = np.array([True, False], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index ba2e6e7130929..bf8b4fabc54cb 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -138,14 +138,12 @@ def test_cython_inner_join(self): tm.assert_numpy_array_equal(rs, exp_rs) -@pytest.mark.parametrize("readonly", [True, False]) -def test_left_join_indexer_unique(readonly): +def test_left_join_indexer_unique(writable): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - if readonly: - # GH#37312, GH#37264 - a.setflags(write=False) - b.setflags(write=False) + # GH#37312, GH#37264 + a.setflags(write=writable) + b.setflags(write=writable) result = libjoin.left_join_indexer_unique(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.intp) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..f619ba4dd204b 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,25 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan + + +def test_ensure_string_array_list_of_lists(): + # GH#61155: ensure list of lists doesn't get converted to string + arr = [list("test"), list("word")] + result = lib.ensure_string_array(arr) + + # Each item in result should still be a list, not a stringified version + expected = np.array(["['t', 'e', 's', 't']", "['w', 'o', 'r', 'd']"], dtype=object) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 69120160699c2..d8c49d6d47f28 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -76,8 +76,6 @@ def _check_data(xp, rs): xp : matplotlib Axes object rs : matplotlib Axes object """ - import matplotlib.pyplot as plt - xp_lines = xp.get_lines() rs_lines = rs.get_lines() @@ -87,8 +85,6 @@ def _check_data(xp, rs): rsdata = rsl.get_xydata() tm.assert_almost_equal(xpdata, rsdata) - plt.close("all") - def _check_visible(collections, visible=True): """ @@ -433,7 +429,7 @@ def _check_box_return_type( raise AssertionError -def _check_grid_settings(obj, kinds, kws={}): +def _check_grid_settings(obj, kinds, kws=None): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 import matplotlib as mpl @@ -446,6 +442,8 @@ def is_grid_on(): return not (xoff and yoff) + if kws is None: + kws = {} spndx = 1 for kind in kinds: mpl.pyplot.subplot(1, 4 * len(kinds), spndx) @@ -493,6 +491,28 @@ def get_y_axis(ax): return ax._shared_axes["y"] +def assert_is_valid_plot_return_object(objs) -> None: + from matplotlib.artist import Artist + from matplotlib.axes import Axes + + if isinstance(objs, (Series, np.ndarray)): + if isinstance(objs, Series): + objs = objs._values + for el in objs.reshape(-1): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {type(el).__name__!r}" + ) + assert isinstance(el, (Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{type(objs).__name__!r}" + ) + assert isinstance(objs, (Artist, tuple, dict)), msg + + def _check_plot_works(f, default_axes=False, **kwargs): """ Create plot and ensure that plot return object is valid. @@ -528,15 +548,11 @@ def _check_plot_works(f, default_axes=False, **kwargs): gen_plots = _gen_two_subplots ret = None - try: - fig = kwargs.get("figure", plt.gcf()) - plt.clf() - - for ret in gen_plots(f, fig, **kwargs): - tm.assert_is_valid_plot_return_object(ret) + fig = kwargs.get("figure", plt.gcf()) + fig.clf() - finally: - plt.close(fig) + for ret in gen_plots(f, fig, **kwargs): + assert_is_valid_plot_return_object(ret) return ret diff --git a/pandas/tests/plotting/conftest.py b/pandas/tests/plotting/conftest.py index d688bbd47595c..eb5a1f1f6382e 100644 --- a/pandas/tests/plotting/conftest.py +++ b/pandas/tests/plotting/conftest.py @@ -1,5 +1,3 @@ -import gc - import numpy as np import pytest @@ -10,23 +8,8 @@ @pytest.fixture(autouse=True) -def mpl_cleanup(): - # matplotlib/testing/decorators.py#L24 - # 1) Resets units registry - # 2) Resets rc_context - # 3) Closes all figures - mpl = pytest.importorskip("matplotlib") - mpl_units = pytest.importorskip("matplotlib.units") - plt = pytest.importorskip("matplotlib.pyplot") - orig_units_registry = mpl_units.registry.copy() - with mpl.rc_context(): - mpl.use("template") - yield - mpl_units.registry.clear() - mpl_units.registry.update(orig_units_registry) - plt.close("all") - # https://matplotlib.org/stable/users/prev_whats_new/whats_new_3.6.0.html#garbage-collection-is-no-longer-run-on-figure-close # noqa: E501 - gc.collect(1) +def autouse_mpl_cleanup(mpl_cleanup): + pass @pytest.fixture diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 45dc612148f40..3f274a336ad44 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1,4 +1,5 @@ -""" Test cases for DataFrame.plot """ +"""Test cases for DataFrame.plot""" + from datetime import ( date, datetime, @@ -44,6 +45,7 @@ _check_visible, get_y_axis, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -204,7 +206,7 @@ def test_plot_multiindex_unicode(self): columns=columns, index=index, ) - _check_plot_works(df.plot, title="\u03A3") + _check_plot_works(df.plot, title="\u03a3") @pytest.mark.slow @pytest.mark.parametrize("layout", [None, (-1, 1)]) @@ -772,6 +774,16 @@ def test_bar_nan_stacked(self): expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0] assert result == expected + def test_bar_stacked_label_position_with_zero_height(self): + # GH 59429 + df = DataFrame({"A": [3, 0, 1], "B": [0, 2, 4], "C": [5, 0, 2]}) + ax = df.plot.bar(stacked=True) + ax.bar_label(ax.containers[-1]) + expected = [8.0, 2.0, 7.0] + result = [text.xy[1] for text in ax.texts] + tm.assert_almost_equal(result, expected) + plt.close("all") + @pytest.mark.parametrize("idx", [Index, pd.CategoricalIndex]) def test_bar_categorical(self, idx): # GH 13019 @@ -828,14 +840,26 @@ def test_plot_scatter_shape(self): axes = df.plot(x="x", y="y", kind="scatter", subplots=True) _check_axes_shape(axes, axes_num=1, layout=(1, 1)) - def test_raise_error_on_datetime_time_data(self): - # GH 8113, datetime.time type is not supported by matplotlib in scatter + def test_scatter_on_datetime_time_data(self): + # datetime.time type is now supported in scatter, since a converter + # is implemented in ScatterPlot df = DataFrame(np.random.default_rng(2).standard_normal(10), columns=["a"]) df["dtime"] = date_range(start="2014-01-01", freq="h", periods=10).time - msg = "must be a string or a (real )?number, not 'datetime.time'" + df.plot(kind="scatter", x="dtime", y="a") - with pytest.raises(TypeError, match=msg): - df.plot(kind="scatter", x="dtime", y="a") + def test_scatter_line_xticks(self): + # GH#61005 + df = DataFrame( + [(datetime(year=2025, month=1, day=1, hour=n), n) for n in range(3)], + columns=["datetime", "y"], + ) + fig, ax = plt.subplots(2, sharex=True) + df.plot.scatter(x="datetime", y="y", ax=ax[0]) + scatter_xticks = ax[0].get_xticks() + df.plot(x="datetime", y="y", ax=ax[1]) + line_xticks = ax[1].get_xticks() + assert scatter_xticks[0] == line_xticks[0] + assert scatter_xticks[-1] == line_xticks[-1] @pytest.mark.parametrize("x, y", [("dates", "vals"), (0, 1)]) def test_scatterplot_datetime_data(self, x, y): @@ -1058,28 +1082,43 @@ def test_boxplot_series_positions(self, hist_df): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(rot=50, fontsize=8, **kwargs) _check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) _check_text_labels(ax.get_yticklabels(), labels) assert len(ax.lines) == 7 * len(numeric_cols) - @pytest.mark.filterwarnings("ignore:Attempt:UserWarning") + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib version 3.10", + ) def test_boxplot_vertical_subplots(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) axes = _check_plot_works( - df.plot.box, - default_axes=True, - subplots=True, - vert=False, - logx=True, + df.plot.box, default_axes=True, subplots=True, logx=True, **kwargs ) _check_axes_shape(axes, axes_num=3, layout=(1, 3)) _check_ax_scales(axes, xaxis="log") @@ -1087,12 +1126,22 @@ def test_boxplot_vertical_subplots(self, hist_df): _check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == 7 + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical_positions(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(positions=positions, **kwargs) _check_text_labels(ax.get_yticklabels(), labels) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) @@ -1119,7 +1168,7 @@ def test_boxplot_return_type_invalid_type(self, return_type): def test_kde_df(self): pytest.importorskip("scipy") - df = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) ax = _check_plot_works(df.plot, kind="kde") expected = [pprint_thing(c) for c in df.columns] _check_legend_labels(ax, labels=expected) @@ -1176,20 +1225,16 @@ def test_hist_df_series(self): _check_ticks_props(axes, xrot=40, yrot=0) def test_hist_df_series_cumulative_density(self): - from matplotlib.patches import Rectangle - series = Series(np.random.default_rng(2).random(10)) ax = series.plot.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) def test_hist_df_series_cumulative(self): - from matplotlib.patches import Rectangle - series = Series(np.random.default_rng(2).random(10)) ax = series.plot.hist(cumulative=True, bins=4) - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-2].get_height(), 10.0) @@ -1199,12 +1244,10 @@ def test_hist_df_orientation(self): axes = df.plot.hist(rot=50, fontsize=8, orientation="horizontal") _check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8) - @pytest.mark.parametrize( - "weights", [0.1 * np.ones(shape=(100,)), 0.1 * np.ones(shape=(100, 2))] - ) - def test_hist_weights(self, weights): + @pytest.mark.parametrize("weight_shape", [(100,), (100, 2)]) + def test_hist_weights(self, weight_shape): # GH 33173 - + weights = 0.1 * np.ones(shape=weight_shape) df = DataFrame( dict(zip(["A", "B"], np.random.default_rng(2).standard_normal((2, 100)))) ) @@ -1386,8 +1429,6 @@ def test_plot_int_columns(self): ], ) def test_style_by_column(self, markers): - import matplotlib.pyplot as plt - fig = plt.gcf() fig.clf() fig.add_subplot(111) @@ -1630,7 +1671,7 @@ def test_pie_df_subplots(self): for ax in axes: _check_text_labels(ax.texts, df.index) for ax, ylabel in zip(axes, df.columns): - assert ax.get_ylabel() == ylabel + assert ax.get_ylabel() == "" def test_pie_df_labels_colors(self): df = DataFrame( @@ -1970,9 +2011,6 @@ def test_sharex_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - - plt.close("all") gs, axes = _generate_4_axes_via_gridspec() df = DataFrame( @@ -2002,7 +2040,7 @@ def _check(axes): plt.close("all") gs, axes = _generate_4_axes_via_gridspec() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True) _check(axes) @@ -2010,8 +2048,6 @@ def test_sharex_false_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - df = DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -2036,8 +2072,6 @@ def test_sharey_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - gs, axes = _generate_4_axes_via_gridspec() df = DataFrame( @@ -2066,7 +2100,7 @@ def _check(axes): plt.close("all") gs, axes = _generate_4_axes_via_gridspec() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharey=True) gs.tight_layout(plt.gcf()) @@ -2074,8 +2108,6 @@ def _check(axes): def test_sharey_and_ax_tight(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, - import matplotlib.pyplot as plt - df = DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -2135,9 +2167,6 @@ def test_memory_leak(self, kind): def test_df_gridspec_patterns_vert_horiz(self): # GH 10819 - from matplotlib import gridspec - import matplotlib.pyplot as plt - ts = Series( np.random.default_rng(2).standard_normal(10), index=date_range("1/1/2000", periods=10), @@ -2150,14 +2179,14 @@ def test_df_gridspec_patterns_vert_horiz(self): ) def _get_vertical_grid(): - gs = gridspec.GridSpec(3, 1) + gs = mpl.gridspec.GridSpec(3, 1) fig = plt.figure() ax1 = fig.add_subplot(gs[:2, :]) ax2 = fig.add_subplot(gs[2, :]) return ax1, ax2 def _get_horizontal_grid(): - gs = gridspec.GridSpec(1, 3) + gs = mpl.gridspec.GridSpec(1, 3) fig = plt.figure() ax1 = fig.add_subplot(gs[:, :2]) ax2 = fig.add_subplot(gs[:, 2]) @@ -2187,7 +2216,7 @@ def _get_horizontal_grid(): # vertical / subplots / sharex=True / sharey=True ax1, ax2 = _get_vertical_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 @@ -2203,7 +2232,7 @@ def _get_horizontal_grid(): # horizontal / subplots / sharex=True / sharey=True ax1, ax2 = _get_horizontal_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 @@ -2218,9 +2247,6 @@ def _get_horizontal_grid(): def test_df_gridspec_patterns_boxed(self): # GH 10819 - from matplotlib import gridspec - import matplotlib.pyplot as plt - ts = Series( np.random.default_rng(2).standard_normal(10), index=date_range("1/1/2000", periods=10), @@ -2228,7 +2254,7 @@ def test_df_gridspec_patterns_boxed(self): # boxed def _get_boxed_grid(): - gs = gridspec.GridSpec(3, 3) + gs = mpl.gridspec.GridSpec(3, 3) fig = plt.figure() ax1 = fig.add_subplot(gs[:2, :2]) ax2 = fig.add_subplot(gs[:2, 2]) @@ -2253,7 +2279,7 @@ def _get_boxed_grid(): # subplots / sharex=True / sharey=True axes = _get_boxed_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True, sharey=True) for ax in axes: assert len(ax.lines) == 1 @@ -2487,8 +2513,14 @@ def test_group_subplot_invalid_column_name(self): d = {"a": np.arange(10), "b": np.arange(10)} df = DataFrame(d) - with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): - df.plot(subplots=[("a", "bad_name")]) + if Version(np.__version__) < Version("2.0.0"): + with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): + df.plot(subplots=[("a", "bad_name")]) + else: + with pytest.raises( + ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]" + ): + df.plot(subplots=[("a", "bad_name")]) def test_group_subplot_duplicated_column(self): d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} @@ -2579,10 +2611,31 @@ def test_plot_no_warning(self): _ = df.plot() _ = df.T.plot() + @pytest.mark.parametrize("freq", ["h", "7h", "60min", "120min", "3M"]) + def test_plot_period_index_makes_no_right_shift(self, freq): + # GH#57587 + idx = pd.period_range("01/01/2000", freq=freq, periods=4) + df = DataFrame( + np.array([0, 1, 0, 1]), + index=idx, + columns=["A"], + ) + expected = idx.values -def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt + ax = df.plot() + result = ax.get_lines()[0].get_xdata() + assert all(str(result[i]) == str(expected[i]) for i in range(4)) + + def test_plot_display_xlabel_and_xticks(self): + # GH#44050 + df = DataFrame(np.random.default_rng(2).random((10, 2)), columns=["a", "b"]) + ax = df.plot.hexbin(x="a", y="b") + + _check_visible([ax.xaxis.get_label()], visible=True) + _check_visible(ax.get_xticklabels(), visible=True) + +def _generate_4_axes_via_gridspec(): gs = mpl.gridspec.GridSpec(2, 2) ax_tl = plt.subplot(gs[0, 0]) ax_ll = plt.subplot(gs[1, 0]) diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py index ff1edd323ef28..5e5c3539f3283 100644 --- a/pandas/tests/plotting/frame/test_frame_color.py +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -1,4 +1,5 @@ -""" Test cases for DataFrame.plot """ +"""Test cases for DataFrame.plot""" + import re import numpy as np @@ -12,7 +13,6 @@ _check_plot_works, _unpack_cycler, ) -from pandas.util.version import Version mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") @@ -30,11 +30,10 @@ def _check_colors_box(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=Non class TestDataFrameColor: - @pytest.mark.parametrize( - "color", ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"] - ) + @pytest.mark.parametrize("color", list(range(10))) def test_mpl2_color_cycle_str(self, color): # GH 15516 + color = f"C{color}" df = DataFrame( np.random.default_rng(2).standard_normal((10, 3)), columns=["a", "b", "c"] ) @@ -95,6 +94,18 @@ def test_color_and_marker(self, color, expected): assert all(i.get_linestyle() == "--" for i in ax.lines) assert all(i.get_marker() == "d" for i in ax.lines) + def test_color_and_style(self): + color = {"g": "black", "h": "brown"} + style = {"g": "-", "h": "--"} + expected_color = ["black", "brown"] + expected_style = ["-", "--"] + df = DataFrame({"g": [1, 2], "h": [2, 3]}, index=[1, 2]) + ax = df.plot.line(color=color, style=style) + color = [i.get_color() for i in ax.lines] + style = [i.get_linestyle() for i in ax.lines] + assert color == expected_color + assert style == expected_style + def test_bar_colors(self): default_colors = _unpack_cycler(plt.rcParams) @@ -205,8 +216,53 @@ def test_scatter_with_c_column_name_with_colors(self, cmap): ax = df.plot.scatter(x=0, y=1, cmap=cmap, c="species") else: ax = df.plot.scatter(x=0, y=1, c="species", cmap=cmap) + + assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == 3 # r/g/b + assert ( + np.unique(ax.collections[0].get_facecolor(), axis=0) + == np.array( + [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 0.0, 1.0], [1.0, 0.0, 0.0, 1.0]] + ) # r/g/b + ).all() assert ax.collections[0].colorbar is None + def test_scatter_with_c_column_name_without_colors(self): + # Given + colors = ["NY", "MD", "MA", "CA"] + color_count = 4 # 4 unique colors + + # When + df = DataFrame( + { + "dataX": range(100), + "dataY": range(100), + "color": (colors[i % len(colors)] for i in range(100)), + } + ) + + # Then + ax = df.plot.scatter("dataX", "dataY", c="color") + assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == color_count + + # Given + colors = ["r", "g", "not-a-color"] + color_count = 3 + # Also, since not all are mpl-colors, points matching 'r' or 'g' + # are not necessarily red or green + + # When + df = DataFrame( + { + "dataX": range(100), + "dataY": range(100), + "color": (colors[i % len(colors)] for i in range(100)), + } + ) + + # Then + ax = df.plot.scatter("dataX", "dataY", c="color") + assert len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == color_count + def test_scatter_colors(self): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) with pytest.raises(TypeError, match="Specify exactly one of `c` and `color`"): @@ -217,7 +273,14 @@ def test_scatter_colors_not_raising_warnings(self): # provided via 'c'. Parameters 'cmap' will be ignored df = DataFrame({"x": [1, 2, 3], "y": [1, 2, 3]}) with tm.assert_produces_warning(None): - df.plot.scatter(x="x", y="y", c="b") + ax = df.plot.scatter(x="x", y="y", c="b") + assert ( + len(np.unique(ax.collections[0].get_facecolor(), axis=0)) == 1 + ) # blue + assert ( + np.unique(ax.collections[0].get_facecolor(), axis=0) + == np.array([[0.0, 0.0, 1.0, 1.0]]) + ).all() # blue def test_scatter_colors_default(self): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) @@ -364,14 +427,16 @@ def test_line_colors_and_styles_subplots_list_styles(self): _check_colors(ax.get_lines(), linecolors=[c]) def test_area_colors(self): - from matplotlib.collections import PolyCollection - custom_colors = "rgcby" df = DataFrame(np.random.default_rng(2).random((5, 5))) ax = df.plot.area(color=custom_colors) _check_colors(ax.get_lines(), linecolors=custom_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] _check_colors(poly, facecolors=custom_colors) handles, _ = ax.get_legend_handles_labels() @@ -381,14 +446,15 @@ def test_area_colors(self): assert h.get_alpha() is None def test_area_colors_poly(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - df = DataFrame(np.random.default_rng(2).random((5, 5))) ax = df.plot.area(colormap="jet") - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + jet_colors = [mpl.cm.jet(n) for n in np.linspace(0, 1, len(df))] _check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] _check_colors(poly, facecolors=jet_colors) handles, _ = ax.get_legend_handles_labels() @@ -397,15 +463,16 @@ def test_area_colors_poly(self): assert h.get_alpha() is None def test_area_colors_stacked_false(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - df = DataFrame(np.random.default_rng(2).random((5, 5))) - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + jet_colors = [mpl.cm.jet(n) for n in np.linspace(0, 1, len(df))] # When stacked=False, alpha is set to 0.5 - ax = df.plot.area(colormap=cm.jet, stacked=False) + ax = df.plot.area(colormap=mpl.cm.jet, stacked=False) _check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors] _check_colors(poly, facecolors=jet_with_alpha) @@ -647,10 +714,7 @@ def test_colors_of_columns_with_same_name(self): df_concat = pd.concat([df, df1], axis=1) result = df_concat.plot() legend = result.get_legend() - if Version(mpl.__version__) < Version("3.7"): - handles = legend.legendHandles - else: - handles = legend.legend_handles + handles = legend.legend_handles for legend, line in zip(handles, result.lines): assert legend.get_color() == line.get_color() diff --git a/pandas/tests/plotting/frame/test_frame_groupby.py b/pandas/tests/plotting/frame/test_frame_groupby.py index f1924185a3df1..b7e147bbabde5 100644 --- a/pandas/tests/plotting/frame/test_frame_groupby.py +++ b/pandas/tests/plotting/frame/test_frame_groupby.py @@ -1,4 +1,4 @@ -""" Test cases for DataFrame.plot """ +"""Test cases for DataFrame.plot""" import pytest diff --git a/pandas/tests/plotting/frame/test_frame_legend.py b/pandas/tests/plotting/frame/test_frame_legend.py index 402a4b9531e5d..755293e0bf6d7 100644 --- a/pandas/tests/plotting/frame/test_frame_legend.py +++ b/pandas/tests/plotting/frame/test_frame_legend.py @@ -12,7 +12,6 @@ _check_legend_marker, _check_text_labels, ) -from pandas.util.version import Version mpl = pytest.importorskip("matplotlib") @@ -26,22 +25,16 @@ class TestFrameLegend: ) def test_mixed_yerr(self): # https://github.com/pandas-dev/pandas/issues/39522 - from matplotlib.collections import LineCollection - from matplotlib.lines import Line2D - df = DataFrame([{"x": 1, "a": 1, "b": 1}, {"x": 2, "a": 2, "b": 3}]) ax = df.plot("x", "a", c="orange", yerr=0.1, label="orange") df.plot("x", "b", c="blue", yerr=None, ax=ax, label="blue") legend = ax.get_legend() - if Version(mpl.__version__) < Version("3.7"): - result_handles = legend.legendHandles - else: - result_handles = legend.legend_handles + result_handles = legend.legend_handles - assert isinstance(result_handles[0], LineCollection) - assert isinstance(result_handles[1], Line2D) + assert isinstance(result_handles[0], mpl.collections.LineCollection) + assert isinstance(result_handles[1], mpl.lines.Line2D) def test_legend_false(self): # https://github.com/pandas-dev/pandas/issues/40044 @@ -51,10 +44,7 @@ def test_legend_false(self): ax = df.plot(legend=True, color={"a": "blue", "b": "green"}, secondary_y="b") df2.plot(legend=True, color={"d": "red"}, ax=ax) legend = ax.get_legend() - if Version(mpl.__version__) < Version("3.7"): - handles = legend.legendHandles - else: - handles = legend.legend_handles + handles = legend.legend_handles result = [handle.get_color() for handle in handles] expected = ["blue", "green", "red"] assert result == expected diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py index 4d8d8fa4cdee3..b44725a01fe23 100644 --- a/pandas/tests/plotting/frame/test_frame_subplots.py +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -1,4 +1,4 @@ -""" Test cases for DataFrame.plot """ +"""Test cases for DataFrame.plot""" import string @@ -187,9 +187,9 @@ def test_subplots_timeseries_y_axis_not_supported(self): data = { "numeric": np.array([1, 2, 5]), "period": [ - pd.Period("2017-08-01 00:00:00", freq="H"), - pd.Period("2017-08-01 02:00", freq="H"), - pd.Period("2017-08-02 00:00:00", freq="H"), + pd.Period("2017-08-01 00:00:00", freq="h"), + pd.Period("2017-08-01 02:00", freq="h"), + pd.Period("2017-08-02 00:00:00", freq="h"), ], "categorical": pd.Categorical( ["c", "b", "a"], categories=["a", "b", "c"], ordered=False @@ -327,7 +327,7 @@ def test_subplots_multiple_axes_error(self): def test_subplots_multiple_axes_2_dim(self, layout, exp_layout): # GH 5353, 6970, GH 7069 # pass 2-dim axes and invalid layout - # invalid lauout should not affect to input and return value + # invalid layout should not affect to input and return value # (show warning is tested in # TestDataFrameGroupByPlots.test_grouped_box_multiple_axes _, axes = mpl.pyplot.subplots(2, 2) @@ -335,7 +335,7 @@ def test_subplots_multiple_axes_2_dim(self, layout, exp_layout): np.random.default_rng(2).random((10, 4)), index=list(string.ascii_letters[:10]), ) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="layout keyword is ignored"): returned = df.plot( subplots=True, ax=axes, layout=layout, sharex=False, sharey=False ) @@ -501,7 +501,7 @@ def test_df_subplots_patterns_minorticks_1st_ax_hidden(self): columns=list("AB"), ) _, axes = plt.subplots(2, 1) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True) for ax in axes: assert len(ax.lines) == 1 @@ -544,7 +544,7 @@ def test_subplots_sharex_false(self): tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) - def test_subplots_constrained_layout(self): + def test_subplots_constrained_layout(self, temp_file): # GH 25261 idx = date_range(start="now", periods=10) df = DataFrame(np.random.default_rng(2).random((10, 3)), index=idx) @@ -554,7 +554,7 @@ def test_subplots_constrained_layout(self): _, axes = mpl.pyplot.subplots(2, **kwargs) with tm.assert_produces_warning(None): df.plot(ax=axes[0]) - with tm.ensure_clean(return_filelike=True) as path: + with temp_file.open(mode="wb") as path: mpl.pyplot.savefig(path) @pytest.mark.parametrize( diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 76f7fa1f22eec..2267b6197cd80 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,4 +1,6 @@ -""" Test cases for .boxplot method """ +"""Test cases for .boxplot method""" + +from __future__ import annotations import itertools import string @@ -22,6 +24,7 @@ _check_ticks_props, _check_visible, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -35,12 +38,21 @@ def _check_ax_limits(col, ax): assert y_max >= col.max() +if Version(mpl.__version__) < Version("3.10"): + verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}] +else: + verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}] + + +@pytest.fixture(params=verts) +def vert(request): + return request.param + + class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 - import matplotlib.pyplot as plt - - n = 80 + n = 30 df = DataFrame( { "Clinical": np.random.default_rng(2).choice([0, 1, 2, 3], n), @@ -51,10 +63,10 @@ def test_stacked_boxplot_set_axis(self): ) ax = df.plot(kind="bar", stacked=True) assert [int(x.get_text()) for x in ax.get_xticklabels()] == df.index.to_list() - ax.set_xticks(np.arange(0, 80, 10)) + ax.set_xticks(np.arange(0, n, 10)) plt.draw() # Update changes assert [int(x.get_text()) for x in ax.get_xticklabels()] == list( - np.arange(0, 80, 10) + np.arange(0, n, 10) ) @pytest.mark.slow @@ -129,7 +141,8 @@ def test_boxplot_legacy2_with_multi_col(self): df["Y"] = Series(["A"] * 10) # Multiple columns with an ax argument should use same figure fig, ax = mpl.pyplot.subplots() - with tm.assert_produces_warning(UserWarning): + msg = "the figure containing the passed axes is being cleared" + with tm.assert_produces_warning(UserWarning, match=msg): axes = df.boxplot( column=["Col1", "Col2"], by="X", ax=ax, return_type="axes" ) @@ -226,12 +239,12 @@ def test_boxplot_numeric_data(self): # GH 22799 df = DataFrame( { - "a": date_range("2012-01-01", periods=100), - "b": np.random.default_rng(2).standard_normal(100), - "c": np.random.default_rng(2).standard_normal(100) + 2, - "d": date_range("2012-01-01", periods=100).astype(str), - "e": date_range("2012-01-01", periods=100, tz="UTC"), - "f": timedelta_range("1 days", periods=100), + "a": date_range("2012-01-01", periods=10), + "b": np.random.default_rng(2).standard_normal(10), + "c": np.random.default_rng(2).standard_normal(10) + 2, + "d": date_range("2012-01-01", periods=10).astype(str), + "e": date_range("2012-01-01", periods=10, tz="UTC"), + "f": timedelta_range("1 days", periods=10), } ) ax = df.plot(kind="box") @@ -281,8 +294,6 @@ def test_color_kwd(self, colors_kwd, expected): def test_colors_in_theme(self, scheme, expected): # GH: 40769 df = DataFrame(np.random.default_rng(2).random((10, 2))) - import matplotlib.pyplot as plt - plt.style.use(scheme) result = df.plot.box(return_type="dict") for k, v in expected.items(): @@ -315,7 +326,7 @@ def test_specified_props_kwd(self, props, expected): assert result[expected][0].get_color() == "C1" - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -325,27 +336,26 @@ def test_plot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.plot(kind="box", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) - df1 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) - df2 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + df1 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) + df2 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) - df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) - df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert) + df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert) for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - mpl.pyplot.close() - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -355,11 +365,11 @@ def test_boxplot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_group_xlabel_ylabel(self, vert): df = DataFrame( { @@ -369,14 +379,19 @@ def test_boxplot_group_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert) for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - mpl.pyplot.close() - @pytest.mark.parametrize("vert", [True, False]) - def test_boxplot_group_no_xlabel_ylabel(self, vert): + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + def test_boxplot_group_no_xlabel_ylabel(self, vert, request): + if Version(mpl.__version__) >= Version("3.10") and vert == { + "orientation": "horizontal" + }: + request.applymarker( + pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10") + ) df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -384,11 +399,14 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): "group": np.random.default_rng(2).choice(["group1", "group2"], 10), } ) - ax = df.boxplot(by="group", vert=vert) + ax = df.boxplot(by="group", **vert) for subplot in ax: - target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() + target_label = ( + subplot.get_xlabel() + if vert == {"vert": True} or vert == {"orientation": "vertical"} + else subplot.get_ylabel() + ) assert target_label == pprint_thing(["group"]) - mpl.pyplot.close() class TestDataFrameGroupByPlots: @@ -426,25 +444,6 @@ def test_boxplot_legacy2_return_type(self): axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") _check_axes_shape(axes, axes_num=1, layout=(1, 1)) - @pytest.mark.parametrize( - "subplots, warn, axes_num, layout", - [[True, UserWarning, 3, (2, 2)], [False, None, 1, (1, 1)]], - ) - def test_boxplot_legacy3(self, subplots, warn, axes_num, layout): - tuples = zip(string.ascii_letters[:10], range(10)) - df = DataFrame( - np.random.default_rng(2).random((10, 3)), - index=MultiIndex.from_tuples(tuples), - ) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = df.unstack(level=1).groupby(level=0, axis=1) - with tm.assert_produces_warning(warn, check_stacklevel=False): - axes = _check_plot_works( - grouped.boxplot, subplots=subplots, return_type="axes" - ) - _check_axes_shape(axes, axes_num=axes_num, layout=layout) - def test_grouped_plot_fignums(self): n = 10 weight = Series(np.random.default_rng(2).normal(166, 20, size=n)) @@ -626,7 +625,7 @@ def test_grouped_box_multiple_axes(self, hist_df): # passes multiple axes to plot, hist or boxplot # location should be changed if other test is added # which has earlier alphabetical order - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): _, axes = mpl.pyplot.subplots(2, 2) df.groupby("category").boxplot(column="height", return_type="axes", ax=axes) _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(2, 2)) @@ -636,7 +635,7 @@ def test_grouped_box_multiple_axes_on_fig(self, hist_df): # GH 6970, GH 7069 df = hist_df fig, axes = mpl.pyplot.subplots(2, 3) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): returned = df.boxplot( column=["height", "weight", "category"], by="gender", @@ -649,7 +648,7 @@ def test_grouped_box_multiple_axes_on_fig(self, hist_df): assert returned[0].figure is fig # draw on second row - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): returned = df.groupby("classroom").boxplot( column=["height", "weight", "category"], return_type="axes", ax=axes[1] ) @@ -663,10 +662,10 @@ def test_grouped_box_multiple_axes_ax_error(self, hist_df): # GH 6970, GH 7069 df = hist_df msg = "The number of passed axes must be 3, the same as the output plot" + _, axes = mpl.pyplot.subplots(2, 3) with pytest.raises(ValueError, match=msg): - fig, axes = mpl.pyplot.subplots(2, 3) # pass different number of axes from required - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.groupby("classroom").boxplot(ax=axes) def test_fontsize(self): @@ -759,3 +758,17 @@ def test_boxplot_multiindex_column(self): expected_xticklabel = ["(bar, one)", "(bar, two)"] result_xticklabel = [x.get_text() for x in axes.get_xticklabels()] assert expected_xticklabel == result_xticklabel + + @pytest.mark.parametrize("group", ["X", ["X", "Y"]]) + def test_boxplot_multi_groupby_groups(self, group): + # GH 14701 + rows = 20 + df = DataFrame( + np.random.default_rng(12).normal(size=(rows, 2)), columns=["Col1", "Col2"] + ) + df["X"] = Series(np.repeat(["A", "B"], int(rows / 2))) + df["Y"] = Series(np.tile(["C", "D"], int(rows / 2))) + grouped = df.groupby(group) + _check_plot_works(df.boxplot, by=group, default_axes=True) + _check_plot_works(df.plot.box, by=group, default_axes=True) + _check_plot_works(grouped.boxplot, default_axes=True) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index f748d7c5fc758..cfdfa7f723599 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -34,15 +34,11 @@ Second, ) -try: - from pandas.plotting._matplotlib import converter -except ImportError: - # try / except, rather than skip, to avoid internal refactoring - # causing an improper skip - pass - -pytest.importorskip("matplotlib.pyplot") +plt = pytest.importorskip("matplotlib.pyplot") dates = pytest.importorskip("matplotlib.dates") +units = pytest.importorskip("matplotlib.units") + +from pandas.plotting._matplotlib import converter @pytest.mark.single_cpu @@ -79,30 +75,22 @@ def test_dont_register_by_default(self): assert subprocess.check_call(call) == 0 def test_registering_no_warning(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() ax.plot(s.index, s.values) - plt.close() def test_pandas_plots_register(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) # Set to the "warn" state, in case this isn't the first test run with tm.assert_produces_warning(None) as w: s.plot() - try: - assert len(w) == 0 - finally: - plt.close() + assert len(w) == 0 def test_matplotlib_formatters(self): - units = pytest.importorskip("matplotlib.units") - # Can't make any assertion about the start state. # We we check that toggling converters off removes it, and toggling it # on restores it. @@ -113,26 +101,19 @@ def test_matplotlib_formatters(self): assert Timestamp in units.registry def test_option_no_warning(self): - pytest.importorskip("matplotlib.pyplot") - ctx = cf.option_context("plotting.matplotlib.register_converters", False) - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Test without registering first, no warning - with ctx: + with cf.option_context("plotting.matplotlib.register_converters", False): ax.plot(s.index, s.values) # Now test with registering register_matplotlib_converters() - with ctx: + with cf.option_context("plotting.matplotlib.register_converters", False): ax.plot(s.index, s.values) - plt.close() def test_registry_resets(self): - units = pytest.importorskip("matplotlib.units") - dates = pytest.importorskip("matplotlib.dates") - # make a copy, to reset to original = dict(units.registry) @@ -215,7 +196,7 @@ def test_conversion_float(self, dtc): rtol = 0.5 * 10**-9 rs = dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None) - xp = converter.mdates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) + xp = dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) tm.assert_almost_equal(rs, xp, rtol=rtol) rs = dtc.convert( @@ -236,10 +217,10 @@ def test_conversion_float(self, dtc): def test_conversion_outofbounds_datetime(self, dtc, values): # 2579 rs = dtc.convert(values, None, None) - xp = converter.mdates.date2num(values) + xp = dates.date2num(values) tm.assert_numpy_array_equal(rs, xp) rs = dtc.convert(values[0], None, None) - xp = converter.mdates.date2num(values[0]) + xp = dates.date2num(values[0]) assert rs == xp @pytest.mark.parametrize( @@ -262,7 +243,7 @@ def test_dateindex_conversion(self, freq, dtc): rtol = 10**-9 dateindex = date_range("2020-01-01", periods=10, freq=freq) rs = dtc.convert(dateindex, None, None) - xp = converter.mdates.date2num(dateindex._mpl_repr()) + xp = dates.date2num(dateindex._mpl_repr()) tm.assert_almost_equal(rs, xp, rtol=rtol) @pytest.mark.parametrize("offset", [Second(), Milli(), Micro(50)]) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 112172656b6ec..1275f3d6f7d6d 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1,4 +1,5 @@ -""" Test cases for time series specific (freq conversion, etc) """ +"""Test cases for time series specific (freq conversion, etc)""" + from datetime import ( date, datetime, @@ -14,7 +15,8 @@ BaseOffset, to_offset, ) -from pandas._libs.tslibs.dtypes import freq_to_period_freqstr + +from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( DataFrame, @@ -42,6 +44,9 @@ from pandas.tseries.offsets import WeekOfMonth mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") + +import pandas.plotting._matplotlib.converter as conv class TestTSPlot: @@ -70,7 +75,7 @@ def test_fontsize_set_correctly(self): def test_frame_inferred(self): # inferred freq - idx = date_range("1/1/1987", freq="MS", periods=100) + idx = date_range("1/1/1987", freq="MS", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame( @@ -79,7 +84,7 @@ def test_frame_inferred(self): _check_plot_works(df.plot) # axes freq - idx = idx[0:40].union(idx[45:99]) + idx = idx[0:4].union(idx[6:]) df2 = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx ) @@ -108,7 +113,6 @@ def test_nonnumeric_exclude(self): fig, ax = mpl.pyplot.subplots() df.plot(ax=ax) # it works assert len(ax.get_lines()) == 1 # B was plotted - mpl.pyplot.close(fig) def test_nonnumeric_exclude_error(self): idx = date_range("1/1/1987", freq="YE", periods=3) @@ -119,7 +123,7 @@ def test_nonnumeric_exclude_error(self): @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -128,7 +132,7 @@ def test_tsplot_period(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_tsplot_datetime(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -142,10 +146,9 @@ def test_tsplot(self): color = (0.0, 0.0, 0.0, 1) assert color == ax.get_lines()[0].get_color() - def test_both_style_and_color(self): - ts = Series( - np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) - ) + @pytest.mark.parametrize("index", [None, date_range("2020-01-01", periods=10)]) + def test_both_style_and_color(self, index): + ts = Series(np.arange(10, dtype=np.float64), index=index) msg = ( "Cannot pass 'style' string with a color symbol and 'color' " "keyword argument. Please use one or the other or pass 'style' " @@ -154,46 +157,37 @@ def test_both_style_and_color(self): with pytest.raises(ValueError, match=msg): ts.plot(style="b-", color="#000099") - s = ts.reset_index(drop=True) - with pytest.raises(ValueError, match=msg): - s.plot(style="b-", color="#000099") - @pytest.mark.parametrize("freq", ["ms", "us"]) def test_high_freq(self, freq): _, ax = mpl.pyplot.subplots() - rng = date_range("1/1/2012", periods=100, freq=freq) + rng = date_range("1/1/2012", periods=10, freq=freq) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): - from pandas.plotting._matplotlib.converter import get_datevalue - - assert get_datevalue(None, "D") is None - assert get_datevalue(1987, "Y") == 1987 - assert get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal - assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - - def test_ts_plot_format_coord(self): - def check_format_of_first_point(ax, expected_string): - first_line = ax.get_lines()[0] - first_x = first_line.get_xdata()[0].ordinal - first_y = first_line.get_ydata()[0] - assert expected_string == ax.format_coord(first_x, first_y) + assert conv.get_datevalue(None, "D") is None + assert conv.get_datevalue(1987, "Y") == 1987 + assert ( + conv.get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal + ) + assert conv.get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - annual = Series(1, index=date_range("2014-01-01", periods=3, freq="YE-DEC")) + @pytest.mark.parametrize( + "freq, expected_string", + [["YE-DEC", "t = 2014 y = 1.000000"], ["D", "t = 2014-01-01 y = 1.000000"]], + ) + def test_ts_plot_format_coord(self, freq, expected_string): + ser = Series(1, index=date_range("2014-01-01", periods=3, freq=freq)) _, ax = mpl.pyplot.subplots() - annual.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014 y = 1.000000") - - # note this is added to the annual plot already in existence, and - # changes its freq field - daily = Series(1, index=date_range("2014-01-01", periods=3, freq="D")) - daily.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") + ser.plot(ax=ax) + first_line = ax.get_lines()[0] + first_x = first_line.get_xdata()[0].ordinal + first_y = first_line.get_ydata()[0] + assert expected_string == ax.format_coord(first_x, first_y) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @@ -203,7 +197,7 @@ def test_line_plot_period_series(self, freq): def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the # frequency (`frqncy`) rule code. tests resolution of issue #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) s = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(s.plot, s.index.freq.rule_code) @@ -211,13 +205,13 @@ def test_line_plot_period_mlt_series(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "QE", "YE"]) def test_line_plot_period_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -232,14 +226,13 @@ def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) # of the frequency (`frqncy`) rule code. tests resolution of issue # #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, columns=["A", "B", "C"], ) - freq = freq_to_period_freqstr(1, df.index.freq.rule_code) - freq = df.index.asfreq(freq).freq + freq = df.index.freq.rule_code _check_plot_works(df.plot, freq) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @@ -247,13 +240,13 @@ def test_line_plot_period_mlt_frame(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, columns=["A", "B", "C"], ) - freq = freq_to_period_freqstr(1, df.index.freq.rule_code) + freq = PeriodDtype(df.index.freq)._freqstr freq = df.index.to_period(freq).freq _check_plot_works(df.plot, freq) @@ -261,7 +254,7 @@ def test_line_plot_datetime_frame(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) ser = Series(ser.values, Index(np.asarray(ser.index))) _check_plot_works(ser.plot, ser.index.inferred_freq) @@ -293,27 +286,6 @@ def test_plot_multiple_inferred_freq(self): ser = Series(np.random.default_rng(2).standard_normal(len(dr)), index=dr) _check_plot_works(ser.plot) - @pytest.mark.xfail(reason="Api changed in 3.6.0") - def test_uhf(self): - import pandas.plotting._matplotlib.converter as conv - - idx = date_range("2012-6-22 21:59:51.960928", freq="ms", periods=500) - df = DataFrame( - np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx - ) - - _, ax = mpl.pyplot.subplots() - df.plot(ax=ax) - axis = ax.get_xaxis() - - tlocs = axis.get_ticklocs() - tlabels = axis.get_ticklabels() - for loc, label in zip(tlocs, tlabels): - xp = conv._from_ordinal(loc).strftime("%H:%M:%S.%f") - rs = str(label.get_text()) - if len(rs): - assert xp == rs - def test_irreg_hf(self): idx = date_range("2012-6-22 21:59:51", freq="s", periods=10) df = DataFrame( @@ -369,8 +341,8 @@ def test_business_freq(self): def test_business_freq_convert(self): bts = Series( - np.arange(300, dtype=np.float64), - index=date_range("2020-01-01", periods=300, freq="B"), + np.arange(50, dtype=np.float64), + index=date_range("2020-01-01", periods=50, freq="B"), ).asfreq("BME") ts = bts.to_period("M") _, ax = mpl.pyplot.subplots() @@ -463,12 +435,8 @@ def test_axis_limits(self, obj): result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal - fig = ax.get_figure() - mpl.pyplot.close(fig) def test_get_finder(self): - import pandas.plotting._matplotlib.converter as conv - assert conv.get_finder(to_offset("B")) == conv._daily_finder assert conv.get_finder(to_offset("D")) == conv._daily_finder assert conv.get_finder(to_offset("ME")) == conv._monthly_finder @@ -571,7 +539,7 @@ def test_finder_annual(self): @pytest.mark.slow def test_finder_minutely(self): - nminutes = 50 * 24 * 60 + nminutes = 1 * 24 * 60 rng = date_range("1/1/1999", freq="Min", periods=nminutes) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -596,9 +564,9 @@ def test_finder_hourly(self): def test_gaps(self): ts = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - ts.iloc[5:25] = np.nan + ts.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() ts.plot(ax=ax) lines = ax.get_lines() @@ -610,8 +578,7 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() - mpl.pyplot.close(ax.get_figure()) + assert mask[5:7, 1].all() def test_gaps_irregular(self): # irregular @@ -632,7 +599,6 @@ def test_gaps_irregular(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[2:5, 1].all() - mpl.pyplot.close(ax.get_figure()) def test_gaps_non_ts(self): # non-ts @@ -653,9 +619,9 @@ def test_gaps_non_ts(self): def test_gap_upsample(self): low = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - low.iloc[5:25] = np.nan + low.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() low.plot(ax=ax) @@ -672,7 +638,7 @@ def test_gap_upsample(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() + assert mask[5:7, 1].all() def test_secondary_y(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -686,7 +652,6 @@ def test_secondary_y(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_yaxis(self): Series(np.random.default_rng(2).standard_normal(10)) @@ -694,7 +659,6 @@ def test_secondary_y_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_both(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -720,7 +684,6 @@ def test_secondary_y_ts(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_ts_yaxis(self): idx = date_range("1/1/2000", periods=10) @@ -728,7 +691,6 @@ def test_secondary_y_ts_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_y_ts_visible(self): idx = date_range("1/1/2000", periods=10) @@ -946,7 +908,7 @@ def test_mixed_freq_shared_ax_twin_x(self): @pytest.mark.xfail(reason="TODO (GH14330, GH14322)") def test_mixed_freq_shared_ax_twin_x_irregular_first(self): # GH13341, using sharex=True - idx1 = date_range("2015-01-01", periods=3, freq="M") + idx1 = date_range("2015-01-01", periods=3, freq="ME") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -1127,8 +1089,8 @@ def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # high to low @@ -1141,8 +1103,8 @@ def test_mixed_freq_second_millisecond(self): def test_mixed_freq_second_millisecond_low_to_high(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # low to high @@ -1317,7 +1279,6 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_right(self): df = DataFrame( @@ -1334,7 +1295,6 @@ def test_secondary_legend_right(self): assert leg.get_texts()[1].get_text() == "B" assert leg.get_texts()[2].get_text() == "C" assert leg.get_texts()[3].get_text() == "D" - mpl.pyplot.close(fig) def test_secondary_legend_bar(self): df = DataFrame( @@ -1347,7 +1307,6 @@ def test_secondary_legend_bar(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A (right)" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_bar_right(self): df = DataFrame( @@ -1360,7 +1319,6 @@ def test_secondary_legend_bar_right(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_multi_col(self): df = DataFrame( @@ -1385,14 +1343,13 @@ def test_secondary_legend_multi_col(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_nonts(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1406,14 +1363,13 @@ def test_secondary_legend_nonts(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close() def test_secondary_legend_nonts_multi_col(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1451,23 +1407,26 @@ def test_mpl_nopandas(self): values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) - kw = {"fmt": "-", "lw": 4} - _, ax = mpl.pyplot.subplots() - ax.plot_date([x.toordinal() for x in dates], values1, **kw) - ax.plot_date([x.toordinal() for x in dates], values2, **kw) - - line1, line2 = ax.get_lines() + ( + line1, + line2, + ) = ax.plot( + [x.toordinal() for x in dates], + values1, + "-", + [x.toordinal() for x in dates], + values2, + "-", + linewidth=4, + ) exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) - exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp) def test_irregular_ts_shared_ax_xlim(self): # GH 2960 - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1480,8 +1439,8 @@ def test_irregular_ts_shared_ax_xlim(self): # check that axis limits are correct left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_secondary_y_non_ts_xlim(self): # GH 3490 - non-timeseries with secondary y @@ -1517,7 +1476,7 @@ def test_secondary_y_regular_ts_xlim(self): def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y - rng = date_range("2000-01-01", periods=10000, freq="min") + rng = date_range("2000-01-01", periods=10, freq="min") ts = Series(1, index=rng) _, ax = mpl.pyplot.subplots() @@ -1532,8 +1491,6 @@ def test_secondary_y_mixed_freq_ts_xlim(self): def test_secondary_y_irregular_ts_xlim(self): # GH 3490 - irregular-timeseries with secondary y - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1547,8 +1504,8 @@ def test_secondary_y_irregular_ts_xlim(self): ts_irregular[:5].plot(ax=ax) left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_plot_outofbounds_datetime(self): # 2579 - checking this does not raise @@ -1586,7 +1543,7 @@ def test_format_timedelta_ticks_wide(self): "9 days 06:13:20", ] - rng = timedelta_range("0", periods=10, freq="1 d") + rng = timedelta_range("0", periods=10, freq="1 D") df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng) _, ax = mpl.pyplot.subplots() ax = df.plot(fontsize=2, ax=ax) @@ -1605,7 +1562,7 @@ def test_timedelta_plot(self): def test_timedelta_long_period(self): # test long period - index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 d") + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 D") s = Series(np.random.default_rng(2).standard_normal(len(index)), index) _, ax = mpl.pyplot.subplots() _check_plot_works(s.plot, ax=ax) @@ -1714,41 +1671,49 @@ def test_check_xticks_rot_sharex(self): axes = df.plot(x="x", y="y", subplots=True, sharex=False) _check_ticks_props(axes, xrot=0) + @pytest.mark.parametrize( + "idx", + [ + date_range("2020-01-01", periods=5), + date_range("2020-01-01", periods=5, tz="UTC"), + timedelta_range("1 day", periods=5, freq="D"), + period_range("2020-01-01", periods=5, freq="D"), + Index([date(2000, 1, i) for i in [1, 3, 6, 20, 22]], dtype=object), + range(5), + ], + ) + def test_pickle_fig(self, temp_file, frame_or_series, idx): + # GH18439, GH#24088, statsmodels#4772 + df = frame_or_series(range(5), index=idx) + fig, ax = plt.subplots(1, 1) + df.plot(ax=ax) + with temp_file.open(mode="wb") as path: + pickle.dump(fig, path) -def _check_plot_works(f, freq=None, series=None, *args, **kwargs): - import matplotlib.pyplot as plt +def _check_plot_works(f, freq=None, series=None, *args, **kwargs): fig = plt.gcf() - try: - plt.clf() - ax = fig.add_subplot(211) - orig_ax = kwargs.pop("ax", plt.gca()) - orig_axfreq = getattr(orig_ax, "freq", None) - - ret = f(*args, **kwargs) - assert ret is not None # do something more intelligent - - ax = kwargs.pop("ax", plt.gca()) - if series is not None: - dfreq = series.index.freq - if isinstance(dfreq, BaseOffset): - dfreq = dfreq.rule_code - if orig_axfreq is None: - assert ax.freq == dfreq - - if freq is not None: - ax_freq = to_offset(ax.freq, is_period=True) - if freq is not None and orig_axfreq is None: - assert ax_freq == freq - - ax = fig.add_subplot(212) - kwargs["ax"] = ax - ret = f(*args, **kwargs) - assert ret is not None # TODO: do something more intelligent - - # GH18439, GH#24088, statsmodels#4772 - with tm.ensure_clean(return_filelike=True) as path: - pickle.dump(fig, path) - finally: - plt.close(fig) + fig.clf() + ax = fig.add_subplot(211) + orig_ax = kwargs.pop("ax", plt.gca()) + orig_axfreq = getattr(orig_ax, "freq", None) + + ret = f(*args, **kwargs) + assert ret is not None # do something more intelligent + + ax = kwargs.pop("ax", plt.gca()) + if series is not None: + dfreq = series.index.freq + if isinstance(dfreq, BaseOffset): + dfreq = dfreq.rule_code + if orig_axfreq is None: + assert ax.freq == dfreq + + if freq is not None and orig_axfreq is None: + assert to_offset(ax.freq, is_period=True) == freq + + ax = fig.add_subplot(212) + kwargs["ax"] = ax + ret = f(*args, **kwargs) + assert ret is not None # TODO: do something more intelligent diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 5ebf93510a615..0cb125d822fd1 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -1,5 +1,4 @@ -""" Test cases for GroupBy.plot """ - +"""Test cases for GroupBy.plot""" import numpy as np import pytest diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 4d17f87fdc7bc..410b658065d8d 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -1,4 +1,5 @@ -""" Test cases for .hist method """ +"""Test cases for .hist method""" + import re import numpy as np @@ -26,6 +27,9 @@ ) mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") + +from pandas.plotting._matplotlib.hist import _grouped_hist @pytest.fixture @@ -106,7 +110,7 @@ def test_hist_layout_with_by(self, hist_df, by, layout, axes_num, res_layout): # _check_plot_works adds an `ax` kwarg to the method call # so we get a warning about an axis being cleared, even - # though we don't explicing pass one, see GH #13188 + # though we don't explicitly pass one, see GH #13188 with tm.assert_produces_warning(UserWarning, check_stacklevel=False): axes = _check_plot_works(df.height.hist, by=getattr(df, by), layout=layout) _check_axes_shape(axes, axes_num=axes_num, layout=res_layout) @@ -118,18 +122,13 @@ def test_hist_layout_with_by_shape(self, hist_df): _check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) def test_hist_no_overlap(self): - from matplotlib.pyplot import ( - gcf, - subplot, - ) - x = Series(np.random.default_rng(2).standard_normal(2)) y = Series(np.random.default_rng(2).standard_normal(2)) - subplot(121) + plt.subplot(121) x.hist() - subplot(122) + plt.subplot(122) y.hist() - fig = gcf() + fig = plt.gcf() axes = fig.axes assert len(axes) == 2 @@ -139,10 +138,8 @@ def test_hist_by_no_extra_plots(self, hist_df): assert len(mpl.pyplot.get_fignums()) == 1 def test_plot_fails_when_ax_differs_from_figure(self, ts): - from pylab import figure - - fig1 = figure() - fig2 = figure() + fig1 = plt.figure(1) + fig2 = plt.figure(2) ax1 = fig1.add_subplot(111) msg = "passed axis not bound to passed figure" with pytest.raises(AssertionError, match=msg): @@ -168,8 +165,8 @@ def test_histtype_argument(self, histtype, expected): ) def test_hist_with_legend(self, by, expected_axes_num, expected_layout): # GH 6279 - Series histogram can have a legend - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" # Use default_axes=True when plotting method generate subplots itself @@ -180,8 +177,8 @@ def test_hist_with_legend(self, by, expected_axes_num, expected_layout): @pytest.mark.parametrize("by", [None, "b"]) def test_hist_with_legend_raises(self, by): # GH 6279 - Series histogram with legend and label raises - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" with pytest.raises(ValueError, match="Cannot use both legend and label"): @@ -330,12 +327,10 @@ def test_hist_df_legacy_layout_labelsize_rot(self, frame_or_series): @pytest.mark.slow def test_hist_df_legacy_rectangles(self): - from matplotlib.patches import Rectangle - ser = Series(range(10)) ax = ser.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) @pytest.mark.slow @@ -430,12 +425,12 @@ def test_hist_layout_error(self): # GH 9351 def test_tight_layout(self): - df = DataFrame(np.random.default_rng(2).standard_normal((100, 2))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) df[2] = to_datetime( np.random.default_rng(2).integers( 812419200000000000, 819331200000000000, - size=100, + size=10, dtype=np.int64, ) ) @@ -503,7 +498,7 @@ def test_hist_column_order_unchanged(self, column, expected): def test_histtype_argument(self, histtype, expected): # GH23992 Verify functioning of histtype argument df = DataFrame( - np.random.default_rng(2).integers(1, 10, size=(100, 2)), columns=["a", "b"] + np.random.default_rng(2).integers(1, 10, size=(10, 2)), columns=["a", "b"] ) ax = df.hist(histtype=histtype) _check_patches_all_filled(ax, filled=expected) @@ -518,9 +513,9 @@ def test_hist_with_legend(self, by, column): if by is not None: expected_labels = [expected_labels] * 2 - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -544,9 +539,9 @@ def test_hist_with_legend(self, by, column): @pytest.mark.parametrize("column", [None, "b"]) def test_hist_with_legend_raises(self, by, column): # GH 6279 - DataFrame histogram with legend and label raises - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -585,7 +580,7 @@ def test_hist_df_with_nonnumerics_no_bins(self): def test_hist_secondary_legend(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # primary -> secondary @@ -601,7 +596,7 @@ def test_hist_secondary_legend(self): def test_hist_secondary_secondary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> secondary _, ax = mpl.pyplot.subplots() @@ -616,7 +611,7 @@ def test_hist_secondary_secondary(self): def test_hist_secondary_primary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> primary _, ax = mpl.pyplot.subplots() @@ -631,7 +626,6 @@ def test_hist_secondary_primary(self): def test_hist_with_nans_and_weights(self): # GH 48884 - mpl_patches = pytest.importorskip("matplotlib.patches") df = DataFrame( [[np.nan, 0.2, 0.3], [0.4, np.nan, np.nan], [0.7, 0.8, 0.9]], columns=list("abc"), @@ -642,12 +636,12 @@ def test_hist_with_nans_and_weights(self): _, ax0 = mpl.pyplot.subplots() df.plot.hist(ax=ax0, weights=weights) - rects = [x for x in ax0.get_children() if isinstance(x, mpl_patches.Rectangle)] + rects = [x for x in ax0.get_children() if isinstance(x, mpl.patches.Rectangle)] heights = [rect.get_height() for rect in rects] _, ax1 = mpl.pyplot.subplots() no_nan_df.plot.hist(ax=ax1, weights=no_nan_weights) no_nan_rects = [ - x for x in ax1.get_children() if isinstance(x, mpl_patches.Rectangle) + x for x in ax1.get_children() if isinstance(x, mpl.patches.Rectangle) ] no_nan_heights = [rect.get_height() for rect in no_nan_rects] assert all(h0 == h1 for h0, h1 in zip(heights, no_nan_heights)) @@ -655,15 +649,13 @@ def test_hist_with_nans_and_weights(self): idxerror_weights = np.array([[0.3, 0.25], [0.45, 0.45]]) msg = "weights must have the same shape as data, or be a single column" + _, ax2 = mpl.pyplot.subplots() with pytest.raises(ValueError, match=msg): - _, ax2 = mpl.pyplot.subplots() no_nan_df.plot.hist(ax=ax2, weights=idxerror_weights) class TestDataFrameGroupByPlots: def test_grouped_hist_legacy(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(10) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -715,10 +707,6 @@ def test_grouped_hist_legacy_single_key(self): _check_ticks_props(axes, xrot=30) def test_grouped_hist_legacy_grouped_hist_kwargs(self): - from matplotlib.patches import Rectangle - - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -747,14 +735,14 @@ def test_grouped_hist_legacy_grouped_hist_kwargs(self): ) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [ + x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle) + ] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) _check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) def test_grouped_hist_legacy_grouped_hist(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -772,8 +760,6 @@ def test_grouped_hist_legacy_grouped_hist(self): _check_ax_scales(axes, yaxis="log") def test_grouped_hist_legacy_external_err(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index cfb657c2a800f..d3e1d7f60384b 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -1,4 +1,5 @@ -""" Test cases for misc plot functions """ +"""Test cases for misc plot functions""" + import os import numpy as np @@ -30,6 +31,10 @@ plt = pytest.importorskip("matplotlib.pyplot") cm = pytest.importorskip("matplotlib.cm") +import re + +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def iris(datapath) -> DataFrame: @@ -108,8 +113,6 @@ def test_savefig(kind, data, index): class TestSeriesPlots: def test_autocorrelation_plot(self): - from pandas.plotting import autocorrelation_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), @@ -117,32 +120,28 @@ def test_autocorrelation_plot(self): ) # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(autocorrelation_plot, series=ser) - _check_plot_works(autocorrelation_plot, series=ser.values) + _check_plot_works(plotting.autocorrelation_plot, series=ser) + _check_plot_works(plotting.autocorrelation_plot, series=ser.values) - ax = autocorrelation_plot(ser, label="Test") + ax = plotting.autocorrelation_plot(ser, label="Test") _check_legend_labels(ax, labels=["Test"]) @pytest.mark.parametrize("kwargs", [{}, {"lag": 5}]) def test_lag_plot(self, kwargs): - from pandas.plotting import lag_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(lag_plot, series=ser, **kwargs) + _check_plot_works(plotting.lag_plot, series=ser, **kwargs) def test_bootstrap_plot(self): - from pandas.plotting import bootstrap_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(bootstrap_plot, series=ser, size=10) + _check_plot_works(plotting.bootstrap_plot, series=ser, size=10) class TestDataFramePlots: @@ -155,7 +154,7 @@ def test_scatter_matrix_axis(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(2).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 3))) # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning, check_stacklevel=False): @@ -167,7 +166,7 @@ def test_scatter_matrix_axis(self, pass_axis): ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() # GH 5662 - expected = ["-2", "0", "2"] + expected = ["-2", "-1", "0"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @@ -180,7 +179,7 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(11).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(11).standard_normal((10, 3))) df[0] = (df[0] - 2) / 3 # we are plotting multiples on a sub-plot @@ -192,18 +191,15 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): ax=ax, ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ["-1.0", "-0.5", "0.0"] + expected = ["-1.25", "-1.0", "-0.75", "-0.5"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow def test_andrews_curves_no_warning(self, iris): - from pandas.plotting import andrews_curves - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(andrews_curves, frame=df, class_column="Name") + _check_plot_works(plotting.andrews_curves, frame=iris, class_column="Name") @pytest.mark.slow @pytest.mark.parametrize( @@ -228,12 +224,10 @@ def test_andrews_curves_no_warning(self, iris): ], ) def test_andrews_curves_linecolors(self, request, df, linecolors): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=linecolors + plotting.andrews_curves, frame=df, class_column="Name", color=linecolors ) _check_colors( ax.get_lines()[:10], linecolors=linecolors, mapping=df["Name"][:10] @@ -255,23 +249,19 @@ def test_andrews_curves_linecolors(self, request, df, linecolors): ], ) def test_andrews_curves_cmap(self, request, df): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=cmaps + plotting.andrews_curves, frame=df, class_column="Name", color=cmaps ) _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_andrews_curves_handle(self): - from pandas.plotting import andrews_curves - colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = andrews_curves(df, "Name", color=colors) + ax = plotting.andrews_curves(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) @@ -281,61 +271,54 @@ def test_andrews_curves_handle(self): [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_parallel_coordinates_colors(self, iris, color): - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", color=color + plotting.parallel_coordinates, frame=df, class_column="Name", color=color ) _check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet + plotting.parallel_coordinates, + frame=df, + class_column="Name", + colormap=cm.jet, ) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_line_diff(self, iris): - from pandas.plotting import parallel_coordinates - df = iris - ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") + ax = _check_plot_works( + plotting.parallel_coordinates, frame=df, class_column="Name" + ) nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", axvlines=False + plotting.parallel_coordinates, frame=df, class_column="Name", axvlines=False ) assert len(ax.get_lines()) == (nlines - nxticks) @pytest.mark.slow def test_parallel_coordinates_handles(self, iris): - from pandas.plotting import parallel_coordinates - df = iris colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = parallel_coordinates(df, "Name", color=colors) + ax = plotting.parallel_coordinates(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") def test_parallel_coordinates_with_sorted_labels(self): - """For #15908""" - from pandas.plotting import parallel_coordinates - + # GH 15908 df = DataFrame( { "feat": list(range(30)), @@ -344,7 +327,7 @@ def test_parallel_coordinates_with_sorted_labels(self): + [1 for _ in range(10)], } ) - ax = parallel_coordinates(df, "class", sort_labels=True) + ax = plotting.parallel_coordinates(df, "class", sort_labels=True) polylines, labels = ax.get_legend_handles_labels() color_label_tuples = zip( [polyline.get_color() for polyline in polylines], labels @@ -358,45 +341,38 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] def test_radviz_no_warning(self, iris): - from pandas.plotting import radviz - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(radviz, frame=df, class_column="Name") + _check_plot_works(plotting.radviz, frame=iris, class_column="Name") @pytest.mark.parametrize( "color", [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_radviz_color(self, iris, color): - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", color=color) + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", color=color + ) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches[:10], facecolors=color, mapping=df["Name"][:10]) def test_radviz_color_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) def test_radviz_colors_handles(self): - from pandas.plotting import radviz - colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] df = DataFrame( {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} ) - ax = radviz(df, "Name", color=colors) + ax = plotting.radviz(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, facecolors=colors) @@ -470,15 +446,11 @@ def test_get_standard_colors_random_seed(self): def test_get_standard_colors_consistency(self): # GH17525 # Make sure it produces the same colors every time it's called - from pandas.plotting._matplotlib.style import get_standard_colors - color1 = get_standard_colors(1, color_type="random") color2 = get_standard_colors(1, color_type="random") assert color1 == color2 def test_get_standard_colors_default_num_colors(self): - from pandas.plotting._matplotlib.style import get_standard_colors - # Make sure the default color_types returns the specified amount color1 = get_standard_colors(1, color_type="default") color2 = get_standard_colors(9, color_type="default") @@ -508,11 +480,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. - from matplotlib import cm - - from pandas.plotting._matplotlib.style import get_standard_colors - - color_before = cm.gnuplot(range(5)) + color_before = mpl.cm.gnuplot(range(5)) color_after = get_standard_colors(1, color=color_before) assert len(color_after) == len(color_before) @@ -520,7 +488,7 @@ def test_get_standard_colors_no_appending(self): np.random.default_rng(2).standard_normal((48, 4)), columns=list("ABCD") ) - color_list = cm.gnuplot(np.linspace(0, 1, 16)) + color_list = mpl.cm.gnuplot(np.linspace(0, 1, 16)) p = df.A.plot.bar(figsize=(16, 7), color=color_list) assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() @@ -545,9 +513,7 @@ def test_dictionary_color(self, kind): def test_bar_plot(self): # GH38947 # Test bar plot with string and int index - from matplotlib.text import Text - - expected = [Text(0, 0, "0"), Text(1, 0, "Total")] + expected = [mpl.text.Text(0, 0, "0"), mpl.text.Text(1, 0, "Total")] df = DataFrame( { @@ -564,11 +530,12 @@ def test_bar_plot(self): def test_barh_plot_labels_mixed_integer_string(self): # GH39126 # Test barh plot with string and integer at the same column - from matplotlib.text import Text - df = DataFrame([{"word": 1, "value": 0}, {"word": "knowledge", "value": 2}]) plot_barh = df.plot.barh(x="word", legend=None) - expected_yticklabels = [Text(0, 0, "1"), Text(0, 1, "knowledge")] + expected_yticklabels = [ + mpl.text.Text(0, 0, "1"), + mpl.text.Text(0, 1, "knowledge"), + ] assert all( actual.get_text() == expected.get_text() for actual, expected in zip( @@ -648,8 +615,8 @@ def test_externally_shared_axes(self): # Create data df = DataFrame( { - "a": np.random.default_rng(2).standard_normal(1000), - "b": np.random.default_rng(2).standard_normal(1000), + "a": np.random.default_rng(2).standard_normal(10), + "b": np.random.default_rng(2).standard_normal(10), } ) @@ -706,9 +673,7 @@ def test_plot_bar_axis_units_timestamp_conversion(self): def test_bar_plt_xaxis_intervalrange(self): # GH 38969 # Ensure IntervalIndex x-axis produces a bar plot as expected - from matplotlib.text import Text - - expected = [Text(0, 0, "([0, 1],)"), Text(1, 0, "([1, 2],)")] + expected = [mpl.text.Text(0, 0, "([0, 1],)"), mpl.text.Text(1, 0, "([1, 2],)")] s = Series( [1, 2], index=[interval_range(0, 2, closed="both")], @@ -718,3 +683,182 @@ def test_bar_plt_xaxis_intervalrange(self): (a.get_text() == b.get_text()) for a, b in zip(s.plot.bar().get_xticklabels(), expected) ) + + +@pytest.fixture +def df_bar_data(): + return np.random.default_rng(3).integers(0, 100, 5) + + +@pytest.fixture +def df_bar_df(df_bar_data) -> DataFrame: + df_bar_df = DataFrame( + { + "A": df_bar_data, + "B": df_bar_data[::-1], + "C": df_bar_data[0], + "D": df_bar_data[-1], + } + ) + return df_bar_df + + +def _df_bar_xyheight_from_ax_helper(df_bar_data, ax, subplot_division): + subplot_data_df_list = [] + + # get xy and height of squares representing data, separated by subplots + for i in range(len(subplot_division)): + subplot_data = np.array( + [ + (x.get_x(), x.get_y(), x.get_height()) + for x in ax[i].findobj(plt.Rectangle) + if x.get_height() in df_bar_data + ] + ) + subplot_data_df_list.append( + DataFrame(data=subplot_data, columns=["x_coord", "y_coord", "height"]) + ) + + return subplot_data_df_list + + +def _df_bar_subplot_checker(df_bar_data, df_bar_df, subplot_data_df, subplot_columns): + subplot_sliced_by_source = [ + subplot_data_df.iloc[ + len(df_bar_data) * i : len(df_bar_data) * (i + 1) + ].reset_index() + for i in range(len(subplot_columns)) + ] + + if len(subplot_columns) == 1: + expected_total_height = df_bar_df.loc[:, subplot_columns[0]] + else: + expected_total_height = df_bar_df.loc[:, subplot_columns].sum(axis=1) + + for i in range(len(subplot_columns)): + sliced_df = subplot_sliced_by_source[i] + if i == 0: + # Checks that the bar chart starts y=0 + assert (sliced_df["y_coord"] == 0).all() + height_iter = sliced_df["y_coord"].add(sliced_df["height"]) + else: + height_iter = height_iter + sliced_df["height"] + + if i + 1 == len(subplot_columns): + # Checks final height matches what is expected + tm.assert_series_equal( + height_iter, expected_total_height, check_names=False, check_dtype=False + ) + else: + # Checks each preceding bar ends where the next one starts + next_start_coord = subplot_sliced_by_source[i + 1]["y_coord"] + tm.assert_series_equal( + height_iter, next_start_coord, check_names=False, check_dtype=False + ) + + +# GH Issue 61018 +@pytest.mark.parametrize("columns_used", [["A", "B"], ["C", "D"], ["D", "A"]]) +def test_bar_1_subplot_1_double_stacked(df_bar_data, df_bar_df, columns_used): + df_bar_df_trimmed = df_bar_df[columns_used] + subplot_division = [columns_used] + ax = df_bar_df_trimmed.plot(subplots=subplot_division, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df_trimmed, subplot_data_df_list[i], subplot_division[i] + ) + + +@pytest.mark.parametrize( + "columns_used", [["A", "B", "C"], ["A", "C", "B"], ["D", "A", "C"]] +) +def test_bar_2_subplot_1_double_stacked(df_bar_data, df_bar_df, columns_used): + df_bar_df_trimmed = df_bar_df[columns_used] + subplot_division = [(columns_used[0], columns_used[1]), (columns_used[2],)] + ax = df_bar_df_trimmed.plot(subplots=subplot_division, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df_trimmed, subplot_data_df_list[i], subplot_division[i] + ) + + +@pytest.mark.parametrize( + "subplot_division", + [ + [("A", "B"), ("C", "D")], + [("A", "D"), ("C", "B")], + [("B", "C"), ("D", "A")], + [("B", "D"), ("C", "A")], + ], +) +def test_bar_2_subplot_2_double_stacked(df_bar_data, df_bar_df, subplot_division): + ax = df_bar_df.plot(subplots=subplot_division, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i] + ) + + +@pytest.mark.parametrize( + "subplot_division", + [[("A", "B", "C")], [("A", "D", "B")], [("C", "A", "D")], [("D", "C", "A")]], +) +def test_bar_2_subplots_1_triple_stacked(df_bar_data, df_bar_df, subplot_division): + ax = df_bar_df.plot(subplots=subplot_division, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i] + ) + + +def test_bar_subplots_stacking_bool(df_bar_data, df_bar_df): + subplot_division = [("A"), ("B"), ("C"), ("D")] + ax = df_bar_df.plot(subplots=True, kind="bar", stacked=True) + subplot_data_df_list = _df_bar_xyheight_from_ax_helper( + df_bar_data, ax, subplot_division + ) + for i in range(len(subplot_data_df_list)): + _df_bar_subplot_checker( + df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i] + ) + + +def test_plot_bar_label_count_default(): + df = DataFrame( + [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD") + ) + df.plot(subplots=True, kind="bar", title=["A", "B", "C", "D"]) + + +def test_plot_bar_label_count_expected_fail(): + df = DataFrame( + [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD") + ) + error_regex = re.escape( + "The number of titles (4) must equal the number of subplots (3)." + ) + with pytest.raises(ValueError, match=error_regex): + df.plot( + subplots=[("A", "B")], + kind="bar", + title=["A&B", "C", "D", "Extra Title"], + ) + + +def test_plot_bar_label_count_expected_success(): + df = DataFrame( + [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD") + ) + df.plot(subplots=[("A", "B", "D")], kind="bar", title=["A&B&D", "C"]) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 2b2f2f3b84307..98e70f770896c 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -1,4 +1,5 @@ -""" Test cases for Series.plot """ +"""Test cases for Series.plot""" + from datetime import datetime from itertools import chain @@ -32,9 +33,14 @@ get_y_axis, ) +from pandas.tseries.offsets import CustomBusinessDay + mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +from pandas.plotting._matplotlib.converter import DatetimeConverter +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def ts(): @@ -48,7 +54,7 @@ def ts(): @pytest.fixture def series(): return Series( - range(20), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(20)] + range(10), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(10)] ) @@ -191,28 +197,24 @@ def test_area_sharey_dont_overwrite(self, ts): assert get_y_axis(ax1).joined(ax1, ax2) assert get_y_axis(ax2).joined(ax1, ax2) - plt.close(fig) def test_label(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(label="LABEL", legend=True, ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_none(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=[""]) - mpl.pyplot.close("all") def test_label_ser_name(self): s = Series([1, 2], name="NAME") _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=["NAME"]) - mpl.pyplot.close("all") def test_label_ser_name_override(self): s = Series([1, 2], name="NAME") @@ -220,7 +222,6 @@ def test_label_ser_name_override(self): _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, label="LABEL", ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_ser_name_override_dont_draw(self): s = Series([1, 2], name="NAME") @@ -230,7 +231,6 @@ def test_label_ser_name_override_dont_draw(self): assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_boolean(self): # GH 23719 @@ -343,9 +343,7 @@ def test_rotation_30(self): _check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): - from pandas.plotting._matplotlib.converter import DatetimeConverter - - rng = date_range("1/1/2000", "3/1/2000") + rng = date_range("1/1/2000", "1/15/2000") rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -377,7 +375,13 @@ def test_pie_series(self): ) ax = _check_plot_works(series.plot.pie) _check_text_labels(ax.texts, series.index) - assert ax.get_ylabel() == "YLABEL" + assert ax.get_ylabel() == "" + + def test_pie_arrow_type(self): + # GH 59192 + pytest.importorskip("pyarrow") + ser = Series([1, 2, 3, 4], dtype="int32[pyarrow]") + _check_plot_works(ser.plot.pie) def test_pie_series_no_label(self): series = Series( @@ -423,7 +427,7 @@ def test_pie_series_autopct_and_fontsize(self): ax = _check_plot_works( series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7 ) - pcts = [f"{s*100:.2f}" for s in series.values / series.sum()] + pcts = [f"{s * 100:.2f}" for s in series.values / series.sum()] expected_texts = list(chain.from_iterable(zip(series.index, pcts))) _check_text_labels(ax.texts, expected_texts) for t in ax.texts: @@ -452,9 +456,9 @@ def test_pie_nan(self): def test_df_series_secondary_legend(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # primary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() @@ -466,28 +470,12 @@ def test_df_series_secondary_legend(self): assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_with_axes(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # primary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left and right axis must be visible - _check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) - assert ax.get_yaxis().get_visible() - assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, ax=ax) @@ -499,29 +487,12 @@ def test_df_series_secondary_legend_both(self): assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # secondary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(secondary_y=True, ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left axis must be invisible and right axis must be visible - expected = ["a (right)", "b (right)", "c (right)", "x (right)"] - _check_legend_labels(ax.left_ax, expected) - assert not ax.left_ax.get_yaxis().get_visible() - assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis_2(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (with passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, mark_right=False, ax=ax) @@ -536,17 +507,12 @@ def test_df_series_secondary_legend_both_with_axis_2(self): @pytest.mark.parametrize( "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] ) - def test_secondary_logy(self, input_logy, expected_scale): - # GH 25545 - s1 = Series(np.random.default_rng(2).standard_normal(100)) - s2 = Series(np.random.default_rng(2).standard_normal(100)) - - # GH 24980 - ax1 = s1.plot(logy=input_logy) - ax2 = s2.plot(secondary_y=True, logy=input_logy) - + @pytest.mark.parametrize("secondary_kwarg", [{}, {"secondary_y": True}]) + def test_secondary_logy(self, input_logy, expected_scale, secondary_kwarg): + # GH 25545, GH 24980 + s1 = Series(np.random.default_rng(2).standard_normal(10)) + ax1 = s1.plot(logy=input_logy, **secondary_kwarg) assert ax1.get_yscale() == expected_scale - assert ax2.get_yscale() == expected_scale def test_plot_fails_with_dupe_color_and_style(self): x = Series(np.random.default_rng(2).standard_normal(2)) @@ -572,6 +538,22 @@ def test_kde_kwargs(self, ts, bw_method, ind): pytest.importorskip("scipy") _check_plot_works(ts.plot.kde, bw_method=bw_method, ind=ind) + @pytest.mark.parametrize( + "bw_method, ind, weights", + [ + ["scott", 20, None], + [None, 20, None], + [None, np.int_(20), None], + [0.5, np.linspace(-100, 100, 20), None], + ["scott", 40, np.linspace(0.0, 2.0, 50)], + ], + ) + def test_kde_kwargs_weights(self, bw_method, ind, weights): + # GH59337 + pytest.importorskip("scipy") + s = Series(np.random.default_rng(2).uniform(size=50)) + _check_plot_works(s.plot.kde, bw_method=bw_method, ind=ind, weights=weights) + def test_density_kwargs(self, ts): pytest.importorskip("scipy") sample_points = np.linspace(-100, 100, 20) @@ -672,6 +654,9 @@ def test_errorbar_asymmetrical(self): expected = (err.T * np.array([-1, 1])) + s.to_numpy().reshape(-1, 1) tm.assert_numpy_array_equal(result, expected) + def test_errorbar_asymmetrical_error(self): + # GH9536 + s = Series(np.arange(10), name="x") msg = ( "Asymmetrical error bars should be provided " f"with the shape \\(2, {len(s)}\\)" @@ -758,8 +743,6 @@ def test_series_grid_settings(self): @pytest.mark.parametrize("c", ["r", "red", "green", "#FF0000"]) def test_standard_colors(self, c): - from pandas.plotting._matplotlib.style import get_standard_colors - result = get_standard_colors(1, color=c) assert result == [c] @@ -773,12 +756,8 @@ def test_standard_colors(self, c): assert result == [c] * 3 def test_standard_colors_all(self): - from matplotlib import colors - - from pandas.plotting._matplotlib.style import get_standard_colors - # multiple colors like mediumaquamarine - for c in colors.cnames: + for c in mpl.colors.cnames: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -792,7 +771,7 @@ def test_standard_colors_all(self): assert result == [c] * 3 # single letter colors like k - for c in colors.ColorConverter.colors: + for c in mpl.colors.ColorConverter.colors: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -820,8 +799,6 @@ def test_time_series_plot_color_kwargs(self): _check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_with_empty_kwargs(self): - import matplotlib as mpl - def_colors = _unpack_cycler(mpl.rcParams) index = date_range("1/1/2000", periods=12) s = Series(np.arange(1, 13), index=index) @@ -850,8 +827,6 @@ def test_xtick_barPlot(self): def test_custom_business_day_freq(self): # GH7222 - from pandas.tseries.offsets import CustomBusinessDay - s = Series( range(100, 121), index=pd.bdate_range( @@ -983,3 +958,40 @@ def test_plot_no_warning(self, ts): # TODO(3.0): this can be removed once Period[B] deprecation is enforced with tm.assert_produces_warning(False): _ = ts.plot() + + def test_secondary_y_subplot_axis_labels(self): + # GH#14102 + s1 = Series([5, 7, 6, 8, 7], index=[1, 2, 3, 4, 5]) + s2 = Series([6, 4, 5, 3, 4], index=[1, 2, 3, 4, 5]) + + ax = plt.subplot(2, 1, 1) + s1.plot(ax=ax) + s2.plot(ax=ax, secondary_y=True) + ax2 = plt.subplot(2, 1, 2) + s1.plot(ax=ax2) + assert len(ax.xaxis.get_minor_ticks()) == 0 + assert len(ax.get_xticklabels()) > 0 + + def test_bar_line_plot(self): + """ + Test that bar and line plots with the same x values are superposed + and that the x limits are set such that the plots are visible. + """ + # GH61161 + index = period_range("2023", periods=3, freq="Y") + years = set(index.year.astype(str)) + s = Series([1, 2, 3], index=index) + ax = plt.subplot() + s.plot(kind="bar", ax=ax) + bar_xticks = [ + label for label in ax.get_xticklabels() if label.get_text() in years + ] + s.plot(kind="line", ax=ax, color="r") + line_xticks = [ + label for label in ax.get_xticklabels() if label.get_text() in years + ] + assert len(bar_xticks) == len(index) + assert bar_xticks == line_xticks + x_limits = ax.get_xlim() + assert x_limits[0] <= bar_xticks[0].get_position()[0] + assert x_limits[1] >= bar_xticks[-1].get_position()[0] diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py index 665bda15724fd..f9c89e0a7893f 100644 --- a/pandas/tests/plotting/test_style.py +++ b/pandas/tests/plotting/test_style.py @@ -2,7 +2,8 @@ from pandas import Series -pytest.importorskip("matplotlib") +mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") from pandas.plotting._matplotlib.style import get_standard_colors @@ -18,11 +19,8 @@ class TestGetStandardColors: ], ) def test_default_colors_named_from_prop_cycle(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color=["red", "green", "blue"]), + "axes.prop_cycle": plt.cycler(color=["red", "green", "blue"]), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -39,11 +37,8 @@ def test_default_colors_named_from_prop_cycle(self, num_colors, expected): ], ) def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color="bgry"), + "axes.prop_cycle": plt.cycler(color="bgry"), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -74,11 +69,8 @@ def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected) ], ) def test_default_colors_named_undefined_prop_cycle(self, num_colors, expected_name): - import matplotlib as mpl - import matplotlib.colors as mcolors - with mpl.rc_context(rc={}): - expected = [mcolors.to_hex(x) for x in expected_name] + expected = [mpl.colors.to_hex(x) for x in expected_name] result = get_standard_colors(num_colors=num_colors) assert result == expected diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 30ec0d0affaa3..a7bb80727206e 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -128,28 +128,14 @@ def test_nanargminmax(self, opname, index_or_series): obj = klass([NaT, datetime(2011, 11, 1)]) assert getattr(obj, arg_op)() == 1 - msg = ( - "The behavior of (DatetimeIndex|Series).argmax/argmin with " - "skipna=False and NAs" - ) - if klass is Series: - msg = "The behavior of Series.(idxmax|idxmin) with all-NA" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(obj, arg_op)(skipna=False) - if klass is Series: - assert np.isnan(result) - else: - assert result == -1 + with pytest.raises(ValueError, match="Encountered an NA value"): + getattr(obj, arg_op)(skipna=False) obj = klass([NaT, datetime(2011, 11, 1), NaT]) # check DatetimeIndex non-monotonic path assert getattr(obj, arg_op)() == 1 - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(obj, arg_op)(skipna=False) - if klass is Series: - assert np.isnan(result) - else: - assert result == -1 + with pytest.raises(ValueError, match="Encountered an NA value"): + getattr(obj, arg_op)(skipna=False) @pytest.mark.parametrize("opname", ["max", "min"]) @pytest.mark.parametrize("dtype", ["M8[ns]", "datetime64[ns, UTC]"]) @@ -175,40 +161,38 @@ def test_argminmax(self): obj = Index([np.nan, 1, np.nan, 2]) assert obj.argmin() == 1 assert obj.argmax() == 3 - msg = "The behavior of Index.argmax/argmin with skipna=False and NAs" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmin(skipna=False) == -1 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmax(skipna=False) == -1 + with pytest.raises(ValueError, match="Encountered an NA value"): + obj.argmin(skipna=False) + with pytest.raises(ValueError, match="Encountered an NA value"): + obj.argmax(skipna=False) obj = Index([np.nan]) - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmin() == -1 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmax() == -1 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmin(skipna=False) == -1 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmax(skipna=False) == -1 - - msg = "The behavior of DatetimeIndex.argmax/argmin with skipna=False and NAs" + with pytest.raises(ValueError, match="Encountered all NA values"): + obj.argmin() + with pytest.raises(ValueError, match="Encountered all NA values"): + obj.argmax() + with pytest.raises(ValueError, match="Encountered an NA value"): + obj.argmin(skipna=False) + with pytest.raises(ValueError, match="Encountered an NA value"): + obj.argmax(skipna=False) + obj = Index([NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), NaT]) assert obj.argmin() == 1 assert obj.argmax() == 2 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmin(skipna=False) == -1 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmax(skipna=False) == -1 + with pytest.raises(ValueError, match="Encountered an NA value"): + obj.argmin(skipna=False) + with pytest.raises(ValueError, match="Encountered an NA value"): + obj.argmax(skipna=False) obj = Index([NaT]) - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmin() == -1 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmax() == -1 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmin(skipna=False) == -1 - with tm.assert_produces_warning(FutureWarning, match=msg): - assert obj.argmax(skipna=False) == -1 + with pytest.raises(ValueError, match="Encountered all NA values"): + obj.argmin() + with pytest.raises(ValueError, match="Encountered all NA values"): + obj.argmax() + with pytest.raises(ValueError, match="Encountered an NA value"): + obj.argmin(skipna=False) + with pytest.raises(ValueError, match="Encountered an NA value"): + obj.argmax(skipna=False) @pytest.mark.parametrize("op, expected_col", [["max", "a"], ["min", "b"]]) def test_same_tz_min_max_axis_1(self, op, expected_col): @@ -394,7 +378,7 @@ def test_invalid_td64_reductions(self, opname): [ f"reduction operation '{opname}' not allowed for this dtype", rf"cannot perform {opname} with type timedelta64\[ns\]", - f"does not support reduction '{opname}'", + f"does not support operation '{opname}'", ] ) @@ -594,11 +578,6 @@ def test_sum_inf(self): arr = np.random.default_rng(2).standard_normal((100, 100)).astype("f4") arr[:, 2] = np.inf - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.use_inf_as_na", True): - tm.assert_almost_equal(s.sum(), s2.sum()) - res = nanops.nansum(arr, axis=1) assert np.isinf(res).all() @@ -686,7 +665,7 @@ def test_empty(self, method, unit, use_bottleneck, dtype): # GH#844 (changed in GH#9422) df = DataFrame(np.empty((10, 0)), dtype=dtype) - assert (getattr(df, method)(1) == unit).all() + assert (getattr(df, method)(axis=1) == unit).all() s = Series([1], dtype=dtype) result = getattr(s, method)(min_count=2) @@ -735,7 +714,7 @@ def test_ops_consistency_on_empty(self, method): [ "operation 'var' not allowed", r"cannot perform var with type timedelta64\[ns\]", - "does not support reduction 'var'", + "does not support operation 'var'", ] ) with pytest.raises(TypeError, match=msg): @@ -801,8 +780,7 @@ def test_var_masked_array(self, ddof, exp): assert result == result_numpy_dtype assert result == exp - @pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]")) - @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("dtype", ("m8[ns]", "M8[ns]", "M8[ns, UTC]")) def test_empty_timeseries_reductions_return_nat(self, dtype, skipna): # covers GH#11245 assert Series([], dtype=dtype).min(skipna=skipna) is NaT @@ -847,26 +825,16 @@ def test_idxmin_dt64index(self, unit): # GH#43587 should have NaT instead of NaN dti = DatetimeIndex(["NaT", "2015-02-08", "NaT"]).as_unit(unit) ser = Series([1.0, 2.0, np.nan], index=dti) - msg = "The behavior of Series.idxmin with all-NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.idxmin(skipna=False) - assert res is NaT - msg = "The behavior of Series.idxmax with all-NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.idxmax(skipna=False) - assert res is NaT + with pytest.raises(ValueError, match="Encountered an NA value"): + ser.idxmin(skipna=False) + with pytest.raises(ValueError, match="Encountered an NA value"): + ser.idxmax(skipna=False) df = ser.to_frame() - msg = "The behavior of DataFrame.idxmin with all-NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.idxmin(skipna=False) - assert res.dtype == f"M8[{unit}]" - assert res.isna().all() - msg = "The behavior of DataFrame.idxmax with all-NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.idxmax(skipna=False) - assert res.dtype == f"M8[{unit}]" - assert res.isna().all() + with pytest.raises(ValueError, match="Encountered an NA value"): + df.idxmin(skipna=False) + with pytest.raises(ValueError, match="Encountered an NA value"): + df.idxmax(skipna=False) def test_idxmin(self): # test idxmin @@ -878,9 +846,8 @@ def test_idxmin(self): # skipna or no assert string_series[string_series.idxmin()] == string_series.min() - msg = "The behavior of Series.idxmin" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert isna(string_series.idxmin(skipna=False)) + with pytest.raises(ValueError, match="Encountered an NA value"): + string_series.idxmin(skipna=False) # no NaNs nona = string_series.dropna() @@ -889,8 +856,9 @@ def test_idxmin(self): # all NaNs allna = string_series * np.nan - with tm.assert_produces_warning(FutureWarning, match=msg): - assert isna(allna.idxmin()) + msg = "Encountered all NA values" + with pytest.raises(ValueError, match=msg): + allna.idxmin() # datetime64[ns] s = Series(date_range("20130102", periods=6)) @@ -911,8 +879,7 @@ def test_idxmax(self): # skipna or no assert string_series[string_series.idxmax()] == string_series.max() - msg = "The behavior of Series.idxmax with all-NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Encountered an NA value"): assert isna(string_series.idxmax(skipna=False)) # no NaNs @@ -922,9 +889,9 @@ def test_idxmax(self): # all NaNs allna = string_series * np.nan - msg = "The behavior of Series.idxmax with all-NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert isna(allna.idxmax()) + msg = "Encountered all NA values" + with pytest.raises(ValueError, match=msg): + allna.idxmax() s = Series(date_range("20130102", periods=6)) result = s.idxmax() @@ -986,32 +953,27 @@ def test_all_any_bool_only(self): assert s.any(bool_only=True) assert not s.all(bool_only=True) - @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) - @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype(self, bool_agg_func, skipna): + def test_any_all_object_dtype(self, all_boolean_reductions, skipna): # GH#12863 ser = Series(["a", "b", "c", "d", "e"], dtype=object) - result = getattr(ser, bool_agg_func)(skipna=skipna) + result = getattr(ser, all_boolean_reductions)(skipna=skipna) expected = True assert result == expected - @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize( "data", [[False, None], [None, False], [False, np.nan], [np.nan, False]] ) - def test_any_all_object_dtype_missing(self, data, bool_agg_func): + def test_any_all_object_dtype_missing(self, data, all_boolean_reductions): # GH#27709 ser = Series(data) - result = getattr(ser, bool_agg_func)(skipna=False) + result = getattr(ser, all_boolean_reductions)(skipna=False) # None is treated is False, but np.nan is treated as True - expected = bool_agg_func == "any" and None not in data + expected = all_boolean_reductions == "any" and None not in data assert result == expected @pytest.mark.parametrize("dtype", ["boolean", "Int64", "UInt64", "Float64"]) - @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize( # expected_data indexed as [[skipna=False/any, skipna=False/all], # [skipna=True/any, skipna=True/all]] @@ -1026,13 +988,13 @@ def test_any_all_object_dtype_missing(self, data, bool_agg_func): ], ) def test_any_all_nullable_kleene_logic( - self, bool_agg_func, skipna, data, dtype, expected_data + self, all_boolean_reductions, skipna, data, dtype, expected_data ): # GH-37506, GH-41967 ser = Series(data, dtype=dtype) - expected = expected_data[skipna][bool_agg_func == "all"] + expected = expected_data[skipna][all_boolean_reductions == "all"] - result = getattr(ser, bool_agg_func)(skipna=skipna) + result = getattr(ser, all_boolean_reductions)(skipna=skipna) assert (result is pd.NA and expected is pd.NA) or result == expected def test_any_axis1_bool_only(self): @@ -1049,32 +1011,41 @@ def test_any_all_datetimelike(self): ser = Series(dta) df = DataFrame(ser) - msg = "'(any|all)' with datetime64 dtypes is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#34479 - assert dta.all() - assert dta.any() + # GH#34479 + msg = "datetime64 type does not support operation '(any|all)'" + with pytest.raises(TypeError, match=msg): + dta.all() + with pytest.raises(TypeError, match=msg): + dta.any() - assert ser.all() - assert ser.any() + with pytest.raises(TypeError, match=msg): + ser.all() + with pytest.raises(TypeError, match=msg): + ser.any() - assert df.any().all() - assert df.all().all() + with pytest.raises(TypeError, match=msg): + df.any().all() + with pytest.raises(TypeError, match=msg): + df.all().all() dta = dta.tz_localize("UTC") ser = Series(dta) df = DataFrame(ser) + # GH#34479 + with pytest.raises(TypeError, match=msg): + dta.all() + with pytest.raises(TypeError, match=msg): + dta.any() - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#34479 - assert dta.all() - assert dta.any() - - assert ser.all() - assert ser.any() + with pytest.raises(TypeError, match=msg): + ser.all() + with pytest.raises(TypeError, match=msg): + ser.any() - assert df.any().all() - assert df.all().all() + with pytest.raises(TypeError, match=msg): + df.any().all() + with pytest.raises(TypeError, match=msg): + df.all().all() tda = dta - dta[0] ser = Series(tda) @@ -1089,25 +1060,62 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() - def test_any_all_pyarrow_string(self): + def test_any_all_string_dtype(self, any_string_dtype): # GH#54591 - pytest.importorskip("pyarrow") - ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + if ( + isinstance(any_string_dtype, pd.StringDtype) + and any_string_dtype.na_value is pd.NA + ): + # the nullable string dtype currently still raise an error + # https://github.com/pandas-dev/pandas/issues/51939 + ser = Series(["a", "b"], dtype=any_string_dtype) + with pytest.raises(TypeError): + ser.any() + with pytest.raises(TypeError): + ser.all() + return + + ser = Series(["", "a"], dtype=any_string_dtype) assert ser.any() assert not ser.all() + assert ser.any(skipna=False) + assert not ser.all(skipna=False) - ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + ser = Series([np.nan, "a"], dtype=any_string_dtype) assert ser.any() assert ser.all() - assert not ser.all(skipna=False) + assert ser.any(skipna=False) + assert ser.all(skipna=False) # NaN is considered truthy - ser = Series([None, ""], dtype="string[pyarrow_numpy]") + ser = Series([np.nan, ""], dtype=any_string_dtype) assert not ser.any() assert not ser.all() + assert ser.any(skipna=False) # NaN is considered truthy + assert not ser.all(skipna=False) - ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser = Series(["a", "b"], dtype=any_string_dtype) assert ser.any() assert ser.all() + assert ser.any(skipna=False) + assert ser.all(skipna=False) + + ser = Series([], dtype=any_string_dtype) + assert not ser.any() + assert ser.all() + assert not ser.any(skipna=False) + assert ser.all(skipna=False) + + ser = Series([""], dtype=any_string_dtype) + assert not ser.any() + assert not ser.all() + assert not ser.any(skipna=False) + assert not ser.all(skipna=False) + + ser = Series([np.nan], dtype=any_string_dtype) + assert not ser.any() + assert ser.all() + assert ser.any(skipna=False) # NaN is considered truthy + assert ser.all(skipna=False) # NaN is considered truthy def test_timedelta64_analytics(self): # index min/max @@ -1150,18 +1158,7 @@ def test_timedelta64_analytics(self): expected = Timedelta("1 days") assert result == expected - @pytest.mark.parametrize( - "test_input,error_type", - [ - (Series([], dtype="float64"), ValueError), - # For strings, or any Series with dtype 'O' - (Series(["foo", "bar", "baz"]), TypeError), - (Series([(1,), (2,)]), TypeError), - # For mixed data types - (Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]), TypeError), - ], - ) - def test_assert_idxminmax_empty_raises(self, test_input, error_type): + def test_assert_idxminmax_empty_raises(self): """ Cases where ``Series.argmax`` and related should raise an exception """ @@ -1250,24 +1247,12 @@ def test_idxminmax_with_inf(self): s = Series([0, -np.inf, np.inf, np.nan]) assert s.idxmin() == 1 - msg = "The behavior of Series.idxmin with all-NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert np.isnan(s.idxmin(skipna=False)) + with pytest.raises(ValueError, match="Encountered an NA value"): + s.idxmin(skipna=False) assert s.idxmax() == 2 - msg = "The behavior of Series.idxmax with all-NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert np.isnan(s.idxmax(skipna=False)) - - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - # Using old-style behavior that treats floating point nan, -inf, and - # +inf as missing - with pd.option_context("mode.use_inf_as_na", True): - assert s.idxmin() == 0 - assert np.isnan(s.idxmin(skipna=False)) - assert s.idxmax() == 0 - np.isnan(s.idxmax(skipna=False)) + with pytest.raises(ValueError, match="Encountered an NA value"): + s.idxmax(skipna=False) def test_sum_uint64(self): # GH 53401 @@ -1276,6 +1261,12 @@ def test_sum_uint64(self): expected = np.uint64(10000000000000000000) tm.assert_almost_equal(result, expected) + def test_signedness_preserved_after_sum(self): + # GH 37491 + ser = Series([1, 2, 3, 4]) + + assert ser.astype("uint8").sum().dtype == "uint64" + class TestDatetime64SeriesReductions: # Note: the name TestDatetime64SeriesReductions indicates these tests @@ -1300,13 +1291,14 @@ def test_minmax_nat_series(self, nat_ser): @pytest.mark.parametrize( "nat_df", [ - DataFrame([NaT, NaT]), - DataFrame([NaT, Timedelta("nat")]), - DataFrame([Timedelta("nat"), Timedelta("nat")]), + [NaT, NaT], + [NaT, Timedelta("nat")], + [Timedelta("nat"), Timedelta("nat")], ], ) def test_minmax_nat_dataframe(self, nat_df): # GH#23282 + nat_df = DataFrame(nat_df) assert nat_df.min()[0] is NaT assert nat_df.max()[0] is NaT assert nat_df.min(skipna=False)[0] is NaT @@ -1380,7 +1372,6 @@ def test_min_max_ordered(self, values, categories, function): assert result == expected @pytest.mark.parametrize("function", ["min", "max"]) - @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_ordered_with_nan_only(self, function, skipna): # https://github.com/pandas-dev/pandas/issues/33450 cat = Series(Categorical([np.nan], categories=[1, 2], ordered=True)) @@ -1388,7 +1379,6 @@ def test_min_max_ordered_with_nan_only(self, function, skipna): assert result is np.nan @pytest.mark.parametrize("function", ["min", "max"]) - @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_skipna(self, function, skipna): cat = Series( Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True) @@ -1407,14 +1397,10 @@ class TestSeriesMode: # were moved from a series-specific test file, _not_ that these tests are # intended long-term to be series-specific - @pytest.mark.parametrize( - "dropna, expected", - [(True, Series([], dtype=np.float64)), (False, Series([], dtype=np.float64))], - ) - def test_mode_empty(self, dropna, expected): + def test_mode_empty(self, dropna): s = Series([], dtype=np.float64) result = s.mode(dropna) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, s) @pytest.mark.parametrize( "dropna, data, expected", @@ -1425,13 +1411,10 @@ def test_mode_empty(self, dropna, expected): (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), ], ) - @pytest.mark.parametrize( - "dt", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) - ) - def test_mode_numerical(self, dropna, data, expected, dt): - s = Series(data, dtype=dt) + def test_mode_numerical(self, dropna, data, expected, any_real_numpy_dtype): + s = Series(data, dtype=any_real_numpy_dtype) result = s.mode(dropna) - expected = Series(expected, dtype=dt) + expected = Series(expected, dtype=any_real_numpy_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dropna, expected", [(True, [1.0]), (False, [1, np.nan])]) @@ -1442,10 +1425,13 @@ def test_mode_numerical_nan(self, dropna, expected): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "dropna, expected1, expected2, expected3", - [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], + "dropna, expected1, expected2", + [ + (True, ["b"], ["bar"]), + (False, ["b"], [np.nan]), + ], ) - def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + def test_mode_object(self, dropna, expected1, expected2): # Test string and object types. data = ["a"] * 2 + ["b"] * 3 @@ -1458,15 +1444,31 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): s = Series(data, dtype=object) result = s.mode(dropna) - expected2 = Series(expected2, dtype=None if expected2 == ["bar"] else object) + expected2 = Series(expected2, dtype=object) tm.assert_series_equal(result, expected2) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + (True, ["b"], ["bar"]), + (False, ["b"], [np.nan]), + ], + ) + def test_mode_string(self, dropna, expected1, expected2, any_string_dtype): + # Test string and object types. + data = ["a"] * 2 + ["b"] * 3 + + s = Series(data, dtype=any_string_dtype) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=any_string_dtype) + tm.assert_series_equal(result, expected1) + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] - s = Series(data, dtype=object).astype(str) + s = Series(data, dtype=any_string_dtype) result = s.mode(dropna) - expected3 = Series(expected3) - tm.assert_series_equal(result, expected3) + expected2 = Series(expected2, dtype=any_string_dtype) + tm.assert_series_equal(result, expected2) @pytest.mark.parametrize( "dropna, expected1, expected2", @@ -1475,12 +1477,12 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): def test_mode_mixeddtype(self, dropna, expected1, expected2): s = Series([1, "foo", "foo"]) result = s.mode(dropna) - expected = Series(expected1) + expected = Series(expected1, dtype=object) tm.assert_series_equal(result, expected) s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) result = s.mode(dropna) - expected = Series(expected2, dtype=None if expected2 == ["foo"] else object) + expected = Series(expected2, dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1605,24 +1607,17 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(["foo", np.nan]) + def test_mode_sort_with_na(self): s = Series([1, "foo", "foo", np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - + expected = Series(["foo", np.nan], dtype=object) + result = s.mode(dropna=False) tm.assert_series_equal(result, expected) def test_mode_boolean_with_na(self): # GH#42107 ser = Series([True, False, True, pd.NA], dtype="boolean") result = ser.mode() - expected = Series({0: True}, dtype="boolean") + expected = Series([True], dtype="boolean") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1630,23 +1625,24 @@ def test_mode_boolean_with_na(self): [ ( [0, 1j, 1, 1, 1 + 1j, 1 + 2j], - Series([1], dtype=np.complex128), + [1], np.complex128, ), ( [0, 1j, 1, 1, 1 + 1j, 1 + 2j], - Series([1], dtype=np.complex64), + [1], np.complex64, ), ( [1 + 1j, 2j, 1 + 1j], - Series([1 + 1j], dtype=np.complex128), + [1 + 1j], np.complex128, ), ], ) def test_single_mode_value_complex(self, array, expected, dtype): result = Series(array, dtype=dtype).mode() + expected = Series(expected, dtype=dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1655,12 +1651,12 @@ def test_single_mode_value_complex(self, array, expected, dtype): ( # no modes [0, 1j, 1, 1 + 1j, 1 + 2j], - Series([0j, 1j, 1 + 0j, 1 + 1j, 1 + 2j], dtype=np.complex128), + [0j, 1j, 1 + 0j, 1 + 1j, 1 + 2j], np.complex128, ), ( [1 + 1j, 2j, 1 + 1j, 2j, 3], - Series([2j, 1 + 1j], dtype=np.complex64), + [2j, 1 + 1j], np.complex64, ), ], @@ -1670,4 +1666,5 @@ def test_multimode_complex(self, array, expected, dtype): # mode tries to sort multimodal series. # Complex numbers are sorted by their magnitude result = Series(array, dtype=dtype).mode() + expected = Series(expected, dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 8fbb78737474c..4af1ca1d4800a 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -1,6 +1,7 @@ """ Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ... """ + import inspect import numpy as np @@ -16,8 +17,7 @@ class TestDatetimeLikeStatReductions: - @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) - def test_dt64_mean(self, tz_naive_fixture, box): + def test_dt64_mean(self, tz_naive_fixture, index_or_series_or_array): tz = tz_naive_fixture dti = date_range("2001-01-01", periods=11, tz=tz) @@ -25,20 +25,19 @@ def test_dt64_mean(self, tz_naive_fixture, box): dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) dtarr = dti._data - obj = box(dtarr) + obj = index_or_series_or_array(dtarr) assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz) assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz) # dtarr[-2] will be the first date 2001-01-1 dtarr[-2] = pd.NaT - obj = box(dtarr) + obj = index_or_series_or_array(dtarr) assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz) assert obj.mean(skipna=False) is pd.NaT - @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) @pytest.mark.parametrize("freq", ["s", "h", "D", "W", "B"]) - def test_period_mean(self, box, freq): + def test_period_mean(self, index_or_series_or_array, freq): # GH#24757 dti = date_range("2001-01-01", periods=11) # shuffle so that we are not just working with monotone-increasing @@ -48,7 +47,7 @@ def test_period_mean(self, box, freq): msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): parr = dti._data.to_period(freq) - obj = box(parr) + obj = index_or_series_or_array(parr) with pytest.raises(TypeError, match="ambiguous"): obj.mean() with pytest.raises(TypeError, match="ambiguous"): @@ -62,13 +61,12 @@ def test_period_mean(self, box, freq): with pytest.raises(TypeError, match="ambiguous"): obj.mean(skipna=True) - @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) - def test_td64_mean(self, box): + def test_td64_mean(self, index_or_series_or_array): m8values = np.array([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], "m8[D]") tdi = pd.TimedeltaIndex(m8values).as_unit("ns") tdarr = tdi._data - obj = box(tdarr, copy=False) + obj = index_or_series_or_array(tdarr, copy=False) result = obj.mean() expected = np.array(tdarr).mean() @@ -101,7 +99,7 @@ def _check_stat_op( # mean, idxmax, idxmin, min, and max are valid for dates if name not in ["max", "min", "mean", "median", "std"]: ds = Series(date_range("1/1/2001", periods=10)) - msg = f"does not support reduction '{name}'" + msg = f"does not support operation '{name}'" with pytest.raises(TypeError, match=msg): f(ds) diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index 1033d908eb22d..6c45ece5d8fb9 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -1,13 +1,5 @@ -from datetime import datetime - -import numpy as np import pytest -from pandas import ( - DataFrame, - Series, -) - # The various methods we support downsample_methods = [ "min", @@ -39,105 +31,3 @@ def downsample_method(request): def resample_method(request): """Fixture for parametrization of Grouper resample methods.""" return request.param - - -@pytest.fixture -def _index_start(): - """Fixture for parametrization of index, series and frame.""" - return datetime(2005, 1, 1) - - -@pytest.fixture -def _index_end(): - """Fixture for parametrization of index, series and frame.""" - return datetime(2005, 1, 10) - - -@pytest.fixture -def _index_freq(): - """Fixture for parametrization of index, series and frame.""" - return "D" - - -@pytest.fixture -def _index_name(): - """Fixture for parametrization of index, series and frame.""" - return None - - -@pytest.fixture -def index(_index_factory, _index_start, _index_end, _index_freq, _index_name): - """ - Fixture for parametrization of date_range, period_range and - timedelta_range indexes - """ - return _index_factory(_index_start, _index_end, freq=_index_freq, name=_index_name) - - -@pytest.fixture -def _static_values(index): - """ - Fixture for parametrization of values used in parametrization of - Series and DataFrames with date_range, period_range and - timedelta_range indexes - """ - return np.arange(len(index)) - - -@pytest.fixture -def _series_name(): - """ - Fixture for parametrization of Series name for Series used with - date_range, period_range and timedelta_range indexes - """ - return None - - -@pytest.fixture -def series(index, _series_name, _static_values): - """ - Fixture for parametrization of Series with date_range, period_range and - timedelta_range indexes - """ - return Series(_static_values, index=index, name=_series_name) - - -@pytest.fixture -def empty_series_dti(series): - """ - Fixture for parametrization of empty Series with date_range, - period_range and timedelta_range indexes - """ - return series[:0] - - -@pytest.fixture -def frame(index, _series_name, _static_values): - """ - Fixture for parametrization of DataFrame with date_range, period_range - and timedelta_range indexes - """ - # _series_name is intentionally unused - return DataFrame({"value": _static_values}, index=index) - - -@pytest.fixture -def empty_frame_dti(series): - """ - Fixture for parametrization of empty DataFrame with date_range, - period_range and timedelta_range indexes - """ - index = series.index[:0] - return DataFrame(index=index) - - -@pytest.fixture -def series_and_frame(frame_or_series, series, frame): - """ - Fixture for parametrization of Series and DataFrame with date_range, - period_range and timedelta_range indexes - """ - if frame_or_series == Series: - return series - if frame_or_series == DataFrame: - return frame diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 50644e33e45e1..d9bd89af61aaf 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,6 +3,9 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -21,54 +24,63 @@ from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.resample import _asfreq_compat -# a fixture value can be overridden by the test parameter value. Note that the -# value of the fixture can be overridden this way even if the test doesn't use -# it directly (doesn't mention it in the function prototype). -# see https://docs.pytest.org/en/latest/fixture.html#override-a-fixture-with-direct-test-parametrization # noqa: E501 -# in this module we override the fixture values defined in conftest.py -# tuples of '_index_factory,_series_name,_index_start,_index_end' -DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10)) -PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10)) -TIMEDELTA_RANGE = (timedelta_range, "tdi", "1 day", "10 day") - -all_ts = pytest.mark.parametrize( - "_index_factory,_series_name,_index_start,_index_end", - [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE], -) - -@pytest.fixture -def create_index(_index_factory): - def _create_index(*args, **kwargs): - """return the _index_factory created using the args, kwargs""" - return _index_factory(*args, **kwargs) - - return _create_index +@pytest.fixture( + params=[ + "linear", + "time", + "index", + "values", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + ], +) +def all_1d_no_arg_interpolation_methods(request): + return request.param @pytest.mark.parametrize("freq", ["2D", "1h"]) @pytest.mark.parametrize( - "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] + "index", + [ + timedelta_range("1 day", "10 day", freq="D"), + date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"), + ], ) -def test_asfreq(series_and_frame, freq, create_index): - obj = series_and_frame +def test_asfreq(frame_or_series, index, freq): + obj = frame_or_series(range(len(index)), index=index) + idx_range = date_range if isinstance(index, DatetimeIndex) else timedelta_range result = obj.resample(freq).asfreq() - new_index = create_index(obj.index[0], obj.index[-1], freq=freq) + new_index = idx_range(obj.index[0], obj.index[-1], freq=freq) expected = obj.reindex(new_index) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( - "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] + "index", + [ + timedelta_range("1 day", "10 day", freq="D"), + date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"), + ], ) -def test_asfreq_fill_value(series, create_index): +def test_asfreq_fill_value(index): # test for fill value during resampling, issue 3715 - ser = series + ser = Series(range(len(index)), index=index, name="a") + idx_range = date_range if isinstance(index, DatetimeIndex) else timedelta_range result = ser.resample("1h").asfreq() - new_index = create_index(ser.index[0], ser.index[-1], freq="1h") + new_index = idx_range(ser.index[0], ser.index[-1], freq="1h") expected = ser.reindex(new_index) tm.assert_series_equal(result, expected) @@ -76,15 +88,22 @@ def test_asfreq_fill_value(series, create_index): frame = ser.astype("float").to_frame("value") frame.iloc[1] = None result = frame.resample("1h").asfreq(fill_value=4.0) - new_index = create_index(frame.index[0], frame.index[-1], freq="1h") + new_index = idx_range(frame.index[0], frame.index[-1], freq="1h") expected = frame.reindex(new_index, fill_value=4.0) tm.assert_frame_equal(result, expected) -@all_ts -def test_resample_interpolate(frame): +@pytest.mark.parametrize( + "index", + [ + timedelta_range("1 day", "10 day", freq="D"), + date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"), + period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"), + ], +) +def test_resample_interpolate(index): # GH#12925 - df = frame + df = DataFrame(range(len(index)), index=index) warn = None if isinstance(df.index, PeriodIndex): warn = FutureWarning @@ -95,6 +114,56 @@ def test_resample_interpolate(frame): tm.assert_frame_equal(result, expected) +def test_resample_interpolate_regular_sampling_off_grid( + all_1d_no_arg_interpolation_methods, +): + pytest.importorskip("scipy") + # GH#21351 + index = date_range("2000-01-01 00:01:00", periods=5, freq="2h") + ser = Series(np.arange(5.0), index) + + method = all_1d_no_arg_interpolation_methods + result = ser.resample("1h").interpolate(method) + + if method == "linear": + values = np.repeat(np.arange(0.0, 4.0), 2) + np.tile([1 / 3, 2 / 3], 4) + elif method == "nearest": + values = np.repeat(np.arange(0.0, 5.0), 2)[1:-1] + elif method == "zero": + values = np.repeat(np.arange(0.0, 4.0), 2) + else: + values = 0.491667 + np.arange(0.0, 4.0, 0.5) + values = np.insert(values, 0, np.nan) + index = date_range("2000-01-01 00:00:00", periods=9, freq="1h") + expected = Series(values, index=index) + tm.assert_series_equal(result, expected) + + +def test_resample_interpolate_irregular_sampling(all_1d_no_arg_interpolation_methods): + pytest.importorskip("scipy") + # GH#21351 + ser = Series( + np.linspace(0.0, 1.0, 5), + index=DatetimeIndex( + [ + "2000-01-01 00:00:03", + "2000-01-01 00:00:22", + "2000-01-01 00:00:24", + "2000-01-01 00:00:31", + "2000-01-01 00:00:39", + ] + ), + ) + + # Resample to 5 second sampling and interpolate with the given method + ser_resampled = ser.resample("5s").interpolate(all_1d_no_arg_interpolation_methods) + + # Check that none of the resampled values are NaN, except the first one + # which lies 3 seconds before the first actual data point + assert np.isnan(ser_resampled.iloc[0]) + assert not ser_resampled.iloc[1:].isna().any() + + def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() @@ -106,12 +175,19 @@ def test_raises_on_non_datetimelike_index(): xp.resample("YE") -@all_ts +@pytest.mark.parametrize( + "index", + [ + PeriodIndex([], freq="D", name="a"), + DatetimeIndex([], name="a"), + TimedeltaIndex([], name="a"), + ], +) @pytest.mark.parametrize("freq", ["ME", "D", "h"]) -def test_resample_empty_series(freq, empty_series_dti, resample_method): +def test_resample_empty_series(freq, index, resample_method): # GH12771 & GH12868 - ser = empty_series_dti + ser = Series(index=index, dtype=float) if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " @@ -134,7 +210,7 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): if resample_method == "ohlc": expected = DataFrame( - [], index=ser.index[:0].copy(), columns=["open", "high", "low", "close"] + [], index=ser.index[:0], columns=["open", "high", "low", "close"] ) expected.index = _asfreq_compat(ser.index, freq) tm.assert_frame_equal(result, expected, check_dtype=False) @@ -147,7 +223,31 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): assert result.index.freq == expected.index.freq -@all_ts +@pytest.mark.parametrize("min_count", [0, 1]) +def test_resample_empty_sum_string(string_dtype_no_object, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + ser = Series( + pd.NA, + index=DatetimeIndex( + [ + "2000-01-01 00:00:00", + "2000-01-01 00:00:10", + "2000-01-01 00:00:20", + "2000-01-01 00:00:30", + ] + ), + dtype=dtype, + ) + rs = ser.resample("20s") + result = rs.sum(min_count=min_count) + + value = "" if min_count == 0 else pd.NA + index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s") + expected = Series(value, index=index, dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "freq", [ @@ -156,11 +256,10 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): "h", ], ) -def test_resample_nat_index_series(freq, series, resample_method): +def test_resample_nat_index_series(freq, resample_method): # GH39227 - ser = series.copy() - ser.index = PeriodIndex([NaT] * len(ser), freq=freq) + ser = Series(range(5), index=PeriodIndex([NaT] * 5, freq=freq)) msg = "Resampling with a PeriodIndex is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -169,7 +268,7 @@ def test_resample_nat_index_series(freq, series, resample_method): if resample_method == "ohlc": expected = DataFrame( - [], index=ser.index[:0].copy(), columns=["open", "high", "low", "close"] + [], index=ser.index[:0], columns=["open", "high", "low", "close"] ) tm.assert_frame_equal(result, expected, check_dtype=False) else: @@ -179,12 +278,19 @@ def test_resample_nat_index_series(freq, series, resample_method): assert result.index.freq == expected.index.freq -@all_ts +@pytest.mark.parametrize( + "index", + [ + PeriodIndex([], freq="D", name="a"), + DatetimeIndex([], name="a"), + TimedeltaIndex([], name="a"), + ], +) @pytest.mark.parametrize("freq", ["ME", "D", "h"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) -def test_resample_count_empty_series(freq, empty_series_dti, resample_method): +def test_resample_count_empty_series(freq, index, resample_method): # GH28427 - ser = empty_series_dti + ser = Series(index=index) if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " @@ -213,11 +319,13 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): tm.assert_series_equal(result, expected) -@all_ts +@pytest.mark.parametrize( + "index", [DatetimeIndex([]), TimedeltaIndex([]), PeriodIndex([], freq="D")] +) @pytest.mark.parametrize("freq", ["ME", "D", "h"]) -def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): +def test_resample_empty_dataframe(index, freq, resample_method): # GH13212 - df = empty_frame_dti + df = DataFrame(index=index) # count retains dimensions too if freq == "ME" and isinstance(df.index, TimedeltaIndex): msg = ( @@ -241,9 +349,7 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): if resample_method == "ohlc": # TODO: no tests with len(df.columns) > 0 mi = MultiIndex.from_product([df.columns, ["open", "high", "low", "close"]]) - expected = DataFrame( - [], index=df.index[:0].copy(), columns=mi, dtype=np.float64 - ) + expected = DataFrame([], index=df.index[:0], columns=mi, dtype=np.float64) expected.index = _asfreq_compat(df.index, freq) elif resample_method != "size": @@ -261,12 +367,13 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # test size for GH13212 (currently stays as df) -@all_ts +@pytest.mark.parametrize( + "index", [DatetimeIndex([]), TimedeltaIndex([]), PeriodIndex([], freq="D")] +) @pytest.mark.parametrize("freq", ["ME", "D", "h"]) -def test_resample_count_empty_dataframe(freq, empty_frame_dti): +def test_resample_count_empty_dataframe(freq, index): # GH28427 - - empty_frame_dti["a"] = [] + empty_frame_dti = DataFrame(index=index, columns=Index(["a"], dtype=object)) if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( @@ -295,12 +402,14 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): tm.assert_frame_equal(result, expected) -@all_ts +@pytest.mark.parametrize( + "index", [DatetimeIndex([]), TimedeltaIndex([]), PeriodIndex([], freq="D")] +) @pytest.mark.parametrize("freq", ["ME", "D", "h"]) -def test_resample_size_empty_dataframe(freq, empty_frame_dti): +def test_resample_size_empty_dataframe(freq, index): # GH28427 - empty_frame_dti["a"] = [] + empty_frame_dti = DataFrame(index=index, columns=Index(["a"], dtype=object)) if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( @@ -329,6 +438,24 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("index", [DatetimeIndex([]), TimedeltaIndex([])]) +@pytest.mark.parametrize("freq", ["D", "h"]) +@pytest.mark.parametrize( + "method", ["ffill", "bfill", "nearest", "asfreq", "interpolate", "mean"] +) +def test_resample_apply_empty_dataframe(index, freq, method): + # GH#55572 + empty_frame_dti = DataFrame(index=index) + + rs = empty_frame_dti.resample(freq) + result = rs.apply(getattr(rs, method)) + + expected_index = _asfreq_compat(empty_frame_dti.index, freq) + expected = DataFrame([], index=expected_index) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "index", [ @@ -352,7 +479,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): empty_series_dti = Series([], index, dtype) with tm.assert_produces_warning(warn, match=msg): - rs = empty_series_dti.resample("d", group_keys=False) + rs = empty_series_dti.resample("D", group_keys=False) try: getattr(rs, resample_method)() except DataError: @@ -361,27 +488,34 @@ def test_resample_empty_dtypes(index, dtype, resample_method): pass -@all_ts +@pytest.mark.parametrize( + "index", + [ + PeriodIndex([], freq="D", name="a"), + DatetimeIndex([], name="a"), + TimedeltaIndex([], name="a"), + ], +) @pytest.mark.parametrize("freq", ["ME", "D", "h"]) -def test_apply_to_empty_series(empty_series_dti, freq): +def test_apply_to_empty_series(index, freq): # GH 14313 - ser = empty_series_dti + ser = Series(index=index) - if freq == "ME" and isinstance(empty_series_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): - empty_series_dti.resample(freq) + ser.resample(freq) return - elif freq == "ME" and isinstance(empty_series_dti.index, PeriodIndex): + elif freq == "ME" and isinstance(ser.index, PeriodIndex): # index is PeriodIndex, so convert to corresponding Period freq freq = "M" msg = "Resampling with a PeriodIndex" warn = None - if isinstance(empty_series_dti.index, PeriodIndex): + if isinstance(ser.index, PeriodIndex): warn = FutureWarning with tm.assert_produces_warning(warn, match=msg): @@ -394,9 +528,17 @@ def test_apply_to_empty_series(empty_series_dti, freq): tm.assert_series_equal(result, expected, check_dtype=False) -@all_ts -def test_resampler_is_iterable(series): +@pytest.mark.parametrize( + "index", + [ + timedelta_range("1 day", "10 day", freq="D"), + date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"), + period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"), + ], +) +def test_resampler_is_iterable(index): # GH 15314 + series = Series(range(len(index)), index=index) freq = "h" tg = Grouper(freq=freq, convention="start") msg = "Resampling with a PeriodIndex" @@ -414,18 +556,52 @@ def test_resampler_is_iterable(series): tm.assert_series_equal(rv, gv) -@all_ts -def test_resample_quantile(series): +@pytest.mark.parametrize( + "index", + [ + timedelta_range("1 day", "10 day", freq="D"), + date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"), + period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"), + ], +) +def test_resample_quantile(index): # GH 15023 - ser = series + ser = Series(range(len(index)), index=index) q = 0.75 freq = "h" msg = "Resampling with a PeriodIndex" warn = None - if isinstance(series.index, PeriodIndex): + if isinstance(ser.index, PeriodIndex): warn = FutureWarning with tm.assert_produces_warning(warn, match=msg): result = ser.resample(freq).quantile(q) expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, skipna, how): + # GH#57019 + if is_extension_array_dtype(any_real_nullable_dtype): + na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value + else: + na_value = np.nan + df = DataFrame( + { + "a": [2, 1, 1, 2], + "b": [na_value, 3.0, na_value, 4.0], + "c": [na_value, 3.0, na_value, 4.0], + }, + index=date_range("2020-01-01", periods=4, freq="D"), + dtype=any_real_nullable_dtype, + ) + rs = df.resample("ME") + method = getattr(rs, how) + result = method(skipna=skipna) + + ts = pd.to_datetime("2020-01-31").as_unit("ns") + gb = df.groupby(df.shape[0] * [ts]) + expected = getattr(gb, how)(skipna=skipna) + expected.index.freq = "ME" + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 80583f5d3c5f2..f871c0bf0218c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1,9 +1,9 @@ from datetime import datetime from functools import partial +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs import lib from pandas._typing import DatetimeNaTType @@ -36,26 +36,6 @@ from pandas.tseries.offsets import Minute -@pytest.fixture() -def _index_factory(): - return date_range - - -@pytest.fixture -def _index_freq(): - return "Min" - - -@pytest.fixture -def _static_values(index): - return np.random.default_rng(2).random(len(index)) - - -@pytest.fixture(params=["s", "ms", "us", "ns"]) -def unit(request): - return request.param - - @pytest.fixture def simple_date_range_series(): """ @@ -69,7 +49,8 @@ def _simple_date_range_series(start, end, freq="D"): return _simple_date_range_series -def test_custom_grouper(index, unit): +def test_custom_grouper(unit): + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="Min") dti = index.as_unit(unit) s = Series(np.array([1] * len(dti)), index=dti, dtype="int64") @@ -105,7 +86,8 @@ def test_custom_grouper(index, unit): tm.assert_series_equal(result, expect) -def test_custom_grouper_df(index, unit): +def test_custom_grouper_df(unit): + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") b = Grouper(freq=Minute(5), closed="right", label="right") dti = index.as_unit(unit) df = DataFrame( @@ -117,10 +99,6 @@ def test_custom_grouper_df(index, unit): assert len(r.index) == 2593 -@pytest.mark.parametrize( - "_index_start,_index_end,_index_name", - [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], -) @pytest.mark.parametrize( "closed, expected", [ @@ -142,8 +120,10 @@ def test_custom_grouper_df(index, unit): ), ], ) -def test_resample_basic(series, closed, expected, unit): - s = series +def test_resample_basic(closed, expected, unit): + index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min") + s = Series(range(len(index)), index=index) + s.index.name = "index" s.index = s.index.as_unit(unit) expected = expected(s) expected.index = expected.index.as_unit(unit) @@ -175,8 +155,10 @@ def test_resample_integerarray(unit): tm.assert_series_equal(result, expected) -def test_resample_basic_grouper(series, unit): - s = series +def test_resample_basic_grouper(unit): + index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min") + s = Series(range(len(index)), index=index) + s.index.name = "index" s.index = s.index.as_unit(unit) result = s.resample("5Min").last() grouper = Grouper(freq=Minute(5), closed="left", label="left") @@ -187,32 +169,28 @@ def test_resample_basic_grouper(series, unit): @pytest.mark.filterwarnings( "ignore:The 'convention' keyword in Series.resample:FutureWarning" ) -@pytest.mark.parametrize( - "_index_start,_index_end,_index_name", - [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], -) @pytest.mark.parametrize( "keyword,value", [("label", "righttt"), ("closed", "righttt"), ("convention", "starttt")], ) -def test_resample_string_kwargs(series, keyword, value, unit): +def test_resample_string_kwargs(keyword, value, unit): # see gh-19303 # Check that wrong keyword argument strings raise an error + index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min") + series = Series(range(len(index)), index=index) + series.index.name = "index" series.index = series.index.as_unit(unit) msg = f"Unsupported value {value} for `{keyword}`" with pytest.raises(ValueError, match=msg): series.resample("5min", **({keyword: value})) -@pytest.mark.parametrize( - "_index_start,_index_end,_index_name", - [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], -) -def test_resample_how(series, downsample_method, unit): +def test_resample_how(downsample_method, unit): if downsample_method == "ohlc": pytest.skip("covered by test_resample_how_ohlc") - - s = series + index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min") + s = Series(range(len(index)), index=index) + s.index.name = "index" s.index = s.index.as_unit(unit) grouplist = np.ones_like(s) grouplist[0] = 0 @@ -230,12 +208,10 @@ def test_resample_how(series, downsample_method, unit): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "_index_start,_index_end,_index_name", - [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], -) -def test_resample_how_ohlc(series, unit): - s = series +def test_resample_how_ohlc(unit): + index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min") + s = Series(range(len(index)), index=index) + s.index.name = "index" s.index = s.index.as_unit(unit) grouplist = np.ones_like(s) grouplist[0] = 0 @@ -263,7 +239,9 @@ def _ohlc(group): def test_resample_how_callables(unit): # GH#7929 data = np.arange(5, dtype=np.int64) - ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit) + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit) df = DataFrame({"A": data, "B": data}, index=ind) def fn(x, a=1): @@ -358,7 +336,9 @@ def test_resample_basic_from_daily(unit): s = Series(np.random.default_rng(2).random(len(dti)), dti) # to weekly - result = s.resample("w-sun").last() + msg = "'w-sun' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample("w-sun").last() assert len(result) == 3 assert (result.index.dayofweek == [6, 6, 6]).all() @@ -465,19 +445,6 @@ def test_resample_frame_basic_M_A(freq, unit): tm.assert_series_equal(result["A"], df["A"].resample(freq).mean()) -@pytest.mark.parametrize("freq", ["W-WED", "ME"]) -def test_resample_frame_basic_kind(freq, unit): - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), - index=date_range("2000-01-01", periods=10, freq="B"), - ) - df.index = df.index.as_unit(unit) - msg = "The 'kind' keyword in DataFrame.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.resample(freq, kind="period").mean() - - def test_resample_upsample(unit): # from daily dti = date_range( @@ -558,8 +525,10 @@ def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit): tm.assert_series_equal(result, expected) -def test_resample_ohlc(series, unit): - s = series +def test_resample_ohlc(unit): + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="Min") + s = Series(range(len(index)), index=index) + s.index.name = "index" s.index = s.index.as_unit(unit) grouper = Grouper(freq=Minute(5)) @@ -661,26 +630,6 @@ def test_resample_ohlc_dataframe(unit): # df.columns = ['PRICE', 'PRICE'] -def test_resample_dup_index(): - # GH 4812 - # dup columns with resample raising - df = DataFrame( - np.random.default_rng(2).standard_normal((4, 12)), - index=[2000, 2000, 2000, 2000], - columns=[Period(year=2000, month=i + 1, freq="M") for i in range(12)], - ) - df.iloc[3, :] = np.nan - warning_msg = "DataFrame.resample with axis=1 is deprecated." - with tm.assert_produces_warning(FutureWarning, match=warning_msg): - result = df.resample("QE", axis=1).mean() - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby(lambda x: int((x.month - 1) / 3), axis=1).mean() - expected.columns = [Period(year=2000, quarter=i + 1, freq="Q") for i in range(4)] - tm.assert_frame_equal(result, expected) - - def test_resample_reresample(unit): dti = date_range( start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D" @@ -707,9 +656,7 @@ def test_resample_timestamp_to_period( ts = simple_date_range_series("1/1/1990", "1/1/2000") ts.index = ts.index.as_unit(unit) - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ts.resample(freq, kind="period").mean() + result = ts.resample(freq).mean().to_period() expected = ts.resample(freq).mean() expected.index = period_range(**expected_kwargs) tm.assert_series_equal(result, expected) @@ -759,21 +706,6 @@ def test_asfreq_non_unique(unit): ts.asfreq("B") -def test_resample_axis1(unit): - rng = date_range("1/1/2000", "2/29/2000").as_unit(unit) - df = DataFrame( - np.random.default_rng(2).standard_normal((3, len(rng))), - columns=rng, - index=["a", "b", "c"], - ) - - warning_msg = "DataFrame.resample with axis=1 is deprecated." - with tm.assert_produces_warning(FutureWarning, match=warning_msg): - result = df.resample("ME", axis=1).mean() - expected = df.T.resample("ME").mean().T - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("freq", ["min", "5min", "15min", "30min", "4h", "12h"]) def test_resample_anchored_ticks(freq, unit): # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should @@ -1026,6 +958,19 @@ def _create_series(values, timestamps, freq="D"): tm.assert_series_equal(result, expected) +def test_resample_dst_midnight_last_nonexistent(): + # GH 58380 + ts = Series( + 1, + date_range("2024-04-19", "2024-04-20", tz="Africa/Cairo", freq="15min"), + ) + + expected = Series([len(ts)], index=DatetimeIndex([ts.index[0]], freq="7D")) + + result = ts.resample("7D").sum() + tm.assert_series_equal(result, expected) + + def test_resample_daily_anchored(unit): rng = date_range("1/1/2000 0:00:00", periods=10000, freq="min").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) @@ -1042,9 +987,7 @@ def test_resample_to_period_monthly_buglet(unit): rng = date_range("1/1/2000", "12/31/2000").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ts.resample("ME", kind="period").mean() + result = ts.resample("ME").mean().to_period() exp_index = period_range("Jan-2000", "Dec-2000", freq="M") tm.assert_index_equal(result.index, exp_index) @@ -1079,12 +1022,8 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("ID").resample("5min").sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + result = df.groupby("ID").resample("5min").sum() + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1103,9 +1042,7 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1166,18 +1103,15 @@ def test_resample_anchored_intraday(unit): df = DataFrame(rng.month, index=rng) result = df.resample("ME").mean() - msg = "The 'kind' keyword in DataFrame.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.resample("ME", kind="period").mean().to_timestamp(how="end") + expected = df.resample("ME").mean().to_period() + expected = expected.to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") expected.index = expected.index.as_unit(unit)._with_freq("infer") assert expected.index.freq == "ME" tm.assert_frame_equal(result, expected) result = df.resample("ME", closed="left").mean() - msg = "The 'kind' keyword in DataFrame.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - exp = df.shift(1, freq="D").resample("ME", kind="period").mean() + exp = df.shift(1, freq="D").resample("ME").mean().to_period() exp = exp.to_timestamp(how="end") exp.index = exp.index + Timedelta(1, "ns") - Timedelta(1, "D") @@ -1191,9 +1125,8 @@ def test_resample_anchored_intraday2(unit): df = DataFrame(rng.month, index=rng) result = df.resample("QE").mean() - msg = "The 'kind' keyword in DataFrame.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.resample("QE", kind="period").mean().to_timestamp(how="end") + expected = df.resample("QE").mean().to_period() + expected = expected.to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") expected.index._data.freq = "QE" expected.index._freq = lib.no_default @@ -1201,11 +1134,8 @@ def test_resample_anchored_intraday2(unit): tm.assert_frame_equal(result, expected) result = df.resample("QE", closed="left").mean() - msg = "The 'kind' keyword in DataFrame.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = ( - df.shift(1, freq="D").resample("QE", kind="period", closed="left").mean() - ) + expected = df.shift(1, freq="D").resample("QE").mean() + expected = expected.to_period() expected = expected.to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") expected.index._data.freq = "QE" @@ -1262,9 +1192,7 @@ def test_corner_cases_date(simple_date_range_series, unit): # resample to periods ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") ts.index = ts.index.as_unit(unit) - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ts.resample("ME", kind="period").mean() + result = ts.resample("ME").mean().to_period() assert len(result) == 1 assert result.index[0] == Period("2000-04", freq="M") @@ -1273,7 +1201,9 @@ def test_anchored_lowercase_buglet(unit): dates = date_range("4/16/2012 20:00", periods=50000, freq="s").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates) # it works! - ts.resample("d").mean() + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + ts.resample("d").mean() def test_upsample_apply_functions(unit): @@ -1383,7 +1313,8 @@ def test_resample_consistency(unit): s10 = s.reindex(index=i10, method="bfill") s10_2 = s.reindex(index=i10, method="bfill", limit=2) - rl = s.reindex_like(s10, method="bfill", limit=2) + with tm.assert_produces_warning(FutureWarning): + rl = s.reindex_like(s10, method="bfill", limit=2) r10_2 = s.resample("10Min").bfill(limit=2) r10 = s.resample("10Min").bfill() @@ -1521,12 +1452,12 @@ def test_resample_nunique_with_date_gap(func, unit): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("n", [10000, 100000]) -@pytest.mark.parametrize("k", [10, 100, 1000]) -def test_resample_group_info(n, k, unit): +def test_resample_group_info(unit): # GH10914 # use a fixed seed to always have the same uniques + n = 100 + k = 10 prng = np.random.default_rng(2) dr = date_range(start="2015-08-27", periods=n // 10, freq="min").as_unit(unit) @@ -1613,9 +1544,9 @@ def test_groupby_with_dst_time_change(unit): ) df = DataFrame([1, 2], index=index) - result = df.groupby(Grouper(freq="1d")).last() + result = df.groupby(Grouper(freq="1D")).last() expected_index_values = date_range( - "2016-11-02", "2016-11-24", freq="d", tz="America/Chicago" + "2016-11-02", "2016-11-24", freq="D", tz="America/Chicago" ).as_unit(unit) index = DatetimeIndex(expected_index_values) @@ -1737,13 +1668,13 @@ def test_resample_dst_anchor2(unit): def test_downsample_across_dst(unit): # GH 8531 - tz = pytz.timezone("Europe/Berlin") + tz = zoneinfo.ZoneInfo("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq="2h").as_unit(unit) + dates = date_range(dt.astimezone(tz), periods=4, freq="2h").as_unit(unit) result = Series(5, index=dates).resample("h").mean() expected = Series( [5.0, np.nan] * 3 + [5.0], - index=date_range(tz.localize(dt), periods=7, freq="h").as_unit(unit), + index=date_range(dt.astimezone(tz), periods=7, freq="h").as_unit(unit), ) tm.assert_series_equal(result, expected) @@ -1854,8 +1785,12 @@ def test_resample_datetime_values(unit): tm.assert_series_equal(res, exp) -def test_resample_apply_with_additional_args(series, unit): +def test_resample_apply_with_additional_args(unit): # GH 14615 + index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min") + series = Series(range(len(index)), index=index) + series.index.name = "index" + def f(data, add_arg): return np.mean(data) * add_arg @@ -1880,12 +1815,8 @@ def f(data, add_arg): multiplier = 10 df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) @@ -1953,9 +1884,7 @@ def test_resample_apply_product(duplicates, unit): if duplicates: df.columns = ["A", "A"] - msg = "using DatetimeIndexResampler.prod" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.resample("QE").apply(np.prod) + result = df.resample("QE").apply(np.prod) expected = DataFrame( np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64), index=DatetimeIndex( @@ -2094,47 +2023,42 @@ def test_resample_empty_series_with_tz(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ("1YE", "1A"), - ("2YE-MAR", "2A-MAR"), - ], -) -def test_resample_M_Q_Y_A_deprecated(freq, freq_depr): - # GH#9586 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample(freq).mean() - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = s.resample(freq_depr).mean() - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("freq", ["2M", "2m", "2Q", "2Q-SEP", "2q-sep", "1Y", "2Y-MAR"]) +def test_resample_M_Q_Y_raises(freq): + msg = f"Invalid frequency: {freq}" + + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() + + +@pytest.mark.parametrize("freq", ["2BM", "1bm", "1BQ", "2BQ-MAR", "2bq=-mar"]) +def test_resample_BM_BQ_raises(freq): + msg = f"Invalid frequency: {freq}" + + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() @pytest.mark.parametrize( - "freq, freq_depr", + "freq,freq_depr,data", [ - ("2BME", "2BM"), - ("2BQE", "2BQ"), - ("2BQE-MAR", "2BQ-MAR"), + ("1W-SUN", "1w-sun", ["2013-01-06"]), + ("1D", "1d", ["2013-01-01"]), + ("1B", "1b", ["2013-01-01"]), + ("1C", "1c", ["2013-01-01"]), ], ) -def test_resample_BM_BQ_deprecated(freq, freq_depr): - # GH#52064 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample(freq).mean() - with tm.assert_produces_warning(FutureWarning, match=depr_msg): +def test_resample_depr_lowercase_frequency(freq, freq_depr, data): + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + + s = Series(range(5), index=date_range("20130101", freq="h", periods=5)) + with tm.assert_produces_warning(FutureWarning, match=msg): result = s.resample(freq_depr).mean() + + exp_dti = DatetimeIndex(data=data, dtype="datetime64[ns]", freq=freq) + expected = Series(2.0, index=exp_dti) tm.assert_series_equal(result, expected) @@ -2229,3 +2153,22 @@ def test_arrow_timestamp_resample(tz): expected = Series(np.arange(5, dtype=np.float64), index=idx) result = expected.resample("1D").mean() tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_arrow_timestamp_resample_keep_index_name(): + # https://github.com/pandas-dev/pandas/issues/61222 + idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") + expected = Series(np.arange(5, dtype=np.float64), index=idx) + expected.index.name = "index_name" + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("freq", ["1A", "2A-MAR"]) +def test_resample_A_raises(freq): + msg = f"Invalid frequency: {freq[1:]}" + + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index eb80f56dd7d4b..e17529dfab00c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1,10 +1,14 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) +import re import warnings +import zoneinfo import dateutil import numpy as np import pytest -import pytz from pandas._libs.tslibs.ccalendar import ( DAYS, @@ -35,16 +39,6 @@ ) -@pytest.fixture() -def _index_factory(): - return period_range - - -@pytest.fixture -def _series_name(): - return "pi" - - @pytest.fixture def simple_period_range_series(): """ @@ -68,37 +62,38 @@ def _simple_period_range_series(start, end, freq="D"): class TestPeriodIndex: @pytest.mark.parametrize("freq", ["2D", "1h", "2h"]) - @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) - def test_asfreq(self, series_and_frame, freq, kind): + def test_asfreq(self, frame_or_series, freq): # GH 12884, 15944 - # make sure .asfreq() returns PeriodIndex (except kind='timestamp') - - obj = series_and_frame - if kind == "timestamp": - expected = obj.to_timestamp().resample(freq).asfreq() - else: - start = obj.index[0].to_timestamp(how="start") - end = (obj.index[-1] + obj.index.freq).to_timestamp(how="start") - new_index = date_range(start=start, end=end, freq=freq, inclusive="left") - expected = obj.to_timestamp().reindex(new_index).to_period(freq) - msg = "The 'kind' keyword in (Series|DataFrame).resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = obj.resample(freq, kind=kind).asfreq() + + obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) + + expected = obj.to_timestamp().resample(freq).asfreq() + result = obj.to_timestamp().resample(freq).asfreq() + tm.assert_almost_equal(result, expected) + + start = obj.index[0].to_timestamp(how="start") + end = (obj.index[-1] + obj.index.freq).to_timestamp(how="start") + new_index = date_range(start=start, end=end, freq=freq, inclusive="left") + expected = obj.to_timestamp().reindex(new_index).to_period(freq) + + result = obj.resample(freq).asfreq() tm.assert_almost_equal(result, expected) - def test_asfreq_fill_value(self, series): + result = obj.resample(freq).asfreq().to_timestamp().to_period() + tm.assert_almost_equal(result, expected) + + def test_asfreq_fill_value(self): # test for fill value during resampling, issue 3715 - s = series + index = period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + s = Series(range(len(index)), index=index) new_index = date_range( s.index[0].to_timestamp(how="start"), (s.index[-1]).to_timestamp(how="start"), freq="1h", ) expected = s.to_timestamp().reindex(new_index, fill_value=4.0) - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.resample("1h", kind="timestamp").asfreq(fill_value=4.0) + result = s.to_timestamp().resample("1h").asfreq(fill_value=4.0) tm.assert_series_equal(result, expected) frame = s.to_frame("value") @@ -108,17 +103,15 @@ def test_asfreq_fill_value(self, series): freq="1h", ) expected = frame.to_timestamp().reindex(new_index, fill_value=3.0) - msg = "The 'kind' keyword in DataFrame.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = frame.resample("1h", kind="timestamp").asfreq(fill_value=3.0) + result = frame.to_timestamp().resample("1h").asfreq(fill_value=3.0) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("freq", ["h", "12h", "2D", "W"]) - @pytest.mark.parametrize("kind", [None, "period", "timestamp"]) @pytest.mark.parametrize("kwargs", [{"on": "date"}, {"level": "d"}]) - def test_selection(self, index, freq, kind, kwargs): + def test_selection(self, freq, kwargs): # This is a bug, these should be implemented # GH 14008 + index = period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") rng = np.arange(len(index), dtype=np.int64) df = DataFrame( {"date": index, "a": rng}, @@ -129,10 +122,8 @@ def test_selection(self, index, freq, kind, kwargs): r"not currently supported, use \.set_index\(\.\.\.\) to " "explicitly set index" ) - depr_msg = "The 'kind' keyword in DataFrame.resample is deprecated" with pytest.raises(NotImplementedError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - df.resample(freq, kind=kind, **kwargs) + df.resample(freq, **kwargs) @pytest.mark.parametrize("month", MONTHS) @pytest.mark.parametrize("meth", ["ffill", "bfill"]) @@ -173,12 +164,12 @@ def test_basic_downsample(self, simple_period_range_series): ("Y-DEC", ""), ("Q-MAR", ""), ("M", ""), - ("w-thu", ""), + ("W-THU", ""), ], ) def test_not_subperiod(self, simple_period_range_series, rule, expected_error_msg): # These are incompatible period rules for resampling - ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="W-WED") msg = ( "Frequency cannot be resampled to " f"{expected_error_msg}, as they are not sub or super periods" @@ -275,12 +266,9 @@ def test_resample_basic(self): name="idx", ) expected = Series([34.5, 79.5], index=index) - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.to_period().resample("min", kind="period").mean() + result = s.to_period().resample("min").mean() tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - result2 = s.resample("min", kind="period").mean() + result2 = s.resample("min").mean().to_period() tm.assert_series_equal(result2, expected) @pytest.mark.parametrize( @@ -319,7 +307,7 @@ def test_resample_incompat_freq(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), dateutil.tz.gettz("America/Los_Angeles"), ], ) @@ -327,17 +315,21 @@ def test_with_local_timezone(self, tz): # see gh-5430 local_timezone = tz - start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) + start = datetime( + year=2013, month=11, day=1, hour=0, minute=0, tzinfo=timezone.utc + ) # 1 day later - end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) + end = datetime( + year=2013, month=11, day=2, hour=0, minute=0, tzinfo=timezone.utc + ) index = date_range(start, end, freq="h", name="idx") series = Series(1, index=index) series = series.tz_convert(local_timezone) - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = series.resample("D", kind="period").mean() + msg = "Converting to PeriodArray/Index representation will drop timezone" + with tm.assert_produces_warning(UserWarning, match=msg): + result = series.resample("D").mean().to_period() # Create the expected series # Index is moved back a day with the timezone conversion from UTC to @@ -351,7 +343,7 @@ def test_with_local_timezone(self, tz): @pytest.mark.parametrize( "tz", [ - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), dateutil.tz.gettz("America/Los_Angeles"), ], ) @@ -368,8 +360,6 @@ def test_resample_with_tz(self, tz, unit): index=exp_dti, ) tm.assert_series_equal(result, expected) - # Especially assert that the timezone is LMT for pytz - assert result.index.tz == tz def test_resample_nonexistent_time_bin_edge(self): # GH 19375 @@ -439,10 +429,8 @@ def test_weekly_upsample(self, day, target, convention, simple_period_range_seri def test_resample_to_timestamps(self, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ts.resample("Y-DEC", kind="timestamp").mean() - expected = ts.to_timestamp(how="start").resample("YE-DEC").mean() + result = ts.resample("Y-DEC").mean().to_timestamp() + expected = ts.resample("Y-DEC").mean().to_timestamp(how="start") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("month", MONTHS) @@ -495,17 +483,17 @@ def test_cant_fill_missing_dups(self): with pytest.raises(InvalidIndexError, match=msg): s.resample("Y").ffill() - @pytest.mark.parametrize("freq", ["5min"]) - @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) - def test_resample_5minute(self, freq, kind): + def test_resample_5minute(self): rng = period_range("1/1/2000", "1/5/2000", freq="min") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - expected = ts.to_timestamp().resample(freq).mean() - if kind != "timestamp": - expected = expected.to_period(freq) - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ts.resample(freq, kind=kind).mean() + expected = ts.to_timestamp().resample("5min").mean() + result = ts.resample("5min").mean().to_timestamp() + tm.assert_series_equal(result, expected) + + expected = expected.to_period("5min") + result = ts.resample("5min").mean() + tm.assert_series_equal(result, expected) + result = ts.resample("5min").mean().to_timestamp().to_period() tm.assert_series_equal(result, expected) def test_upsample_daily_business_daily(self, simple_period_range_series): @@ -579,9 +567,9 @@ def test_resample_tz_localized2(self): tm.assert_series_equal(result, expected) # for good measure - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.resample("D", kind="period").mean() + msg = "Converting to PeriodArray/Index representation will drop timezone " + with tm.assert_produces_warning(UserWarning, match=msg): + result = s.resample("D").mean().to_period() ex_index = period_range("2001-09-20", periods=1, freq="D") expected = Series([1.5], index=ex_index) tm.assert_series_equal(result, expected) @@ -815,8 +803,7 @@ def test_evenly_divisible_with_no_extra_bins2(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("freq, period_mult", [("h", 24), ("12h", 2)]) - @pytest.mark.parametrize("kind", [None, "period"]) - def test_upsampling_ohlc(self, freq, period_mult, kind): + def test_upsampling_ohlc(self, freq, period_mult): # GH 13083 pi = period_range(start="2000", freq="D", periods=10) s = Series(range(len(pi)), index=pi) @@ -826,9 +813,10 @@ def test_upsampling_ohlc(self, freq, period_mult, kind): # of the last original period, so extend accordingly: new_index = period_range(start="2000", freq=freq, periods=period_mult * len(pi)) expected = expected.reindex(new_index) - msg = "The 'kind' keyword in Series.resample is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.resample(freq, kind=kind).ohlc() + result = s.resample(freq).ohlc() + tm.assert_frame_equal(result, expected) + + result = s.resample(freq).ohlc().to_timestamp().to_period() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -989,22 +977,38 @@ def test_sum_min_count(self): def test_resample_t_l_deprecated(self): # GH#52536 - msg_t = "'T' is deprecated and will be removed in a future version." - msg_l = "'L' is deprecated and will be removed in a future version." + msg_t = "Invalid frequency: T" + msg_l = "Invalid frequency: L" - with tm.assert_produces_warning(FutureWarning, match=msg_l): - rng_l = period_range( + with pytest.raises(ValueError, match=msg_l): + period_range( "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="L" ) + rng_l = period_range( + "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="ms" + ) ser = Series(np.arange(len(rng_l)), index=rng_l) - rng = period_range( - "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="min" - ) - expected = Series([29999.5, 60000.0], index=rng) - with tm.assert_produces_warning(FutureWarning, match=msg_t): - result = ser.resample("T").mean() - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg_t): + ser.resample("T").mean() + + @pytest.mark.parametrize( + "freq, freq_depr, freq_depr_res", + [ + ("2Q", "2q", "2y"), + ("2M", "2m", "2q"), + ], + ) + def test_resample_lowercase_frequency_raises(self, freq, freq_depr, freq_depr_res): + msg = f"Invalid frequency: {freq_depr}" + with pytest.raises(ValueError, match=msg): + period_range("2020-01-01", "2020-08-01", freq=freq_depr) + + msg = f"Invalid frequency: {freq_depr_res}" + rng = period_range("2020-01-01", "2020-08-01", freq=freq) + ser = Series(np.arange(len(rng)), index=rng) + with pytest.raises(ValueError, match=msg): + ser.resample(freq_depr_res).mean() @pytest.mark.parametrize( "offset", @@ -1014,32 +1018,36 @@ def test_resample_t_l_deprecated(self): offsets.BusinessHour(2), ], ) - def test_asfreq_invalid_period_freq(self, offset, series_and_frame): - # GH#9586 - msg = f"Invalid offset: '{offset.base}' for converting time series " + def test_asfreq_invalid_period_offset(self, offset, frame_or_series): + # GH#55785 + msg = re.escape(f"{offset} is not supported as period frequency") - df = series_and_frame + obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) with pytest.raises(ValueError, match=msg): - df.asfreq(freq=offset) + obj.asfreq(freq=offset) @pytest.mark.parametrize( - "freq,freq_depr", + "freq", [ - ("2M", "2ME"), - ("2Q", "2QE"), - ("2Q-FEB", "2QE-FEB"), - ("2Y", "2YE"), - ("2Y-MAR", "2YE-MAR"), + ("2ME"), + ("2QE"), + ("2QE-FEB"), + ("2YE"), + ("2YE-MAR"), + ("2me"), + ("2qe"), + ("2ye-mar"), ], ) -def test_resample_frequency_ME_QE_YE_error_message(series_and_frame, freq, freq_depr): +def test_resample_frequency_ME_QE_YE_raises(frame_or_series, freq): # GH#9586 - msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + msg = f"{freq[1:]} is not supported as period frequency" - obj = series_and_frame + obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) + msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): - obj.resample(freq_depr) + obj.resample(freq) def test_corner_cases_period(simple_period_range_series): @@ -1052,20 +1060,11 @@ def test_corner_cases_period(simple_period_range_series): assert len(result) == 0 -@pytest.mark.parametrize( - "freq_depr", - [ - "2BME", - "2CBME", - "2SME", - "2BQE-FEB", - "2BYE-MAR", - ], -) -def test_resample_frequency_invalid_freq(series_and_frame, freq_depr): +@pytest.mark.parametrize("freq", ["2BME", "2CBME", "2SME", "2BQE-FEB", "2BYE-MAR"]) +def test_resample_frequency_invalid_freq(frame_or_series, freq): # GH#9586 - msg = f"Invalid frequency: {freq_depr[1:]}" + msg = f"Invalid frequency: {freq}" - obj = series_and_frame + obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) with pytest.raises(ValueError, match=msg): - obj.resample(freq_depr) + obj.resample(freq) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 7e8779ab48b7e..da1774cf22587 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -5,7 +5,6 @@ import pytest from pandas._libs import lib -from pandas.errors import UnsupportedFunctionCall import pandas as pd from pandas import ( @@ -35,13 +34,13 @@ def test_frame(dti, _test_series): def test_str(_test_series): r = _test_series.resample("h") assert ( - "DatetimeIndexResampler [freq=, axis=0, closed=left, " + "DatetimeIndexResampler [freq=, closed=left, " "label=left, convention=start, origin=start_day]" in str(r) ) r = _test_series.resample("h", origin="2000-01-01") assert ( - "DatetimeIndexResampler [freq=, axis=0, closed=left, " + "DatetimeIndexResampler [freq=, closed=left, " "label=left, convention=start, origin=2000-01-01 00:00:00]" in str(r) ) @@ -77,9 +76,7 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) @@ -188,7 +185,7 @@ def test_api_compat_before_use(attr): getattr(rs, attr) -def tests_raises_on_nuisance(test_frame): +def tests_raises_on_nuisance(test_frame, using_infer_string): df = test_frame df["D"] = "foo" r = df.resample("h") @@ -198,6 +195,8 @@ def tests_raises_on_nuisance(test_frame): expected = r[["A", "B", "C"]].mean() msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -297,32 +296,6 @@ def test_transform_frame(on): tm.assert_frame_equal(result, expected) -def test_fillna(): - # need to upsample here - rng = date_range("1/1/2012", periods=10, freq="2s") - ts = Series(np.arange(len(rng), dtype="int64"), index=rng) - r = ts.resample("s") - - expected = r.ffill() - msg = "DatetimeIndexResampler.fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = r.fillna(method="ffill") - tm.assert_series_equal(result, expected) - - expected = r.bfill() - with tm.assert_produces_warning(FutureWarning, match=msg): - result = r.fillna(method="bfill") - tm.assert_series_equal(result, expected) - - msg2 = ( - r"Invalid fill method\. Expecting pad \(ffill\), backfill " - r"\(bfill\) or nearest\. Got 0" - ) - with pytest.raises(ValueError, match=msg2): - with tm.assert_produces_warning(FutureWarning, match=msg): - r.fillna(0) - - @pytest.mark.parametrize( "func", [ @@ -355,7 +328,7 @@ def test_agg_consistency(): r = df.resample("3min") - msg = r"Column\(s\) \['r1', 'r2'\] do not exist" + msg = r"Label\(s\) \['r1', 'r2'\] do not exist" with pytest.raises(KeyError, match=msg): r.agg({"r1": "mean", "r2": "sum"}) @@ -370,7 +343,7 @@ def test_agg_consistency_int_str_column_mix(): r = df.resample("3min") - msg = r"Column\(s\) \[2, 'b'\] do not exist" + msg = r"Label\(s\) \[2, 'b'\] do not exist" with pytest.raises(KeyError, match=msg): r.agg({2: "mean", "b": "sum"}) @@ -467,34 +440,30 @@ def cases(request): def test_agg_mixed_column_aggregation(cases, a_mean, a_std, b_mean, b_std, request): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) - msg = "using SeriesGroupBy.[mean|std]" + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", ""]]) # "date" is an index and a column, so get included in the agg if "df_mult" in request.node.callspec.id: date_mean = cases["date"].mean() date_std = cases["date"].std() expected = pd.concat([date_mean, date_std, expected], axis=1) expected.columns = pd.MultiIndex.from_product( - [["date", "A", "B"], ["mean", "std"]] + [["date", "A", "B"], ["mean", ""]] ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = cases.aggregate([np.mean, np.std]) + result = cases.aggregate([np.mean, lambda x: np.std(x, ddof=1)]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "agg", [ - {"func": {"A": np.mean, "B": np.std}}, - {"A": ("A", np.mean), "B": ("B", np.std)}, - {"A": NamedAgg("A", np.mean), "B": NamedAgg("B", np.std)}, + {"func": {"A": np.mean, "B": lambda x: np.std(x, ddof=1)}}, + {"A": ("A", np.mean), "B": ("B", lambda x: np.std(x, ddof=1))}, + {"A": NamedAgg("A", np.mean), "B": NamedAgg("B", lambda x: np.std(x, ddof=1))}, ], ) def test_agg_both_mean_std_named_result(cases, a_mean, b_std, agg): - msg = "using SeriesGroupBy.[mean|std]" expected = pd.concat([a_mean, b_std], axis=1) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = cases.aggregate(**agg) + result = cases.aggregate(**agg) tm.assert_frame_equal(result, expected, check_like=True) @@ -550,11 +519,9 @@ def test_agg_dict_of_lists(cases, a_mean, a_std, b_mean, b_std): ) def test_agg_with_lambda(cases, agg): # passed lambda - msg = "using SeriesGroupBy.sum" rcustom = cases["B"].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([cases["A"].sum(), rcustom], axis=1) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = cases.agg(**agg) + result = cases.agg(**agg) tm.assert_frame_equal(result, expected, check_like=True) @@ -567,7 +534,7 @@ def test_agg_with_lambda(cases, agg): ], ) def test_agg_no_column(cases, agg): - msg = r"Column\(s\) \['result1', 'result2'\] do not exist" + msg = r"Label\(s\) \['result1', 'result2'\] do not exist" with pytest.raises(KeyError, match=msg): cases[["A", "B"]].agg(**agg) @@ -615,31 +582,11 @@ def test_agg_specificationerror_series(cases, agg): def test_agg_specificationerror_invalid_names(cases): # errors # invalid names in the agg specification - msg = r"Column\(s\) \['B'\] do not exist" + msg = r"Label\(s\) \['B'\] do not exist" with pytest.raises(KeyError, match=msg): cases[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) -@pytest.mark.parametrize( - "func", [["min"], ["mean", "max"], {"A": "sum"}, {"A": "prod", "B": "median"}] -) -def test_multi_agg_axis_1_raises(func): - # GH#46904 - - index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") - index.name = "date" - df = DataFrame( - np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index - ).T - warning_msg = "DataFrame.resample with axis=1 is deprecated." - with tm.assert_produces_warning(FutureWarning, match=warning_msg): - res = df.resample("ME", axis=1) - with pytest.raises( - NotImplementedError, match="axis other than 0 is not supported" - ): - res.agg(func) - - def test_agg_nested_dicts(): index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" @@ -684,7 +631,7 @@ def test_try_aggregate_non_existing_column(): df = DataFrame(data).set_index("dt") # Error as we don't have 'z' column - msg = r"Column\(s\) \['z'\] do not exist" + msg = r"Label\(s\) \['z'\] do not exist" with pytest.raises(KeyError, match=msg): df.resample("30min").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) @@ -761,7 +708,9 @@ def test_selection_api_validation(): tm.assert_frame_equal(exp, result) exp.index.name = "d" - with pytest.raises(TypeError, match="datetime64 type does not support sum"): + with pytest.raises( + TypeError, match="datetime64 type does not support operation 'sum'" + ): df.resample("2D", level="d").sum() result = df.resample("2D", level="d").sum(numeric_only=True) tm.assert_frame_equal(exp, result) @@ -783,7 +732,7 @@ def test_agg_with_datetime_index_list_agg_func(col_name): ), columns=[col_name], ) - result = df.resample("1d").aggregate(["mean"]) + result = df.resample("1D").aggregate(["mean"]) expected = DataFrame( [47.5, 143.5, 195.5], index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"), @@ -932,7 +881,9 @@ def test_end_and_end_day_origin( ("sem", lib.no_default, "could not convert string to float"), ], ) -def test_frame_downsample_method(method, numeric_only, expected_data): +def test_frame_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -949,6 +900,11 @@ def test_frame_downsample_method(method, numeric_only, expected_data): if method in ("var", "mean", "median", "prod"): klass = TypeError msg = re.escape(f"agg function failed [how->{method},dtype->") + if using_infer_string: + msg = f"dtype 'str' does not support operation '{method}'" + elif method in ["sum", "std", "sem"] and using_infer_string: + klass = TypeError + msg = f"dtype 'str' does not support operation '{method}'" else: klass = ValueError msg = expected_data @@ -983,7 +939,9 @@ def test_frame_downsample_method(method, numeric_only, expected_data): ("last", lib.no_default, ["cat_2"]), ], ) -def test_series_downsample_method(method, numeric_only, expected_data): +def test_series_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -999,85 +957,17 @@ def test_series_downsample_method(method, numeric_only, expected_data): func(**kwargs) elif method == "prod": msg = re.escape("agg function failed [how->prod,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'prod'" with pytest.raises(TypeError, match=msg): func(**kwargs) + else: result = func(**kwargs) expected = Series(expected_data, index=expected_index) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "method, raises", - [ - ("sum", True), - ("prod", True), - ("min", True), - ("max", True), - ("first", False), - ("last", False), - ("median", False), - ("mean", True), - ("std", True), - ("var", True), - ("sem", False), - ("ohlc", False), - ("nunique", False), - ], -) -def test_args_kwargs_depr(method, raises): - index = date_range("20180101", periods=3, freq="h") - df = Series([2, 4, 6], index=index) - resampled = df.resample("30min") - args = () - - func = getattr(resampled, method) - - error_msg = "numpy operations are not valid with resample." - error_msg_type = "too many arguments passed in" - warn_msg = f"Passing additional args to DatetimeIndexResampler.{method}" - - if raises: - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - with pytest.raises(UnsupportedFunctionCall, match=error_msg): - func(*args, 1, 2, 3) - else: - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - with pytest.raises(TypeError, match=error_msg_type): - func(*args, 1, 2, 3) - - -def test_df_axis_param_depr(): - index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") - index.name = "date" - df = DataFrame( - np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index - ).T - - # Deprecation error when axis=1 is explicitly passed - warning_msg = "DataFrame.resample with axis=1 is deprecated." - with tm.assert_produces_warning(FutureWarning, match=warning_msg): - df.resample("ME", axis=1) - - # Deprecation error when axis=0 is explicitly passed - df = df.T - warning_msg = ( - "The 'axis' keyword in DataFrame.resample is deprecated and " - "will be removed in a future version." - ) - with tm.assert_produces_warning(FutureWarning, match=warning_msg): - df.resample("ME", axis=0) - - -def test_series_axis_param_depr(_test_series): - warning_msg = ( - "The 'axis' keyword in Series.resample is " - "deprecated and will be removed in a future version." - ) - with tm.assert_produces_warning(FutureWarning, match=warning_msg): - _test_series.resample("h", axis=0) - - def test_resample_empty(): # GH#52484 df = DataFrame( diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 337c5ff53bd14..286625b8ce470 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -69,12 +69,8 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby("id").apply(f_0) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.set_index("date").groupby("id").resample("D").asfreq() + expected = df.groupby("id").apply(f_0) + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -88,12 +84,8 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.groupby("group").apply(f_1) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + expected = df.groupby("group").apply(f_1) + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -108,9 +100,7 @@ def test_getitem(test_frame): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = g.resample("2s").mean().B + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -234,12 +224,8 @@ def test_methods(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -256,12 +242,8 @@ def test_methods_nunique(test_frame): def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -270,24 +252,18 @@ def test_apply(test_frame): r = g.resample("2s") # reduction - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.resample("2s").sum() + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = r.apply(f_0) + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = g.apply(f_1) + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -355,9 +331,7 @@ def test_resample_groupby_with_label(unit): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("col0").resample("1W", label="left").sum() + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -367,9 +341,7 @@ def test_resample_groupby_with_label(unit): ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) - expected = DataFrame( - data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex - ) + expected = DataFrame(data={"col1": [1, 1, 2, 1]}, index=mindex) tm.assert_frame_equal(result, expected) @@ -378,9 +350,7 @@ def test_consistency_with_window(test_frame): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").resample("2s").mean() + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -397,10 +367,9 @@ def test_median_duplicate_columns(): columns=list("aaa"), index=date_range("2012-01-01", periods=20, freq="s"), ) - df2 = df.copy() - df2.columns = ["a", "b", "c"] - expected = df2.resample("5s").median() result = df.resample("5s").median() + df.columns = ["a", "b", "c"] + expected = df.resample("5s").median() expected.columns = result.columns tm.assert_frame_equal(result, expected) @@ -478,13 +447,12 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) - .set_index(TimedeltaIndex([]), append=True) + .set_index(TimedeltaIndex([]), append=True)[expected_columns] ) if len(keys) == 1: expected.index.name = keys[0] @@ -503,8 +471,7 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - with tm.assert_produces_warning(FutureWarning): - result = df.groupby(["key"]).resample("W", on="date").min() + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -516,7 +483,6 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): ) expected = DataFrame( { - "key": ["A"] * 3 + ["B"] * 3, "col1": [0, 5, 12] * 2, "col_object": ["val"] * 3 + [np.nan] * 3, }, @@ -525,6 +491,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +def test_groupby_resample_empty_sum_string( + string_dtype_no_object, test_frame, min_count +): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype)) + gbrs = test_frame.groupby("A").resample("40s") + result = gbrs.sum(min_count=min_count) + + index = pd.MultiIndex( + levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]], + codes=[[0, 1, 2], [0, 0, 0]], + names=["A", None], + ) + value = "" if min_count == 0 else pd.NA + expected = DataFrame({"B": value}, index=index, dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_groupby_resample_with_list_of_keys(): # GH 47362 df = DataFrame( @@ -554,12 +540,11 @@ def test_resample_no_index(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) - expected = expected.set_index("date", append=True, drop=True) + expected = expected.set_index("date", append=True, drop=True)[expected_columns] if len(keys) == 1: expected.index.name = keys[0] @@ -603,9 +588,7 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="h", periods=12), ) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").resample("D").size() + result = df.groupby("A").resample("D").size() mi_exp = pd.MultiIndex.from_arrays( [ @@ -686,29 +669,3 @@ def test_groupby_resample_on_index_with_list_of_keys_missing_column(): rs = gb.resample("2D") with pytest.raises(KeyError, match="Columns not found"): rs[["val_not_in_dataframe"]] - - -@pytest.mark.parametrize("kind", ["datetime", "period"]) -def test_groupby_resample_kind(kind): - # GH 24103 - df = DataFrame( - { - "datetime": pd.to_datetime( - ["20181101 1100", "20181101 1200", "20181102 1300", "20181102 1400"] - ), - "group": ["A", "B", "A", "B"], - "value": [1, 2, 3, 4], - } - ) - df = df.set_index("datetime") - result = df.groupby("group")["value"].resample("D", kind=kind).last() - - dt_level = pd.DatetimeIndex(["2018-11-01", "2018-11-02"]) - if kind == "period": - dt_level = dt_level.to_period(freq="D") - expected_index = pd.MultiIndex.from_product( - [["A", "B"], dt_level], - names=["group", "datetime"], - ) - expected = Series([1, 3, 2, 4], index=expected_index, name="value") - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3d9098917a12d..e6cfa12f5f61a 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -57,12 +57,8 @@ def test_count(test_series): def test_numpy_reduction(test_series): result = test_series.resample("YE", closed="right").prod() - - msg = "using SeriesGroupBy.prod" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = test_series.groupby(lambda x: x.year).agg(np.prod) + expected = test_series.groupby(lambda x: x.year).agg(np.prod) expected.index = result.index - tm.assert_series_equal(result, expected) @@ -197,7 +193,7 @@ def test_aggregate_nth(): ) def test_resample_entirely_nat_window(method, method_args, unit): ser = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4)) - result = methodcaller(method, **method_args)(ser.resample("2d")) + result = methodcaller(method, **method_args)(ser.resample("2D")) exp_dti = pd.DatetimeIndex(["2017-01-01", "2017-01-03"], dtype="M8[ns]", freq="2D") expected = Series([0.0, unit], index=exp_dti) @@ -296,7 +292,7 @@ def test_repr(): # GH18203 result = repr(Grouper(key="A", freq="h")) expected = ( - "TimeGrouper(key='A', freq=, axis=0, sort=True, dropna=True, " + "TimeGrouper(key='A', freq=, sort=True, dropna=True, " "closed='left', label='left', how='mean', " "convention='e', origin='start_day')" ) @@ -304,7 +300,7 @@ def test_repr(): result = repr(Grouper(key="A", freq="h", origin="2000-01-01")) expected = ( - "TimeGrouper(key='A', freq=, axis=0, sort=True, dropna=True, " + "TimeGrouper(key='A', freq=, sort=True, dropna=True, " "closed='left', label='left', how='mean', " "convention='e', origin=Timestamp('2000-01-01 00:00:00'))" ) @@ -337,54 +333,104 @@ def test_upsample_sum(method, method_args, expected_values): tm.assert_series_equal(result, expected) -def test_groupby_resample_interpolate(): +@pytest.fixture +def groupy_test_df(): + return DataFrame( + {"price": [10, 11, 9], "volume": [50, 60, 50]}, + index=date_range("01/01/2018", periods=3, freq="W"), + ) + + +def test_groupby_resample_interpolate_raises(groupy_test_df): + # GH 35325 + + # Make a copy of the test data frame that has index.name=None + groupy_test_df_without_index_name = groupy_test_df.copy() + groupy_test_df_without_index_name.index.name = None + + dfs = [groupy_test_df, groupy_test_df_without_index_name] + + for df in dfs: + with pytest.raises( + NotImplementedError, + match="Direct interpolation of MultiIndex data frames is not supported", + ): + df.groupby("volume").resample("1D").interpolate(method="linear") + + +def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): # GH 35325 - d = {"price": [10, 11, 9], "volume": [50, 60, 50]} - df = DataFrame(d) + # Make a copy of the test data frame that has index.name=None + groupy_test_df_without_index_name = groupy_test_df.copy() + groupy_test_df_without_index_name.index.name = None + + dfs = [groupy_test_df, groupy_test_df_without_index_name] + + for df in dfs: + result = df.groupby("volume").apply( + lambda x: x.resample("1D").interpolate(method="linear"), + ) - df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") + volume = [50] * 15 + [60] + week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ + Timestamp("2018-01-14") + ] + expected_ind = pd.MultiIndex.from_arrays( + [volume, week_starting], + names=["volume", df.index.name], + ) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") + expected = DataFrame( + data={ + "price": [ + 10.0, + 9.928571428571429, + 9.857142857142858, + 9.785714285714286, + 9.714285714285714, + 9.642857142857142, + 9.571428571428571, + 9.5, + 9.428571428571429, + 9.357142857142858, + 9.285714285714286, + 9.214285714285714, + 9.142857142857142, + 9.071428571428571, + 9.0, + 11.0, + ] + }, + index=expected_ind, ) + tm.assert_frame_equal(result, expected) - volume = [50] * 15 + [60] - week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ - Timestamp("2018-01-14") - ] + +def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df): + """Similar test as test_groupby_resample_interpolate_with_apply_syntax but + with resampling that results in missing anchor points when interpolating. + See GH#21351.""" + # GH#21351 + result = groupy_test_df.groupby("volume").apply( + lambda x: x.resample("265h").interpolate(method="linear") + ) + + volume = [50, 50, 60] + week_starting = pd.DatetimeIndex( + [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), + ] + ).as_unit("ns") expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], names=["volume", "week_starting"], ) expected = DataFrame( - data={ - "price": [ - 10.0, - 9.928571428571429, - 9.857142857142858, - 9.785714285714286, - 9.714285714285714, - 9.642857142857142, - 9.571428571428571, - 9.5, - 9.428571428571429, - 9.357142857142858, - 9.285714285714286, - 9.214285714285714, - 9.142857142857142, - 9.071428571428571, - 9.0, - 11.0, - ], - "volume": [50.0] * 15 + [60], - }, + data={"price": [10.0, 9.5, 11.0]}, index=expected_ind, ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_names=False) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 7c70670d42908..309810b656ed3 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -176,7 +176,6 @@ def test_resample_with_timedelta_yields_no_empty_groups(duplicates): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_resample_quantile_timedelta(unit): # GH: 29485 dtype = np.dtype(f"m8[{unit}]") diff --git a/pandas/tests/reshape/concat/conftest.py b/pandas/tests/reshape/concat/conftest.py deleted file mode 100644 index 62b8c59ba8855..0000000000000 --- a/pandas/tests/reshape/concat/conftest.py +++ /dev/null @@ -1,7 +0,0 @@ -import pytest - - -@pytest.fixture(params=[True, False]) -def sort(request): - """Boolean sort keyword for concat and DataFrame.append.""" - return request.param diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 81ca227fb7afb..6e18ccfc70e06 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -162,9 +162,7 @@ def test_append_preserve_index_name(self): df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) df2 = df2.set_index(["A"]) - msg = "The behavior of array concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df1._append(df2) + result = df1._append(df2) assert result.index.name == "A" indexes_can_append = [ @@ -328,16 +326,13 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): result = df._append([ser, ser], ignore_index=True) tm.assert_frame_equal(result, expected) - def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager): + def test_append_empty_tz_frame_with_datetime64ns(self): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") # pd.NaT gets inferred as tz-naive, so append result is tz-naive result = df._append({"a": pd.NaT}, ignore_index=True) - if using_array_manager: - expected = DataFrame({"a": [pd.NaT]}, dtype=object) - else: - expected = DataFrame({"a": [np.nan]}, dtype=object) + expected = DataFrame({"a": [pd.NaT]}, dtype=object) tm.assert_frame_equal(result, expected) # also test with typed value to append @@ -356,9 +351,7 @@ def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager): "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] ) @pytest.mark.parametrize("val", [1, "NaT"]) - def test_append_empty_frame_with_timedelta64ns_nat( - self, dtype_str, val, using_array_manager - ): + def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype(dtype_str) @@ -366,12 +359,6 @@ def test_append_empty_frame_with_timedelta64ns_nat( result = df._append(other, ignore_index=True) expected = other.astype(object) - if isinstance(val, str) and dtype_str != "int64" and not using_array_manager: - # TODO: expected used to be `other.astype(object)` which is a more - # reasonable result. This was changed when tightening - # assert_frame_equal's treatment of mismatched NAs to match the - # existing behavior. - expected = DataFrame({"a": [np.nan]}, dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 31c3ef3176222..d0ff950e7985f 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -1,3 +1,5 @@ +import zoneinfo + import numpy as np import pytest @@ -19,12 +21,12 @@ "float64": [1.1, np.nan, 3.3], "category": Categorical(["X", "Y", "Z"]), "object": ["a", "b", "c"], - "datetime64[ns]": [ + "datetime64[s]": [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-03"), ], - "datetime64[ns, US/Eastern]": [ + "datetime64[s, US/Eastern]": [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), pd.Timestamp("2011-01-03", tz="US/Eastern"), @@ -353,14 +355,15 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) # different tz - dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + tz_diff = zoneinfo.ZoneInfo("US/Hawaii") + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz_diff) exp = Index( [ pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2011-01-02", tz=tz), - pd.Timestamp("2012-01-01", tz="US/Pacific"), - pd.Timestamp("2012-01-02", tz="US/Pacific"), + pd.Timestamp("2012-01-01", tz=tz_diff), + pd.Timestamp("2012-01-02", tz=tz_diff), ], dtype=object, ) @@ -691,15 +694,12 @@ def test_concat_categorical_empty(self): s1 = Series([], dtype="category") s2 = Series([1, 2], dtype="category") + exp = s2.astype(object) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) - msg = "The behavior of array concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1._append(s2, ignore_index=True), s2) - - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2._append(s1, ignore_index=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) s1 = Series([], dtype="category") s2 = Series([], dtype="category") @@ -719,15 +719,12 @@ def test_concat_categorical_empty(self): s1 = Series([], dtype="category") s2 = Series([np.nan, np.nan]) - # empty Series is ignored - exp = Series([np.nan, np.nan]) - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) + exp = Series([np.nan, np.nan], dtype=object) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) def test_categorical_concat_append(self): cat = Categorical(["a", "b"], categories=["a", "b"]) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index bbaaf0abecfbd..8e6a14e6bfb8f 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -59,9 +59,7 @@ def test_categorical_concat_dtypes(self, using_infer_string): num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == ( - object if not using_infer_string else "string[pyarrow_numpy]" - ) + result = df.dtypes == (object if not using_infer_string else "str") expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 9e34d02091e69..2d0eb5d14a1d9 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,12 +5,12 @@ from collections.abc import Iterator from datetime import datetime from decimal import Decimal +import itertools import numpy as np import pytest from pandas.errors import InvalidIndexError -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -18,6 +18,7 @@ Index, MultiIndex, PeriodIndex, + RangeIndex, Series, concat, date_range, @@ -44,59 +45,42 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - def test_concat_copy(self, using_array_manager, using_copy_on_write): + def test_concat_copy(self): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) - # These are actual copies. - result = concat([df, df2, df3], axis=1, copy=True) + result = concat([df, df2, df3], axis=1) - if not using_copy_on_write: - for arr in result._mgr.arrays: - assert not any( - np.shares_memory(arr, y) - for x in [df, df2, df3] - for y in x._mgr.arrays - ) - else: - for arr in result._mgr.arrays: - assert arr.base is not None - - # These are the same. - result = concat([df, df2, df3], axis=1, copy=False) - - for arr in result._mgr.arrays: + for block in result._mgr.blocks: + arr = block.values if arr.dtype.kind == "f": - assert arr.base is df._mgr.arrays[0].base + assert arr.base is df._mgr.blocks[0].values.base elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: - if using_array_manager: - # we get the same array object, which has no base - assert arr is df3._mgr.arrays[0] - else: - assert arr.base is not None + assert arr.base is not None + elif arr.dtype == "string": + tm.shares_memory(arr, df3._mgr.blocks[0].values) # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) - result = concat([df, df2, df3, df4], axis=1, copy=False) - for arr in result._mgr.arrays: + result = concat([df, df2, df3, df4], axis=1) + for blocks in result._mgr.blocks: + arr = blocks.values if arr.dtype.kind == "f": - if using_array_manager or using_copy_on_write: - # this is a view on some array in either df or df4 - assert any( - np.shares_memory(arr, other) - for other in df._mgr.arrays + df4._mgr.arrays - ) - else: - # the block was consolidated, so we got a copy anyway - assert arr.base is None + # this is a view on some array in either df or df4 + assert any( + np.shares_memory(arr, block.values) + for block in itertools.chain(df._mgr.blocks, df4._mgr.blocks) + ) elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: # this is a view on df3 - assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays) + assert any( + np.shares_memory(arr, block.values) for block in df3._mgr.blocks + ) def test_concat_with_group_keys(self): # axis=0 @@ -342,6 +326,8 @@ def test_concat_mixed_objs_index(self): def test_concat_mixed_objs_index_names(self): # Test row-wise concat for mixed series/frames with distinct names # GH2385, GH15047 + # GH #60723 & GH #56257 (Updated the test case, + # as the above GH PR ones were incorrect) index = date_range("01-Jan-2013", periods=10, freq="h") arr = np.arange(10, dtype="int64") @@ -357,8 +343,11 @@ def test_concat_mixed_objs_index_names(self): result = concat([s1, df, s2]) tm.assert_frame_equal(result, expected) - # Rename all series to 0 when ignore_index=True - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) + expected = DataFrame( + np.kron(np.where(np.identity(3) == 1, 1, np.nan), arr).T, + index=np.arange(30, dtype=np.int64), + columns=["foo", 0, "bar"], + ) result = concat([s1, df, s2], ignore_index=True) tm.assert_frame_equal(result, expected) @@ -413,11 +402,34 @@ def test_concat_keys_with_none(self): expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"]) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("klass", [range, RangeIndex]) + @pytest.mark.parametrize("include_none", [True, False]) + def test_concat_preserves_rangeindex(self, klass, include_none): + df = DataFrame([1, 2]) + df2 = DataFrame([3, 4]) + data = [df, None, df2, None] if include_none else [df, df2] + keys_length = 4 if include_none else 2 + result = concat(data, keys=klass(keys_length)) + expected = DataFrame( + [1, 2, 3, 4], + index=MultiIndex( + levels=( + RangeIndex(start=0, stop=keys_length, step=keys_length / 2), + RangeIndex(start=0, stop=2, step=1), + ), + codes=( + np.array([0, 0, 1, 1], dtype=np.int8), + np.array([0, 1, 0, 1], dtype=np.int8), + ), + ), + ) + tm.assert_frame_equal(result, expected) + def test_concat_bug_1719(self): ts1 = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - ts2 = ts1.copy()[::2] + ts2 = ts1[::2] # to join with union # these two are of different length! @@ -550,14 +562,13 @@ def test_concat_no_unnecessary_upcast(float_numpy_dtype, frame_or_series): assert x.values.dtype == dt -@pytest.mark.parametrize("pdt", [Series, DataFrame]) -def test_concat_will_upcast(pdt, any_signed_int_numpy_dtype): +def test_concat_will_upcast(frame_or_series, any_signed_int_numpy_dtype): dt = any_signed_int_numpy_dtype - dims = pdt().ndim + dims = frame_or_series().ndim dfs = [ - pdt(np.array([1], dtype=dt, ndmin=dims)), - pdt(np.array([np.nan], ndmin=dims)), - pdt(np.array([5], dtype=dt, ndmin=dims)), + frame_or_series(np.array([1], dtype=dt, ndmin=dims)), + frame_or_series(np.array([np.nan], ndmin=dims)), + frame_or_series(np.array([5], dtype=dt, ndmin=dims)), ] x = concat(dfs) assert x.values.dtype == "float64" @@ -724,7 +735,7 @@ def test_concat_multiindex_with_empty_rangeindex(): # GH#41234 mi = MultiIndex.from_tuples([("B", 1), ("C", 1)]) df1 = DataFrame([[1, 2]], columns=mi) - df2 = DataFrame(index=[1], columns=pd.RangeIndex(0)) + df2 = DataFrame(index=[1], columns=RangeIndex(0)) result = concat([df1, df2]) expected = DataFrame([[1, 2], [np.nan, np.nan]], columns=mi) @@ -777,7 +788,6 @@ def test_concat_retain_attrs(data): assert df.attrs[1] == 1 -@td.skip_array_manager_invalid_test @pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) @pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): @@ -785,14 +795,13 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - warn = None + needs_update = False if df_dtype == "datetime64[ns]" or ( df_dtype == "float64" and empty_dtype != "float64" ): - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): - result = concat([empty, df]) + needs_update = True + + result = concat([empty, df]) expected = df if df_dtype == "int64": # TODO what exact behaviour do we want for integer eventually? @@ -800,10 +809,13 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): expected = df.astype("float64") else: expected = df.astype("object") + + if needs_update: + # GH#40893 changed the expected here to retain dependence on empty + expected = expected.astype(object) tm.assert_frame_equal(result, expected) -@td.skip_array_manager_invalid_test @pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) @pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): @@ -817,21 +829,22 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): else: df_dtype = "float64" - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - warn = None + needs_update = False if empty_dtype != df_dtype and empty_dtype is not None: - warn = FutureWarning + needs_update = True elif df_dtype == "datetime64[ns]": - warn = FutureWarning + needs_update = True - with tm.assert_produces_warning(warn, match=msg): - result = concat([empty, df], ignore_index=True) + result = concat([empty, df], ignore_index=True) expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype) + if needs_update: + # GH#40893 changed the expected here to retain dependence on empty + expected = expected.astype(object) + expected.iloc[0] = np.nan tm.assert_frame_equal(result, expected) -@td.skip_array_manager_invalid_test def test_concat_ignore_empty_from_reindex(): # https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856 df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]}) @@ -839,10 +852,16 @@ def test_concat_ignore_empty_from_reindex(): aligned = df2.reindex(columns=df1.columns) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = concat([df1, aligned], ignore_index=True) - expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]}) + result = concat([df1, aligned], ignore_index=True) + + expected = DataFrame( + { + "a": [1, 2], + "b": pd.array([pd.Timestamp("2012-01-01"), np.nan], dtype=object), + }, + dtype=object, + ) + expected["a"] = expected["a"].astype("int64") tm.assert_frame_equal(result, expected) @@ -852,14 +871,14 @@ def test_concat_mismatched_keys_length(): sers = [ser + n for n in range(4)] keys = ["A", "B", "C"] - msg = r"The behavior of pd.concat with len\(keys\) != len\(objs\) is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = r"The length of the keys" + with pytest.raises(ValueError, match=msg): concat(sers, keys=keys, axis=1) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): concat(sers, keys=keys, axis=0) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): concat((x for x in sers), keys=(y for y in keys), axis=1) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): concat((x for x in sers), keys=(y for y in keys), axis=0) @@ -905,8 +924,80 @@ def test_concat_none_with_timezone_timestamp(): # GH#52093 df1 = DataFrame([{"A": None}]) df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}]) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = concat([df1, df2], ignore_index=True) - expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}) + result = concat([df1, df2], ignore_index=True) + expected = DataFrame( + {"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}, dtype=object + ) + tm.assert_frame_equal(result, expected) + + +def test_concat_with_series_and_frame_returns_rangeindex_columns(): + ser = Series([0]) + df = DataFrame([1, 2]) + result = concat([ser, df]) + expected = DataFrame([0, 1, 2], index=[0, 0, 1]) + tm.assert_frame_equal(result, expected, check_column_type=True) + + +def test_concat_with_moot_ignore_index_and_keys(): + df1 = DataFrame([[0]]) + df2 = DataFrame([[42]]) + + ignore_index = True + keys = ["df1", "df2"] + msg = f"Cannot set {ignore_index=} and specify keys. Either should be used." + with pytest.raises(ValueError, match=msg): + concat([df1, df2], keys=keys, ignore_index=ignore_index) + + +@pytest.mark.parametrize( + "inputs, ignore_index, axis, expected", + [ + # Concatenating DataFrame and named Series without ignore_index + ( + [DataFrame({"a": [0, 1], "b": [2, 3]}), Series([4, 5], name="c")], + False, + 0, + DataFrame( + { + "a": [0, 1, None, None], + "b": [2, 3, None, None], + "c": [None, None, 4, 5], + }, + index=[0, 1, 0, 1], + ), + ), + # Concatenating DataFrame and named Series with ignore_index + ( + [DataFrame({"a": [0, 1], "b": [2, 3]}), Series([4, 5], name="c")], + True, + 0, + DataFrame( + { + "a": [0, 1, None, None], + "b": [2, 3, None, None], + "c": [None, None, 4, 5], + }, + index=[0, 1, 2, 3], + ), + ), + # Concatenating DataFrame and unnamed Series along columns + ( + [DataFrame({"a": [0, 1], "b": [2, 3]}), Series([4, 5]), Series([4, 5])], + False, + 1, + DataFrame({"a": [0, 1], "b": [2, 3], 0: [4, 5], 1: [4, 5]}, index=[0, 1]), + ), + # Concatenating DataFrame and unnamed Series along columns with ignore_index + ( + [DataFrame({"a": [0, 1], "b": [2, 3]}), Series([4, 5]), Series([4, 5])], + True, + 1, + DataFrame({0: [0, 1], 1: [2, 3], 2: [4, 5], 3: [4, 5]}, index=[0, 1]), + ), + ], +) +def test_concat_of_series_and_frame(inputs, ignore_index, axis, expected): + # GH #60723 and #56257 + result = concat(inputs, ignore_index=ignore_index, axis=axis) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index f288921c25753..2ad46ac009928 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -192,20 +192,6 @@ def test_concat_duplicates_in_index_with_keys(self): tm.assert_frame_equal(result, expected) tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date")) - @pytest.mark.parametrize("ignore_index", [True, False]) - @pytest.mark.parametrize("order", ["C", "F"]) - @pytest.mark.parametrize("axis", [0, 1]) - def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write): - # based on asv ConcatDataFrames - df = DataFrame(np.zeros((10, 5), dtype=np.float32, order=order)) - - res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True) - - if not using_copy_on_write: - for arr in res._iter_column_arrays(): - for arr2 in df._iter_column_arrays(): - assert not np.shares_memory(arr, arr2) - def test_outer_sort_columns(self): # GH#47127 df1 = DataFrame({"A": [0], "B": [1], 0: 1}) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 71ddff7438254..0cf3192ea3a74 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -73,23 +73,23 @@ def test_concat_datetime_timezone(self): exp_idx = DatetimeIndex( [ - "2010-12-31 23:00:00+00:00", - "2011-01-01 00:00:00+00:00", - "2011-01-01 01:00:00+00:00", "2010-12-31 15:00:00+00:00", "2010-12-31 16:00:00+00:00", "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", ] ).as_unit("ns") expected = DataFrame( [ - [1, np.nan], - [2, np.nan], - [3, np.nan], [np.nan, 1], [np.nan, 2], [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], ], index=exp_idx, columns=["a", "b"], @@ -213,10 +213,8 @@ def test_concat_NaT_dataframes(self, tz): @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) - @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")]) - def test_concat_NaT_dataframes_all_NaT_axis_0( - self, tz1, tz2, item, using_array_manager - ): + @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101").as_unit("ns")]) + def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item): # GH 12396 # tz-naive @@ -228,15 +226,6 @@ def test_concat_NaT_dataframes_all_NaT_axis_0( expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) if tz1 != tz2: expected = expected.astype(object) - if item is pd.NaT and not using_array_manager: - # GH#18463 - # TODO: setting nan here is to keep the test passing as we - # make assert_frame_equal stricter, but is nan really the - # ideal behavior here? - if tz1 is not None: - expected.iloc[-1, 0] = np.nan - else: - expected.iloc[:-1, 0] = np.nan tm.assert_frame_equal(result, expected) @@ -369,7 +358,7 @@ def test_concat_tz_series_tzlocal(self): result = concat([Series(x), Series(y)], ignore_index=True) tm.assert_series_equal(result, Series(x + y)) - assert result.dtype == "datetime64[ns, tzlocal()]" + assert result.dtype == "datetime64[s, tzlocal()]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta @@ -550,8 +539,8 @@ def test_concat_timedelta64_block(): df = DataFrame({"time": rng}) result = concat([df, df]) - tm.assert_frame_equal(result.iloc[:10], df) - tm.assert_frame_equal(result.iloc[10:], df) + tm.assert_frame_equal(result.iloc[:10], df, check_index_type=False) + tm.assert_frame_equal(result.iloc[10:], df, check_index_type=False) def test_concat_multiindex_datetime_nat(): @@ -567,7 +556,7 @@ def test_concat_multiindex_datetime_nat(): tm.assert_frame_equal(result, expected) -def test_concat_float_datetime64(using_array_manager): +def test_concat_float_datetime64(): # GH#32934 df_time = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) df_float = DataFrame({"A": pd.array([1.0], dtype="float64")}) @@ -592,15 +581,9 @@ def test_concat_float_datetime64(using_array_manager): result = concat([df_time.iloc[:0], df_float]) tm.assert_frame_equal(result, expected) - if not using_array_manager: - expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = concat([df_time, df_float.iloc[:0]]) - tm.assert_frame_equal(result, expected) - else: - expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype( - {"A": "object"} - ) - result = concat([df_time, df_float.iloc[:0]]) - tm.assert_frame_equal(result, expected) + expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype( + object + ) + + result = concat([df_time, df_float.iloc[:0]]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 30ef0a934157b..4869cfbf3a556 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -27,7 +27,7 @@ def test_handle_empty_objects(self, sort, using_infer_string): expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) expected["foo"] = expected["foo"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.loc[0:4, "foo"] = "bar" @@ -62,11 +62,9 @@ def test_concat_empty_series(self): s1 = Series([1, 2, 3], name="x") s2 = Series(name="y", dtype="float64") - msg = "The behavior of array concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = concat([s1, s2], axis=0) + res = concat([s1, s2], axis=0) # name will be reset - exp = Series([1, 2, 3]) + exp = Series([1, 2, 3], dtype="float64") tm.assert_series_equal(res, exp) # empty Series with no name @@ -284,7 +282,7 @@ def test_concat_empty_dataframe_different_dtypes(self, using_infer_string): result = concat([df1[:0], df2[:0]]) assert result["a"].dtype == np.int64 - assert result["b"].dtype == np.object_ if not using_infer_string else "string" + assert result["b"].dtype == np.object_ if not using_infer_string else "str" def test_concat_to_empty_ea(self): """48510 `concat` to an empty EA should maintain type EA dtype.""" diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 52bb9fa0f151b..17f1a9d4ecbf1 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd from pandas import ( DataFrame, @@ -100,23 +98,20 @@ def test_concat_rename_index(self): tm.assert_frame_equal(result, exp) assert result.index.names == exp.index.names - def test_concat_copy_index_series(self, axis, using_copy_on_write): + def test_concat_copy_index_series(self, axis): # GH 29879 ser = Series([1, 2]) - comb = concat([ser, ser], axis=axis, copy=True) - if not using_copy_on_write or axis in [0, "index"]: + comb = concat([ser, ser], axis=axis) + if axis in [0, "index"]: assert comb.index is not ser.index else: assert comb.index is ser.index - def test_concat_copy_index_frame(self, axis, using_copy_on_write): + def test_concat_copy_index_frame(self, axis): # GH 29879 df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - comb = concat([df, df], axis=axis, copy=True) - if not using_copy_on_write: - assert not comb.index.is_(df.index) - assert not comb.columns.is_(df.columns) - elif axis in [0, "index"]: + comb = concat([df, df], axis=axis) + if axis in [0, "index"]: assert not comb.index.is_(df.index) assert comb.columns.is_(df.columns) elif axis in [1, "columns"]: @@ -340,7 +335,7 @@ def test_concat_multiindex_(self): ) tm.assert_frame_equal(result_df, expected_df) - def test_concat_with_key_not_unique(self): + def test_concat_with_key_not_unique(self, performance_warning): # GitHub #46519 df1 = DataFrame({"name": [1]}) df2 = DataFrame({"name": [2]}) @@ -348,15 +343,17 @@ def test_concat_with_key_not_unique(self): df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) # the warning is caused by indexing unsorted multi-index with tm.assert_produces_warning( - PerformanceWarning, match="indexing past lexsort depth" + performance_warning, match="indexing past lexsort depth" ): out_a = df_a.loc[("x", 0), :] - df_b = DataFrame( - {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)]) + {"name": [1, 2, 3]}, + index=MultiIndex( + levels=[["x", "y"], range(1)], codes=[[0, 1, 0], [0, 0, 0]] + ), ) with tm.assert_produces_warning( - PerformanceWarning, match="indexing past lexsort depth" + performance_warning, match="indexing past lexsort depth" ): out_b = df_b.loc[("x", 0)] @@ -367,7 +364,7 @@ def test_concat_with_key_not_unique(self): df3 = DataFrame({"name": ["c", "d"]}) df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) with tm.assert_produces_warning( - PerformanceWarning, match="indexing past lexsort depth" + performance_warning, match="indexing past lexsort depth" ): out_a = df_a.loc[("x", 0), :] @@ -380,7 +377,7 @@ def test_concat_with_key_not_unique(self): ).set_index(["a", "b"]) df_b.index.names = [None, None] with tm.assert_produces_warning( - PerformanceWarning, match="indexing past lexsort depth" + performance_warning, match="indexing past lexsort depth" ): out_b = df_b.loc[("x", 0), :] @@ -452,9 +449,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): s1 = Series(["a", "b", "c"]) s2 = Series(["a", "b"]) s3 = Series(["a", "b", "c", "d"]) - s4 = Series( - [], dtype=object if not using_infer_string else "string[pyarrow_numpy]" - ) + s4 = Series([], dtype=object if not using_infer_string else "str") result = concat( [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 ) @@ -465,7 +460,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): ["c", np.nan] * 2, [np.nan] * 2 + ["d"] + [np.nan], ], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ) tm.assert_frame_equal( result, expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index c12b835cb61e1..3523340bb2858 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -30,11 +30,11 @@ def test_concat_series(self): result = concat(pieces, keys=[0, 1, 2]) expected = ts.copy() - - ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) - exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] - exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) + exp_index = MultiIndex( + levels=[[0, 1, 2], DatetimeIndex(ts.index.to_numpy(dtype="M8[ns]"))], + codes=exp_codes, + ) expected.index = exp_index tm.assert_series_equal(result, expected) @@ -43,10 +43,8 @@ def test_concat_empty_and_non_empty_series_regression(self): s1 = Series([1]) s2 = Series([], dtype=object) - expected = s1 - msg = "The behavior of array concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = concat([s1, s2]) + expected = s1.astype(object) + result = concat([s1, s2]) tm.assert_series_equal(result, expected) def test_concat_series_axis1(self): @@ -128,11 +126,10 @@ def test_concat_series_axis1_same_names_ignore_index(self): tm.assert_index_equal(result.columns, expected, exact=True) - @pytest.mark.parametrize( - "s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))] - ) - def test_concat_series_name_npscalar_tuple(self, s1name, s2name): + @pytest.mark.parametrize("s1name", [np.int64(190), 190]) + def test_concat_series_name_npscalar_tuple(self, s1name): # GH21015 + s2name = (43, 0) s1 = Series({"a": 1, "b": 2}, name=s1name) s2 = Series({"c": 5, "d": 6}, name=s2name) result = concat([s1, s2]) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5a1f47e341222..65bfea0b9beea 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -16,6 +16,7 @@ bdate_range, concat, merge, + option_context, ) import pandas._testing as tm @@ -153,13 +154,12 @@ def test_join_on(self, target_source, infer_string): target.join(source, on="E") # overlap - source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object|string columns for key " + "You are trying to merge on float64 and object|str columns for key " "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): - target.join(source_copy, on="A") + target.join(source, on="A") def test_join_on_fails_with_different_right_index(self): df = DataFrame( @@ -563,24 +563,30 @@ def test_join_many_non_unique_index(self): tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) - def test_join_sort(self): - left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) - right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) - - joined = left.join(right, on="key", sort=True) - expected = DataFrame( - { - "key": ["bar", "baz", "foo", "foo"], - "value": [2, 3, 1, 4], - "value2": ["a", "b", "c", "c"], - }, - index=[1, 2, 0, 3], - ) - tm.assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on="key", sort=False) - tm.assert_index_equal(joined.index, Index(range(4)), exact=True) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_sort(self, infer_string): + with option_context("future.infer_string", infer_string): + left = DataFrame( + {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]} + ) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) + tm.assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on="key", sort=False) + tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index @@ -614,7 +620,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) - def test_mixed_type_join_with_suffix(self): + def test_mixed_type_join_with_suffix(self, using_infer_string): # GH #916 df = DataFrame( np.random.default_rng(2).standard_normal((20, 6)), @@ -624,7 +630,9 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) @@ -769,7 +777,7 @@ def test_join_on_tz_aware_datetimeindex(self): ) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) + expected["vals_2"] = Series([np.nan] * 2 + list("tuv")) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): @@ -1035,6 +1043,25 @@ def test_join_empty(left_empty, how, exp): tm.assert_frame_equal(result, expected) +def test_join_empty_uncomparable_columns(): + # GH 57048 + df1 = DataFrame() + df2 = DataFrame(columns=["test"]) + df3 = DataFrame(columns=["foo", ("bar", "baz")]) + + result = df1 + df2 + expected = DataFrame(columns=["test"]) + tm.assert_frame_equal(result, expected) + + result = df2 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo", "test"]) + tm.assert_frame_equal(result, expected) + + result = df1 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "how, values", [ @@ -1073,3 +1100,29 @@ def test_join_multiindex_categorical_output_index_dtype(how, values): result = df1.join(df2, how=how) tm.assert_frame_equal(result, expected) + + +def test_join_multiindex_with_none_as_label(): + # GH 58721 + df1 = DataFrame( + {"A": [1]}, + index=MultiIndex.from_tuples([(3, 3)], names=["X", None]), + ) + df2 = DataFrame( + {"B": [2]}, + index=MultiIndex.from_tuples([(3, 3)], names=[None, "X"]), + ) + + result12 = df1.join(df2) + expected12 = DataFrame( + {"A": [1], "B": [2]}, + index=MultiIndex.from_tuples([(3, 3)], names=["X", None]), + ) + tm.assert_frame_equal(result12, expected12) + + result21 = df2.join(df1) + expected21 = DataFrame( + {"B": [2], "A": [1]}, + index=MultiIndex.from_tuples([(3, 3)], names=[None, "X"]), + ) + tm.assert_frame_equal(result21, expected21) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d7a343ae9f152..f0f67aebd85ec 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -44,54 +47,6 @@ def get_test_data(ngroups=8, n=50): return arr -def get_series(): - return [ - Series([1], dtype="int64"), - Series([1], dtype="Int64"), - Series([1.23]), - Series(["foo"]), - Series([True]), - Series([pd.Timestamp("2018-01-01")]), - Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), - ] - - -def get_series_na(): - return [ - Series([np.nan], dtype="Int64"), - Series([np.nan], dtype="float"), - Series([np.nan], dtype="object"), - Series([pd.NaT]), - ] - - -@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) -def series_of_dtype(request): - """ - A parametrized fixture returning a variety of Series of different - dtypes - """ - return request.param - - -@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) -def series_of_dtype2(request): - """ - A duplicate of the series_of_dtype fixture, so that it can be used - twice by a single function - """ - return request.param - - -@pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name) -def series_of_dtype_all_na(request): - """ - A parametrized fixture returning a variety of Series with all NA - values - """ - return request.param - - @pytest.fixture def dfs_for_indicator(): df1 = DataFrame({"col1": [0, 1], "col_conflict": [1, 2], "col_left": ["a", "b"]}) @@ -140,13 +95,6 @@ def left(self): } ) - @pytest.fixture - def right(self): - return DataFrame( - {"v2": np.random.default_rng(2).standard_normal(4)}, - index=["d", "b", "c", "a"], - ) - def test_merge_inner_join_empty(self): # GH 15328 df_empty = DataFrame() @@ -230,7 +178,11 @@ def test_merge_index_singlekey_inner(self): expected = left.join(right, on="key").loc[result.index] tm.assert_frame_equal(result, expected.loc[:, result.columns]) - def test_merge_misspecified(self, df, df2, left, right): + def test_merge_misspecified(self, df, df2, left): + right = DataFrame( + {"v2": np.random.default_rng(2).standard_normal(4)}, + index=["d", "b", "c", "a"], + ) msg = "Must pass right_on or right_index=True" with pytest.raises(pd.errors.MergeError, match=msg): merge(left, right, left_index=True) @@ -308,7 +260,7 @@ def test_merge_copy(self): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) - merged = merge(left, right, left_index=True, right_index=True, copy=True) + merged = merge(left, right, left_index=True, right_index=True) merged["a"] = 6 assert (left["a"] == 0).all() @@ -316,14 +268,15 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self, using_array_manager): + def test_merge_nocopy(self, using_infer_string): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) - merged = merge(left, right, left_index=True, right_index=True, copy=False) + merged = merge(left, right, left_index=True, right_index=True) assert np.shares_memory(merged["a"]._values, left["a"]._values) - assert np.shares_memory(merged["d"]._values, right["d"]._values) + if not using_infer_string: + assert np.shares_memory(merged["d"]._values, right["d"]._values) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -455,13 +408,12 @@ def test_left_merge_empty_dataframe(self): result = merge(right, left, on="key", how="right") tm.assert_frame_equal(result, left) - @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"]) - def test_merge_empty_dataframe(self, index, how): + def test_merge_empty_dataframe(self, index, join_type): # GH52777 left = DataFrame([], index=index[:0]) right = left.copy() - result = left.join(right, how=how) + result = left.join(right, how=join_type) tm.assert_frame_equal(result, left) @pytest.mark.parametrize( @@ -572,6 +524,30 @@ def check2(exp, kwarg): check1(exp_in, kwarg) check2(exp_out, kwarg) + @pytest.mark.parametrize( + "series_of_dtype", + [ + Series([1], dtype="int64"), + Series([1], dtype="Int64"), + Series([1.23]), + Series(["foo"]), + Series([True]), + Series([pd.Timestamp("2018-01-01")]), + Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), + ], + ) + @pytest.mark.parametrize( + "series_of_dtype2", + [ + Series([1], dtype="int64"), + Series([1], dtype="Int64"), + Series([1.23]), + Series(["foo"]), + Series([True]), + Series([pd.Timestamp("2018-01-01")]), + Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), + ], + ) def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): # GH 25183 df = DataFrame( @@ -590,6 +566,27 @@ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): actual = df_empty.merge(df, on="key") tm.assert_frame_equal(actual, expected) + @pytest.mark.parametrize( + "series_of_dtype", + [ + Series([1], dtype="int64"), + Series([1], dtype="Int64"), + Series([1.23]), + Series(["foo"]), + Series([True]), + Series([pd.Timestamp("2018-01-01")]), + Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), + ], + ) + @pytest.mark.parametrize( + "series_of_dtype_all_na", + [ + Series([np.nan], dtype="Int64"), + Series([np.nan], dtype="float"), + Series([np.nan], dtype="object"), + Series([pd.NaT]), + ], + ) def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na): # GH 25183 df_left = DataFrame( @@ -667,11 +664,13 @@ def test_merge_nan_right(self): "i1_": {0: 0, 1: np.nan}, "i3": {0: 0.0, 1: np.nan}, None: {0: 0, 1: 0}, - } + }, + columns=Index(["i1", "i2", "i1_", "i3", None], dtype=object), ) .set_index(None) .reset_index()[["i1", "i2", "i1_", "i3"]] ) + result.columns = result.columns.astype("object") tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_nan_right2(self): @@ -702,7 +701,7 @@ def _constructor(self): assert isinstance(result, NotADataFrame) - def test_join_append_timedeltas(self, using_array_manager): + def test_join_append_timedeltas(self): # timedelta64 issues with join/merge # GH 5695 @@ -710,22 +709,15 @@ def test_join_append_timedeltas(self, using_array_manager): {"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]} ) df = DataFrame(columns=list("dt")) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - warn = FutureWarning - if using_array_manager: - warn = None - with tm.assert_produces_warning(warn, match=msg): - df = concat([df, d], ignore_index=True) - result = concat([df, d], ignore_index=True) + df = concat([df, d], ignore_index=True) + result = concat([df, d], ignore_index=True) expected = DataFrame( { "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500), timedelta(0, 22500)], - } + }, + dtype=object, ) - if using_array_manager: - # TODO(ArrayManager) decide on exact casting rules in concat - expected = expected.astype(object) tm.assert_frame_equal(result, expected) def test_join_append_timedeltas2(self): @@ -820,7 +812,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|str'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1375,8 +1367,8 @@ def test_merge_two_empty_df_no_division_error(self): ), ), ( - TimedeltaIndex(["1d", "2d", "3d"]), - TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]), + TimedeltaIndex(["1D", "2D", "3D"]), + TimedeltaIndex(["1D", "2D", "3D", pd.NaT, pd.NaT, pd.NaT]), ), ], ) @@ -1459,11 +1451,26 @@ def test_merge_readonly(self): ) # make each underlying block array / column array read-only - for arr in data1._mgr.arrays: - arr.flags.writeable = False + for block in data1._mgr.blocks: + block.values.flags.writeable = False data1.merge(data2) # no error + def test_merge_how_validation(self): + # https://github.com/pandas-dev/pandas/issues/59422 + data1 = DataFrame( + np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] + ) + data2 = DataFrame( + np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] + ) + msg = ( + "'full' is not a valid Merge type: left, right, inner, outer, " + "left_anti, right_anti, cross, asof" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + data1.merge(data2, how="full") + def _check_merge(x, y): for how in ["inner", "left", "outer"]: @@ -1478,10 +1485,8 @@ def _check_merge(x, y): class TestMergeDtypes: - @pytest.mark.parametrize( - "right_vals", [["foo", "bar"], Series(["foo", "bar"]).astype("category")] - ) - def test_different(self, right_vals): + @pytest.mark.parametrize("dtype", [object, "category"]) + def test_different(self, dtype): left = DataFrame( { "A": ["foo", "bar"], @@ -1492,20 +1497,18 @@ def test_different(self, right_vals): "F": Series([1, 2], dtype="int32"), } ) + right_vals = Series(["foo", "bar"], dtype=dtype) right = DataFrame({"A": right_vals}) # GH 9780 # We allow merging on object and categorical cols and cast # categorical cols to object result = merge(left, right, on="A") - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) - @pytest.mark.parametrize( - "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] - ) @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) - def test_join_multi_dtypes(self, d1, d2): - dtype1 = np.dtype(d1) + def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): + dtype1 = np.dtype(any_int_numpy_dtype) dtype2 = np.dtype(d2) left = DataFrame( @@ -1577,11 +1580,12 @@ def test_merge_on_ints_floats_warning(self): B = DataFrame({"Y": [1.1, 2.5, 3.0]}) expected = DataFrame({"X": [3], "Y": [3.0]}) - with tm.assert_produces_warning(UserWarning): + msg = "the float values are not equal to their int representation" + with tm.assert_produces_warning(UserWarning, match=msg): result = A.merge(B, left_on="X", right_on="Y") tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): result = B.merge(A, left_on="Y", right_on="X") tm.assert_frame_equal(result, expected[["Y", "X"]]) @@ -1637,7 +1641,7 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): result = merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) result = merge(df2, df1, on=["A"]) - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "df1_vals, df2_vals", @@ -1647,7 +1651,6 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): (Series([1, 2], dtype="int32"), ["a", "b", "c"]), ([0, 1, 2], ["0", "1", "2"]), ([0.0, 1.0, 2.0], ["0", "1", "2"]), - ([0, 1, 2], ["0", "1", "2"]), ( pd.date_range("1/1/2011", periods=2, freq="D"), ["2011-01-01", "2011-01-02"], @@ -1843,6 +1846,41 @@ def test_merge_empty(self, left_empty, how, exp): tm.assert_frame_equal(result, expected) + def test_merge_with_uintc_columns(self): + df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.uintc)}) + df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.uintc)}) + result = df1.merge(df2, how="outer") + expected = DataFrame( + { + "a": ["bar", "baz", "foo", "foo"], + "b": np.array([2, 4, 1, 3], dtype=np.uintc), + } + ) + tm.assert_frame_equal(result.reset_index(drop=True), expected) + + def test_merge_with_intc_columns(self): + df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.intc)}) + df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.intc)}) + result = df1.merge(df2, how="outer") + expected = DataFrame( + { + "a": ["bar", "baz", "foo", "foo"], + "b": np.array([2, 4, 1, 3], dtype=np.intc), + } + ) + tm.assert_frame_equal(result.reset_index(drop=True), expected) + + def test_merge_intc_non_monotonic(self): + df = DataFrame({"join_key": Series([0, 2, 1], dtype=np.intc)}) + df_details = DataFrame( + {"join_key": Series([0, 1, 2], dtype=np.intc), "value": ["a", "b", "c"]} + ) + merged = df.merge(df_details, on="join_key", how="left") + expected = DataFrame( + {"join_key": np.array([0, 2, 1], dtype=np.intc), "value": ["a", "c", "b"]} + ) + tm.assert_frame_equal(merged.reset_index(drop=True), expected) + @pytest.fixture def left(): @@ -1867,25 +1905,27 @@ def right(): class TestMergeCategorical: - def test_identical(self, left): + def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( - [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")], + [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], ) tm.assert_series_equal(result, expected) - def test_basic(self, left, right): + def test_basic(self, left, right, using_infer_string): # we have matching Categorical dtypes in X # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, np.dtype("int64"), ], index=["X", "Y", "Z"], @@ -1989,16 +2029,17 @@ def test_multiindex_merge_with_unordered_categoricalindex(self, ordered): ).set_index(["id", "p"]) tm.assert_frame_equal(result, expected) - def test_other_columns(self, left, right): + def test_other_columns(self, left, right, using_infer_string): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype("category")) merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, CategoricalDtype(categories=[1, 2]), ], index=["X", "Y", "Z"], @@ -2017,7 +2058,9 @@ def test_other_columns(self, left, right): lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) - def test_dtype_on_merged_different(self, change, join_type, left, right): + def test_dtype_on_merged_different( + self, change, join_type, left, right, using_infer_string + ): # our merging columns, X now has 2 different dtypes # so we must be object as a result @@ -2029,9 +2072,8 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - expected = Series( - [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] - ) + dtype = np.dtype("O") if not using_infer_string else "str" + expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) def test_self_join_multiple_categories(self): @@ -2140,16 +2182,6 @@ def test_merge_on_int_array(self): tm.assert_frame_equal(result, expected) -@pytest.fixture -def left_df(): - return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) - - -@pytest.fixture -def right_df(): - return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) - - class TestMergeOnIndexes: @pytest.mark.parametrize( "how, sort, expected", @@ -2198,7 +2230,9 @@ class TestMergeOnIndexes: ), ], ) - def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): + def test_merge_on_indexes(self, how, sort, expected): + left_df = DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) + right_df = DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) result = merge( left_df, right_df, left_index=True, right_index=True, how=how, sort=sort ) @@ -2207,23 +2241,28 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): @pytest.mark.parametrize( "index", - [Index([1, 2], dtype=dtyp, name="index_col") for dtyp in tm.ALL_REAL_NUMPY_DTYPES] + [ + Index([1, 2, 4], dtype=dtyp, name="index_col") + for dtyp in tm.ALL_REAL_NUMPY_DTYPES + ] + [ - CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"), - RangeIndex(start=0, stop=2, name="index_col"), - DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"), + CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="index_col"), + RangeIndex(start=0, stop=3, name="index_col"), + DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"], name="index_col"), ], ids=lambda x: f"{type(x).__name__}[{x.dtype}]", ) def test_merge_index_types(index): # gh-20777 # assert key access is consistent across index types - left = DataFrame({"left_data": [1, 2]}, index=index) - right = DataFrame({"right_data": [1.0, 2.0]}, index=index) + left = DataFrame({"left_data": [1, 2, 3]}, index=index) + right = DataFrame({"right_data": [1.0, 2.0, 3.0]}, index=index) result = left.merge(right, on=["index_col"]) - expected = DataFrame({"left_data": [1, 2], "right_data": [1.0, 2.0]}, index=index) + expected = DataFrame( + {"left_data": [1, 2, 3], "right_data": [1.0, 2.0, 3.0]}, index=index + ) tm.assert_frame_equal(result, expected) @@ -2335,19 +2374,15 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): [ ( "right", - DataFrame( - {"A": [100, 200, 300], "B1": [60, 70, np.nan], "B2": [600, 700, 800]} - ), + {"A": [100, 200, 300], "B1": [60, 70, np.nan], "B2": [600, 700, 800]}, ), ( "outer", - DataFrame( - { - "A": [1, 100, 200, 300], - "B1": [80, 60, 70, np.nan], - "B2": [np.nan, 600, 700, 800], - } - ), + { + "A": [1, 100, 200, 300], + "B1": [80, 60, 70, np.nan], + "B2": [np.nan, 600, 700, 800], + }, ), ], ) @@ -2355,6 +2390,7 @@ def test_merge_duplicate_suffix(how, expected): left_df = DataFrame({"A": [100, 200, 1], "B": [60, 70, 80]}) right_df = DataFrame({"A": [100, 200, 300], "B": [600, 700, 800]}) result = merge(left_df, right_df, on="A", how=how, suffixes=("_x", "_x")) + expected = DataFrame(expected) expected.columns = ["A", "B_x", "B_x"] tm.assert_frame_equal(result, expected) @@ -2499,7 +2535,7 @@ def test_merge_multiindex_columns(): expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_datetime_upcast_dtype(): @@ -2819,9 +2855,8 @@ def test_merge_arrow_and_numpy_dtypes(dtype): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("how", ["inner", "left", "outer", "right"]) @pytest.mark.parametrize("tz", [None, "America/Chicago"]) -def test_merge_datetime_different_resolution(tz, how): +def test_merge_datetime_different_resolution(tz, join_type): # https://github.com/pandas-dev/pandas/issues/53200 vals = [ pd.Timestamp(2023, 5, 12, tz=tz), @@ -2835,14 +2870,14 @@ def test_merge_datetime_different_resolution(tz, how): expected = DataFrame({"t": vals, "a": [1.0, 2.0, np.nan], "b": [np.nan, 1.0, 2.0]}) expected["t"] = expected["t"].dt.as_unit("ns") - if how == "inner": + if join_type == "inner": expected = expected.iloc[[1]].reset_index(drop=True) - elif how == "left": + elif join_type == "left": expected = expected.iloc[[0, 1]] - elif how == "right": + elif join_type == "right": expected = expected.iloc[[1, 2]].reset_index(drop=True) - result = df1.merge(df2, on="t", how=how) + result = df1.merge(df2, on="t", how=join_type) tm.assert_frame_equal(result, expected) @@ -2859,16 +2894,21 @@ def test_merge_multiindex_single_level(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -@pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("on_index", [True, False]) @pytest.mark.parametrize("left_unique", [True, False]) @pytest.mark.parametrize("left_monotonic", [True, False]) @pytest.mark.parametrize("right_unique", [True, False]) @pytest.mark.parametrize("right_monotonic", [True, False]) def test_merge_combinations( - how, sort, on_index, left_unique, left_monotonic, right_unique, right_monotonic + join_type, + sort, + on_index, + left_unique, + left_monotonic, + right_unique, + right_monotonic, ): + how = join_type # GH 54611 left = [2, 3] if left_unique: @@ -2980,11 +3020,43 @@ def test_merge_empty_frames_column_order(left_empty, right_empty): df2 = df2.iloc[:0] result = merge(df1, df2, on=["A"], how="outer") - expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"]) + expected = DataFrame(1, index=range(1), columns=["A", "B", "C", "D"]) if left_empty and right_empty: expected = expected.iloc[:0] elif left_empty: - expected.loc[:, "B"] = np.nan + expected["B"] = np.nan elif right_empty: - expected.loc[:, ["C", "D"]] = np.nan + expected[["C", "D"]] = np.nan + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_merge_datetime_and_timedelta(how): + left = DataFrame({"key": Series([1, None], dtype="datetime64[ns]")}) + right = DataFrame({"key": Series([1], dtype="timedelta64[ns]")}) + + msg = ( + f"You are trying to merge on {left['key'].dtype} and {right['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + left.merge(right, on="key", how=how) + + msg = ( + f"You are trying to merge on {right['key'].dtype} and {left['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + right.merge(left, on="key", how=how) + + +def test_merge_on_all_nan_column(): + # GH#59421 + left = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6]}) + right = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "zz": [4, 5, 6]}) + result = left.merge(right, on=["x", "y"], how="outer") + # Should not trigger array bounds eerror with bounds checking or asan enabled. + expected = DataFrame( + {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_antijoin.py b/pandas/tests/reshape/merge/test_merge_antijoin.py new file mode 100644 index 0000000000000..006622c6e5e94 --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_antijoin.py @@ -0,0 +1,280 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, +) +import pandas._testing as tm +from pandas.core.reshape.merge import merge + + +def test_merge_antijoin(): + # GH#42916 + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4]}, index=["a", "b", "d"]) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [3], "B": [np.nan]}, index=["c"]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [np.nan], "B": [4]}, index=["d"]) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_on_different_columns(): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}).astype({"B": object}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}).astype( + {"D": object} + ) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [3.0], + "B": ["c"], + "C": [np.nan], + "D": [np.nan], + }, + index=[2], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [2.0], + "D": ["d"], + }, + index=[1], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_nonunique_keys(): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}).astype({"B": object}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}).astype( + {"D": object} + ) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [1.0], + "B": ["a"], + "C": [np.nan], + "D": [np.nan], + }, + index=[0], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan, np.nan], + "B": [np.nan, np.nan], + "C": [2.0, 4.0], + "D": ["d", "d"], + }, + index=[2, 3], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_same_df(): + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"], dtype=np.int64) + result = merge(left, left, how="left_anti", left_index=True, right_index=True) + expected = DataFrame([], columns=["A_x", "A_y"], dtype=np.int64) + tm.assert_frame_equal(result, expected, check_index_type=False) + + +def test_merge_antijoin_nans(): + left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}).astype( + {"C": object} + ) + right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}).astype( + {"D": object} + ) + result = merge(left, right, how="left_anti", on="A") + expected = DataFrame({"A": [1.0], "C": ["a"], "D": [np.nan]}).astype( + {"C": object, "D": object} + ) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_on_datetime64tz(): + # GH11405 + left = DataFrame( + { + "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), + "value": [1.0, 2.0], + } + ) + right = DataFrame( + { + "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), + "value": [1.0, 2.0, 3.0], + } + ) + + expected = DataFrame( + { + "key": pd.date_range("20151010", periods=1, tz="US/Eastern"), + "value_x": [1.0], + "value_y": [np.nan], + }, + index=[0], + ) + result = merge(left, right, on="key", how="left_anti") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "key": pd.date_range("20151012", periods=2, tz="US/Eastern"), + "value_x": [np.nan, np.nan], + "value_y": [2.0, 3.0], + }, + index=[1, 2], + ) + result = merge(left, right, on="key", how="right_anti") + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_multiindex(): + left = DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "z")], names=["first", "second"] + ), + ) + right = DataFrame( + { + "C": [7, 8, 9], + "D": [10, 11, 12], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "w")], names=["first", "second"] + ), + ) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [3], + "B": [6], + "C": [np.nan], + "D": [np.nan], + }, + index=MultiIndex.from_tuples([("c", "z")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [9], + "D": [12], + }, + index=MultiIndex.from_tuples([("c", "w")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("timestamp[s][pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_merge_antijoin_extension_dtype(dtype): + left = DataFrame( + { + "join_col": [1, 3, 5], + "left_val": [1, 2, 3], + } + ) + right = DataFrame( + { + "join_col": [2, 3, 4], + "right_val": [1, 2, 3], + } + ) + left = left.astype({"join_col": dtype}) + right = right.astype({"join_col": dtype}) + result = merge(left, right, how="left_anti", on="join_col") + expected = DataFrame( + { + "join_col": [1, 5], + "left_val": [1, 3], + "right_val": [np.nan, np.nan], + }, + index=[0, 2], + ) + expected = expected.astype({"join_col": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_empty_dataframe(): + left = DataFrame({"A": [], "B": []}) + right = DataFrame({"C": [], "D": []}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="C") + expected = DataFrame({"A": [], "B": [], "C": [], "D": []}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="C") + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_no_common_elements(): + left = DataFrame({"A": [1, 2, 3]}) + right = DataFrame({"B": [4, 5, 6]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1, 2, 3], "B": [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": [4, 5, 6]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_with_null_values(): + left = DataFrame({"A": [1.0, 2.0, None, 4.0]}) + right = DataFrame({"B": [2.0, None, 5.0]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1.0, 4.0], "B": [np.nan, np.nan]}, index=[0, 3]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [5.0]}, index=[2]) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_with_mixed_dtypes(): + left = DataFrame({"A": [1, "2", 3.0]}) + right = DataFrame({"B": ["2", 3.0, 4]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1], "B": [np.nan]}, dtype=object) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [4]}, dtype=object, index=[2]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index b656191cc739d..f7b0876c5a605 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -2,7 +2,6 @@ import numpy as np import pytest -import pytz import pandas.util._test_decorators as td @@ -18,14 +17,6 @@ from pandas.core.reshape.merge import MergeError -@pytest.fixture(params=["s", "ms", "us", "ns"]) -def unit(request): - """ - Resolution for datetimelike dtypes. - """ - return request.param - - class TestAsOfMerge: def prep_data(self, df, dedupe=False): if dedupe: @@ -612,546 +603,66 @@ def tolerance(self): df["ask"] = df["ask"].astype("float64") return self.prep_data(df) - @pytest.fixture - def allow_exact_matches(self, datapath): - df = pd.DataFrame( - [ - [ - "20160525 13:30:00.023", + def test_examples1(self): + """doc-string examples""" + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 3, 7]} + ) + + result = merge_asof(left, right, on="a") + tm.assert_frame_equal(result, expected) + + def test_examples2(self, unit): + """doc-string examples""" + if unit == "s": + pytest.skip( + "This test is invalid for unit='s' because that would " + "round the trades['time']]" + ) + trades = pd.DataFrame( + { + "time": to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ).astype(f"M8[{unit}]"), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ).astype(f"M8[{unit}]"), + "ticker": [ + "GOOG", + "MSFT", "MSFT", - "51.95", - "75", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.038", "MSFT", - "51.95", - "155", - "NASDAQ", - "51.95", - "51.95", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.77", - "100", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.92", - "100", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.93", - "200", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.93", - "300", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.93", - "600", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", "GOOG", - "720.93", - "44", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.074", - "AAPL", - "98.67", - "478343", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.67", - "478343", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.66", - "6", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "30", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "75", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "20", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "35", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "10", - "NASDAQ", - np.nan, - np.nan, - ], - ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], - ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "1000", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "200", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "300", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "400", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", "AAPL", - "98.56", - "600", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "200", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.078", - "MSFT", - "51.95", - "783", - "NASDAQ", - "51.95", - "51.95", - ], - [ - "20160525 13:30:00.078", - "MSFT", - "51.95", - "100", - "NASDAQ", - "51.95", - "51.95", - ], - [ - "20160525 13:30:00.078", - "MSFT", - "51.95", - "100", - "NASDAQ", - "51.95", - "51.95", - ], - ], - columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), - ) - df["price"] = df["price"].astype("float64") - df["quantity"] = df["quantity"].astype("int64") - df["bid"] = df["bid"].astype("float64") - df["ask"] = df["ask"].astype("float64") - return self.prep_data(df) - - @pytest.fixture - def allow_exact_matches_and_tolerance(self): - df = pd.DataFrame( - [ - [ - "20160525 13:30:00.023", - "MSFT", - "51.95", - "75", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.038", - "MSFT", - "51.95", - "155", - "NASDAQ", - "51.95", - "51.95", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.77", - "100", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.92", - "100", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.93", - "200", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.93", - "300", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.93", - "600", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.048", - "GOOG", - "720.93", - "44", - "NASDAQ", - "720.5", - "720.93", - ], - [ - "20160525 13:30:00.074", - "AAPL", - "98.67", - "478343", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.67", - "478343", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.66", - "6", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "30", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "75", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "20", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "35", - "NASDAQ", - np.nan, - np.nan, - ], - [ - "20160525 13:30:00.075", - "AAPL", - "98.65", - "10", - "NASDAQ", - np.nan, - np.nan, - ], - ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], - ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "1000", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "200", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "300", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "400", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "600", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.076", - "AAPL", - "98.56", - "200", - "ARCA", - "98.55", - "98.56", - ], - [ - "20160525 13:30:00.078", - "MSFT", - "51.95", - "783", - "NASDAQ", - "51.95", - "51.95", - ], - [ - "20160525 13:30:00.078", - "MSFT", - "51.95", - "100", - "NASDAQ", - "51.95", - "51.95", - ], - [ - "20160525 13:30:00.078", - "MSFT", - "51.95", - "100", - "NASDAQ", - "51.95", - "51.95", - ], - ], - columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), - ) - df["price"] = df["price"].astype("float64") - df["quantity"] = df["quantity"].astype("int64") - df["bid"] = df["bid"].astype("float64") - df["ask"] = df["ask"].astype("float64") - return self.prep_data(df) - - def test_examples1(self): - """doc-string examples""" - left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) - right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) - - expected = pd.DataFrame( - {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 3, 7]} - ) - - result = merge_asof(left, right, on="a") - tm.assert_frame_equal(result, expected) - - def test_examples2(self, unit): - """doc-string examples""" - if unit == "s": - pytest.skip( - "This test is invalid for unit='s' because that would " - "round the trades['time']]" - ) - trades = pd.DataFrame( - { - "time": to_datetime( - [ - "20160525 13:30:00.023", - "20160525 13:30:00.038", - "20160525 13:30:00.048", - "20160525 13:30:00.048", - "20160525 13:30:00.048", - ] - ).astype(f"M8[{unit}]"), - "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], - "price": [51.95, 51.95, 720.77, 720.92, 98.00], - "quantity": [75, 155, 100, 100, 100], - }, - columns=["time", "ticker", "price", "quantity"], - ) - - quotes = pd.DataFrame( - { - "time": to_datetime( - [ - "20160525 13:30:00.023", - "20160525 13:30:00.023", - "20160525 13:30:00.030", - "20160525 13:30:00.041", - "20160525 13:30:00.048", - "20160525 13:30:00.049", - "20160525 13:30:00.072", - "20160525 13:30:00.075", - ] - ).astype(f"M8[{unit}]"), - "ticker": [ - "GOOG", - "MSFT", - "MSFT", - "MSFT", - "GOOG", - "AAPL", - "GOOG", - "MSFT", + "GOOG", + "MSFT", ], "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], @@ -2559,7 +2070,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-02"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value1": np.arange(5), @@ -2571,7 +2082,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-01"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value2": list("ABCDE"), @@ -2585,7 +2096,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-02"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value1": np.arange(5), @@ -2628,11 +2139,247 @@ def test_index_tolerance(self, trades, quotes, tolerance): ) tm.assert_frame_equal(result, expected) - def test_allow_exact_matches(self, trades, quotes, allow_exact_matches): + def test_allow_exact_matches(self, trades, quotes): result = merge_asof( trades, quotes, on="time", by="ticker", allow_exact_matches=False ) - expected = allow_exact_matches + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + np.nan, + np.nan, + ], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + expected = self.prep_data(df) tm.assert_frame_equal(result, expected) def test_allow_exact_matches_forward(self): @@ -2665,9 +2412,7 @@ def test_allow_exact_matches_nearest(self): ) tm.assert_frame_equal(result, expected) - def test_allow_exact_matches_and_tolerance( - self, trades, quotes, allow_exact_matches_and_tolerance - ): + def test_allow_exact_matches_and_tolerance(self, trades, quotes): result = merge_asof( trades, quotes, @@ -2676,7 +2421,243 @@ def test_allow_exact_matches_and_tolerance( tolerance=Timedelta("100ms"), allow_exact_matches=False, ) - expected = allow_exact_matches_and_tolerance + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + np.nan, + np.nan, + ], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + expected = self.prep_data(df) tm.assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance2(self): @@ -3117,7 +3098,7 @@ def test_merge_groupby_multiple_column_with_categorical_column(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "func", [lambda x: x, lambda x: to_datetime(x)], ids=["numeric", "datetime"] + "func", [lambda x: x, to_datetime], ids=["numeric", "datetime"] ) @pytest.mark.parametrize("side", ["left", "right"]) def test_merge_on_nans(self, func, side): @@ -3134,7 +3115,7 @@ def test_merge_on_nans(self, func, side): else: merge_asof(df, df_null, on="a") - def test_by_nullable(self, any_numeric_ea_dtype): + def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): # Note: this test passes if instead of using pd.array we use # np.array([np.nan, 1]). Other than that, I (@jbrockmendel) # have NO IDEA what the expected behavior is. @@ -3176,6 +3157,8 @@ def test_by_nullable(self, any_numeric_ea_dtype): } ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3201,7 +3184,7 @@ def test_merge_by_col_tz_aware(self): ) tm.assert_frame_equal(result, expected) - def test_by_mixed_tz_aware(self): + def test_by_mixed_tz_aware(self, using_infer_string): # GH 26649 left = pd.DataFrame( { @@ -3225,6 +3208,8 @@ def test_by_mixed_tz_aware(self): columns=["by_col1", "by_col2", "on_col", "value_x"], ) expected["value_y"] = np.array([np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py index 14f9036e43fce..6ab80cf0e0823 100644 --- a/pandas/tests/reshape/merge/test_merge_cross.py +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -42,8 +42,7 @@ def test_merge_cross_error_reporting(kwargs): left = DataFrame({"a": [1, 3]}) right = DataFrame({"b": [3, 4]}) msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" + "Can not pass on, right_on, left_on or set right_index=True or left_index=True" ) with pytest.raises(MergeError, match=msg): merge(left, right, how="cross", **kwargs) @@ -94,8 +93,7 @@ def test_join_cross_error_reporting(): left = DataFrame({"a": [1, 3]}) right = DataFrame({"a": [3, 4]}) msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" + "Can not pass on, right_on, left_on or set right_index=True or left_index=True" ) with pytest.raises(MergeError, match=msg): left.join(right, how="cross", on="a") diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index abd61026b4e37..7deecf6bbf4a4 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -135,54 +135,50 @@ def test_doc_example(self): "left, right, on, left_by, right_by, expected", [ ( - DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), - DataFrame({"T": [2], "E": [1]}), + {"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}, + {"T": [2], "E": [1]}, ["T"], ["G", "H"], None, - DataFrame( - { - "G": ["g"] * 3, - "H": ["h"] * 3, - "T": [1, 2, 3], - "E": [np.nan, 1.0, np.nan], - } - ), + { + "G": ["g"] * 3, + "H": ["h"] * 3, + "T": [1, 2, 3], + "E": [np.nan, 1.0, np.nan], + }, ), ( - DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), - DataFrame({"T": [2], "E": [1]}), + {"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}, + {"T": [2], "E": [1]}, "T", ["G", "H"], None, - DataFrame( - { - "G": ["g"] * 3, - "H": ["h"] * 3, - "T": [1, 2, 3], - "E": [np.nan, 1.0, np.nan], - } - ), + { + "G": ["g"] * 3, + "H": ["h"] * 3, + "T": [1, 2, 3], + "E": [np.nan, 1.0, np.nan], + }, ), ( - DataFrame({"T": [2], "E": [1]}), - DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), + {"T": [2], "E": [1]}, + {"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}, ["T"], None, ["G", "H"], - DataFrame( - { - "T": [1, 2, 3], - "E": [np.nan, 1.0, np.nan], - "G": ["g"] * 3, - "H": ["h"] * 3, - } - ), + { + "T": [1, 2, 3], + "E": [np.nan, 1.0, np.nan], + "G": ["g"] * 3, + "H": ["h"] * 3, + }, ), ], ) def test_list_type_by(self, left, right, on, left_by, right_by, expected): # GH 35269 + left = DataFrame(left) + right = DataFrame(right) result = merge_ordered( left=left, right=right, @@ -190,6 +186,7 @@ def test_list_type_by(self, left, right, on, left_by, right_by, expected): left_by=left_by, right_by=right_by, ) + expected = DataFrame(expected) tm.assert_frame_equal(result, expected) @@ -219,3 +216,26 @@ def test_ffill_validate_fill_method(self, left, right, invalid_method): ValueError, match=re.escape("fill_method must be 'ffill' or None") ): merge_ordered(left, right, on="key", fill_method=invalid_method) + + def test_ffill_left_merge(self): + # GH 57010 + df1 = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + } + ) + df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + result = merge_ordered( + df1, df2, fill_method="ffill", left_by="group", how="left" + ) + expected = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + "rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 269d3a2b7078e..7ae2fffa04205 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -9,6 +11,7 @@ RangeIndex, Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core.reshape.concat import concat @@ -88,69 +91,69 @@ def test_merge_on_multikey(self, left, right, join_type): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("sort", [False, True]) - def test_left_join_multi_index(self, sort): - icols = ["1st", "2nd", "3rd"] - - def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord("a") - return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_left_join_multi_index(self, sort, infer_string): + with option_context("future.infer_string", infer_string): + icols = ["1st", "2nd", "3rd"] - def run_asserts(left, right, sort): - res = left.join(right, on=icols, how="left", sort=sort) + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 - assert len(left) < len(res) + 1 - assert not res["4th"].isna().any() - assert not res["5th"].isna().any() + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how="left", sort=sort) - tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) - result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res["4th"], result, check_names=False) - assert result.name is None + assert len(left) < len(res) + 1 + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() - if sort: - tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res["4th"], result, check_names=False) + assert result.name is None - out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") + if sort: + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) - res.index = RangeIndex(len(res)) - tm.assert_frame_equal(out, res) + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") - lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) - left = DataFrame( - np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] - ) - # Explicit cast to float to avoid implicit cast when setting nan - left.insert( - 1, - "2nd", - np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), - ) + res.index = RangeIndex(len(res)) + tm.assert_frame_equal(out, res) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i].copy() + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame( + np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] + ) + # Explicit cast to float to avoid implicit cast when setting nan + left.insert( + 1, + "2nd", + np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), + ) + right = left.sample(frac=1, random_state=np.random.default_rng(2)) - left["4th"] = bind_cols(left) - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) - # inject some nulls - left.loc[1::4, "1st"] = np.nan - left.loc[2::5, "2nd"] = np.nan - left.loc[3::6, "3rd"] = np.nan - left["4th"] = bind_cols(left) + # inject some nulls + left.loc[1::4, "1st"] = np.nan + left.loc[2::5, "2nd"] = np.nan + left.loc[3::6, "3rd"] = np.nan + left["4th"] = bind_cols(left) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i, :-1] - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i, :-1] + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) - @pytest.mark.parametrize("sort", [False, True]) def test_merge_right_vs_left(self, left, right, sort): # compare left vs right merge with multikey on_cols = ["key1", "key2"] @@ -632,7 +635,7 @@ def test_join_multi_levels_outer(self, portfolio, household, expected): axis=0, sort=True, ).reindex(columns=expected.columns) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_join_multi_levels_invalid(self, portfolio, household): portfolio = portfolio.copy() diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 136e76986df9d..070c756e8c928 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -452,14 +452,12 @@ def test_crosstab_normalize_arrays(self): index=Index([1, 2, "All"], name="a", dtype="object"), columns=Index([3, 4, "All"], name="b", dtype="object"), ) - msg = "using DataFrameGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - test_case = crosstab( - df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True - ) + test_case = crosstab( + df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True + ) tm.assert_frame_equal(test_case, norm_sum) - def test_crosstab_with_empties(self, using_array_manager): + def test_crosstab_with_empties(self): # Check handling of empties df = DataFrame( { @@ -484,9 +482,6 @@ def test_crosstab_with_empties(self, using_array_manager): index=Index([1, 2], name="a", dtype="int64"), columns=Index([3, 4], name="b"), ) - if using_array_manager: - # INFO(ArrayManager) column without NaNs can preserve int dtype - nans[3] = nans[3].astype("int64") calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False) tm.assert_frame_equal(nans, calculated) @@ -658,16 +653,14 @@ def test_crosstab_normalize_multiple_columns(self): } ) - msg = "using DataFrameGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = crosstab( - [df.A, df.B], - df.C, - values=df.D, - aggfunc=np.sum, - normalize=True, - margins=True, - ) + result = crosstab( + [df.A, df.B], + df.C, + values=df.D, + aggfunc=np.sum, + normalize=True, + margins=True, + ) expected = DataFrame( np.array([0] * 29 + [1], dtype=float).reshape(10, 3), columns=Index(["bar", "foo", "All"], name="C"), diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 0811c69859c0d..63332fe4658e5 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -445,10 +447,16 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CategoricalDtype(ordered=True)) + ) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) + + if type(bins[0]) is datetime: + # The bins have microsecond dtype -> so does result + expected = expected.astype("interval[datetime64[us]]") + + expected = expected.astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -461,10 +469,6 @@ def test_datetime_cut(unit, box): data = box(data) result, _ = cut(data, 3, retbins=True) - if box is list: - # We don't (yet) do inference on these, so get nanos - unit = "ns" - if unit == "s": # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425 # for why we round to 8 seconds instead of 7 @@ -531,24 +535,26 @@ def test_datetime_tz_cut(bins, box): bins = box(bins) result = cut(ser, bins) - expected = Series( - IntervalIndex( - [ - Interval( - Timestamp("2012-12-31 23:57:07.200000", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz), - ), - ] - ) - ).astype(CategoricalDtype(ordered=True)) + ii = IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + if isinstance(bins, int): + # the dtype is inferred from ser, which has nanosecond unit + ii = ii.astype("interval[datetime64[ns, US/Eastern]]") + expected = Series(ii).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -727,6 +733,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424 @@ -789,3 +796,17 @@ def test_cut_with_nullable_int64(): result = cut(series, bins=bins) tm.assert_series_equal(result, expected) + + +def test_cut_datetime_array_no_attributeerror(): + # GH 55431 + ser = Series(to_datetime(["2023-10-06 12:00:00+0000", "2023-10-07 12:00:00+0000"])) + + result = cut(ser.array, bins=2) + + categories = result.categories + expected = Categorical.from_codes([0, 1], categories=categories, ordered=True) + + tm.assert_categorical_equal( + result, expected, check_dtype=True, check_category_order=True + ) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index f9a03222c8057..c7b7992a78232 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -298,32 +298,32 @@ def test_no_prefix_string_cats_contains_get_dummies_NaN_column(): [ pytest.param( "c", - DataFrame({"": ["a", "b", "c"]}), + {"": ["a", "b", "c"]}, id="default_category is a str", ), pytest.param( 1, - DataFrame({"": ["a", "b", 1]}), + {"": ["a", "b", 1]}, id="default_category is a int", ), pytest.param( 1.25, - DataFrame({"": ["a", "b", 1.25]}), + {"": ["a", "b", 1.25]}, id="default_category is a float", ), pytest.param( 0, - DataFrame({"": ["a", "b", 0]}), + {"": ["a", "b", 0]}, id="default_category is a 0", ), pytest.param( False, - DataFrame({"": ["a", "b", False]}), + {"": ["a", "b", False]}, id="default_category is a bool", ), pytest.param( (1, 2), - DataFrame({"": ["a", "b", (1, 2)]}), + {"": ["a", "b", (1, 2)]}, id="default_category is a tuple", ), ], @@ -333,8 +333,9 @@ def test_no_prefix_string_cats_default_category( ): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) + expected = DataFrame(expected) if using_infer_string: - expected[""] = expected[""].astype("string[pyarrow_numpy]") + expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) @@ -366,42 +367,45 @@ def test_with_prefix_contains_get_dummies_NaN_column(): [ pytest.param( "x", - DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}), + {"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}, id="default_category is a str", ), pytest.param( 0, - DataFrame({"col1": ["a", "b", 0], "col2": [0, "a", "c"]}), + {"col1": ["a", "b", 0], "col2": [0, "a", "c"]}, id="default_category is a 0", ), pytest.param( False, - DataFrame({"col1": ["a", "b", False], "col2": [False, "a", "c"]}), + {"col1": ["a", "b", False], "col2": [False, "a", "c"]}, id="default_category is a False", ), pytest.param( {"col2": 1, "col1": 2.5}, - DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}), + {"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}, id="default_category is a dict with int and float values", ), pytest.param( {"col2": None, "col1": False}, - DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}), + {"col1": ["a", "b", False], "col2": [None, "a", "c"]}, id="default_category is a dict with bool and None values", ), pytest.param( {"col2": (1, 2), "col1": [1.25, False]}, - DataFrame({"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}), + {"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}, id="default_category is a dict with list and tuple values", ), ], ) def test_with_prefix_default_category( - dummies_with_unassigned, default_category, expected + dummies_with_unassigned, default_category, expected, using_infer_string ): result = from_dummies( dummies_with_unassigned, sep="_", default_category=default_category ) + expected = DataFrame(expected) + if using_infer_string: + expected = expected.astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 31260e4dcb7d2..9ce2c925a368b 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -120,7 +120,7 @@ def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string): result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - key = "string" if using_infer_string else "object" + key = "str" if using_infer_string else "object" expected_counts = {"int64": 1, key: 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) @@ -214,10 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) - def test_dataframe_dummies_string_dtype(self, df, using_infer_string): + def test_dataframe_dummies_string_dtype(self, df, any_string_dtype): # GH44965 df = df[["A", "B"]] - df = df.astype({"A": "object", "B": "string"}) + df = df.astype({"A": "str", "B": any_string_dtype}) result = get_dummies(df) expected = DataFrame( { @@ -228,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string): }, dtype=bool, ) - if not using_infer_string: - # infer_string returns numpy bools + if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA: expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) @@ -453,19 +452,19 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): [ ( {"data": DataFrame({"ä": ["a"]})}, - DataFrame({"ä_a": [True]}), + "ä_a", ), ( {"data": DataFrame({"x": ["ä"]})}, - DataFrame({"x_ä": [True]}), + "x_ä", ), ( {"data": DataFrame({"x": ["a"]}), "prefix": "ä"}, - DataFrame({"ä_a": [True]}), + "ä_a", ), ( {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, - DataFrame({"xäa": [True]}), + "xäa", ), ], ) @@ -473,6 +472,7 @@ def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): # GH22084 get_dummies incorrectly encodes unicode characters # in dataframe column names result = get_dummies(**get_dummies_kwargs) + expected = DataFrame({expected: [True]}) tm.assert_frame_equal(result, expected) def test_get_dummies_basic_drop_first(self, sparse): @@ -708,19 +708,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): ) tm.assert_frame_equal(result, expected) - @td.skip_if_no("pyarrow") - def test_get_dummies_ea_dtype(self): + @pytest.mark.parametrize("dtype_type", ["string", "category"]) + def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object): # GH#56273 - for dtype, exp_dtype in [ - ("string[pyarrow]", "boolean"), - ("string[pyarrow_numpy]", "bool"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), - ]: - df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) - result = get_dummies(df) - expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) - tm.assert_frame_equal(result, expected) + dtype = string_dtype_no_object + exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool" + if dtype_type == "category": + dtype = CategoricalDtype(Index(["a"], dtype)) + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) @td.skip_if_no("pyarrow") def test_get_dummies_arrow_dtype(self): diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index ff9f927597956..95aa5291cb45a 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -19,7 +19,7 @@ def df(): res = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) res["id1"] = (res["A"] > 0).astype(np.int64) @@ -133,25 +133,21 @@ def test_vars_work_with_multiindex(self, df1): ["A"], ["B"], 0, - DataFrame( - { - "A": {0: 1.067683, 1: -1.321405, 2: -0.807333}, - "CAP": {0: "B", 1: "B", 2: "B"}, - "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, - } - ), + { + "A": {0: 1.067683, 1: -1.321405, 2: -0.807333}, + "CAP": {0: "B", 1: "B", 2: "B"}, + "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, + }, ), ( ["a"], ["b"], 1, - DataFrame( - { - "a": {0: 1.067683, 1: -1.321405, 2: -0.807333}, - "low": {0: "b", 1: "b", 2: "b"}, - "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, - } - ), + { + "a": {0: 1.067683, 1: -1.321405, 2: -0.807333}, + "low": {0: "b", 1: "b", 2: "b"}, + "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, + }, ), ], ) @@ -159,6 +155,7 @@ def test_single_vars_work_with_multiindex( self, id_vars, value_vars, col_level, expected, df1 ): result = df1.melt(id_vars, value_vars, col_level=col_level) + expected = DataFrame(expected) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -287,13 +284,14 @@ def test_multiindex(self, df1): @pytest.mark.parametrize( "col", [ - pd.Series(date_range("2010", periods=5, tz="US/Pacific")), - pd.Series(["a", "b", "c", "a", "d"], dtype="category"), - pd.Series([0, 1, 0, 0, 0]), + date_range("2010", periods=5, tz="US/Pacific"), + pd.Categorical(["a", "b", "c", "a", "d"]), + [0, 1, 0, 0, 0], ], ) def test_pandas_dtypes(self, col): # GH 15785 + col = pd.Series(col) df = DataFrame( {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col} ) @@ -349,13 +347,12 @@ def test_melt_missing_columns_raises(self): df.melt(["a", "b", "not_here", "or_there"], ["c", "d"]) # Multiindex melt fails if column is missing from multilevel melt - multi = df.copy() - multi.columns = [list("ABCD"), list("abcd")] + df.columns = [list("ABCD"), list("abcd")] with pytest.raises(KeyError, match=msg): - multi.melt([("E", "a")], [("B", "b")]) + df.melt([("E", "a")], [("B", "b")]) # Multiindex fails if column is missing from single level melt with pytest.raises(KeyError, match=msg): - multi.melt(["A"], ["F"], col_level=0) + df.melt(["A"], ["F"], col_level=0) def test_melt_mixed_int_str_id_vars(self): # GH 29718 @@ -364,6 +361,8 @@ def test_melt_mixed_int_str_id_vars(self): expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) + # the df's columns are mixed type and thus object -> preserves object dtype + expected["variable"] = expected["variable"].astype(object) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): @@ -536,6 +535,26 @@ def test_melt_non_scalar_var_name_raises(self): with pytest.raises(ValueError, match=r".* must be a scalar."): df.melt(id_vars=["a"], var_name=[1, 2]) + def test_melt_multiindex_columns_var_name(self): + # GH 58033 + df = DataFrame({("A", "a"): [1], ("A", "b"): [2]}) + + expected = DataFrame( + [("A", "a", 1), ("A", "b", 2)], columns=["first", "second", "value"] + ) + + tm.assert_frame_equal(df.melt(var_name=["first", "second"]), expected) + tm.assert_frame_equal(df.melt(var_name=["first"]), expected[["first", "value"]]) + + def test_melt_multiindex_columns_var_name_too_many(self): + # GH 58033 + df = DataFrame({("A", "a"): [1], ("A", "b"): [2]}) + + with pytest.raises( + ValueError, match="but the dataframe columns only have 2 levels" + ): + df.melt(var_name=["first", "second", "third"]) + class TestLreshape: def test_pairs(self): @@ -1197,11 +1216,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - @pytest.mark.parametrize("dtype", ["O", "string"]) - def test_missing_stubname(self, dtype): + def test_missing_stubname(self, any_string_dtype): # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) - df = df.astype({"id": dtype}) + df = df.astype({"id": any_string_dtype}) result = wide_to_long( df, stubnames=["a", "b"], @@ -1217,6 +1235,38 @@ def test_missing_stubname(self, dtype): {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) - new_level = expected.index.levels[0].astype(dtype) + new_level = expected.index.levels[0].astype(any_string_dtype) + if any_string_dtype == "object": + new_level = expected.index.levels[0].astype("str") expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) + + +def test_wide_to_long_string_columns(string_storage): + # GH 57066 + string_dtype = pd.StringDtype(string_storage, na_value=np.nan) + df = DataFrame( + { + "ID": {0: 1}, + "R_test1": {0: 1}, + "R_test2": {0: 1}, + "R_test3": {0: 2}, + "D": {0: 1}, + } + ) + df.columns = df.columns.astype(string_dtype) + result = wide_to_long( + df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" + ) + expected = DataFrame( + [[1, 1], [1, 1], [1, 2]], + columns=Index(["D", "R"]), + index=pd.MultiIndex.from_arrays( + [ + [1, 1, 1], + Index(["test1", "test2", "test3"], dtype=string_dtype), + ], + names=["ID", "UNPIVOTED"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 18a449b4d0c67..2a58815c1cece 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -9,12 +9,13 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype -from pandas.errors import PerformanceWarning +from pandas.compat.numpy import np_version_gte1p25 import pandas as pd from pandas import ( + ArrowDtype, Categorical, DataFrame, Grouper, @@ -30,17 +31,6 @@ from pandas.core.reshape.pivot import pivot_table -@pytest.fixture(params=[True, False]) -def dropna(request): - return request.param - - -@pytest.fixture(params=[([0] * 4, [1] * 4), (range(3), range(1, 4))]) -def interval_values(request, closed): - left, right = request.param - return Categorical(pd.IntervalIndex.from_arrays(left, right, closed)) - - class TestPivotTable: @pytest.fixture def data(self): @@ -203,9 +193,9 @@ def test_pivot_table_categorical(self): ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True ) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - msg = "The default value of observed=False is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pivot_table(df, values="values", index=["A", "B"], dropna=True) + result = pivot_table( + df, values="values", index=["A", "B"], dropna=True, observed=False + ) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1.0, 2.0, 3.0, 4.0]}, index=exp_index) @@ -224,9 +214,9 @@ def test_pivot_table_dropna_categoricals(self, dropna): ) df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False)) - msg = "The default value of observed=False is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) + result = df.pivot_table( + index="B", columns="A", values="C", dropna=dropna, observed=False + ) expected_columns = Series(["a", "b", "c"], name="A") expected_columns = expected_columns.astype( CategoricalDtype(categories, ordered=False) @@ -256,9 +246,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): } ) - msg = "The default value of observed=False is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.pivot_table(index="A", values="B", dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False) if dropna: values = [2.0, 3.0] codes = [0, 1] @@ -269,9 +257,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): expected = DataFrame( {"B": values}, index=Index( - Categorical.from_codes( - codes, categories=["low", "high"], ordered=dropna - ), + Categorical.from_codes(codes, categories=["low", "high"], ordered=True), name="A", ), ) @@ -291,9 +277,7 @@ def test_pivot_with_non_observable_dropna_multi_cat(self, dropna): } ) - msg = "The default value of observed=False is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.pivot_table(index="A", values="B", dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False) expected = DataFrame( {"B": [2.0, 3.0, 0.0]}, index=Index( @@ -308,13 +292,16 @@ def test_pivot_with_non_observable_dropna_multi_cat(self, dropna): tm.assert_frame_equal(result, expected) - def test_pivot_with_interval_index(self, interval_values, dropna): + @pytest.mark.parametrize( + "left_right", [([0] * 4, [1] * 4), (range(3), range(1, 4))] + ) + def test_pivot_with_interval_index(self, left_right, dropna, closed): # GH 25814 + left, right = left_right + interval_values = Categorical(pd.IntervalIndex.from_arrays(left, right, closed)) df = DataFrame({"A": interval_values, "B": 1}) - msg = "The default value of observed=False is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.pivot_table(index="A", values="B", dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False) expected = DataFrame( {"B": 1.0}, index=Index(interval_values.unique(), name="A") ) @@ -335,11 +322,15 @@ def test_pivot_with_interval_index_margins(self): } ) - msg = "The default value of observed=False is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - pivot_tab = pivot_table( - df, index="C", columns="B", values="A", aggfunc="sum", margins=True - ) + pivot_tab = pivot_table( + df, + index="C", + columns="B", + values="A", + aggfunc="sum", + margins=True, + observed=False, + ) result = pivot_tab["All"] expected = Series( @@ -750,18 +741,11 @@ def test_pivot_periods_with_margins(self): result = df.pivot_table(index="a", columns="b", values="x", margins=True) tm.assert_frame_equal(expected, result) - @pytest.mark.parametrize( - "values", - [ - ["baz", "zoo"], - np.array(["baz", "zoo"]), - Series(["baz", "zoo"]), - Index(["baz", "zoo"]), - ], - ) + @pytest.mark.parametrize("box", [list, np.array, Series, Index]) @pytest.mark.parametrize("method", [True, False]) - def test_pivot_with_list_like_values(self, values, method): + def test_pivot_with_list_like_values(self, box, method): # issue #17160 + values = box(["baz", "zoo"]) df = DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], @@ -836,7 +820,7 @@ def test_pivot_columns_none_raise_error(self): df = DataFrame({"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]}) msg = r"pivot\(\) missing 1 required keyword-only argument: 'columns'" with pytest.raises(TypeError, match=msg): - df.pivot(index="col1", values="col3") # pylint: disable=missing-kwoa + df.pivot(index="col1", values="col3") @pytest.mark.xfail( reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966" @@ -903,10 +887,14 @@ def _check_output( result, values_col, data, - index=["A", "B"], - columns=["C"], + index=None, + columns=None, margins_col="All", ): + if index is None: + index = ["A", "B"] + if columns is None: + columns = ["C"] col_margins = result.loc[result.index[:-1], margins_col] expected_col_margins = data.groupby(index)[values_col].mean() tm.assert_series_equal(col_margins, expected_col_margins, check_names=False) @@ -948,12 +936,14 @@ def test_margins(self, data): for value_col in table.columns.levels[0]: self._check_output(table[value_col], value_col, data) - def test_no_col(self, data): + def test_no_col(self, data, using_infer_string): # no col # to help with a buglet data.columns = [k * 2 for k in data.columns] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( @@ -1003,7 +993,7 @@ def test_no_col(self, data): ], ) def test_margin_with_only_columns_defined( - self, columns, aggfunc, values, expected_columns + self, columns, aggfunc, values, expected_columns, using_infer_string ): # GH 31016 df = DataFrame( @@ -1027,6 +1017,8 @@ def test_margin_with_only_columns_defined( ) if aggfunc != "sum": msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: @@ -1090,7 +1082,7 @@ def test_pivot_table_multiindex_only(self, cols): expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"], dtype=object), + index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"), ) tm.assert_frame_equal(result, expected) @@ -1277,7 +1269,7 @@ def test_pivot_table_with_margins_set_margin_name(self, margin_name, data): margins_name=margin_name, ) - def test_pivot_timegrouper(self, using_array_manager): + def test_pivot_timegrouper(self): df = DataFrame( { "Branch": "A A A A A A A B".split(), @@ -1331,9 +1323,6 @@ def test_pivot_timegrouper(self, using_array_manager): ) expected.index.name = "Date" expected.columns.name = "Buyer" - if using_array_manager: - # INFO(ArrayManager) column without NaNs can preserve int dtype - expected["Carl"] = expected["Carl"].astype("int64") result = pivot_table( df, @@ -1754,6 +1743,7 @@ def test_daily(self): mask = ts.index.year == y expected[y] = Series(ts.values[mask], index=doy[mask]) expected = DataFrame(expected, dtype=float).T + expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(result, expected) def test_monthly(self): @@ -1769,6 +1759,7 @@ def test_monthly(self): mask = ts.index.year == y expected[y] = Series(ts.values[mask], index=month[mask]) expected = DataFrame(expected, dtype=float).T + expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(result, expected) def test_pivot_table_with_iterator_values(self, data): @@ -1844,9 +1835,9 @@ def test_categorical_margins_category(self, observed): df.y = df.y.astype("category") df.z = df.z.astype("category") - msg = "The default value of observed=False is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) + table = df.pivot_table( + "x", "y", "z", dropna=observed, margins=True, observed=False + ) tm.assert_frame_equal(table, expected) def test_margins_casted_to_float(self): @@ -1908,11 +1899,14 @@ def test_categorical_aggfunc(self, observed): {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]} ) df["C1"] = df["C1"].astype("category") - msg = "The default value of observed=False is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.pivot_table( - "V", index="C1", columns="C2", dropna=observed, aggfunc="count" - ) + result = df.pivot_table( + "V", + index="C1", + columns="C2", + dropna=observed, + aggfunc="count", + observed=False, + ) expected_index = pd.CategoricalIndex( ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" @@ -2069,15 +2063,69 @@ def test_pivot_string_as_func(self): ).rename_axis("A") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("kwargs", [{"a": 2}, {"a": 2, "b": 3}, {"b": 3, "a": 2}]) + def test_pivot_table_kwargs(self, kwargs): + # GH#57884 + def f(x, a, b=3): + return x.sum() * a + b + + def g(x): + return f(x, **kwargs) + + df = DataFrame( + { + "A": ["good", "bad", "good", "bad", "good"], + "B": ["one", "two", "one", "three", "two"], + "X": [2, 5, 4, 20, 10], + } + ) + result = pivot_table( + df, index="A", columns="B", values="X", aggfunc=f, **kwargs + ) + expected = pivot_table(df, index="A", columns="B", values="X", aggfunc=g) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs", [{}, {"b": 10}, {"a": 3}, {"a": 3, "b": 10}, {"b": 10, "a": 3}] + ) + def test_pivot_table_kwargs_margin(self, data, kwargs): + # GH#57884 + def f(x, a=5, b=7): + return (x.sum() + b) * a + + def g(x): + return f(x, **kwargs) + + result = data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + aggfunc=f, + margins=True, + fill_value=0, + **kwargs, + ) + + expected = data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + aggfunc=g, + margins=True, + fill_value=0, + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "f, f_numpy", [ ("sum", np.sum), ("mean", np.mean), - ("std", np.std), + ("min", np.min), (["sum", "mean"], [np.sum, np.mean]), - (["sum", "std"], [np.sum, np.std]), - (["std", "mean"], [np.std, np.mean]), + (["sum", "min"], [np.sum, np.min]), + (["max", "mean"], [np.max, np.mean]), ], ) def test_pivot_string_func_vs_func(self, f, f_numpy, data): @@ -2085,14 +2133,20 @@ def test_pivot_string_func_vs_func(self, f, f_numpy, data): # for consistency purposes data = data.drop(columns="C") result = pivot_table(data, index="A", columns="B", aggfunc=f) - ops = "|".join(f) if isinstance(f, list) else f - msg = f"using DataFrameGroupBy.[{ops}]" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy) + expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy) + + if not np_version_gte1p25 and isinstance(f_numpy, list): + # Prior to 1.25, np.min/np.max would come through as amin and amax + mapper = {"amin": "min", "amax": "max", "sum": "sum", "mean": "mean"} + expected.columns = expected.columns.map( + lambda x: (mapper[x[0]], x[1], x[2]) + ) tm.assert_frame_equal(result, expected) @pytest.mark.slow - def test_pivot_number_of_levels_larger_than_int32(self, monkeypatch): + def test_pivot_number_of_levels_larger_than_int32( + self, performance_warning, monkeypatch + ): # GH 20601 # GH 26314: Change ValueError to PerformanceWarning class MockUnstacker(reshape_lib._Unstacker): @@ -2108,7 +2162,7 @@ def __init__(self, *args, **kwargs) -> None: ) msg = "The following operation may generate" - with tm.assert_produces_warning(PerformanceWarning, match=msg): + with tm.assert_produces_warning(performance_warning, match=msg): with pytest.raises(Exception, match="Don't compute final result."): df.pivot_table( index="ind1", columns="ind2", values="count", aggfunc="count" @@ -2323,10 +2377,13 @@ def test_pivot_table_with_margins_and_numeric_columns(self): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("dropna", [True, False]) - def test_pivot_ea_dtype_dropna(self, dropna): + @pytest.mark.parametrize( + "dtype,expected_dtype", [("Int64", "Float64"), ("int64", "float64")] + ) + def test_pivot_ea_dtype_dropna(self, dropna, dtype, expected_dtype): # GH#47477 - df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype="Int64")}) + # GH#47971 + df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype=dtype)}) result = df.pivot_table( index="x", columns="y", values="age", aggfunc="mean", dropna=dropna ) @@ -2334,7 +2391,7 @@ def test_pivot_ea_dtype_dropna(self, dropna): [[30]], index=Index(["a"], name="x"), columns=Index(["b"], name="y"), - dtype="Float64", + dtype=expected_dtype, ) tm.assert_frame_equal(result, expected) @@ -2370,7 +2427,7 @@ def test_pivot_table_datetime_warning(self): ) tm.assert_frame_equal(result, expected) - def test_pivot_table_with_mixed_nested_tuples(self, using_array_manager): + def test_pivot_table_with_mixed_nested_tuples(self): # GH 50342 df = DataFrame( { @@ -2434,9 +2491,6 @@ def test_pivot_table_with_mixed_nested_tuples(self, using_array_manager): [["bar", "bar", "foo", "foo"], ["one", "two"] * 2], names=["A", "B"] ), ) - if using_array_manager: - # INFO(ArrayManager) column without NaNs can preserve int dtype - expected["small"] = expected["small"].astype("int64") tm.assert_frame_equal(result, expected) def test_pivot_table_aggfunc_nunique_with_different_values(self): @@ -2476,6 +2530,70 @@ def test_pivot_table_aggfunc_nunique_with_different_values(self): tm.assert_frame_equal(result, expected) + def test_pivot_table_index_and_column_keys_with_nan(self, dropna): + # GH#61113 + data = {"row": [None, *range(4)], "col": [*range(4), None], "val": range(5)} + df = DataFrame(data) + result = df.pivot_table(values="val", index="row", columns="col", dropna=dropna) + e_axis = [*range(4), None] + nan = np.nan + e_data = [ + [nan, 1.0, nan, nan, nan], + [nan, nan, 2.0, nan, nan], + [nan, nan, nan, 3.0, nan], + [nan, nan, nan, nan, 4.0], + [0.0, nan, nan, nan, nan], + ] + expected = DataFrame( + data=e_data, + index=Index(data=e_axis, name="row"), + columns=Index(data=e_axis, name="col"), + ) + if dropna: + expected = expected.loc[[0, 1, 2], [1, 2, 3]] + + tm.assert_frame_equal(left=result, right=expected) + + @pytest.mark.parametrize( + "index, columns, e_data, e_index, e_cols", + [ + ( + "Category", + "Value", + [ + [1.0, np.nan, 1.0, np.nan], + [np.nan, 1.0, np.nan, 1.0], + ], + Index(data=["A", "B"], name="Category"), + Index(data=[10, 20, 40, 50], name="Value"), + ), + ( + "Value", + "Category", + [ + [1.0, np.nan], + [np.nan, 1.0], + [1.0, np.nan], + [np.nan, 1.0], + ], + Index(data=[10, 20, 40, 50], name="Value"), + Index(data=["A", "B"], name="Category"), + ), + ], + ids=["values-and-columns", "values-and-index"], + ) + def test_pivot_table_values_as_two_params( + self, index, columns, e_data, e_index, e_cols + ): + # GH#57876 + data = {"Category": ["A", "B", "A", "B"], "Value": [10, 20, 40, 50]} + df = DataFrame(data) + result = df.pivot_table( + index=index, columns=columns, values="Value", aggfunc="count" + ) + expected = DataFrame(data=e_data, index=e_index, columns=e_cols) + tm.assert_frame_equal(result, expected) + class TestPivot: def test_pivot(self): @@ -2524,12 +2642,16 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.parametrize("dtype", [object, "string"]) - def test_pivot_integer_bug(self, dtype): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) + def test_pivot_integer_bug(self, any_string_dtype): + df = DataFrame( + data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype + ) result = df.pivot(index=1, columns=0, values=2) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) + expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) + if any_string_dtype == "object": + expected_columns = expected_columns.astype("str") + tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self): # GH#3962 @@ -2609,9 +2731,13 @@ def test_pivot_columns_not_given(self): # GH#48293 df = DataFrame({"a": [1], "b": 1}) with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): - df.pivot() # pylint: disable=missing-kwoa + df.pivot() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + # this still fails because columns=None gets passed down to unstack as level=None + # while at that point None was converted to NaN + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2627,8 +2753,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") - def test_pivot_index_is_none(self): + def test_pivot_index_is_none(self, using_infer_string): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2639,9 +2764,10 @@ def test_pivot_index_is_none(self): result = df.pivot(columns="b", index=None, values="c") expected = DataFrame(3, index=[1], columns=Index([2], name="b")) + if using_infer_string: + expected.index.name = np.nan tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2712,3 +2838,85 @@ def test_pivot_table_with_margins_and_numeric_column_names(self): index=Index(["a", "b", "All"], name=0), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("m", [1, 10]) + def test_unstack_copy(self, m): + # GH#56633 + levels = np.arange(m) + index = MultiIndex.from_product([levels] * 2) + values = np.arange(m * m * 100).reshape(m * m, 100) + df = DataFrame(values, index, np.arange(100)) + df_orig = df.copy() + result = df.unstack(sort=False) + result.iloc[0, 0] = -1 + tm.assert_frame_equal(df, df_orig) + + def test_pivot_empty_with_datetime(self): + # GH#59126 + df = DataFrame( + { + "timestamp": Series([], dtype=pd.DatetimeTZDtype(tz="UTC")), + "category": Series([], dtype=str), + "value": Series([], dtype=str), + } + ) + df_pivoted = df.pivot_table( + index="category", columns="value", values="timestamp" + ) + assert df_pivoted.empty + + def test_pivot_margins_with_none_index(self): + # GH#58722 + df = DataFrame( + { + "x": [1, 1, 2], + "y": [3, 3, 4], + "z": [5, 5, 6], + "w": [7, 8, 9], + } + ) + result = df.pivot_table( + index=None, + columns=["y", "z"], + values="w", + margins=True, + aggfunc="count", + ) + expected = DataFrame( + [[2, 2, 1, 1]], + index=["w"], + columns=MultiIndex( + levels=[[3, 4], [5, 6, "All"]], + codes=[[0, 0, 1, 1], [0, 2, 1, 2]], + names=["y", "z"], + ), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_pivot_with_pyarrow_categorical(self): + # GH#53051 + pa = pytest.importorskip("pyarrow") + + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) + + df = df.pivot(columns=["string_column"], values=["number_column"]) + + multi_index = MultiIndex.from_arrays( + [["number_column", "number_column", "number_column"], ["A", "B", "C"]], + names=(None, "string_column"), + ) + df_expected = DataFrame( + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]], + columns=multi_index, + ) + tm.assert_frame_equal( + df, df_expected, check_dtype=False, check_column_type=False + ) diff --git a/pandas/tests/reshape/test_pivot_multilevel.py b/pandas/tests/reshape/test_pivot_multilevel.py index 08ef29440825f..af70210b37f3c 100644 --- a/pandas/tests/reshape/test_pivot_multilevel.py +++ b/pandas/tests/reshape/test_pivot_multilevel.py @@ -197,7 +197,7 @@ def test_pivot_list_like_columns( tm.assert_frame_equal(result, expected) -def test_pivot_multiindexed_rows_and_cols(using_array_manager): +def test_pivot_multiindexed_rows_and_cols(): # GH 36360 df = pd.DataFrame( @@ -225,9 +225,7 @@ def test_pivot_multiindexed_rows_and_cols(using_array_manager): ), index=Index([0, 1], dtype="int64", name="idx_L0"), ) - if not using_array_manager: - # BlockManager does not preserve the dtypes - expected = expected.astype("float64") + expected = expected.astype("float64") tm.assert_frame_equal(res, expected) @@ -252,3 +250,52 @@ def test_pivot_df_multiindex_index_none(): columns=Index(["label1", "label2"], name="label"), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "index, columns, e_data, e_index, e_cols", + [ + ( + "index", + ["col", "value"], + [ + [50.0, np.nan, 100.0, np.nan], + [np.nan, 100.0, np.nan, 200.0], + ], + Index(data=["A", "B"], name="index"), + MultiIndex.from_arrays( + arrays=[[1, 1, 2, 2], [50, 100, 100, 200]], names=["col", "value"] + ), + ), + ( + ["index", "value"], + "col", + [ + [50.0, np.nan], + [np.nan, 100.0], + [100.0, np.nan], + [np.nan, 200.0], + ], + MultiIndex.from_arrays( + arrays=[["A", "A", "B", "B"], [50, 100, 100, 200]], + names=["index", "value"], + ), + Index(data=[1, 2], name="col"), + ), + ], + ids=["values-and-columns", "values-and-index"], +) +def test_pivot_table_multiindex_values_as_two_params( + index, columns, e_data, e_index, e_cols +): + # GH#61292 + data = [ + ["A", 1, 50, -1], + ["B", 1, 100, -2], + ["A", 2, 100, -2], + ["B", 2, 200, -4], + ] + df = pd.DataFrame(data=data, columns=["index", "col", "value", "extra"]) + result = df.pivot_table(values="value", index=index, columns=columns) + expected = pd.DataFrame(data=e_data, index=e_index, columns=e_cols) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index b5b19eef1106f..b2e9f26e1c407 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -154,14 +154,15 @@ def test_qcut_wrong_length_labels(labels): @pytest.mark.parametrize( "labels, expected", [ - (["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)), - (list(range(3)), Categorical([0, 1, 2], ordered=True)), + (["a", "b", "c"], ["a", "b", "c"]), + (list(range(3)), [0, 1, 2]), ], ) def test_qcut_list_like_labels(labels, expected): # GH 13318 values = range(3) result = qcut(values, 3, labels=labels) + expected = Categorical(expected, ordered=True) tm.assert_categorical_equal(result, expected) @@ -209,13 +210,14 @@ def test_single_quantile(data, start, end, length, labels): @pytest.mark.parametrize( "ser", [ - Series(DatetimeIndex(["20180101", NaT, "20180103"])), - Series(TimedeltaIndex(["0 days", NaT, "2 days"])), + DatetimeIndex(["20180101", NaT, "20180103"]), + TimedeltaIndex(["0 days", NaT, "2 days"]), ], ids=lambda x: str(x.dtype), ) def test_qcut_nat(ser, unit): # see gh-19768 + ser = Series(ser) ser = ser.dt.as_unit(unit) td = Timedelta(1, unit=unit).as_unit(unit) @@ -269,8 +271,10 @@ def test_datetime_tz_qcut(bins): ], ], ) -def test_date_like_qcut_bins(arg, expected_bins): +def test_date_like_qcut_bins(arg, expected_bins, unit): # see gh-19891 + arg = arg.as_unit(unit) + expected_bins = expected_bins.as_unit(unit) ser = Series(arg) result, result_bins = qcut(ser, 2, retbins=True) tm.assert_index_equal(result_bins, expected_bins) @@ -303,3 +307,15 @@ def test_qcut_nullable_integer(q, any_numeric_ea_dtype): expected = qcut(arr.astype(float), q) tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0]) +@pytest.mark.parametrize("q", [3, 7, 9]) +@pytest.mark.parametrize("precision", [1, 3, 16]) +def test_qcut_contains(scale, q, precision): + # GH-59355 + arr = (scale * np.arange(q + 1)).round(precision) + result = qcut(arr, q, precision=precision) + + for value, bucket in zip(arr, result): + assert value in bucket diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 8d78d34e936f0..081feae6fc43f 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -126,7 +126,11 @@ def test_union_categoricals_nan(self): def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 if using_infer_string and val == ["1"]: - request.applymarker(pytest.mark.xfail("object and strings dont match")) + request.applymarker( + pytest.mark.xfail( + reason="TDOD(infer_string) object and strings dont match" + ) + ) res = union_categoricals([Categorical([]), Categorical(val)]) exp = Categorical(val) tm.assert_categorical_equal(res, exp) diff --git a/pandas/tests/scalar/period/test_arithmetic.py b/pandas/tests/scalar/period/test_arithmetic.py index 5dc0858de466c..97e486f9af060 100644 --- a/pandas/tests/scalar/period/test_arithmetic.py +++ b/pandas/tests/scalar/period/test_arithmetic.py @@ -476,10 +476,11 @@ def test_period_comparison_nat(self): assert not left >= right @pytest.mark.parametrize( - "zerodim_arr, expected", - ((np.array(0), False), (np.array(Period("2000-01", "M")), True)), + "scalar, expected", + ((0, False), (Period("2000-01", "M"), True)), ) - def test_period_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): + def test_period_comparison_numpy_zerodim_arr(self, scalar, expected): + zerodim_arr = np.array(scalar) per = Period("2000-01", "M") assert (per == zerodim_arr) is expected diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 4489c307172d7..0ae5389a3e9b5 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -59,6 +59,7 @@ def test_asfreq_corner(self): def test_conv_annual(self): # frequency conversion tests: from Annual Frequency + msg = INVALID_FREQ_ERR_MSG ival_A = Period(freq="Y", year=2007) @@ -110,18 +111,14 @@ def test_conv_annual(self): assert ival_A.asfreq("B", "E") == ival_A_to_B_end assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - msg = "'H' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("H", "s") == ival_A_to_H_start assert ival_A.asfreq("H", "E") == ival_A_to_H_end assert ival_A.asfreq("min", "s") == ival_A_to_T_start assert ival_A.asfreq("min", "E") == ival_A_to_T_end - msg = "'T' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("T", "s") == ival_A_to_T_start assert ival_A.asfreq("T", "E") == ival_A_to_T_end - msg = "'S' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): assert ival_A.asfreq("S", "S") == ival_A_to_S_start assert ival_A.asfreq("S", "E") == ival_A_to_S_end @@ -824,6 +821,5 @@ def test_asfreq_MS(self): with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") - msg = "MonthBegin is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): Period("2013-01", "MS") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index aa4a8b152b19f..baaedaa853565 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -3,6 +3,7 @@ datetime, timedelta, ) +import re import numpy as np import pytest @@ -40,25 +41,26 @@ class TestPeriodDisallowedFreqs: ) def test_offsets_not_supported(self, freq, freq_msg): # GH#55785 - msg = f"{freq_msg} is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = re.escape(f"{freq} is not supported as period frequency") + with pytest.raises(ValueError, match=msg): Period(year=2014, freq=freq) def test_custom_business_day_freq_raises(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "C is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq="C") - with pytest.raises(TypeError, match=msg): + msg = f"{offsets.CustomBusinessDay().base} is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq=offsets.CustomBusinessDay()) def test_invalid_frequency_error_message(self): - msg = "WeekOfMonth is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "WOM-1MON is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="WOM-1MON") def test_invalid_frequency_period_error_message(self): - msg = "for Period, please use 'M' instead of 'ME'" + msg = "Invalid frequency: ME" with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="ME") @@ -106,14 +108,18 @@ def test_construction(self): assert i1 == i3 i1 = Period("1982", freq="min") - i2 = Period("1982", freq="MIN") + msg = "'MIN' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i2 = Period("1982", freq="MIN") assert i1 == i2 i1 = Period(year=2005, month=3, day=1, freq="D") i2 = Period("3/1/2005", freq="D") assert i1 == i2 - i3 = Period(year=2005, month=3, day=1, freq="d") + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i3 = Period(year=2005, month=3, day=1, freq="d") assert i1 == i3 i1 = Period("2007-01-01 09:00:00.001") @@ -609,6 +615,25 @@ def test_period_large_ordinal(self, hour): p = Period(ordinal=2562048 + hour, freq="1h") assert p.hour == hour + @pytest.mark.filterwarnings( + "ignore:Period with BDay freq is deprecated:FutureWarning" + ) + @pytest.mark.parametrize( + "freq,freq_depr", + [("2W", "2w"), ("2W-FRI", "2w-fri"), ("2D", "2d"), ("2B", "2b")], + ) + def test_period_deprecated_lowercase_freq(self, freq, freq_depr): + # GH#58998 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Period("2016-03-01 09:00", freq=freq_depr) + + expected = Period("2016-03-01 09:00", freq=freq) + assert result == expected + class TestPeriodMethods: def test_round_trip(self): @@ -966,7 +991,6 @@ def test_properties_quarterly(self): qedec_date = Period(freq="Q-DEC", year=2007, quarter=1) qejan_date = Period(freq="Q-JAN", year=2007, quarter=1) qejun_date = Period(freq="Q-JUN", year=2007, quarter=1) - # for x in range(3): for qd in (qedec_date, qejan_date, qejun_date): assert (qd + x).qyear == 2007 @@ -991,7 +1015,6 @@ def test_properties_monthly(self): def test_properties_weekly(self): # Test properties on Periods with daily frequency. w_date = Period(freq="W", year=2007, month=1, day=7) - # assert w_date.year == 2007 assert w_date.quarter == 1 assert w_date.month == 1 @@ -1021,7 +1044,6 @@ def test_properties_daily(self): # Test properties on Periods with daily frequency. with tm.assert_produces_warning(FutureWarning, match=bday_msg): b_date = Period(freq="B", year=2007, month=1, day=1) - # assert b_date.year == 2007 assert b_date.quarter == 1 assert b_date.month == 1 @@ -1064,7 +1086,6 @@ def test_properties_hourly(self): def test_properties_minutely(self): # Test properties on Periods with minutely frequency. t_date = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) - # assert t_date.quarter == 1 assert t_date.month == 1 assert t_date.day == 1 @@ -1083,7 +1104,6 @@ def test_properties_secondly(self): s_date = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) - # assert s_date.year == 2007 assert s_date.quarter == 1 assert s_date.month == 1 diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index cb046e0133245..b20df43dd49a6 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -3,10 +3,10 @@ timedelta, ) import operator +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs.tslibs import iNaT from pandas.compat.numpy import np_version_gte1p24p3 @@ -146,7 +146,6 @@ def test_round_nat(klass, method, freq): "utcnow", "utcoffset", "utctimetuple", - "timestamp", ], ) def test_nat_methods_raise(method): @@ -362,7 +361,7 @@ def test_nat_doc_strings(compare): (Timestamp("2014-01-01"), "timestamp"), (Timestamp("2014-01-01", tz="UTC"), "timestamp"), (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), - (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), + (datetime(2014, 1, 1).astimezone(zoneinfo.ZoneInfo("Asia/Tokyo")), "timestamp"), ], ) def test_nat_arithmetic_scalar(op_name, value, val_type): @@ -440,8 +439,10 @@ def test_nat_rfloordiv_timedelta(val, expected): @pytest.mark.parametrize( "value", [ - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], dtype="M8[ns]", name="x"), + DatetimeIndex( + ["2011-01-01", "2011-01-02"], dtype="M8[ns, US/Eastern]", name="x" + ), DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], dtype="M8[ns]"), DatetimeArray._from_sequence( ["2011-01-01", "2011-01-02"], dtype=DatetimeTZDtype(tz="US/Pacific") diff --git a/pandas/tests/scalar/timedelta/methods/test_round.py b/pandas/tests/scalar/timedelta/methods/test_round.py index e54adb27d126b..96cb1c07d2b76 100644 --- a/pandas/tests/scalar/timedelta/methods/test_round.py +++ b/pandas/tests/scalar/timedelta/methods/test_round.py @@ -19,29 +19,31 @@ class TestTimedeltaRound: # This first case has s1, s2 being the same as t1,t2 below ( "ns", - Timedelta("1 days 02:34:56.789123456"), - Timedelta("-1 days 02:34:56.789123456"), + "1 days 02:34:56.789123456", + "-1 days 02:34:56.789123456", ), ( "us", - Timedelta("1 days 02:34:56.789123000"), - Timedelta("-1 days 02:34:56.789123000"), + "1 days 02:34:56.789123000", + "-1 days 02:34:56.789123000", ), ( "ms", - Timedelta("1 days 02:34:56.789000000"), - Timedelta("-1 days 02:34:56.789000000"), + "1 days 02:34:56.789000000", + "-1 days 02:34:56.789000000", ), - ("s", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), - ("2s", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), - ("5s", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), - ("min", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), - ("12min", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), - ("h", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), - ("d", Timedelta("1 days"), Timedelta("-1 days")), + ("s", "1 days 02:34:57", "-1 days 02:34:57"), + ("2s", "1 days 02:34:56", "-1 days 02:34:56"), + ("5s", "1 days 02:34:55", "-1 days 02:34:55"), + ("min", "1 days 02:35:00", "-1 days 02:35:00"), + ("12min", "1 days 02:36:00", "-1 days 02:36:00"), + ("h", "1 days 03:00:00", "-1 days 03:00:00"), + ("D", "1 days", "-1 days"), ], ) def test_round(self, freq, s1, s2): + s1 = Timedelta(s1) + s2 = Timedelta(s2) t1 = Timedelta("1 days 02:34:56.789123456") t2 = Timedelta("-1 days 02:34:56.789123456") @@ -170,7 +172,6 @@ def checker(ts, nanos, unit): nanos = 24 * 60 * 60 * 1_000_000_000 checker(td, nanos, "D") - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_round_non_nano(self, unit): td = Timedelta("1 days 02:34:57").as_unit(unit) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index d2fa0f722ca6f..2183a5851ea9c 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -1,6 +1,7 @@ """ Tests for scalar Timedelta arithmetic ops """ + from datetime import ( datetime, timedelta, @@ -78,7 +79,7 @@ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_datetimelike_scalar(self, op): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, datetime(2016, 1, 1)) if op is operator.add: @@ -110,7 +111,7 @@ def test_td_add_timestamp_overflow(self): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, Timedelta(days=10)) assert isinstance(result, Timedelta) @@ -118,35 +119,35 @@ def test_td_add_td(self, op): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_pytimedelta(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, timedelta(days=9)) assert isinstance(result, Timedelta) assert result == Timedelta(days=19) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, np.timedelta64(-4, "D")) assert isinstance(result, Timedelta) assert result == Timedelta(days=6) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_offset(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, offsets.Hour(6)) assert isinstance(result, Timedelta) assert result == Timedelta(days=10, hours=6) def test_td_sub_td(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td assert isinstance(result, Timedelta) assert result == expected def test_td_sub_pytimedelta(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_pytimedelta() @@ -158,7 +159,7 @@ def test_td_sub_pytimedelta(self): assert result == expected def test_td_sub_timedelta64(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_timedelta64() @@ -171,12 +172,12 @@ def test_td_sub_timedelta64(self): def test_td_sub_nat(self): # In this context pd.NaT is treated as timedelta-like - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - NaT assert result is NaT def test_td_sub_td64_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") td_nat = np.timedelta64("NaT") result = td - td_nat @@ -186,13 +187,13 @@ def test_td_sub_td64_nat(self): assert result is NaT def test_td_sub_offset(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - offsets.Hour(1) assert isinstance(result, Timedelta) assert result == Timedelta(239, unit="h") def test_td_add_sub_numeric_raises(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") msg = "unsupported operand type" for other in [2, 2.0, np.int64(2), np.float64(2)]: with pytest.raises(TypeError, match=msg): @@ -233,7 +234,7 @@ def test_td_add_sub_int_ndarray(self): other - td def test_td_rsub_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT - td assert result is NaT @@ -241,7 +242,7 @@ def test_td_rsub_nat(self): assert result is NaT def test_td_rsub_offset(self): - result = offsets.Hour(1) - Timedelta(10, unit="d") + result = offsets.Hour(1) - Timedelta(10, unit="D") assert isinstance(result, Timedelta) assert result == Timedelta(-239, unit="h") @@ -361,7 +362,7 @@ class TestTimedeltaMultiplicationDivision: @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nat(self, op, td_nat): # GH#19819 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") typs = "|".join(["numpy.timedelta64", "NaTType", "Timedelta"]) msg = "|".join( [ @@ -376,7 +377,7 @@ def test_td_mul_nat(self, op, td_nat): @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nan(self, op, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, nan) assert result is NaT @@ -418,7 +419,7 @@ def test_td_mul_numeric_ndarray(self): def test_td_mul_numeric_ndarray_0d(self): td = Timedelta("1 day") - other = np.array(2) + other = np.array(2, dtype=np.int64) assert other.ndim == 0 expected = Timedelta("2 days") @@ -448,7 +449,7 @@ def test_td_mul_td64_ndarray_invalid(self): def test_td_div_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / offsets.Hour(1) assert result == 240 @@ -479,7 +480,7 @@ def test_td_div_td64_non_nano(self): def test_td_div_numeric_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / 2 assert isinstance(result, Timedelta) @@ -499,7 +500,7 @@ def test_td_div_numeric_scalar(self): ) def test_td_div_nan(self, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / nan assert result is NaT @@ -531,7 +532,7 @@ def test_td_div_ndarray_0d(self): def test_td_rdiv_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = offsets.Hour(1) / td assert result == 1 / 240.0 @@ -539,7 +540,7 @@ def test_td_rdiv_timedeltalike_scalar(self): def test_td_rdiv_na_scalar(self): # GH#31869 None gets cast to NaT - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT / td assert np.isnan(result) @@ -559,7 +560,7 @@ def test_td_rdiv_na_scalar(self): np.nan / td def test_td_rdiv_ndarray(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") arr = np.array([td], dtype=object) result = arr / td @@ -582,7 +583,7 @@ def test_td_rdiv_ndarray(self): arr / td def test_td_rdiv_ndarray_0d(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") arr = np.array(td.asm8) @@ -622,6 +623,7 @@ def test_td_floordiv_invalid_scalar(self): [ r"Invalid dtype datetime64\[D\] for __floordiv__", "'dtype' is an invalid keyword argument for this function", + "this function got an unexpected keyword argument 'dtype'", r"ufunc '?floor_divide'? cannot use operands with types", ] ) @@ -955,11 +957,12 @@ def test_rdivmod_invalid(self): @pytest.mark.parametrize( "arr", [ - np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]), - np.array([Timestamp("2021-11-09 09:54:00"), Timedelta("1D")]), + [Timestamp("20130101 9:01"), Timestamp("20121230 9:02")], + [Timestamp("2021-11-09 09:54:00"), Timedelta("1D")], ], ) def test_td_op_timedelta_timedeltalike_array(self, op, arr): + arr = np.array(arr) msg = "unsupported operand type|cannot use operands with types" with pytest.raises(TypeError, match=msg): op(arr, Timedelta("1D")) @@ -1178,5 +1181,5 @@ def test_ops_error_str(): with pytest.raises(TypeError, match=msg): left > right - assert not left == right # pylint: disable=unneeded-not + assert not left == right assert left != right diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 4663f8cb71961..45caeb1733590 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -35,15 +35,16 @@ def test_unit_m_y_raises(self, unit): @pytest.mark.parametrize( "unit,unit_depr", [ - ("h", "H"), - ("min", "T"), + ("W", "w"), + ("D", "d"), + ("min", "MIN"), ("s", "S"), - ("ms", "L"), - ("ns", "N"), - ("us", "U"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), ], ) - def test_units_H_T_S_L_N_U_deprecated(self, unit, unit_depr): + def test_unit_deprecated(self, unit, unit_depr): # GH#52536 msg = f"'{unit_depr}' is deprecated and will be removed in a future version." @@ -54,8 +55,8 @@ def test_units_H_T_S_L_N_U_deprecated(self, unit, unit_depr): @pytest.mark.parametrize( "unit, np_unit", - [(value, "W") for value in ["W", "w"]] - + [(value, "D") for value in ["D", "d", "days", "day", "Days", "Day"]] + [("W", "W")] + + [(value, "D") for value in ["D", "days", "day", "Days", "Day"]] + [ (value, "m") for value in [ @@ -88,7 +89,6 @@ def test_units_H_T_S_L_N_U_deprecated(self, unit, unit_depr): "millisecond", "milli", "millis", - "MS", "Milliseconds", "Millisecond", "Milli", @@ -103,13 +103,10 @@ def test_units_H_T_S_L_N_U_deprecated(self, unit, unit_depr): "microsecond", "micro", "micros", - "u", - "US", "Microseconds", "Microsecond", "Micro", "Micros", - "U", ] ] + [ @@ -120,13 +117,10 @@ def test_units_H_T_S_L_N_U_deprecated(self, unit, unit_depr): "nanosecond", "nano", "nanos", - "n", - "NS", "Nanoseconds", "Nanosecond", "Nano", "Nanos", - "N", ] ], ) @@ -139,36 +133,39 @@ def test_unit_parser(self, unit, np_unit, wrapper): dtype="m8[ns]", ) # TODO(2.0): the desired output dtype may have non-nano resolution - msg = f"'{unit}' is deprecated and will be removed in a future version." - - if (unit, np_unit) in (("u", "us"), ("U", "us"), ("n", "ns"), ("N", "ns")): - warn = FutureWarning - else: - warn = FutureWarning - msg = "The 'unit' keyword in TimedeltaIndex construction is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = to_timedelta(wrapper(range(5)), unit=unit) - tm.assert_index_equal(result, expected) - result = TimedeltaIndex(wrapper(range(5)), unit=unit) - tm.assert_index_equal(result, expected) - - str_repr = [f"{x}{unit}" for x in np.arange(5)] - result = to_timedelta(wrapper(str_repr)) - tm.assert_index_equal(result, expected) - result = to_timedelta(wrapper(str_repr)) - tm.assert_index_equal(result, expected) - - # scalar - expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) - result = to_timedelta(2, unit=unit) - assert result == expected - result = Timedelta(2, unit=unit) - assert result == expected - - result = to_timedelta(f"2{unit}") - assert result == expected - result = Timedelta(f"2{unit}") - assert result == expected + + result = to_timedelta(wrapper(range(5)), unit=unit) + tm.assert_index_equal(result, expected) + + str_repr = [f"{x}{unit}" for x in np.arange(5)] + result = to_timedelta(wrapper(str_repr)) + tm.assert_index_equal(result, expected) + result = to_timedelta(wrapper(str_repr)) + tm.assert_index_equal(result, expected) + + # scalar + expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) + result = to_timedelta(2, unit=unit) + assert result == expected + result = Timedelta(2, unit=unit) + assert result == expected + + result = to_timedelta(f"2{unit}") + assert result == expected + result = Timedelta(f"2{unit}") + assert result == expected + + @pytest.mark.parametrize("unit", ["T", "t", "L", "l", "U", "u", "N", "n"]) + def test_unit_T_L_N_U_raises(self, unit): + msg = f"invalid unit abbreviation: {unit}" + with pytest.raises(ValueError, match=msg): + Timedelta(1, unit=unit) + + with pytest.raises(ValueError, match=msg): + to_timedelta(10, unit) + + with pytest.raises(ValueError, match=msg): + to_timedelta([1, 2], unit) def test_construct_from_kwargs_overflow(): @@ -261,8 +258,8 @@ def test_from_tick_reso(): def test_construction(): expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8") - assert Timedelta(10, unit="d")._value == expected - assert Timedelta(10.0, unit="d")._value == expected + assert Timedelta(10, unit="D")._value == expected + assert Timedelta(10.0, unit="D")._value == expected assert Timedelta("10 days")._value == expected assert Timedelta(days=10)._value == expected assert Timedelta(days=10.0)._value == expected @@ -356,8 +353,7 @@ def test_construction(): Timedelta("foo") msg = ( - "cannot construct a Timedelta from " - "the passed arguments, allowed keywords are " + "cannot construct a Timedelta from the passed arguments, allowed keywords are " ) with pytest.raises(ValueError, match=msg): Timedelta(day=10) @@ -631,17 +627,16 @@ def test_timedelta_pass_td_and_kwargs_raises(): @pytest.mark.parametrize( - "constructor, value, unit, expectation", + "constructor, value, unit", [ - (Timedelta, "10s", "ms", (ValueError, "unit must not be specified")), - (to_timedelta, "10s", "ms", (ValueError, "unit must not be specified")), - (to_timedelta, ["1", 2, 3], "s", (ValueError, "unit must not be specified")), + (Timedelta, "10s", "ms"), + (to_timedelta, "10s", "ms"), + (to_timedelta, ["1", 2, 3], "s"), ], ) -def test_string_with_unit(constructor, value, unit, expectation): - exp, match = expectation - with pytest.raises(exp, match=match): - _ = constructor(value, unit=unit) +def test_string_with_unit(constructor, value, unit): + with pytest.raises(ValueError, match="unit must not be specified"): + constructor(value, unit=unit) @pytest.mark.parametrize( diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index e1b0076d5b7b9..1aafeec2ceed5 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize( "td, expected_repr", [ - (Timedelta(10, unit="d"), "Timedelta('10 days 00:00:00')"), + (Timedelta(10, unit="D"), "Timedelta('10 days 00:00:00')"), (Timedelta(10, unit="s"), "Timedelta('0 days 00:00:10')"), (Timedelta(10, unit="ms"), "Timedelta('0 days 00:00:00.010000')"), (Timedelta(-10, unit="ms"), "Timedelta('-1 days +23:59:59.990000')"), diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index d4398f66e6f89..8be2ec846a6d9 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -1,4 +1,5 @@ -""" test the scalar Timedelta """ +"""test the scalar Timedelta""" + from datetime import timedelta import sys @@ -10,6 +11,7 @@ import pytest from pandas._libs import lib +from pandas._libs.missing import NA from pandas._libs.tslibs import ( NaT, iNaT, @@ -137,6 +139,19 @@ def test_truediv_numeric(self, td): assert res._value == td._value / 2 assert res._creso == td._creso + def test_truediv_na_type_not_supported(self, td): + msg_td_floordiv_na = ( + r"unsupported operand type\(s\) for /: 'Timedelta' and 'NAType'" + ) + with pytest.raises(TypeError, match=msg_td_floordiv_na): + td / NA + + msg_na_floordiv_td = ( + r"unsupported operand type\(s\) for /: 'NAType' and 'Timedelta'" + ) + with pytest.raises(TypeError, match=msg_na_floordiv_td): + NA / td + def test_floordiv_timedeltalike(self, td): assert td // td == 1 assert (2.5 * td) // td == 2 @@ -181,6 +196,19 @@ def test_floordiv_numeric(self, td): assert res._value == td._value // 2 assert res._creso == td._creso + def test_floordiv_na_type_not_supported(self, td): + msg_td_floordiv_na = ( + r"unsupported operand type\(s\) for //: 'Timedelta' and 'NAType'" + ) + with pytest.raises(TypeError, match=msg_td_floordiv_na): + td // NA + + msg_na_floordiv_td = ( + r"unsupported operand type\(s\) for //: 'NAType' and 'Timedelta'" + ) + with pytest.raises(TypeError, match=msg_na_floordiv_td): + NA // td + def test_addsub_mismatched_reso(self, td): # need to cast to since td is out of bounds for ns, so # so we would raise OverflowError without casting @@ -252,7 +280,7 @@ def test_timedelta_class_min_max_resolution(): class TestTimedeltaUnaryOps: def test_invert(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") msg = "bad operand type for unary ~" with pytest.raises(TypeError, match=msg): @@ -267,17 +295,17 @@ def test_invert(self): ~(td.to_timedelta64()) def test_unary_ops(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") # __neg__, __pos__ - assert -td == Timedelta(-10, unit="d") - assert -td == Timedelta("-10d") - assert +td == Timedelta(10, unit="d") + assert -td == Timedelta(-10, unit="D") + assert -td == Timedelta("-10D") + assert +td == Timedelta(10, unit="D") # __abs__, __abs__(__neg__) assert abs(td) == td assert abs(-td) == td - assert abs(-td) == Timedelta("10d") + assert abs(-td) == Timedelta("10D") class TestTimedeltas: @@ -306,7 +334,7 @@ def test_total_seconds_scalar(self): assert np.isnan(rng.total_seconds()) def test_conversion(self): - for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: + for td in [Timedelta(10, unit="D"), Timedelta("1 days, 10:11:12.012345")]: pydt = td.to_pytimedelta() assert td == Timedelta(pydt) assert td == pydt @@ -422,7 +450,7 @@ def test_numeric_conversions(self): assert Timedelta(10, unit="us") == np.timedelta64(10, "us") assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") assert Timedelta(10, unit="s") == np.timedelta64(10, "s") - assert Timedelta(10, unit="d") == np.timedelta64(10, "D") + assert Timedelta(10, unit="D") == np.timedelta64(10, "D") def test_timedelta_conversions(self): assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( @@ -446,7 +474,7 @@ def test_to_numpy_alias(self): td.to_numpy(copy=True) def test_identity(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") assert isinstance(td, Timedelta) assert isinstance(td, timedelta) @@ -461,7 +489,10 @@ def conv(v): assert Timedelta("1000") == np.timedelta64(1000, "ns") assert Timedelta("1000ns") == np.timedelta64(1000, "ns") - assert Timedelta("1000NS") == np.timedelta64(1000, "ns") + + msg = "'NS' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timedelta("1000NS") == np.timedelta64(1000, "ns") assert Timedelta("10us") == np.timedelta64(10000, "ns") assert Timedelta("100us") == np.timedelta64(100000, "ns") @@ -480,8 +511,10 @@ def conv(v): assert Timedelta("100s") == np.timedelta64(100000000000, "ns") assert Timedelta("1000s") == np.timedelta64(1000000000000, "ns") - assert Timedelta("1d") == conv(np.timedelta64(1, "D")) - assert Timedelta("-1d") == -conv(np.timedelta64(1, "D")) + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timedelta("1d") == conv(np.timedelta64(1, "D")) + assert Timedelta("-1D") == -conv(np.timedelta64(1, "D")) assert Timedelta("1D") == conv(np.timedelta64(1, "D")) assert Timedelta("10D") == conv(np.timedelta64(10, "D")) assert Timedelta("100D") == conv(np.timedelta64(100, "D")) @@ -635,6 +668,26 @@ def test_resolution_deprecated(self): result = Timedelta.resolution assert result == Timedelta(nanoseconds=1) + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): + # GH#59051 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Timedelta(1, unit_depr) + assert result == Timedelta(1, unit) + @pytest.mark.parametrize( "value, expected", @@ -664,3 +717,10 @@ def test_timedelta_attribute_precision(): result += td.nanoseconds expected = td._value assert result == expected + + +def test_to_pytimedelta_large_values(): + td = Timedelta(1152921504609987375, unit="ns") + result = td.to_pytimedelta() + expected = timedelta(days=13343, seconds=86304, microseconds=609987) + assert result == expected diff --git a/pandas/tests/scalar/timestamp/methods/test_normalize.py b/pandas/tests/scalar/timestamp/methods/test_normalize.py index e097c9673e17a..60f249c602bd6 100644 --- a/pandas/tests/scalar/timestamp/methods/test_normalize.py +++ b/pandas/tests/scalar/timestamp/methods/test_normalize.py @@ -6,7 +6,6 @@ class TestTimestampNormalize: @pytest.mark.parametrize("arg", ["2013-11-30", "2013-11-30 12:00:00"]) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_normalize(self, tz_naive_fixture, arg, unit): tz = tz_naive_fixture ts = Timestamp(arg, tz=tz).as_unit(unit) diff --git a/pandas/tests/scalar/timestamp/methods/test_replace.py b/pandas/tests/scalar/timestamp/methods/test_replace.py index 8a208455edc82..f15ea0e485cae 100644 --- a/pandas/tests/scalar/timestamp/methods/test_replace.py +++ b/pandas/tests/scalar/timestamp/methods/test_replace.py @@ -1,9 +1,9 @@ from datetime import datetime +import zoneinfo from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -11,6 +11,7 @@ conversion, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas.compat import WASM import pandas.util._test_decorators as td import pandas._testing as tm @@ -99,6 +100,7 @@ def test_replace_integer_args(self, tz_aware_fixture): with pytest.raises(ValueError, match=msg): ts.replace(hour=0.1) + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_replace_tzinfo_equiv_tz_localize_none(self): # GH#14621, GH#7825 # assert conversion to naive is the same as replacing tzinfo with None @@ -106,10 +108,11 @@ def test_replace_tzinfo_equiv_tz_localize_none(self): assert ts.tz_localize(None) == ts.replace(tzinfo=None) @td.skip_if_windows + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_replace_tzinfo(self): # GH#15683 - dt = datetime(2016, 3, 27, 1) - tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=1) + tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo result_dt = dt.replace(tzinfo=tzinfo) result_pd = Timestamp(dt).replace(tzinfo=tzinfo) @@ -134,13 +137,16 @@ def test_replace_tzinfo(self): @pytest.mark.parametrize( "tz, normalize", [ - (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), + ("pytz/US/Eastern", lambda x: x.tzinfo.normalize(x)), (gettz("US/Eastern"), lambda x: x), ], ) def test_replace_across_dst(self, tz, normalize): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) ts_naive = Timestamp("2017-12-03 16:03:30") ts_aware = conversion.localize_pydatetime(ts_naive, tz) @@ -157,7 +163,6 @@ def test_replace_across_dst(self, tz, normalize): ts2b = normalize(ts2) assert ts2 == ts2b - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_replace_dst_border(self, unit): # Gh 7825 t = Timestamp("2013-11-3", tz="America/Chicago").as_unit(unit) @@ -168,7 +173,6 @@ def test_replace_dst_border(self, unit): @pytest.mark.parametrize("fold", [0, 1]) @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_replace_dst_fold(self, fold, tz, unit): # GH 25017 d = datetime(2019, 10, 27, 2, 30) diff --git a/pandas/tests/scalar/timestamp/methods/test_round.py b/pandas/tests/scalar/timestamp/methods/test_round.py index d10ee18b47f19..6b27e5e6c5554 100644 --- a/pandas/tests/scalar/timestamp/methods/test_round.py +++ b/pandas/tests/scalar/timestamp/methods/test_round.py @@ -4,7 +4,6 @@ ) import numpy as np import pytest -import pytz from pandas._libs import lib from pandas._libs.tslibs import ( @@ -147,7 +146,6 @@ def test_round_minute_freq(self, test_input, freq, expected, rounder): result = func(freq) assert result == expected - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_ceil(self, unit): dt = Timestamp("20130101 09:10:11").as_unit(unit) result = dt.ceil("D") @@ -155,7 +153,6 @@ def test_ceil(self, unit): assert result == expected assert result._creso == dt._creso - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_floor(self, unit): dt = Timestamp("20130101 09:10:11").as_unit(unit) result = dt.floor("D") @@ -164,15 +161,10 @@ def test_floor(self, unit): assert result._creso == dt._creso @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) - @pytest.mark.parametrize( - "unit", - ["ns", "us", "ms", "s"], - ) def test_round_dst_border_ambiguous(self, method, unit): # GH 18946 round near "fall back" DST ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") ts = ts.as_unit(unit) - # result = getattr(ts, method)("h", ambiguous=True) assert result == ts assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value @@ -188,7 +180,7 @@ def test_round_dst_border_ambiguous(self, method, unit): assert result is NaT msg = "Cannot infer dst time" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): getattr(ts, method)("h", ambiguous="raise") @pytest.mark.parametrize( @@ -199,10 +191,6 @@ def test_round_dst_border_ambiguous(self, method, unit): ["floor", "2018-03-11 03:01:00-0500", "2h"], ], ) - @pytest.mark.parametrize( - "unit", - ["ns", "us", "ms", "s"], - ) def test_round_dst_border_nonexistent(self, method, ts_str, freq, unit): # GH 23324 round near "spring forward" DST ts = Timestamp(ts_str, tz="America/Chicago").as_unit(unit) @@ -215,7 +203,7 @@ def test_round_dst_border_nonexistent(self, method, ts_str, freq, unit): assert result is NaT msg = "2018-03-11 02:00:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): + with pytest.raises(ValueError, match=msg): getattr(ts, method)(freq, nonexistent="raise") @pytest.mark.parametrize( diff --git a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py index 67985bd4ba566..beacaaf04e6b2 100644 --- a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py +++ b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py @@ -1,9 +1,11 @@ # NB: This is for the Timestamp.timestamp *method* specifically, not # the Timestamp class in general. +from datetime import timezone -from pytz import utc +import pytest from pandas._libs.tslibs import Timestamp +from pandas.compat import WASM import pandas.util._test_decorators as td import pandas._testing as tm @@ -11,11 +13,12 @@ class TestTimestampMethod: @td.skip_if_windows + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_timestamp(self, fixed_now_ts): # GH#17329 # tz-naive --> treat it as if it were UTC for purposes of timestamp() ts = fixed_now_ts - uts = ts.replace(tzinfo=utc) + uts = ts.replace(tzinfo=timezone.utc) assert ts.timestamp() == uts.timestamp() tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") diff --git a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py index 57f57e56201c8..07e57b51a7f1e 100644 --- a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py +++ b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py @@ -3,7 +3,7 @@ timedelta, ) -import pytz +import pytest from pandas._libs.tslibs.timezones import dateutil_gettz as gettz import pandas.util._test_decorators as td @@ -24,7 +24,8 @@ def test_to_pydatetime_nonzero_nano(self): ts = Timestamp("2011-01-01 9:00:00.123456789") # Warn the user of data loss (nanoseconds). - with tm.assert_produces_warning(UserWarning): + msg = "Discarding nonzero nanoseconds in conversion" + with tm.assert_produces_warning(UserWarning, match=msg): expected = datetime(2011, 1, 1, 9, 0, 0, 123456) result = ts.to_pydatetime() assert result == expected @@ -42,6 +43,7 @@ def test_timestamp_to_pydatetime_dateutil(self): assert stamp.tzinfo == dtval.tzinfo def test_timestamp_to_pydatetime_explicit_pytz(self): + pytz = pytest.importorskip("pytz") stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern")) dtval = stamp.to_pydatetime() assert stamp == dtval diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py index af3dee1880d2e..cb7ac5fa6f1da 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py @@ -1,13 +1,9 @@ from datetime import timedelta import re +import zoneinfo from dateutil.tz import gettz import pytest -import pytz -from pytz.exceptions import ( - AmbiguousTimeError, - NonExistentTimeError, -) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.errors import OutOfBoundsDatetime @@ -17,69 +13,57 @@ Timestamp, ) -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - class TestTimestampTZLocalize: @pytest.mark.skip_ubsan def test_tz_localize_pushes_out_of_bounds(self): # GH#12677 # tz_localize that pushes away from the boundary is OK + pytz = pytest.importorskip("pytz") msg = ( f"Converting {Timestamp.min.strftime('%Y-%m-%d %H:%M:%S')} " f"underflows past {Timestamp.min}" ) - pac = Timestamp.min.tz_localize("US/Pacific") + pac = Timestamp.min.tz_localize(pytz.timezone("US/Pacific")) assert pac._value > Timestamp.min._value pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.min.tz_localize("Asia/Tokyo") + Timestamp.min.tz_localize(pytz.timezone("Asia/Tokyo")) # tz_localize that pushes away from the boundary is OK msg = ( f"Converting {Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')} " f"overflows past {Timestamp.max}" ) - tokyo = Timestamp.max.tz_localize("Asia/Tokyo") + tokyo = Timestamp.max.tz_localize(pytz.timezone("Asia/Tokyo")) assert tokyo._value < Timestamp.max._value tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.max.tz_localize("US/Pacific") + Timestamp.max.tz_localize(pytz.timezone("US/Pacific")) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_tz_localize_ambiguous_bool(self, unit): + @pytest.mark.parametrize( + "tz", + [zoneinfo.ZoneInfo("US/Central"), "dateutil/US/Central", "pytz/US/Central"], + ) + def test_tz_localize_ambiguous_bool(self, unit, tz): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) ts = Timestamp("2015-11-01 01:00:03").as_unit(unit) - expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") - expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz=tz) + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz=tz) msg = "Cannot infer dst time from 2015-11-01 01:00:03" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("US/Central") - - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("dateutil/US/Central") - - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Central") - except KeyError: - # no tzdata - pass - else: - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize(tz) - - result = ts.tz_localize("US/Central", ambiguous=True) + with pytest.raises(ValueError, match=msg): + ts.tz_localize(tz) + + result = ts.tz_localize(tz, ambiguous=True) assert result == expected0 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - result = ts.tz_localize("US/Central", ambiguous=False) + result = ts.tz_localize(tz, ambiguous=False) assert result == expected1 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value @@ -117,10 +101,10 @@ def test_tz_localize_ambiguous(self): def test_tz_localize_nonexistent(self, stamp, tz): # GH#13057 ts = Timestamp(stamp) - with pytest.raises(NonExistentTimeError, match=stamp): + with pytest.raises(ValueError, match=stamp): ts.tz_localize(tz) # GH 22644 - with pytest.raises(NonExistentTimeError, match=stamp): + with pytest.raises(ValueError, match=stamp): ts.tz_localize(tz, nonexistent="raise") assert ts.tz_localize(tz, nonexistent="NaT") is NaT @@ -166,7 +150,7 @@ def test_tz_localize_ambiguous_raise(self): # GH#13057 ts = Timestamp("2015-11-1 01:00") msg = "Cannot infer dst time from 2015-11-01 01:00:00," - with pytest.raises(AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.tz_localize("US/Pacific", ambiguous="raise") def test_tz_localize_nonexistent_invalid_arg(self, warsaw): @@ -206,9 +190,10 @@ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): def test_tz_localize_ambiguous_compat(self): # validate that pytz and dateutil are compat for dst # when the transition happens + pytz = pytest.importorskip("pytz") naive = Timestamp("2013-10-27 01:00:00") - pytz_zone = "Europe/London" + pytz_zone = pytz.timezone("Europe/London") dateutil_zone = "dateutil/Europe/London" result_pytz = naive.tz_localize(pytz_zone, ambiguous=False) result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=False) @@ -237,13 +222,16 @@ def test_tz_localize_ambiguous_compat(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), - "US/Eastern", + zoneinfo.ZoneInfo("US/Eastern"), "dateutil/US/Eastern", ], ) def test_timestamp_tz_localize(self, tz): + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) stamp = Timestamp("3/11/2012 04:00") result = stamp.tz_localize(tz) @@ -295,7 +283,6 @@ def test_timestamp_tz_localize(self, tz): ], ) @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_timestamp_tz_localize_nonexistent_shift( self, start_ts, tz, end_ts, shift, tz_type, unit ): @@ -327,7 +314,6 @@ def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, warsaw): with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent=timedelta(seconds=offset)) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_timestamp_tz_localize_nonexistent_NaT(self, warsaw, unit): # GH 8917 tz = warsaw @@ -335,13 +321,12 @@ def test_timestamp_tz_localize_nonexistent_NaT(self, warsaw, unit): result = ts.tz_localize(tz, nonexistent="NaT") assert result is NaT - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_timestamp_tz_localize_nonexistent_raise(self, warsaw, unit): # GH 8917 tz = warsaw ts = Timestamp("2015-03-29 02:20:00").as_unit(unit) msg = "2015-03-29 02:20:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent="raise") msg = ( "The nonexistent argument must be one of 'raise', 'NaT', " diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 2d58513989a66..d65d425620c84 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -7,7 +7,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -294,7 +293,7 @@ def test_subtract_different_utc_objects(self, utc_fixture, utc_fixture2): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), "US/Eastern", "dateutil/US/Eastern", @@ -302,7 +301,9 @@ def test_subtract_different_utc_objects(self, utc_fixture, utc_fixture2): ) def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): # GH#1389 - + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) # 4 hours before DST transition stamp = Timestamp("3/10/2012 22:00", tz=tz) @@ -313,6 +314,17 @@ def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): assert result == expected + def test_timestamp_dst_transition(self): + # GH 60084 + dt_str = "2023-11-05 01:00-08:00" + tz_str = "America/Los_Angeles" + + ts1 = Timestamp(dt_str, tz=tz_str) + ts2 = ts1 + Timedelta(hours=0) + + assert ts1 == ts2 + assert hash(ts1) == hash(ts2) + class SubDatetime(datetime): pass diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index e7e5541cf499f..b2590c43e1ece 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -309,5 +309,5 @@ def __eq__(self, other) -> bool: for left, right in [(inf, timestamp), (timestamp, inf)]: assert left > right or left < right assert left >= right or left <= right - assert not left == right # pylint: disable=unneeded-not + assert not left == right assert left != right diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 3975f3c46aaa1..2c97c4a32e0aa 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -15,10 +15,8 @@ ) import numpy as np import pytest -import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas.compat import PY310 from pandas.errors import OutOfBoundsDatetime from pandas import ( @@ -28,6 +26,7 @@ Timedelta, Timestamp, ) +import pandas._testing as tm class TestTimestampConstructorUnitKeyword: @@ -122,6 +121,7 @@ def test_timestamp_constructor_pytz_fold_raise(self): # Test for GH#25057 # pytz doesn't support fold. Check that we raise # if fold is passed with pytz + pytz = pytest.importorskip("pytz") msg = "pytz timezones do not support fold. Please use dateutil timezones." tz = pytz.timezone("Europe/London") with pytest.raises(ValueError, match=msg): @@ -159,15 +159,13 @@ def test_timestamp_constructor_retain_fold(self, tz, fold): expected = fold assert result == expected - try: - _tzs = [ + @pytest.mark.parametrize( + "tz", + [ "dateutil/Europe/London", zoneinfo.ZoneInfo("Europe/London"), - ] - except zoneinfo.ZoneInfoNotFoundError: - _tzs = ["dateutil/Europe/London"] - - @pytest.mark.parametrize("tz", _tzs) + ], + ) @pytest.mark.parametrize( "ts_input,fold_out", [ @@ -190,18 +188,17 @@ def test_timestamp_constructor_infer_fold_from_value(self, tz, ts_input, fold_ou @pytest.mark.parametrize("tz", ["dateutil/Europe/London"]) @pytest.mark.parametrize( - "ts_input,fold,value_out", + "fold,value_out", [ - (datetime(2019, 10, 27, 1, 30, 0, 0), 0, 1572136200000000), - (datetime(2019, 10, 27, 1, 30, 0, 0), 1, 1572139800000000), + (0, 1572136200000000), + (1, 1572139800000000), ], ) - def test_timestamp_constructor_adjust_value_for_fold( - self, tz, ts_input, fold, value_out - ): + def test_timestamp_constructor_adjust_value_for_fold(self, tz, fold, value_out): # Test for GH#25057 # Check that we adjust value for fold correctly # based on timestamps since utc + ts_input = datetime(2019, 10, 27, 1, 30) ts = Timestamp(ts_input, tz=tz, fold=fold) result = ts._value expected = value_out @@ -211,11 +208,7 @@ def test_timestamp_constructor_adjust_value_for_fold( class TestTimestampConstructorPositionalAndKeywordSupport: def test_constructor_positional(self): # see GH#10758 - msg = ( - "'NoneType' object cannot be interpreted as an integer" - if PY310 - else "an integer is required" - ) + msg = "'NoneType' object cannot be interpreted as an integer" with pytest.raises(TypeError, match=msg): Timestamp(2000, 1) @@ -329,6 +322,18 @@ def test_constructor_positional_keyword_mixed_with_tzinfo(self, kwd, request): class TestTimestampClassMethodConstructors: # Timestamp constructors other than __new__ + def test_utcnow_deprecated(self): + # GH#56680 + msg = "Timestamp.utcnow is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + Timestamp.utcnow() + + def test_utcfromtimestamp_deprecated(self): + # GH#56680 + msg = "Timestamp.utcfromtimestamp is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + Timestamp.utcfromtimestamp(43) + def test_constructor_strptime(self): # GH#25016 # Test support for Timestamp.strptime @@ -553,11 +558,11 @@ def test_constructor(self): timezones = [ (None, 0), ("UTC", 0), - (pytz.utc, 0), + (timezone.utc, 0), ("Asia/Tokyo", 9), ("US/Eastern", -4), ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), + (timezone(timedelta(hours=-3)), -3), (dateutil.tz.tzoffset(None, 18000), 5), ] @@ -609,13 +614,12 @@ def test_constructor_with_stringoffset(self): ] timezones = [ - (None, 0), ("UTC", 0), - (pytz.utc, 0), + (timezone.utc, 0), ("Asia/Tokyo", 9), ("US/Eastern", -4), ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), + (timezone(timedelta(hours=-3)), -3), (dateutil.tz.tzoffset(None, 18000), 5), ] @@ -695,7 +699,7 @@ def test_constructor_invalid_tz(self): msg = "at most one of" with pytest.raises(ValueError, match=msg): - Timestamp("2017-10-22", tzinfo=pytz.utc, tz="UTC") + Timestamp("2017-10-22", tzinfo=timezone.utc, tz="UTC") msg = "Cannot pass a date attribute keyword argument when passing a date string" with pytest.raises(ValueError, match=msg): @@ -708,11 +712,11 @@ def test_constructor_tz_or_tzinfo(self): # GH#17943, GH#17690, GH#5168 stamps = [ Timestamp(year=2017, month=10, day=22, tz="UTC"), - Timestamp(year=2017, month=10, day=22, tzinfo=pytz.utc), - Timestamp(year=2017, month=10, day=22, tz=pytz.utc), - Timestamp(datetime(2017, 10, 22), tzinfo=pytz.utc), + Timestamp(year=2017, month=10, day=22, tzinfo=timezone.utc), + Timestamp(year=2017, month=10, day=22, tz=timezone.utc), + Timestamp(datetime(2017, 10, 22), tzinfo=timezone.utc), Timestamp(datetime(2017, 10, 22), tz="UTC"), - Timestamp(datetime(2017, 10, 22), tz=pytz.utc), + Timestamp(datetime(2017, 10, 22), tz=timezone.utc), ] assert all(ts == stamps[0] for ts in stamps) @@ -742,7 +746,7 @@ def test_constructor_tz_or_tzinfo(self): tz="UTC", ), Timestamp(2000, 1, 2, 3, 4, 5, 6, None, nanosecond=1), - Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=pytz.UTC, nanosecond=1), + Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=timezone.utc, nanosecond=1), ], ) def test_constructor_nanosecond(self, result): @@ -887,21 +891,21 @@ def test_construct_timestamp_near_dst(self, offset): def test_construct_with_different_string_format(self, arg): # GH 12064 result = Timestamp(arg) - expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540)) + expected = Timestamp(datetime(2013, 1, 1), tz=timezone(timedelta(hours=9))) assert result == expected @pytest.mark.parametrize("box", [datetime, Timestamp]) def test_raise_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 - kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": pytz.utc} + kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": timezone.utc} msg = "Cannot pass a datetime or Timestamp" with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tz="US/Pacific") msg = "Cannot pass a datetime or Timestamp" with pytest.raises(ValueError, match=msg): - Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) + Timestamp(box(**kwargs), tzinfo=zoneinfo.ZoneInfo("US/Pacific")) - def test_dont_convert_dateutil_utc_to_pytz_utc(self): + def test_dont_convert_dateutil_utc_to_default_utc(self): result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc()) assert result == expected @@ -943,7 +947,7 @@ def test_timestamp_constructor_near_dst_boundary(self): assert result == expected msg = "Cannot infer dst time from 2015-10-25 02:00:00" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): Timestamp("2015-10-25 02:00", tz=tz) result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") @@ -951,7 +955,7 @@ def test_timestamp_constructor_near_dst_boundary(self): assert result == expected msg = "2017-03-26 02:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): + with pytest.raises(ValueError, match=msg): Timestamp("2017-03-26 02:00", tz="Europe/Paris") # GH#11708 @@ -970,7 +974,7 @@ def test_timestamp_constructor_near_dst_boundary(self): assert result == expected msg = "2017-03-26 02:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): + with pytest.raises(ValueError, match=msg): Timestamp("2017-03-26 02:00", tz="Europe/Paris") result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") @@ -985,7 +989,7 @@ def test_timestamp_constructor_near_dst_boundary(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), "US/Eastern", "dateutil/US/Eastern", @@ -994,13 +998,27 @@ def test_timestamp_constructor_near_dst_boundary(self): def test_timestamp_constructed_by_date_and_tz(self, tz): # GH#2993, Timestamp cannot be constructed by datetime.date # and tz correctly - + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) result = Timestamp(date(2012, 3, 11), tz=tz) expected = Timestamp("3/11/2012", tz=tz) assert result.hour == expected.hour assert result == expected + def test_explicit_tz_none(self): + # GH#48688 + msg = "Passed data is timezone-aware, incompatible with 'tz=None'" + with pytest.raises(ValueError, match=msg): + Timestamp(datetime(2022, 1, 1, tzinfo=timezone.utc), tz=None) + + with pytest.raises(ValueError, match=msg): + Timestamp("2022-01-01 00:00:00", tzinfo=timezone.utc, tz=None) + + with pytest.raises(ValueError, match=msg): + Timestamp("2022-01-01 00:00:00-0400", tz=None) + def test_constructor_ambiguous_dst(): # GH 24329 diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index d7160597ea6d6..7b20f0a17556d 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -1,9 +1,13 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import pprint import dateutil.tz import pytest -import pytz # a test below uses pytz but only inside a `eval` call + +from pandas.compat import WASM from pandas import Timestamp @@ -88,13 +92,14 @@ def test_isoformat(ts, timespec, expected_iso): class TestTimestampRendering: - timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] - - @pytest.mark.parametrize("tz", timezones) + @pytest.mark.parametrize( + "tz", ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/America/Los_Angeles"] + ) @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) @pytest.mark.parametrize( "date", ["2014-03-07", "2014-01-01 09:00", "2014-01-01 00:00:00.000000001"] ) + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_repr(self, date, freq, tz): # avoid to match with timezone name freq_repr = f"'{freq}'" @@ -118,7 +123,7 @@ def test_repr(self, date, freq, tz): def test_repr_utcoffset(self): # This can cause the tz field to be populated, but it's redundant to # include this information in the date-string. - date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400", tz=None) + date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400") assert "2014-03-13 00:00:00-0400" in repr(date_with_utc_offset) assert "tzoffset" not in repr(date_with_utc_offset) assert "UTC-04:00" in repr(date_with_utc_offset) @@ -178,14 +183,14 @@ def test_repr_matches_pydatetime_no_tz(self): ts_nanos_micros = Timestamp(1200) assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200" - def test_repr_matches_pydatetime_tz_pytz(self): - dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc) + def test_repr_matches_pydatetime_tz_stdlib(self): + dt_date = datetime(2013, 1, 2, tzinfo=timezone.utc) assert str(dt_date) == str(Timestamp(dt_date)) - dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc) + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=timezone.utc) assert str(dt_datetime) == str(Timestamp(dt_datetime)) - dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc) + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=timezone.utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) def test_repr_matches_pydatetime_tz_dateutil(self): diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 05e1c93e1a676..38d0ddfbc13bd 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -1,4 +1,4 @@ -""" test the scalar Timestamp """ +"""test the scalar Timestamp""" import calendar from datetime import ( @@ -9,6 +9,7 @@ import locale import time import unicodedata +import zoneinfo from dateutil.tz import ( tzlocal, @@ -20,8 +21,6 @@ ) import numpy as np import pytest -import pytz -from pytz import utc from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.timezones import ( @@ -118,17 +117,16 @@ def test_is_end(self, end, tz): assert getattr(ts, end) # GH 12806 - @pytest.mark.parametrize( - "data", - [Timestamp("2017-08-28 23:00:00"), Timestamp("2017-08-28 23:00:00", tz="EST")], - ) + @pytest.mark.parametrize("tz", [None, "EST"]) # error: Unsupported operand types for + ("List[None]" and "List[str]") @pytest.mark.parametrize( - "time_locale", [None] + tm.get_locales() # type: ignore[operator] + "time_locale", + [None] + tm.get_locales(), # type: ignore[operator] ) - def test_names(self, data, time_locale): + def test_names(self, tz, time_locale): # GH 17354 # Test .day_name(), .month_name + data = Timestamp("2017-08-28 23:00:00", tz=tz) if time_locale is None: expected_day = "Monday" expected_month = "August" @@ -260,7 +258,7 @@ def test_dow_parametric(self, ts, sign): class TestTimestamp: - @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) + @pytest.mark.parametrize("tz", [None, zoneinfo.ZoneInfo("US/Pacific")]) def test_disallow_setting_tz(self, tz): # GH#3746 ts = Timestamp("2010") @@ -269,7 +267,9 @@ def test_disallow_setting_tz(self, tz): ts.tz = tz def test_default_to_stdlib_utc(self): - assert Timestamp.utcnow().tz is timezone.utc + msg = "Timestamp.utcnow is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timestamp.utcnow().tz is timezone.utc assert Timestamp.now("UTC").tz is timezone.utc assert Timestamp("2016-01-01", tz="UTC").tz is timezone.utc @@ -310,13 +310,17 @@ def compare(x, y): assert int((Timestamp(x)._value - Timestamp(y)._value) / 1e9) == 0 compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now("UTC"), datetime.now(pytz.timezone("UTC"))) + compare(Timestamp.now("UTC"), datetime.now(timezone.utc)) compare(Timestamp.now("UTC"), datetime.now(tzutc())) - compare(Timestamp.utcnow(), datetime.now(timezone.utc)) + msg = "Timestamp.utcnow is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + compare(Timestamp.utcnow(), datetime.now(timezone.utc)) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) - ts_utc = Timestamp.utcfromtimestamp(current_time) + msg = "Timestamp.utcfromtimestamp is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ts_utc = Timestamp.utcfromtimestamp(current_time) assert ts_utc.timestamp() == current_time compare( Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) @@ -324,12 +328,12 @@ def compare(x, y): compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, "UTC"), - datetime.fromtimestamp(current_time, utc), + datetime.fromtimestamp(current_time, timezone.utc), ) compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, tz="UTC"), - datetime.fromtimestamp(current_time, utc), + datetime.fromtimestamp(current_time, timezone.utc), ) date_component = datetime.now(timezone.utc) @@ -496,8 +500,7 @@ def test_to_period_tz_warning(self): # GH#21333 make sure a warning is issued when timezone # info is lost ts = Timestamp("2009-04-15 16:17:18", tz="US/Eastern") - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone information"): ts.to_period("D") def test_to_numpy_alias(self): @@ -581,9 +584,9 @@ def test_month_name(self, dt64, ts): assert ts.month_name() == alt.month_name() def test_tz_convert(self, ts): - ts = Timestamp._from_value_and_reso(ts._value, ts._creso, utc) + ts = Timestamp._from_value_and_reso(ts._value, ts._creso, timezone.utc) - tz = pytz.timezone("US/Pacific") + tz = zoneinfo.ZoneInfo("US/Pacific") result = ts.tz_convert(tz) assert isinstance(result, Timestamp) diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index cb2a35be907cd..8f2ee3ef45075 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -1,6 +1,7 @@ """ Tests for Timestamp timezone-related methods """ + from datetime import datetime from pandas._libs.tslibs import timezones diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index 2d50b0f36904a..f017ccd963972 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -40,7 +40,7 @@ def test_getname_categorical_accessor(self, method): def test_cat_accessor(self): ser = Series(Categorical(["a", "b", np.nan, "a"])) tm.assert_index_equal(ser.cat.categories, Index(["a", "b"])) - assert not ser.cat.ordered, False + assert not ser.cat.ordered exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) @@ -200,8 +200,8 @@ def test_dt_accessor_api_for_categorical(self, idx): if func == "to_period" and getattr(idx, "tz", None) is not None: # dropping TZ warn_cls.append(UserWarning) - if func == "to_pydatetime": - # deprecated to return Index[object] + elif func == "to_pytimedelta": + # GH 57463 warn_cls.append(FutureWarning) if warn_cls: warn_cls = tuple(warn_cls) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 34465a7c12c18..2c441a6ed91c1 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -9,10 +9,8 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs.timezones import maybe_get_tz -from pandas.errors import SettingWithCopyError from pandas.core.dtypes.common import ( is_integer_dtype, @@ -27,6 +25,7 @@ Period, PeriodIndex, Series, + StringDtype, TimedeltaIndex, date_range, period_range, @@ -115,10 +114,8 @@ def test_dt_namespace_accessor_datetime64(self, freq): for prop in ok_for_dt_methods: getattr(ser.dt, prop) - msg = "The behavior of DatetimeProperties.to_pydatetime is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.dt.to_pydatetime() - assert isinstance(result, np.ndarray) + result = ser.dt.to_pydatetime() + assert isinstance(result, Series) assert result.dtype == object result = ser.dt.tz_localize("US/Eastern") @@ -154,10 +151,8 @@ def test_dt_namespace_accessor_datetime64tz(self): for prop in ok_for_dt_methods: getattr(ser.dt, prop) - msg = "The behavior of DatetimeProperties.to_pydatetime is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.dt.to_pydatetime() - assert isinstance(result, np.ndarray) + result = ser.dt.to_pydatetime() + assert isinstance(result, Series) assert result.dtype == object result = ser.dt.tz_convert("CET") @@ -197,7 +192,9 @@ def test_dt_namespace_accessor_timedelta(self): assert isinstance(result, DataFrame) tm.assert_index_equal(result.index, ser.index) - result = ser.dt.to_pytimedelta() + msg = "The behavior of TimedeltaProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.dt.to_pytimedelta() assert isinstance(result, np.ndarray) assert result.dtype == object @@ -259,9 +256,8 @@ def test_dt_accessor_limited_display_api(self): tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # Period - idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - ser = Series(idx) + idx = period_range("20130101", periods=5, freq="D", name="xxx") + ser = Series(idx) results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) @@ -281,26 +277,15 @@ def test_dt_accessor_ambiguous_freq_conversions(self): expected = Series(exp_values, name="xxx") tm.assert_series_equal(ser, expected) - def test_dt_accessor_not_writeable(self, using_copy_on_write, warn_copy_on_write): + def test_dt_accessor_not_writeable(self): # no setting allowed ser = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): ser.dt.hour = 5 # trying to set a copy - msg = "modifications to a property of a datetimelike.+not supported" - with pd.option_context("chained_assignment", "raise"): - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - ser.dt.hour[0] = 5 - elif warn_copy_on_write: - with tm.assert_produces_warning( - FutureWarning, match="ChainedAssignmentError" - ): - ser.dt.hour[0] = 5 - else: - with pytest.raises(SettingWithCopyError, match=msg): - ser.dt.hour[0] = 5 + with tm.raises_chained_assignment_error(): + ser.dt.hour[0] = 5 @pytest.mark.parametrize( "method, dates", @@ -365,7 +350,7 @@ def test_dt_round_tz_ambiguous(self, method): tm.assert_series_equal(result, expected) # raise - with tm.external_error_raised(pytz.AmbiguousTimeError): + with tm.external_error_raised(ValueError): getattr(df1.date.dt, method)("h", ambiguous="raise") @pytest.mark.parametrize( @@ -387,7 +372,7 @@ def test_dt_round_tz_nonexistent(self, method, ts_str, freq): expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz) tm.assert_series_equal(result, expected) - with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + with pytest.raises(ValueError, match="2018-03-11 02:00:00"): getattr(ser.dt, method)(freq, nonexistent="raise") @pytest.mark.parametrize("freq", ["ns", "us", "1000us"]) @@ -452,7 +437,8 @@ def test_dt_accessor_no_new_attributes(self): # error: Unsupported operand types for + ("List[None]" and "List[str]") @pytest.mark.parametrize( - "time_locale", [None] + tm.get_locales() # type: ignore[operator] + "time_locale", + [None] + tm.get_locales(), # type: ignore[operator] ) def test_dt_accessor_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence @@ -582,7 +568,6 @@ def test_strftime_dt64_days(self): expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], - dtype=np.object_, ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) @@ -595,7 +580,7 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype(StringDtype(na_value=np.nan)) tm.assert_index_equal(result, expected) def test_strftime_dt64_microsecond_resolution(self): @@ -652,7 +637,7 @@ def test_strftime_all_nat(self, data): ser = Series(data) with tm.assert_produces_warning(None): result = ser.dt.strftime("%Y-%m-%d") - expected = Series([np.nan], dtype=object) + expected = Series([np.nan], dtype="str") tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): @@ -696,15 +681,16 @@ def test_dt_accessor_api(self): assert isinstance(ser.dt, DatetimeProperties) @pytest.mark.parametrize( - "ser", + "data", [ - Series(np.arange(5)), - Series(list("abcde")), - Series(np.random.default_rng(2).standard_normal(5)), + np.arange(5), + list("abcde"), + np.random.default_rng(2).standard_normal(5), ], ) - def test_dt_accessor_invalid(self, ser): + def test_dt_accessor_invalid(self, data): # GH#9322 check that series with incorrect dtypes don't have attr + ser = Series(data) with pytest.raises(AttributeError, match="only use .dt accessor"): ser.dt assert not hasattr(ser, "dt") @@ -800,7 +786,8 @@ def test_end_time_timevalues(self, input_vals): # GH#17157 # Check that the time part of the Period is adjusted by end_time # when using the dt accessor on a Series - input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) + dtype = pd.PeriodDtype(input_vals[0].freq) + input_vals = PeriodArray._from_sequence(np.asarray(input_vals), dtype=dtype) ser = Series(input_vals) result = ser.dt.end_time diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 1c60567c1a530..bec8ca13a2f5f 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -25,9 +25,28 @@ def test_list_getitem(list_dtype): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(list_dtype), + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]") + expected = Series([2, None, None], dtype="int64[pyarrow]", name="a") + tm.assert_series_equal(actual, expected) + + +def test_list_getitem_index(): + # GH 58425 + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + name="a", + ) + actual = ser.list[1] + expected = Series( + [2, None, None], + dtype="int64[pyarrow]", + index=[1, 3, 7], + name="a", + ) tm.assert_series_equal(actual, expected) @@ -35,6 +54,8 @@ def test_list_getitem_slice(): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + name="a", ) if pa_version_under11p0: with pytest.raises( @@ -44,7 +65,10 @@ def test_list_getitem_slice(): else: actual = ser.list[1:None:None] expected = Series( - [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())) + [[2, 3], [None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + name="a", ) tm.assert_series_equal(actual, expected) @@ -53,19 +77,26 @@ def test_list_len(): ser = Series( [[1, 2, 3], [4, None], None], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.len() - expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32())) + expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32()), name="a") tm.assert_series_equal(actual, expected) def test_list_flatten(): ser = Series( - [[1, 2, 3], [4, None], None], + [[1, 2, 3], None, [4, None], [], [7, 8]], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.flatten() - expected = Series([1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64())) + expected = Series( + [1, 2, 3, 4, None, 7, 8], + dtype=ArrowDtype(pa.int64()), + index=[0, 0, 0, 2, 2, 4, 4], + name="a", + ) tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/series/accessors/test_str_accessor.py b/pandas/tests/series/accessors/test_str_accessor.py index 09d965ef1f322..ff530459b78fb 100644 --- a/pandas/tests/series/accessors/test_str_accessor.py +++ b/pandas/tests/series/accessors/test_str_accessor.py @@ -15,7 +15,8 @@ def test_str_attribute(self): # str accessor only valid with string values ser = Series(range(5)) - with pytest.raises(AttributeError, match="only use .str accessor"): + msg = "Can only use .str accessor with string values, not integer" + with pytest.raises(AttributeError, match=msg): ser.str.repeat(2) def test_str_accessor_updates_on_inplace(self): diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index 1ec5b3b726d17..80aea75fda406 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -2,6 +2,11 @@ import pytest +from pandas.compat.pyarrow import ( + pa_version_under11p0, + pa_version_under13p0, +) + from pandas import ( ArrowDtype, DataFrame, @@ -11,6 +16,7 @@ import pandas._testing as tm pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") def test_struct_accessor_dtypes(): @@ -53,6 +59,7 @@ def test_struct_accessor_dtypes(): tm.assert_series_equal(actual, expected) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") def test_struct_accessor_field(): index = Index([-100, 42, 123]) ser = Series( @@ -94,10 +101,11 @@ def test_struct_accessor_field(): def test_struct_accessor_field_with_invalid_name_or_index(): ser = Series([], dtype=ArrowDtype(pa.struct([("field", pa.int64())]))) - with pytest.raises(ValueError, match="name_or_index must be an int or str"): + with pytest.raises(ValueError, match="name_or_index must be an int, str,"): ser.struct.field(1.1) +@pytest.mark.skipif(pa_version_under11p0, reason="pyarrow>=11.0.0 required") def test_struct_accessor_explode(): index = Index([-100, 42, 123]) ser = Series( @@ -148,3 +156,41 @@ def test_struct_accessor_api_for_invalid(invalid): ), ): invalid.struct + + +@pytest.mark.parametrize( + ["indices", "name"], + [ + (0, "int_col"), + ([1, 2], "str_col"), + (pc.field("int_col"), "int_col"), + ("int_col", "int_col"), + (b"string_col", b"string_col"), + ([b"string_col"], "string_col"), + ], +) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") +def test_struct_accessor_field_expanded(indices, name): + arrow_type = pa.struct( + [ + ("int_col", pa.int64()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ("str_col", pa.string()), + ] + ), + ), + (b"string_col", pa.string()), + ] + ) + + data = pa.array([], type=arrow_type) + ser = Series(data, dtype=ArrowDtype(arrow_type)) + expected = pc.struct_field(data, indices) + result = ser.struct.field(indices) + tm.assert_equal(result.array._pa_array.combine_chunks(), expected) + assert result.name == name diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index fc1c80eb4dec6..97cafc33611ed 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -1,6 +1,7 @@ """ Also test support for datetime64[ns] in Series / DataFrame """ + from datetime import ( datetime, timedelta, @@ -13,7 +14,6 @@ ) import numpy as np import pytest -import pytz from pandas._libs import index as libindex @@ -35,9 +35,6 @@ def test_fancy_getitem(): s = Series(np.arange(len(dti)), index=dti) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert s[48] == 48 assert s["1/2/2009"] == 48 assert s["2009-1-2"] == 48 assert s[datetime(2009, 1, 2)] == 48 @@ -56,10 +53,6 @@ def test_fancy_setitem(): s = Series(np.arange(len(dti)), index=dti) - msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - s[48] = -1 - assert s.iloc[48] == -1 s["1/2/2009"] = -2 assert s.iloc[48] == -2 s["1/2/2009":"2009-06-05"] = -3 @@ -69,6 +62,7 @@ def test_fancy_setitem(): @pytest.mark.parametrize("tz_source", ["pytz", "dateutil"]) def test_getitem_setitem_datetime_tz(tz_source): if tz_source == "pytz": + pytz = pytest.importorskip(tz_source) tzget = pytz.timezone else: # handle special case for utc in dateutil @@ -430,7 +424,7 @@ def test_indexing(): result = ts["2001"] tm.assert_series_equal(result, ts.iloc[:12]) - df = DataFrame({"A": ts.copy()}) + df = DataFrame({"A": ts}) # GH#36179 pre-2.0 df["2001"] operated as slicing on rows. in 2.0 it behaves # like any other key, so raises diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index 3d1082c3d040b..7440ef2692c47 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,16 +31,15 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self, using_infer_string): + def test_delitem_object_index(self): # Index(dtype=object) - dtype = "string[pyarrow_numpy]" if using_infer_string else object - s = Series(1, index=Index(["a"], dtype=dtype)) + s = Series(1, index=Index(["a"], dtype="str")) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype="str"))) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py index 1f3711ad91903..5ff92ca89efba 100644 --- a/pandas/tests/series/indexing/test_get.py +++ b/pandas/tests/series/indexing/test_get.py @@ -157,13 +157,8 @@ def test_get_with_default(): assert s.get("e", "z") == "z" assert s.get("e", "e") == "e" - msg = "Series.__getitem__ treating keys as positions is deprecated" - warn = None - if index is d0: - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): - assert s.get(10, "z") == "z" - assert s.get(10, 10) == 10 + assert s.get(10, "z") == "z" + assert s.get(10, 10) == 10 @pytest.mark.parametrize( @@ -201,13 +196,10 @@ def test_get_with_ea(arr): result = ser.get("Z") assert result is None - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(4) == ser.iloc[4] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(-1) == ser.iloc[-1] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(len(ser)) is None + # As of 3.0, ints are treated as labels + assert ser.get(4) is None + assert ser.get(-1) is None + assert ser.get(len(ser)) is None # GH#21257 ser = Series(arr) @@ -216,16 +208,14 @@ def test_get_with_ea(arr): def test_getitem_get(string_series, object_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - for obj in [string_series, object_series]: idx = obj.index[5] assert obj[idx] == obj.get(idx) assert obj[idx] == obj.iloc[5] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert string_series.get(-1) == string_series.get(string_series.index[-1]) + # As of 3.0, ints are treated as labels + assert string_series.get(-1) is None assert string_series.iloc[5] == string_series.get(string_series.index[5]) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 596a225c288b8..8ba5e8452711d 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -1,6 +1,7 @@ """ Series.__getitem__ test classes are organized by the type of key passed. """ + from datetime import ( date, datetime, @@ -14,6 +15,7 @@ conversion, timezones, ) +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.common import is_scalar @@ -71,19 +73,14 @@ def test_getitem_unrecognized_scalar(self): def test_getitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds" - warn_msg = "Series.__getitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[-11] + with pytest.raises(KeyError, match="^-11$"): + ser[-11] def test_getitem_out_of_bounds_indexerror(self, datetime_series): # don't segfault, GH#495 - msg = r"index \d+ is out of bounds for axis 0 with size \d+" - warn_msg = "Series.__getitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - datetime_series[len(datetime_series)] + N = len(datetime_series) + with pytest.raises(KeyError, match=str(N)): + datetime_series[N] def test_getitem_out_of_bounds_empty_rangeindex_keyerror(self): # GH#917 @@ -117,11 +114,13 @@ def test_getitem_keyerror_with_integer_index(self, any_int_numpy_dtype): ser["c"] def test_getitem_int64(self, datetime_series): + if np_version_gt2: + msg = r"^np.int64\(5\)$" + else: + msg = "^5$" idx = np.int64(5) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = datetime_series[idx] - assert res == datetime_series.iloc[5] + with pytest.raises(KeyError, match=msg): + datetime_series[idx] def test_getitem_full_range(self): # github.com/pandas-dev/pandas/commit/4f433773141d2eb384325714a2776bcc5b2e20f7 @@ -217,10 +216,8 @@ def test_getitem_str_with_timedeltaindex(self): def test_getitem_bool_index_positional(self): # GH#48653 ser = Series({True: 1, False: 0}) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser[0] - assert result == 1 + with pytest.raises(KeyError, match="^0$"): + ser[0] class TestSeriesGetitemSlices: @@ -363,9 +360,7 @@ def test_getitem_no_matches(self, box): key = Series(["C"], dtype=object) key = box(key) - msg = ( - r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" - ) + msg = r"None of \[Index\(\['C'\], dtype='object|str'\)\] are in the \[index\]" with pytest.raises(KeyError, match=msg): ser[key] @@ -383,17 +378,16 @@ def test_getitem_intlist_intindex_periodvalues(self): @pytest.mark.parametrize("box", [list, np.array, Index]) def test_getitem_intlist_intervalindex_non_int(self, box): - # GH#33404 fall back to positional since ints are unambiguous + # GH#33404 fall back to positional since ints are unambiguous; + # changed in 3.0 to never fallback dti = date_range("2000-01-03", periods=3)._with_freq(None) ii = pd.IntervalIndex.from_breaks(dti) ser = Series(range(len(ii)), index=ii) - expected = ser.iloc[:1] key = box([0]) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser[key] - tm.assert_series_equal(result, expected) + msg = r"None of \[Index\(\[0\], dtype='int(32|64)'\)\] are in the \[index\]" + with pytest.raises(KeyError, match=msg): + ser[key] @pytest.mark.parametrize("box", [list, np.array, Index]) @pytest.mark.parametrize("dtype", [np.int64, np.float64, np.uint64]) @@ -561,14 +555,15 @@ def test_getitem_generator(string_series): @pytest.mark.parametrize( - "series", + "data", [ - Series([0, 1]), - Series(date_range("2012-01-01", periods=2)), - Series(date_range("2012-01-01", periods=2, tz="CET")), + [0, 1], + date_range("2012-01-01", periods=2), + date_range("2012-01-01", periods=2, tz="CET"), ], ) -def test_getitem_ndim_deprecated(series): +def test_getitem_ndim_deprecated(data): + series = Series(data) with pytest.raises(ValueError, match="Multi-dimensional indexing"): series[:, None] @@ -633,11 +628,6 @@ def test_getitem_preserve_name(datetime_series): result = datetime_series[datetime_series > 0] assert result.name == datetime_series.name - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = datetime_series[[0, 2, 4]] - assert result.name == datetime_series.name - result = datetime_series[5:10] assert result.name == datetime_series.name @@ -665,21 +655,16 @@ def test_getitem_missing(datetime_series): def test_getitem_fancy(string_series, object_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - slice1 = string_series[[1, 2, 3]] - slice2 = object_series[[1, 2, 3]] - assert string_series.index[2] == slice1.index[1] - assert object_series.index[2] == slice2.index[1] - assert string_series.iloc[2] == slice1.iloc[1] - assert object_series.iloc[2] == slice2.iloc[1] + msg = r"None of \[Index\(\[1, 2, 3\], dtype='int(32|64)'\)\] are in the \[index\]" + with pytest.raises(KeyError, match=msg): + string_series[[1, 2, 3]] + with pytest.raises(KeyError, match=msg): + object_series[[1, 2, 3]] def test_getitem_box_float64(datetime_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - value = datetime_series[5] - assert isinstance(value, np.float64) + with pytest.raises(KeyError, match="^5$"): + datetime_series[5] def test_getitem_unordered_dup(): @@ -710,13 +695,11 @@ def test_slice_can_reorder_not_uniquely_indexed(): @pytest.mark.parametrize("index_vals", ["aabcd", "aadcb"]) def test_duplicated_index_getitem_positional_indexer(index_vals): - # GH 11747 + # GH 11747; changed in 3.0 integers are treated as always-labels s = Series(range(5), index=list(index_vals)) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s[3] - assert result == 3 + with pytest.raises(KeyError, match="^3$"): + s[3] class TestGetitemDeprecatedIndexers: diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index c52e47a812183..02efd850f9640 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -1,4 +1,5 @@ -""" test get/set & misc """ +"""test get/set & misc""" + from datetime import timedelta import re @@ -31,27 +32,16 @@ def test_basic_indexing(): np.random.default_rng(2).standard_normal(5), index=["a", "b", "a", "a", "b"] ) - warn_msg = "Series.__[sg]etitem__ treating keys as positions is deprecated" - msg = "index 5 is out of bounds for axis 0 with size 5" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] = 0 + with pytest.raises(KeyError, match="^5$"): + s[5] with pytest.raises(KeyError, match=r"^'c'$"): s["c"] s = s.sort_index() - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] - msg = r"index 5 is out of bounds for axis (0|1) with size 5|^5$" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] = 0 + with pytest.raises(KeyError, match="^5$"): + s[5] def test_getitem_numeric_should_not_fallback_to_positional(any_numeric_dtype): @@ -101,35 +91,32 @@ def test_basic_getitem_dt64tz_values(): assert result == expected -def test_getitem_setitem_ellipsis(using_copy_on_write, warn_copy_on_write): +def test_getitem_setitem_ellipsis(): s = Series(np.random.default_rng(2).standard_normal(10)) result = s[...] tm.assert_series_equal(result, s) - with tm.assert_cow_warning(warn_copy_on_write): - s[...] = 5 - if not using_copy_on_write: - assert (result == 5).all() - @pytest.mark.parametrize( "result_1, duplicate_item, expected_1", [ [ - Series({1: 12, 2: [1, 2, 2, 3]}), - Series({1: 313}), + {1: 12, 2: [1, 2, 2, 3]}, + {1: 313}, Series({1: 12}, dtype=object), ], [ - Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), - Series({1: [1, 2, 3]}), + {1: [1, 2, 3], 2: [1, 2, 2, 3]}, + {1: [1, 2, 3]}, Series({1: [1, 2, 3]}), ], ], ) def test_getitem_with_duplicates_indices(result_1, duplicate_item, expected_1): # GH 17610 + result_1 = Series(result_1) + duplicate_item = Series(duplicate_item) result = result_1._append(duplicate_item) expected = expected_1._append(duplicate_item) tm.assert_series_equal(result[1], expected) @@ -155,9 +142,7 @@ def test_series_box_timestamp(): assert isinstance(ser.iloc[4], Timestamp) ser = Series(rng, index=rng) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert isinstance(ser[0], Timestamp) + assert isinstance(ser[rng[0]], Timestamp) assert isinstance(ser.at[rng[1]], Timestamp) assert isinstance(ser.iat[2], Timestamp) assert isinstance(ser.loc[rng[3]], Timestamp) @@ -241,7 +226,7 @@ def test_basic_getitem_setitem_corner(datetime_series): datetime_series[[5, [None, None]]] = 2 -def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_write): +def test_slice(string_series, object_series): original = string_series.copy() numSlice = string_series[10:20] numSliceEnd = string_series[-10:] @@ -258,31 +243,31 @@ def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_w # Test return view. sl = string_series[10:20] - with tm.assert_cow_warning(warn_copy_on_write): - sl[:] = 0 + sl[:] = 0 - if using_copy_on_write: - # Doesn't modify parent (CoW) - tm.assert_series_equal(string_series, original) - else: - assert (string_series[10:20] == 0).all() + # Doesn't modify parent (CoW) + tm.assert_series_equal(string_series, original) def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) s.loc["B"] = timedelta(1) - tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) + expected = Series(Timedelta("1 days"), dtype="timedelta64[ns]", index=["B"]) + tm.assert_series_equal(s, expected) s = s.reindex(s.index.insert(0, "A")) - tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) + expected = Series( + [np.nan, Timedelta("1 days")], dtype="timedelta64[ns]", index=["A", "B"] + ) + tm.assert_series_equal(s, expected) s.loc["A"] = timedelta(1) - expected = Series(Timedelta("1 days"), index=["A", "B"]) + expected = Series(Timedelta("1 days"), dtype="timedelta64[ns]", index=["A", "B"]) tm.assert_series_equal(s, expected) -def test_underlying_data_conversion(using_copy_on_write): +def test_underlying_data_conversion(): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) return_value = df.set_index(["a", "b", "c"], inplace=True) @@ -292,18 +277,9 @@ def test_underlying_data_conversion(using_copy_on_write): df_original = df.copy() df - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["val"].update(s) - expected = df_original - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - df["val"].update(s) - expected = DataFrame( - {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} - ) - return_value = expected.set_index(["a", "b", "c"], inplace=True) - assert return_value is None + with tm.raises_chained_assignment_error(): + df["val"].update(s) + expected = df_original tm.assert_frame_equal(df, expected) @@ -460,28 +436,38 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py - # but checks for warnings instead of errors. - def _check_setitem_invalid(self, ser, invalid, indexer, warn): - msg = "Setting an item of incompatible dtype is deprecated" - msg = re.escape(msg) - + def _check_setitem_invalid(self, ser, invalid, indexer): orig_ser = ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser.loc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser[:] = invalid + def _check_setitem_valid(self, ser, value, indexer): + orig_ser = ser.copy() + + ser[indexer] = value + ser = orig_ser.copy() + + ser.iloc[indexer] = value + ser = orig_ser.copy() + + ser.loc[indexer] = value + ser = orig_ser.copy() + + ser[:] = value + _invalid_scalars = [ 1 + 2j, "True", @@ -491,7 +477,7 @@ def _check_setitem_invalid(self, ser, invalid, indexer, warn): np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -499,20 +485,19 @@ def _check_setitem_invalid(self, ser, invalid, indexer, warn): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): ser = Series([True, False, False], dtype="bool") - self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + self._check_setitem_invalid(ser, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not NaT: - warn = None + if isna(invalid) and invalid is not NaT and not np.isnat(invalid): + self._check_setitem_valid(ser, invalid, indexer) else: - warn = FutureWarning - self._check_setitem_invalid(ser, invalid, indexer, warn) + self._check_setitem_invalid(ser, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): ser = Series([1, 2, None], dtype=float_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + self._check_setitem_invalid(ser, invalid, indexer) diff --git a/pandas/tests/series/indexing/test_set_value.py b/pandas/tests/series/indexing/test_set_value.py index cbe1a8bf296c8..3bf89da53d923 100644 --- a/pandas/tests/series/indexing/test_set_value.py +++ b/pandas/tests/series/indexing/test_set_value.py @@ -4,16 +4,20 @@ from pandas import ( DatetimeIndex, + Index, Series, ) import pandas._testing as tm -def test_series_set_value(): - # GH#1561 +def test_series_set_value(using_infer_string): + # GH#1561, GH#51363 as of 3.0 we do not do inference in Index.insert dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] - index = DatetimeIndex(dates) + if using_infer_string: + index = DatetimeIndex(dates) + else: + index = Index(dates, dtype=object) s = Series(dtype=object) s._set_value(dates[0], 1.0) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 23137f0975fb1..964f0b90b3c41 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1,8 +1,10 @@ +import contextlib from datetime import ( date, datetime, ) from decimal import Decimal +import os import numpy as np import pytest @@ -24,6 +26,7 @@ NaT, Period, Series, + StringDtype, Timedelta, Timestamp, array, @@ -181,14 +184,12 @@ def test_object_series_setitem_dt64array_exact_match(self): class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): + # As of 3.0, int keys are treated as labels, so this becomes + # setitem-with-expansion ser = Series(["a"] * 10, index=["a"] * 10) - - # string index falls back to positional - msg = "index -11|-1 is out of bounds for axis 0 with size 10" - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[-11] = "foo" + ser[-11] = "foo" + exp = Series(["a"] * 10 + ["foo"], index=["a"] * 10 + [-11]) + tm.assert_series_equal(ser, exp) @pytest.mark.parametrize("indexer", [tm.loc, tm.at]) @pytest.mark.parametrize("ser_index", [0, 1]) @@ -275,25 +276,16 @@ def test_setitem_mask_align_and_promote(self): mask = ts > 0 left = ts.copy() right = ts[mask].copy().map(str) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left[mask] = right - expected = ts.map(lambda t: str(t) if t > 0 else t) - tm.assert_series_equal(left, expected) def test_setitem_mask_promote_strs(self): ser = Series([0, 1, 2, 0]) mask = ser > 0 ser2 = ser[mask].map(str) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): ser[mask] = ser2 - expected = Series([0, "1", "2", 0]) - tm.assert_series_equal(ser, expected) - def test_setitem_mask_promote(self): ser = Series([0, "foo", "bar", 0]) mask = Series([False, True, True, False]) @@ -381,12 +373,8 @@ def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(self): def test_setitem_nan_with_bool(self): # GH 13034 result = Series([True, False, True]) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): result[0] = np.nan - expected = Series([np.nan, False, True], dtype=object) - tm.assert_series_equal(result, expected) def test_setitem_mask_smallint_upcast(self): orig = Series([1, 2, 3], dtype="int8") @@ -395,22 +383,14 @@ def test_setitem_mask_smallint_upcast(self): mask = np.array([True, False, True]) ser = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): ser[mask] = Series(alt) - expected = Series([999, 2, 1001]) - tm.assert_series_equal(ser, expected) - ser2 = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): - ser2.mask(mask, alt, inplace=True) - tm.assert_series_equal(ser2, expected) + with pytest.raises(TypeError, match="Invalid value"): + ser.mask(mask, alt, inplace=True) - ser3 = orig.copy() - res = ser3.where(~mask, Series(alt)) + res = ser.where(~mask, Series(alt)) + expected = Series([999, 2, 1001]) tm.assert_series_equal(res, expected) def test_setitem_mask_smallint_no_upcast(self): @@ -437,7 +417,7 @@ def test_setitem_mask_smallint_no_upcast(self): class TestSetitemViewCopySemantics: - def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write): + def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz Series inplace invalidates the # `freq` attribute on the underlying DatetimeIndex @@ -445,10 +425,7 @@ def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write): ts = dti[1] ser = Series(dti) assert ser._values is not dti - if using_copy_on_write: - assert ser._values._ndarray.base is dti._data._ndarray.base - else: - assert ser._values._ndarray.base is not dti._data._ndarray.base + assert ser._values._ndarray.base is dti._data._ndarray.base assert dti.freq == "D" ser.iloc[1] = NaT assert ser._values.freq is None @@ -459,20 +436,16 @@ def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write): assert dti[1] == ts assert dti.freq == "D" - def test_dt64tz_setitem_does_not_mutate_dti(self, using_copy_on_write): + def test_dt64tz_setitem_does_not_mutate_dti(self): # GH#21907, GH#24096 dti = date_range("2016-01-01", periods=10, tz="US/Pacific") ts = dti[0] ser = Series(dti) assert ser._values is not dti - if using_copy_on_write: - assert ser._values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base - else: - assert ser._values._ndarray.base is not dti._data._ndarray.base - assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base + assert ser._values._ndarray.base is dti._data._ndarray.base + assert ser._mgr.blocks[0].values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0] is not dti + assert ser._mgr.blocks[0].values is not dti ser[::3] = NaT assert ser[0] is NaT @@ -501,12 +474,14 @@ def test_setitem_callable_other(self): class TestSetitemWithExpansion: - def test_setitem_empty_series(self): + def test_setitem_empty_series(self, using_infer_string): # GH#10193 key = Timestamp("2012-01-01") series = Series(dtype=object) series[key] = 47 - expected = Series(47, [key]) + expected = Series( + 47, index=[key] if using_infer_string else Index([key], dtype=object) + ) tm.assert_series_equal(series, expected) def test_setitem_empty_series_datetimeindex_preserves_freq(self): @@ -584,39 +559,42 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( - "na, target_na, dtype, target_dtype, indexer, warn", + "na, target_na, dtype, target_dtype, indexer, raises", [ - (NA, NA, "Int64", "Int64", 1, None), - (NA, NA, "Int64", "Int64", 2, None), - (NA, np.nan, "int64", "float64", 1, None), - (NA, np.nan, "int64", "float64", 2, None), - (NaT, NaT, "int64", "object", 1, FutureWarning), - (NaT, NaT, "int64", "object", 2, None), - (np.nan, NA, "Int64", "Int64", 1, None), - (np.nan, NA, "Int64", "Int64", 2, None), - (np.nan, NA, "Float64", "Float64", 1, None), - (np.nan, NA, "Float64", "Float64", 2, None), - (np.nan, np.nan, "int64", "float64", 1, None), - (np.nan, np.nan, "int64", "float64", 2, None), + (NA, NA, "Int64", "Int64", 1, False), + (NA, NA, "Int64", "Int64", 2, False), + (NA, np.nan, "int64", "float64", 1, False), + (NA, np.nan, "int64", "float64", 2, False), + (NaT, NaT, "int64", "object", 1, True), + (NaT, NaT, "int64", "object", 2, False), + (np.nan, NA, "Int64", "Int64", 1, False), + (np.nan, NA, "Int64", "Int64", 2, False), + (np.nan, NA, "Float64", "Float64", 1, False), + (np.nan, NA, "Float64", "Float64", 2, False), + (np.nan, np.nan, "int64", "float64", 1, False), + (np.nan, np.nan, "int64", "float64", 2, False), ], ) def test_setitem_enlarge_with_na( - self, na, target_na, dtype, target_dtype, indexer, warn + self, na, target_na, dtype, target_dtype, indexer, raises ): # GH#32346 ser = Series([1, 2], dtype=dtype) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + ser[indexer] = na + else: ser[indexer] = na - expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] - expected = Series(expected_values, dtype=target_dtype) - tm.assert_series_equal(ser, expected) + expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] + expected = Series(expected_values, dtype=target_dtype) + tm.assert_series_equal(ser, expected) def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): # GH#48665 ser = Series(["a", "b"]) ser[3] = nulls_fixture dtype = ( - "string[pyarrow_numpy]" + "str" if using_infer_string and not isinstance(nulls_fixture, Decimal) else object ) @@ -703,14 +681,8 @@ def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique): if not unique: ser.index = [1, 1] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_sli(ser)[1] = val - assert type(ser.iloc[1]) == type(val) - - expected = Series([True, val], dtype=object, index=ser.index) - if not unique and indexer_sli is not tm.iloc: - expected = Series([val, val], dtype=object, index=[1, 1]) - tm.assert_series_equal(ser, expected) def test_setitem_boolean_array_into_npbool(self): # GH#45462 @@ -721,10 +693,8 @@ def test_setitem_boolean_array_into_npbool(self): ser[:2] = arr[:2] # no NAs -> can set inplace assert ser._values is values - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[1:] = arr[1:] # has an NA -> cast to boolean dtype - expected = Series(arr) - tm.assert_series_equal(ser, expected) class SetitemCastingEquivalents: @@ -768,64 +738,72 @@ def _check_inplace(self, is_inplace, orig, arr, obj): # otherwise original array should be unchanged tm.assert_equal(arr, orig._values) - def test_int_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + def test_int_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): if not isinstance(key, int): pytest.skip("Not relevant for int key") + if raises: + ctx = pytest.raises(TypeError, match="Invalid value") + else: + ctx = contextlib.nullcontext() - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) if indexer_sli is tm.loc: - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, tm.at, is_inplace) elif indexer_sli is tm.iloc: - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, tm.iat, is_inplace) rng = range(key, key + 1) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, rng, expected, val, indexer_sli, is_inplace) if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently slc = slice(key, key + 1) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, slc, expected, val, indexer_sli, is_inplace) ilkey = [key] - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) genkey = (x for x in [key]) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) - def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + def test_slice_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): if not isinstance(key, slice): pytest.skip("Not relevant for slice key") + if raises: + ctx = pytest.raises(TypeError, match="Invalid value") + else: + ctx = contextlib.nullcontext() if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) ilkey = list(range(len(obj)))[key] - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) genkey = (x for x in indkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) - def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): + def test_mask_key(self, obj, key, expected, raises, val, indexer_sli): # setitem with boolean mask mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -838,11 +816,13 @@ def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): indexer_sli(obj)[mask] = val return - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + indexer_sli(obj)[mask] = val + else: indexer_sli(obj)[mask] = val - tm.assert_series_equal(obj, expected) - def test_series_where(self, obj, key, expected, warn, val, is_inplace): + def test_series_where(self, obj, key, expected, raises, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -869,32 +849,24 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val, using_infer_string): + def test_index_where(self, obj, key, expected, raises, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): - Index(obj).where(~mask, val) - else: - res = Index(obj).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + res = Index(obj).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): + def test_index_putmask(self, obj, key, expected, raises, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): - Index(obj).putmask(mask, val) - else: - res = Index(obj).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + res = Index(obj).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( - "obj,expected,key,warn", + "obj,expected,key,raises", [ pytest.param( # GH#45568 setting a valid NA value into IntervalDtype[int] should @@ -905,7 +877,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): dtype="interval[float64]", ), 1, - FutureWarning, + True, id="interval_int_na_value", ), pytest.param( @@ -913,14 +885,14 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), slice(None, None, 2), - None, + False, id="int_series_slice_key_step", ), pytest.param( Series([True, True, False, False]), Series([np.nan, True, np.nan, False], dtype=object), slice(None, None, 2), - FutureWarning, + True, id="bool_series_slice_key_step", ), pytest.param( @@ -928,7 +900,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series(np.arange(10)), Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), slice(None, 5), - None, + False, id="int_series_slice_key", ), pytest.param( @@ -936,7 +908,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([1, 2, 3]), Series([np.nan, 2, 3]), 0, - None, + False, id="int_series_int_key", ), pytest.param( @@ -945,7 +917,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([np.nan], dtype=object), # TODO: maybe go to float64 since we are changing the _whole_ Series? 0, - FutureWarning, + True, id="bool_series_int_key_change_all", ), pytest.param( @@ -953,7 +925,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([False, True]), Series([np.nan, True], dtype=object), 0, - FutureWarning, + True, id="bool_series_int_key", ), ], @@ -1003,8 +975,8 @@ def key(self): return 0 @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemDT64IntoInt(SetitemCastingEquivalents): @@ -1043,8 +1015,8 @@ def val(self, scalar, request): return box([scalar, scalar]) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemNAPeriodDtype(SetitemCastingEquivalents): @@ -1070,8 +1042,8 @@ def val(self, request): return request.param @pytest.fixture - def warn(self): - return None + def raises(self): + return False class TestSetitemNADatetimeLikeDtype(SetitemCastingEquivalents): @@ -1123,8 +1095,8 @@ def key(self): return 0 @pytest.fixture - def warn(self, is_inplace): - return None if is_inplace else FutureWarning + def raises(self, is_inplace): + return False if is_inplace else True class TestSetitemMismatchedTZCastsToObject(SetitemCastingEquivalents): @@ -1155,24 +1127,23 @@ def expected(self, obj, val): return expected @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "obj,expected,warn", + "obj,expected", [ # For numeric series, we should coerce to NaN. - (Series([1, 2, 3]), Series([np.nan, 2, 3]), None), - (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None), + (Series([1, 2, 3]), Series([np.nan, 2, 3])), + (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0])), # For datetime series, we should coerce to NaT. ( Series([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]), Series([NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), - None, ), # For objects, we should preserve the None value. - (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"]), None), + (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"])), ], ) class TestSeriesNoneCoercion(SetitemCastingEquivalents): @@ -1184,6 +1155,10 @@ def key(self): def val(self): return None + @pytest.fixture + def raises(self): + return False + class TestSetitemFloatIntervalWithIntIntervalValues(SetitemCastingEquivalents): # GH#44201 Cast to shared IntervalDtype rather than object @@ -1194,34 +1169,46 @@ def test_setitem_example(self): obj = Series(idx) val = Interval(0.5, 1.5) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj[0] = val - assert obj.dtype == "Interval[float64, right]" @pytest.fixture def obj(self): + """ + Fixture to create a Series [(0, 1], (1, 2], (2, 3]] + """ idx = IntervalIndex.from_breaks(range(4)) return Series(idx) @pytest.fixture def val(self): + """ + Fixture to get an interval (0.5, 1.5] + """ return Interval(0.5, 1.5) @pytest.fixture def key(self): + """ + Fixture to get a key 0 + """ return 0 @pytest.fixture def expected(self, obj, val): + """ + Fixture to get a Series [(0.5, 1.5], (1.0, 2.0], (2.0, 3.0]] + """ data = [val] + list(obj[1:]) idx = IntervalIndex(data, dtype="Interval[float64]") return Series(idx) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + """ + Fixture to enable raising pytest exceptions + """ + return True class TestSetitemRangeIntoIntegerSeries(SetitemCastingEquivalents): @@ -1249,18 +1236,18 @@ def expected(self, any_int_numpy_dtype): return exp @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val, warn", + "val, raises", [ - (np.array([2.0, 3.0]), None), - (np.array([2.5, 3.5]), FutureWarning), + (np.array([2.0, 3.0]), False), + (np.array([2.5, 3.5]), True), ( np.array([2**65, 2**65 + 1], dtype=np.float64), - FutureWarning, + True, ), # all ints, but can't cast ], ) @@ -1300,8 +1287,8 @@ def expected(self): return Series([1, 512, 3], dtype=np.int16) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True @pytest.mark.parametrize("val", [2**33 + 1.0, 2**33 + 1.1, 2**62]) @@ -1324,8 +1311,8 @@ def expected(self, val): return Series([val, 2, 3], dtype=dtype) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class CoercionTest(SetitemCastingEquivalents): @@ -1343,8 +1330,8 @@ def expected(self, obj, key, val, exp_dtype): @pytest.mark.parametrize( - "val,exp_dtype,warn", - [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, FutureWarning)], + "val,exp_dtype,raises", + [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, True)], ) class TestCoercionInt8(CoercionTest): # previously test_setitem_series_int8 in tests.indexing.test_coercion @@ -1362,17 +1349,30 @@ def obj(self): return Series(["a", "b", "c", "d"], dtype=object) @pytest.fixture - def warn(self): - return None + def raises(self): + return False + + +@pytest.mark.parametrize( + "val,exp_dtype,raises", + [ + (1, object, True), + ("e", StringDtype(na_value=np.nan), False), + ], +) +class TestCoercionString(CoercionTest): + @pytest.fixture + def obj(self): + return Series(["a", "b", "c", "d"], dtype=StringDtype(na_value=np.nan)) @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.complex128, None), - (1.1, np.complex128, None), - (1 + 1j, np.complex128, None), - (True, object, FutureWarning), + (1, np.complex128, False), + (1.1, np.complex128, False), + (1 + 1j, np.complex128, False), + (True, object, True), ], ) class TestCoercionComplex(CoercionTest): @@ -1383,14 +1383,14 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, object, FutureWarning), - ("3", object, FutureWarning), - (3, object, FutureWarning), - (1.1, object, FutureWarning), - (1 + 1j, object, FutureWarning), - (True, bool, None), + (1, object, True), + ("3", object, True), + (3, object, True), + (1.1, object, True), + (1 + 1j, object, True), + (True, bool, False), ], ) class TestCoercionBool(CoercionTest): @@ -1401,12 +1401,12 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.int64, None), - (1.1, np.float64, FutureWarning), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), + (1, np.int64, False), + (1.1, np.float64, True), + (1 + 1j, np.complex128, True), + (True, object, True), ], ) class TestCoercionInt64(CoercionTest): @@ -1417,12 +1417,12 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.float64, None), - (1.1, np.float64, None), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), + (1, np.float64, False), + (1.1, np.float64, False), + (1 + 1j, np.complex128, True), + (True, object, True), ], ) class TestCoercionFloat64(CoercionTest): @@ -1433,32 +1433,35 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.float32, None), + (1, np.float32, False), pytest.param( 1.1, np.float32, - None, + False, marks=pytest.mark.xfail( ( not np_version_gte1p24 - or (np_version_gte1p24 and np._get_promotion_state() != "weak") + or ( + np_version_gte1p24 + and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" + ) ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", ), ), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), - (np.uint8(2), np.float32, None), - (np.uint32(2), np.float32, None), + (1 + 1j, np.complex128, True), + (True, object, True), + (np.uint8(2), np.float32, False), + (np.uint32(2), np.float32, False), # float32 cannot hold np.iinfo(np.uint32).max exactly # (closest it can hold is 4294967300.0 which off by 5.0), so # we cast to float64 - (np.uint32(np.iinfo(np.uint32).max), np.float64, FutureWarning), - (np.uint64(2), np.float32, None), - (np.int64(2), np.float32, None), + (np.uint32(np.iinfo(np.uint32).max), np.float64, True), + (np.uint64(2), np.float32, False), + (np.int64(2), np.float32, False), ], ) class TestCoercionFloat32(CoercionTest): @@ -1466,8 +1469,8 @@ class TestCoercionFloat32(CoercionTest): def obj(self): return Series([1.1, 2.2, 3.3, 4.4], dtype=np.float32) - def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): - super().test_slice_key(obj, key, expected, warn, val, indexer_sli, is_inplace) + def test_slice_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): + super().test_slice_key(obj, key, expected, raises, val, indexer_sli, is_inplace) if isinstance(val, float): # the xfail would xpass bc test_slice_key short-circuits @@ -1475,11 +1478,44 @@ def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace) @pytest.mark.parametrize( - "val,exp_dtype,warn", + "exp_dtype", [ - (Timestamp("2012-01-01"), "datetime64[ns]", None), - (1, object, FutureWarning), - ("x", object, FutureWarning), + "M8[ms]", + "M8[ms, UTC]", + "m8[ms]", + ], +) +class TestCoercionDatetime64HigherReso(CoercionTest): + @pytest.fixture + def obj(self, exp_dtype): + idx = date_range("2011-01-01", freq="D", periods=4, unit="s") + if exp_dtype == "m8[ms]": + idx = idx - Timestamp("1970-01-01") + assert idx.dtype == "m8[s]" + elif exp_dtype == "M8[ms, UTC]": + idx = idx.tz_localize("UTC") + return Series(idx) + + @pytest.fixture + def val(self, exp_dtype): + ts = Timestamp("2011-01-02 03:04:05.678").as_unit("ms") + if exp_dtype == "m8[ms]": + return ts - Timestamp("1970-01-01") + elif exp_dtype == "M8[ms, UTC]": + return ts.tz_localize("UTC") + return ts + + @pytest.fixture + def raises(self): + return True + + +@pytest.mark.parametrize( + "val,exp_dtype,raises", + [ + (Timestamp("2012-01-01"), "datetime64[ns]", False), + (1, object, True), + ("x", object, True), ], ) class TestCoercionDatetime64(CoercionTest): @@ -1490,18 +1526,18 @@ def obj(self): return Series(date_range("2011-01-01", freq="D", periods=4)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", None), + (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", False), # pre-2.0, a mis-matched tz would end up casting to object - (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", None), - (Timestamp("2012-01-01"), object, FutureWarning), - (1, object, FutureWarning), + (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", False), + (Timestamp("2012-01-01"), object, True), + (1, object, True), ], ) class TestCoercionDatetime64TZ(CoercionTest): @@ -1512,16 +1548,16 @@ def obj(self): return Series(date_range("2011-01-01", freq="D", periods=4, tz=tz)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timedelta("12 day"), "timedelta64[ns]", None), - (1, object, FutureWarning), - ("x", object, FutureWarning), + (Timedelta("12 day"), "timedelta64[ns]", False), + (1, object, True), + ("x", object, True), ], ) class TestCoercionTimedelta64(CoercionTest): @@ -1531,8 +1567,8 @@ def obj(self): return Series(timedelta_range("1 day", periods=4)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( @@ -1551,63 +1587,45 @@ def obj(self, request): return Series(request.param) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True def test_20643(): # closed by GH#45121 orig = Series([0, 1, 2], index=["a", "b", "c"]) - expected = Series([0, 2.7, 2], index=["a", "b", "c"]) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.at["b"] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.loc["b"] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser["b"] = 2.7 - tm.assert_series_equal(ser, expected) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iat[1] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[1] = 2.7 - tm.assert_series_equal(ser, expected) orig_df = orig.to_frame("A") - expected_df = expected.to_frame("A") df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.at["b", "A"] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc["b", "A"] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[1, 0] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.iat[1, 0] = 2.7 - tm.assert_frame_equal(df, expected_df) def test_20643_comment(): @@ -1629,35 +1647,23 @@ def test_15413(): # fixed by GH#45121 ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[ser == 2] += 0.5 - expected = Series([1, 2.5, 3]) - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.loc[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iat[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.at[1] += 0.5 - tm.assert_series_equal(ser, expected) def test_32878_int_itemsize(): @@ -1665,10 +1671,8 @@ def test_32878_int_itemsize(): arr = np.arange(5).astype("i4") ser = Series(arr) val = np.int64(np.iinfo(np.int64).max) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = val - expected = Series([val, 1, 2, 3, 4], dtype=np.int64) - tm.assert_series_equal(ser, expected) def test_32878_complex_itemsize(): @@ -1678,20 +1682,15 @@ def test_32878_complex_itemsize(): val = val.astype("c16") # GH#32878 used to coerce val to inf+0.000000e+00j - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = val - assert ser[0] == val - expected = Series([val, 1, 2, 3, 4], dtype="c16") - tm.assert_series_equal(ser, expected) def test_37692(indexer_al): # GH#37692 ser = Series([1, 2, 3], index=["a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(ser)["b"] = "test" - expected = Series([1, "test", 3], index=["a", "b", "c"], dtype=object) - tm.assert_series_equal(ser, expected) def test_setitem_bool_int_float_consistency(indexer_sli): @@ -1701,14 +1700,12 @@ def test_setitem_bool_int_float_consistency(indexer_sli): # as the setitem can be done losslessly for dtype in [np.float64, np.int64]: ser = Series(0, index=range(3), dtype=dtype) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_sli(ser)[0] = True - assert ser.dtype == object ser = Series(0, index=range(3), dtype=bool) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = dtype(1) - assert ser.dtype == object # 1.0 can be held losslessly, so no casting ser = Series(0, index=range(3), dtype=np.int64) @@ -1723,24 +1720,24 @@ def test_setitem_bool_int_float_consistency(indexer_sli): def test_setitem_positional_with_casting(): # GH#45070 case where in __setitem__ we get a KeyError, then when # we fallback we *also* get a ValueError if we try to set inplace. + # As of 3.0 we always treat int keys as labels, so this becomes + # setitem-with-expansion ser = Series([1, 2, 3], index=["a", "b", "c"]) - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[0] = "X" - expected = Series(["X", 2, 3], index=["a", "b", "c"], dtype=object) + ser[0] = "X" + expected = Series([1, 2, 3, "X"], index=["a", "b", "c", 0], dtype=object) tm.assert_series_equal(ser, expected) def test_setitem_positional_float_into_int_coerces(): # Case where we hit a KeyError and then trying to set in-place incorrectly - # casts a float to an int + # casts a float to an int; + # As of 3.0 we always treat int keys as labels, so this becomes + # setitem-with-expansion ser = Series([1, 2, 3], index=["a", "b", "c"]) - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[0] = 1.5 - expected = Series([1.5, 2, 3], index=["a", "b", "c"]) + ser[0] = 1.5 + expected = Series([1, 2, 3, 1.5], index=["a", "b", "c", 0]) tm.assert_series_equal(ser, expected) @@ -1799,11 +1796,7 @@ def test_setitem_with_bool_indexer(): @pytest.mark.parametrize( "item", [2.0, np.nan, np.finfo(float).max, np.finfo(float).min] ) -# Test numpy arrays, lists and tuples as the input to be -# broadcast -@pytest.mark.parametrize( - "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] -) +@pytest.mark.parametrize("box", [np.array, list, tuple]) def test_setitem_bool_indexer_dont_broadcast_length1_values(size, mask, item, box): # GH#44265 # see also tests.series.indexing.test_where.test_broadcast @@ -1821,11 +1814,11 @@ def test_setitem_bool_indexer_dont_broadcast_length1_values(size, mask, item, bo ) with pytest.raises(ValueError, match=msg): # GH#44265 - ser[selection] = box(item) + ser[selection] = box([item]) else: # In this corner case setting is equivalent to setting with the unboxed # item - ser[selection] = box(item) + ser[selection] = box([item]) expected = Series(np.arange(size, dtype=float)) expected[selection] = item diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index c978481ca9988..663ee8ad0ee38 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.core.dtypes.common import is_integer import pandas as pd @@ -55,15 +53,13 @@ def test_where_unsafe_upcast(dtype, expected_dtype): s = Series(np.arange(10), dtype=dtype) values = [2.5, 3.5, 4.5, 5.5, 6.5] mask = s < 5 - expected = Series(values + list(range(5, 10)), dtype=expected_dtype) - warn = ( - None - if np.dtype(dtype).kind == np.dtype(expected_dtype).kind == "f" - else FutureWarning - ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if np.dtype(dtype).kind == np.dtype(expected_dtype).kind == "f": s[mask] = values - tm.assert_series_equal(s, expected) + expected = Series(values + list(range(5, 10)), dtype=expected_dtype) + tm.assert_series_equal(s, expected) + else: + with pytest.raises(TypeError, match="Invalid value"): + s[mask] = values def test_where_unsafe(): @@ -74,9 +70,10 @@ def test_where_unsafe(): mask = s > 5 expected = Series(list(range(6)) + values, dtype="float64") - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): s[mask] = values - tm.assert_series_equal(s, expected) + s = s.astype("float64") + s[mask] = values # see gh-3235 s = Series(np.arange(10), dtype="int64") @@ -201,7 +198,7 @@ def test_where_invalid_input(cond): s = Series([1, 2, 3]) msg = "Boolean array expected for the condition" - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): s.where(cond) msg = "Array conditional must be same shape as self" @@ -232,7 +229,6 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment @@ -242,7 +238,7 @@ def test_where_setitem_invalid(): "different length than the value" ) # slice - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:3] = list(range(27)) @@ -252,18 +248,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s.astype(np.int64), expected) # slice with step - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:4:2] = list(range(27)) - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) s[0:4:2] = list(range(2)) expected = Series([0, "b", 1, "d", "e", "f"]) tm.assert_series_equal(s, expected) # neg slices - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[:-1] = list(range(27)) @@ -273,18 +269,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s, expected) # list - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(27)) - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(2)) # scalar - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) s[0] = list(range(10)) expected = Series([list(range(10)), "b", "c"]) tm.assert_series_equal(s, expected) @@ -297,11 +293,7 @@ def test_where_setitem_invalid(): @pytest.mark.parametrize( "item", [2.0, np.nan, np.finfo(float).max, np.finfo(float).min] ) -# Test numpy arrays, lists and tuples as the input to be -# broadcast -@pytest.mark.parametrize( - "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] -) +@pytest.mark.parametrize("box", [np.array, list, tuple]) def test_broadcast(size, mask, item, box): # GH#8801, GH#4195 selection = np.resize(mask, size) @@ -320,11 +312,11 @@ def test_broadcast(size, mask, item, box): tm.assert_series_equal(s, expected) s = Series(data) - result = s.where(~selection, box(item)) + result = s.where(~selection, box([item])) tm.assert_series_equal(result, expected) s = Series(data) - result = s.mask(selection, box(item)) + result = s.mask(selection, box([item])) tm.assert_series_equal(result, expected) @@ -390,34 +382,6 @@ def test_where_numeric_with_string(): assert w.dtype == "object" -@pytest.mark.parametrize("dtype", ["timedelta64[ns]", "datetime64[ns]"]) -def test_where_datetimelike_coerce(dtype): - ser = Series([1, 2], dtype=dtype) - expected = Series([10, 10]) - mask = np.array([False, False]) - - msg = "Downcasting behavior in Series and DataFrame methods 'where'" - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = ser.where(mask, [10, 10]) - tm.assert_series_equal(rs, expected) - - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = ser.where(mask, 10) - tm.assert_series_equal(rs, expected) - - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = ser.where(mask, 10.0) - tm.assert_series_equal(rs, expected) - - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = ser.where(mask, [10.0, 10.0]) - tm.assert_series_equal(rs, expected) - - rs = ser.where(mask, [10.0, np.nan]) - expected = Series([10, np.nan], dtype="object") - tm.assert_series_equal(rs, expected) - - def test_where_datetimetz(): # GH 15701 timestamps = ["2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"] diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index cb60cd2e5bcf3..f7e4dbe11f3d3 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -52,44 +52,7 @@ def test_align(datetime_series, first_slice, second_slice, join_type, fill): assert eb.name == "ts" -@pytest.mark.parametrize( - "first_slice,second_slice", - [ - [[2, None], [None, -5]], - [[None, 0], [None, -5]], - [[None, -5], [None, 0]], - [[None, 0], [None, 0]], - ], -) -@pytest.mark.parametrize("method", ["pad", "bfill"]) -@pytest.mark.parametrize("limit", [None, 1]) -def test_align_fill_method( - datetime_series, first_slice, second_slice, join_type, method, limit -): - a = datetime_series[slice(*first_slice)] - b = datetime_series[slice(*second_slice)] - - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in Series.align " - "are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - aa, ab = a.align(b, join=join_type, method=method, limit=limit) - - join_index = a.index.join(b.index, how=join_type) - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - msg2 = "Series.fillna with 'method' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - ea = ea.fillna(method=method, limit=limit) - eb = eb.fillna(method=method, limit=limit) - - tm.assert_series_equal(aa, ea) - tm.assert_series_equal(ab, eb) - - -def test_align_nocopy(datetime_series, using_copy_on_write): +def test_align_nocopy(datetime_series): b = datetime_series[:5].copy() # do copy @@ -100,12 +63,9 @@ def test_align_nocopy(datetime_series, using_copy_on_write): # do not copy a = datetime_series.copy() - ra, _ = a.align(b, join="left", copy=False) + ra, _ = a.align(b, join="left") ra[:5] = 5 - if using_copy_on_write: - assert not (a[:5] == 5).any() - else: - assert (a[:5] == 5).all() + assert not (a[:5] == 5).any() # do copy a = datetime_series.copy() @@ -117,24 +77,17 @@ def test_align_nocopy(datetime_series, using_copy_on_write): # do not copy a = datetime_series.copy() b = datetime_series[:5].copy() - _, rb = a.align(b, join="right", copy=False) + _, rb = a.align(b, join="right") rb[:2] = 5 - if using_copy_on_write: - assert not (b[:2] == 5).any() - else: - assert (b[:2] == 5).all() - - -def test_align_same_index(datetime_series, using_copy_on_write): - a, b = datetime_series.align(datetime_series, copy=False) - if not using_copy_on_write: - assert a.index is datetime_series.index - assert b.index is datetime_series.index - else: - assert a.index.is_(datetime_series.index) - assert b.index.is_(datetime_series.index) - - a, b = datetime_series.align(datetime_series, copy=True) + assert not (b[:2] == 5).any() + + +def test_align_same_index(datetime_series): + a, b = datetime_series.align(datetime_series) + assert a.index.is_(datetime_series.index) + assert b.index.is_(datetime_series.index) + + a, b = datetime_series.align(datetime_series) assert a.index is not datetime_series.index assert b.index is not datetime_series.index assert a.index.is_(datetime_series.index) @@ -176,22 +129,6 @@ def test_align_multiindex(): tm.assert_series_equal(expr, res2l) -@pytest.mark.parametrize("method", ["backfill", "bfill", "pad", "ffill", None]) -def test_align_with_dataframe_method(method): - # GH31788 - ser = Series(range(3), index=range(3)) - df = pd.DataFrame(0.0, index=range(3), columns=range(3)) - - msg = ( - "The 'method', 'limit', and 'fill_axis' keywords in Series.align " - "are deprecated" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result_ser, result_df = ser.align(df, method=method) - tm.assert_series_equal(result_ser, ser) - tm.assert_frame_equal(result_df, df) - - def test_align_dt64tzindex_mismatched_tzs(): idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") ser = Series(np.random.default_rng(2).standard_normal(len(idx1)), index=idx1) @@ -211,6 +148,19 @@ def test_align_periodindex(join_type): ts.align(ts[::2], join=join_type) +def test_align_stringindex(any_string_dtype): + left = Series(range(3), index=pd.Index(["a", "b", "d"], dtype=any_string_dtype)) + right = Series(range(3), index=pd.Index(["a", "b", "c"], dtype=any_string_dtype)) + result_left, result_right = left.align(right) + + expected_idx = pd.Index(["a", "b", "c", "d"], dtype=any_string_dtype) + expected_left = Series([0, 1, np.nan, 2], index=expected_idx) + expected_right = Series([0, 1, 2, np.nan], index=expected_idx) + + tm.assert_series_equal(result_left, expected_left) + tm.assert_series_equal(result_right, expected_right) + + def test_align_left_fewer_levels(): # GH#45224 left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"])) diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 432c0eceee011..019efe8683347 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -20,21 +20,15 @@ def test_argsort_axis(self): def test_argsort_numpy(self, datetime_series): ser = datetime_series - res = np.argsort(ser).values expected = np.argsort(np.array(ser)) tm.assert_numpy_array_equal(res, expected) - # with missing values - ts = ser.copy() - ts[::2] = np.nan - - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False - ): - result = np.argsort(ts)[1::2] - expected = np.argsort(np.array(ts.dropna())) + def test_argsort_numpy_missing(self): + data = [0.1, np.nan, 0.2, np.nan, 0.3] + ser = Series(data) + result = np.argsort(ser) + expected = np.argsort(np.array(data)) tm.assert_numpy_array_equal(result.values, expected) @@ -56,10 +50,8 @@ def test_argsort_dt64(self, unit): expected = Series(range(5), dtype=np.intp) tm.assert_series_equal(result, expected) - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype=np.intp) + result = shifted.argsort() + expected = Series(list(range(4)) + [4], dtype=np.intp) tm.assert_series_equal(result, expected) def test_argsort_stable(self): @@ -74,7 +66,7 @@ def test_argsort_stable(self): tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) msg = ( r"ndarray Expected type , " - r"found instead" + r"found instead" ) with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(qindexer, mindexer) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 46f55fff91e41..4a7e204ee4161 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str") tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -164,18 +164,23 @@ def test_astype_empty_constructor_equality(self, dtype): @pytest.mark.parametrize("dtype", [str, np.str_]) @pytest.mark.parametrize( - "series", + "data", [ - Series([string.digits * 10, rand_str(63), rand_str(64), rand_str(1000)]), - Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]), + [string.digits * 10, rand_str(63), rand_str(64), rand_str(1000)], + [string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0], ], ) - def test_astype_str_map(self, dtype, series, using_infer_string): + def test_astype_str_map(self, dtype, data, using_infer_string): # see GH#4405 + series = Series(data) + using_string_dtype = using_infer_string and dtype is str result = series.astype(dtype) - expected = series.map(str) - if using_infer_string: - expected = expected.astype(object) + if using_string_dtype: + expected = series.map(lambda val: str(val) if val is not np.nan else np.nan) + else: + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -212,7 +217,7 @@ def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) result = Series(dti).astype(str) - expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str") tm.assert_series_equal(result, expected) def test_astype_dt64tz_to_str(self): @@ -225,7 +230,7 @@ def test_astype_dt64tz_to_str(self): "2012-01-02 00:00:00-05:00", "2012-01-03 00:00:00-05:00", ], - dtype=object, + dtype="str", ) tm.assert_series_equal(result, expected) @@ -285,22 +290,22 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"], dtype=object) + expected = Series(["2010-01-04"], dtype="str") tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str") tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): # see GH#9757 - td = Series([Timedelta(1, unit="d")]) + td = Series([Timedelta(1, unit="D")]) ser = td.astype(str) - expected = Series(["1 days"], dtype=object) + expected = Series(["1 days"], dtype="str") tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -342,12 +347,11 @@ def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): with pytest.raises((ValueError, TypeError), match=msg): ser.astype(float, errors=errors) - @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) - def test_astype_from_float_to_str(self, dtype): + def test_astype_from_float_to_str(self, any_float_dtype): # https://github.com/pandas-dev/pandas/issues/36451 - ser = Series([0.1], dtype=dtype) + ser = Series([0.1], dtype=any_float_dtype) result = ser.astype(str) - expected = Series(["0.1"], dtype=object) + expected = Series(["0.1"], dtype="str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, dtype): (NA, ""), ], ) - def test_astype_to_str_preserves_na(self, value, string_value): + def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string): # https://github.com/pandas-dev/pandas/issues/36904 ser = Series(["a", "b", value], dtype=object) result = ser.astype(str) - expected = Series(["a", "b", string_value], dtype=object) + expected = Series( + ["a", "b", None if using_infer_string else string_value], dtype="str" + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) @@ -374,21 +380,19 @@ def test_astype(self, dtype): assert as_typed.name == ser.name @pytest.mark.parametrize("value", [np.nan, np.inf]) - @pytest.mark.parametrize("dtype", [np.int32, np.int64]) - def test_astype_cast_nan_inf_int(self, dtype, value): + def test_astype_cast_nan_inf_int(self, any_int_numpy_dtype, value): # gh-14265: check NaN and inf raise error when converting to int msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" ser = Series([value]) with pytest.raises(ValueError, match=msg): - ser.astype(dtype) + ser.astype(any_int_numpy_dtype) - @pytest.mark.parametrize("dtype", [int, np.int8, np.int64]) - def test_astype_cast_object_int_fail(self, dtype): + def test_astype_cast_object_int_fail(self, any_int_numpy_dtype): arr = Series(["car", "house", "tree", "1"]) msg = r"invalid literal for int\(\) with base 10: 'car'" with pytest.raises(ValueError, match=msg): - arr.astype(dtype) + arr.astype(any_int_numpy_dtype) def test_astype_float_to_uint_negatives_raise( self, float_numpy_dtype, any_unsigned_int_numpy_dtype @@ -462,13 +466,9 @@ def test_astype_nan_to_bool(self): expected = Series(True, dtype="bool") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "dtype", - tm.ALL_INT_EA_DTYPES + tm.FLOAT_EA_DTYPES, - ) - def test_astype_ea_to_datetimetzdtype(self, dtype): + def test_astype_ea_to_datetimetzdtype(self, any_numeric_ea_dtype): # GH37553 - ser = Series([4, 0, 9], dtype=dtype) + ser = Series([4, 0, 9], dtype=any_numeric_ea_dtype) result = ser.astype(DatetimeTZDtype(tz="US/Pacific")) expected = Series( @@ -538,12 +538,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype="str") tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") @@ -673,3 +673,11 @@ def test_astype_timedelta64_with_np_nan(self): result = Series([Timedelta(1), np.nan], dtype="timedelta64[ns]") expected = Series([Timedelta(1), NaT], dtype="timedelta64[ns]") tm.assert_series_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_astype_int_na_string(self): + # GH#57418 + ser = Series([12, NA], dtype="Int64[pyarrow]") + result = ser.astype("string[pyarrow]") + expected = Series(["12", NA], dtype="string[pyarrow]") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_between.py b/pandas/tests/series/methods/test_between.py index 3913419038876..f035767e2ce0e 100644 --- a/pandas/tests/series/methods/test_between.py +++ b/pandas/tests/series/methods/test_between.py @@ -66,10 +66,9 @@ def test_between_error_args(self, inclusive): left, right = series[[2, 7]] value_error_msg = ( - "Inclusive has to be either string of 'both'," - "'left', 'right', or 'neither'." + "Inclusive has to be either string of 'both','left', 'right', or 'neither'." ) + series = Series(date_range("1/1/2000", periods=10)) with pytest.raises(ValueError, match=value_error_msg): - series = Series(date_range("1/1/2000", periods=10)) series.between(left, right, inclusive=inclusive) diff --git a/pandas/tests/series/methods/test_case_when.py b/pandas/tests/series/methods/test_case_when.py new file mode 100644 index 0000000000000..7cb60a11644a3 --- /dev/null +++ b/pandas/tests/series/methods/test_case_when.py @@ -0,0 +1,148 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + array as pd_array, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def df(): + """ + base dataframe for testing + """ + return DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + +def test_case_when_caselist_is_not_a_list(df): + """ + Raise ValueError if caselist is not a list. + """ + msg = "The caselist argument should be a list; " + msg += "instead got.+" + with pytest.raises(TypeError, match=msg): # GH39154 + df["a"].case_when(caselist=()) + + +def test_case_when_no_caselist(df): + """ + Raise ValueError if no caselist is provided. + """ + msg = "provide at least one boolean condition, " + msg += "with a corresponding replacement." + with pytest.raises(ValueError, match=msg): # GH39154 + df["a"].case_when([]) + + +def test_case_when_odd_caselist(df): + """ + Raise ValueError if no of caselist is odd. + """ + msg = "Argument 0 must have length 2; " + msg += "a condition and replacement; instead got length 3." + + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), 1, df.a.gt(1))]) + + +def test_case_when_raise_error_from_mask(df): + """ + Raise Error from within Series.mask + """ + msg = "Failed to apply condition0 and replacement0." + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), [1, 2])]) + + +def test_case_when_single_condition(df): + """ + Test output on a single condition. + """ + result = Series([np.nan, np.nan, np.nan]).case_when([(df.a.eq(1), 1)]) + expected = Series([1, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions(df): + """ + Test output when booleans are derived from a computation + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [(df.a.eq(1), 1), (Series([False, True, False]), 2)] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_list(df): + """ + Test output when replacement is a list + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [([True, False, False], 1), (df["a"].gt(1) & df["b"].eq(5), [1, 2, 3])] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_extension_dtype(df): + """ + Test output when replacement has an extension dtype + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + ([True, False, False], 1), + (df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")), + ], + ) + expected = Series([1, 2, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_series(df): + """ + Test output when replacement is a Series + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + (np.array([True, False, False]), 1), + (df["a"].gt(1) & df["b"].eq(5), Series([1, 2, 3])), + ], + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_non_range_index(): + """ + Test output if index is not RangeIndex + """ + rng = np.random.default_rng(seed=123) + dates = date_range("1/1/2000", periods=8) + df = DataFrame( + rng.standard_normal(size=(8, 4)), index=dates, columns=["A", "B", "C", "D"] + ) + result = Series(5, index=df.index, name="A").case_when([(df.A.gt(0), df.B)]) + expected = df.A.mask(df.A.gt(0), df.B).where(df.A.gt(0), 5) + tm.assert_series_equal(result, expected) + + +def test_case_when_callable(): + """ + Test output on a callable + """ + # https://numpy.org/doc/stable/reference/generated/numpy.piecewise.html + x = np.linspace(-2.5, 2.5, 6) + ser = Series(x) + result = ser.case_when( + caselist=[ + (lambda df: df < 0, lambda df: -df), + (lambda df: df >= 0, lambda df: df), + ] + ) + expected = np.piecewise(x, [x < 0, x >= 0], [lambda x: -x, lambda x: x]) + tm.assert_series_equal(result, Series(expected)) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 5bbf35f6e39bb..8ed422fc118dc 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import OutOfBoundsDatetime + import pandas as pd from pandas import ( Series, @@ -69,15 +71,11 @@ def test_clip_with_na_args(self): tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) # GH#19992 - msg = "Downcasting behavior in Series and DataFrame methods 'where'" - # TODO: avoid this warning here? seems like we should never be upcasting - # in the first place? - with tm.assert_produces_warning(FutureWarning, match=msg): - res = s.clip(lower=[0, 4, np.nan]) - tm.assert_series_equal(res, Series([1, 4, 3])) - with tm.assert_produces_warning(FutureWarning, match=msg): - res = s.clip(upper=[1, np.nan, 1]) - tm.assert_series_equal(res, Series([1, 2, 1])) + + res = s.clip(lower=[0, 4, np.nan]) + tm.assert_series_equal(res, Series([1, 4, 3.0])) + res = s.clip(upper=[1, np.nan, 1]) + tm.assert_series_equal(res, Series([1, 2, 1.0])) # GH#40420 s = Series([1, 2, 3]) @@ -135,12 +133,30 @@ def test_clip_with_datetimes(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("dtype", [object, "M8[us]"]) - def test_clip_with_timestamps_and_oob_datetimes(self, dtype): + def test_clip_with_timestamps_and_oob_datetimes_object(self): # GH-42794 - ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=object) result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) - expected = Series([Timestamp.min, Timestamp.max], dtype=dtype) + expected = Series([Timestamp.min, Timestamp.max], dtype=object) + + tm.assert_series_equal(result, expected) + + def test_clip_with_timestamps_and_oob_datetimes_non_nano(self): + # GH#56410 + dtype = "M8[us]" + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) + + msg = ( + r"Incompatible \(high-resolution\) value for dtype='datetime64\[us\]'. " + "Explicitly cast before operating" + ) + with pytest.raises(OutOfBoundsDatetime, match=msg): + ser.clip(lower=Timestamp.min, upper=Timestamp.max) + + lower = Timestamp.min.as_unit("us") + upper = Timestamp.max.as_unit("us") + result = ser.clip(lower=lower, upper=upper) + expected = Series([lower, upper], dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index e1ec8afda33a9..51d6704e1905b 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -31,7 +31,7 @@ def test_combine_first_name(self, datetime_series): result = datetime_series.combine_first(datetime_series[:5]) assert result.name == datetime_series.name - def test_combine_first(self): + def test_combine_first(self, using_infer_string): values = np.arange(20, dtype=np.float64) series = Series(values, index=np.arange(20, dtype=np.int64)) @@ -63,11 +63,10 @@ def test_combine_first(self): # corner case ser = Series([1.0, 2, 3], index=[0, 1, 2]) empty = Series([], index=[], dtype=object) - msg = "The behavior of array concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.combine_first(empty) - ser.index = ser.index.astype("O") - tm.assert_series_equal(ser, result) + result = ser.combine_first(empty) + if not using_infer_string: + ser.index = ser.index.astype("O") + tm.assert_series_equal(result, ser.astype(object)) def test_combine_first_dt64(self, unit): s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit) @@ -80,7 +79,7 @@ def test_combine_first_dt64(self, unit): s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") + xp = Series([datetime(2010, 1, 1), "2011"], dtype=f"datetime64[{unit}]") tm.assert_series_equal(rs, xp) @@ -112,10 +111,8 @@ def test_combine_first_timezone_series_with_empty_series(self): ) s1 = Series(range(10), index=time_index) s2 = Series(index=time_index) - msg = "The behavior of array concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s1.combine_first(s2) - tm.assert_series_equal(result, s1) + result = s1.combine_first(s2) + tm.assert_series_equal(result, s1.astype(np.float64)) def test_combine_first_preserves_dtype(self): # GH51764 diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py index fe2016a245ec7..2a57d5139b62c 100644 --- a/pandas/tests/series/methods/test_compare.py +++ b/pandas/tests/series/methods/test_compare.py @@ -5,15 +5,14 @@ import pandas._testing as tm -@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) -def test_compare_axis(align_axis): +def test_compare_axis(axis): # GH#30429 s1 = pd.Series(["a", "b", "c"]) s2 = pd.Series(["x", "b", "z"]) - result = s1.compare(s2, align_axis=align_axis) + result = s1.compare(s2, align_axis=axis) - if align_axis in (1, "columns"): + if axis in (1, "columns"): indices = pd.Index([0, 2]) columns = pd.Index(["self", "other"]) expected = pd.DataFrame( @@ -100,19 +99,19 @@ def test_compare_multi_index(): tm.assert_series_equal(result, expected) -def test_compare_unaligned_objects(): - # test Series with different indices +def test_compare_different_indices(): msg = "Can only compare identically-labeled Series objects" + ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + ser2 = pd.Series([1, 2, 3], index=["a", "b", "d"]) with pytest.raises(ValueError, match=msg): - ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) - ser2 = pd.Series([1, 2, 3], index=["a", "b", "d"]) ser1.compare(ser2) - # test Series with different lengths + +def test_compare_different_lengths(): msg = "Can only compare identically-labeled Series objects" + ser1 = pd.Series([1, 2, 3]) + ser2 = pd.Series([1, 2, 3, 4]) with pytest.raises(ValueError, match=msg): - ser1 = pd.Series([1, 2, 3]) - ser2 = pd.Series([1, 2, 3, 4]) ser1.compare(ser2) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index b0a920ba02cad..324e03894e92c 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -4,191 +4,185 @@ import pytest from pandas._libs import lib +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm -# Each test case consists of a tuple with the data and dtype to create the -# test Series, the default dtype for the expected result (which is valid -# for most cases), and the specific cases where the result deviates from -# this default. Those overrides are defined as a dict with (keyword, val) as -# dictionary key. In case of multiple items, the last override takes precedence. - - -@pytest.fixture( - params=[ - ( - # data - [1, 2, 3], - # original dtype - np.dtype("int32"), - # default expected dtype - "Int32", - # exceptions on expected dtype - {("convert_integer", False): np.dtype("int32")}, - ), - ( - [1, 2, 3], - np.dtype("int64"), - "Int64", - {("convert_integer", False): np.dtype("int64")}, - ), - ( - ["x", "y", "z"], - np.dtype("O"), - pd.StringDtype(), - {("convert_string", False): np.dtype("O")}, - ), - ( - [True, False, np.nan], - np.dtype("O"), - pd.BooleanDtype(), - {("convert_boolean", False): np.dtype("O")}, - ), - ( - ["h", "i", np.nan], - np.dtype("O"), - pd.StringDtype(), - {("convert_string", False): np.dtype("O")}, - ), - ( # GH32117 - ["h", "i", 1], - np.dtype("O"), - np.dtype("O"), - {}, - ), - ( - [10, np.nan, 20], - np.dtype("float"), - "Int64", - { - ("convert_integer", False, "convert_floating", True): "Float64", - ("convert_integer", False, "convert_floating", False): np.dtype( - "float" - ), - }, - ), - ( - [np.nan, 100.5, 200], - np.dtype("float"), - "Float64", - {("convert_floating", False): np.dtype("float")}, - ), - ( - [3, 4, 5], - "Int8", - "Int8", - {}, - ), - ( - [[1, 2], [3, 4], [5]], - None, - np.dtype("O"), - {}, - ), - ( - [4, 5, 6], - np.dtype("uint32"), - "UInt32", - {("convert_integer", False): np.dtype("uint32")}, - ), - ( - [-10, 12, 13], - np.dtype("i1"), - "Int8", - {("convert_integer", False): np.dtype("i1")}, - ), - ( - [1.2, 1.3], - np.dtype("float32"), - "Float32", - {("convert_floating", False): np.dtype("float32")}, - ), - ( - [1, 2.0], - object, - "Int64", - { - ("convert_integer", False): "Float64", - ("convert_integer", False, "convert_floating", False): np.dtype( - "float" - ), - ("infer_objects", False): np.dtype("object"), - }, - ), - ( - [1, 2.5], - object, - "Float64", - { - ("convert_floating", False): np.dtype("float"), - ("infer_objects", False): np.dtype("object"), - }, - ), - (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("s"), - pd.DatetimeTZDtype(tz="UTC"), - pd.DatetimeTZDtype(tz="UTC"), - {}, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ms"), - pd.DatetimeTZDtype(tz="UTC"), - pd.DatetimeTZDtype(tz="UTC"), - {}, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("us"), - pd.DatetimeTZDtype(tz="UTC"), - pd.DatetimeTZDtype(tz="UTC"), - {}, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), - pd.DatetimeTZDtype(tz="UTC"), - pd.DatetimeTZDtype(tz="UTC"), - {}, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), - "datetime64[ns]", - np.dtype("datetime64[ns]"), - {}, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), - object, - np.dtype("datetime64[ns]"), - {("infer_objects", False): np.dtype("object")}, - ), - ( - pd.period_range("1/1/2011", freq="M", periods=3), - None, - pd.PeriodDtype("M"), - {}, - ), - ( - pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), - None, - pd.IntervalDtype("int64", "right"), - {}, - ), - ] -) -def test_cases(request): - return request.param - class TestSeriesConvertDtypes: + @pytest.mark.parametrize( + "data, maindtype, expected_default, expected_other", + [ + ( + # data + [1, 2, 3], + # original dtype + np.dtype("int32"), + # default expected dtype + "Int32", + # exceptions on expected dtype + {("convert_integer", False): np.dtype("int32")}, + ), + ( + [1, 2, 3], + np.dtype("int64"), + "Int64", + {("convert_integer", False): np.dtype("int64")}, + ), + ( + ["x", "y", "z"], + np.dtype("O"), + pd.StringDtype(), + {("convert_string", False): np.dtype("O")}, + ), + ( + [True, False, np.nan], + np.dtype("O"), + pd.BooleanDtype(), + {("convert_boolean", False): np.dtype("O")}, + ), + ( + ["h", "i", np.nan], + np.dtype("O"), + pd.StringDtype(), + {("convert_string", False): np.dtype("O")}, + ), + ( # GH32117 + ["h", "i", 1], + np.dtype("O"), + np.dtype("O"), + {}, + ), + ( + [10, np.nan, 20], + np.dtype("float"), + "Int64", + { + ("convert_integer", False, "convert_floating", True): "Float64", + ("convert_integer", False, "convert_floating", False): np.dtype( + "float" + ), + }, + ), + ( + [np.nan, 100.5, 200], + np.dtype("float"), + "Float64", + {("convert_floating", False): np.dtype("float")}, + ), + ( + [3, 4, 5], + "Int8", + "Int8", + {}, + ), + ( + [[1, 2], [3, 4], [5]], + None, + np.dtype("O"), + {}, + ), + ( + [4, 5, 6], + np.dtype("uint32"), + "UInt32", + {("convert_integer", False): np.dtype("uint32")}, + ), + ( + [-10, 12, 13], + np.dtype("i1"), + "Int8", + {("convert_integer", False): np.dtype("i1")}, + ), + ( + [1.2, 1.3], + np.dtype("float32"), + "Float32", + {("convert_floating", False): np.dtype("float32")}, + ), + ( + [1, 2.0], + object, + "Int64", + { + ("convert_integer", False): "Float64", + ("convert_integer", False, "convert_floating", False): np.dtype( + "float" + ), + ("infer_objects", False): np.dtype("object"), + }, + ), + ( + [1, 2.5], + object, + "Float64", + { + ("convert_floating", False): np.dtype("float"), + ("infer_objects", False): np.dtype("object"), + }, + ), + (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("s"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ms"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("us"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), + "datetime64[ns]", + np.dtype("datetime64[ns]"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), + object, + np.dtype("datetime64[ns]"), + {("infer_objects", False): np.dtype("object")}, + ), + ( + pd.period_range("1/1/2011", freq="M", periods=3), + None, + pd.PeriodDtype("M"), + {}, + ), + ( + pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + None, + pd.IntervalDtype("int64", "right"), + {}, + ), + ], + ) @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) def test_convert_dtypes( self, - test_cases, + data, + maindtype, + expected_default, + expected_other, params, using_infer_string, ): - data, maindtype, expected_default, expected_other = test_cases if ( hasattr(data, "dtype") and lib.is_np_dtype(data.dtype, "M") @@ -227,9 +221,9 @@ def test_convert_dtypes( and params[0] and not params[1] ): - # If we would convert with convert strings then infer_objects converts - # with the option - expected_dtype = "string[pyarrow_numpy]" + # If convert_string=False and infer_objects=True, we end up with the + # default string dtype instead of preserving object for string data + expected_dtype = pd.StringDtype(na_value=np.nan) expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -238,7 +232,7 @@ def test_convert_dtypes( copy = series.copy(deep=True) if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result[result.notna()] = np.nan else: result[result.notna()] = np.nan @@ -304,3 +298,23 @@ def test_convert_dtypes_pyarrow_null(self): result = ser.convert_dtypes(dtype_backend="pyarrow") expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) tm.assert_series_equal(result, expected) + + @td.skip_if_no("pyarrow") + @pytest.mark.parametrize("categories", [None, ["S1", "S2"]]) + def test_convert_empty_categorical_to_pyarrow(self, categories): + # GH#59934 + ser = pd.Series(pd.Categorical([None] * 5, categories=categories)) + converted = ser.convert_dtypes(dtype_backend="pyarrow") + expected = ser + tm.assert_series_equal(converted, expected) + + def test_convert_dtype_pyarrow_timezone_preserve(self): + # GH 60237 + pytest.importorskip("pyarrow") + ser = pd.Series( + pd.to_datetime(range(5), utc=True, unit="h"), + dtype="timestamp[ns, tz=UTC][pyarrow]", + ) + result = ser.convert_dtypes(dtype_backend="pyarrow") + expected = ser.copy() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index 23dbe85075916..ad5417d330d51 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -10,7 +10,7 @@ class TestCopy: @pytest.mark.parametrize("deep", ["default", None, False, True]) - def test_copy(self, deep, using_copy_on_write, warn_copy_on_write): + def test_copy(self, deep): ser = Series(np.arange(10), dtype="float64") # default deep is True @@ -19,29 +19,22 @@ def test_copy(self, deep, using_copy_on_write, warn_copy_on_write): else: ser2 = ser.copy(deep=deep) - if using_copy_on_write: - # INFO(CoW) a shallow copy doesn't yet copy the data - # but parent will not be modified (CoW) - if deep is None or deep is False: - assert np.may_share_memory(ser.values, ser2.values) - else: - assert not np.may_share_memory(ser.values, ser2.values) - - with tm.assert_cow_warning(warn_copy_on_write and deep is False): - ser2[::2] = np.nan - - if deep is not False or using_copy_on_write: - # Did not modify original Series - assert np.isnan(ser2[0]) - assert not np.isnan(ser[0]) + # INFO(CoW) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) else: - # we DID modify the original Series - assert np.isnan(ser2[0]) - assert np.isnan(ser[0]) + assert not np.may_share_memory(ser.values, ser2.values) + + ser2[::2] = np.nan + + # Did not modify original Series + assert np.isnan(ser2[0]) + assert not np.isnan(ser[0]) @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize("deep", ["default", None, False, True]) - def test_copy_tzaware(self, deep, using_copy_on_write): + def test_copy_tzaware(self, deep): # GH#11794 # copy of tz-aware expected = Series([Timestamp("2012/01/01", tz="UTC")]) @@ -54,25 +47,18 @@ def test_copy_tzaware(self, deep, using_copy_on_write): else: ser2 = ser.copy(deep=deep) - if using_copy_on_write: - # INFO(CoW) a shallow copy doesn't yet copy the data - # but parent will not be modified (CoW) - if deep is None or deep is False: - assert np.may_share_memory(ser.values, ser2.values) - else: - assert not np.may_share_memory(ser.values, ser2.values) + # INFO(CoW) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) ser2[0] = Timestamp("1999/01/01", tz="UTC") - # default deep is True - if deep is not False or using_copy_on_write: - # Did not modify original Series - tm.assert_series_equal(ser2, expected2) - tm.assert_series_equal(ser, expected) - else: - # we DID modify the original Series - tm.assert_series_equal(ser2, expected2) - tm.assert_series_equal(ser, expected2) + # Did not modify original Series + tm.assert_series_equal(ser2, expected2) + tm.assert_series_equal(ser, expected) def test_copy_name(self, datetime_series): result = datetime_series.copy() diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py index 9ba163f347198..2172a3205ef55 100644 --- a/pandas/tests/series/methods/test_count.py +++ b/pandas/tests/series/methods/test_count.py @@ -1,11 +1,9 @@ import numpy as np -import pandas as pd from pandas import ( Categorical, Series, ) -import pandas._testing as tm class TestSeriesCount: @@ -16,14 +14,6 @@ def test_count(self, datetime_series): assert datetime_series.count() == np.isfinite(datetime_series).sum() - def test_count_inf_as_na(self): - # GH#29478 - ser = Series([pd.Timestamp("1990/1/1")]) - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("use_inf_as_na", True): - assert ser.count() == 1 - def test_count_categorical(self): ser = Series( Categorical( diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index a369145b4e884..7a4d48fb76940 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -56,11 +56,10 @@ def test_cov_ddof(self, test_ddof, dtype): class TestSeriesCorr: - @pytest.mark.parametrize("dtype", ["float64", "Float64"]) - def test_corr(self, datetime_series, dtype): + def test_corr(self, datetime_series, any_float_dtype): stats = pytest.importorskip("scipy.stats") - datetime_series = datetime_series.astype(dtype) + datetime_series = datetime_series.astype(any_float_dtype) # full overlap tm.assert_almost_equal(datetime_series.corr(datetime_series), 1) @@ -87,22 +86,24 @@ def test_corr(self, datetime_series, dtype): index=date_range("2020-01-01", periods=10), name="ts", ) - B = A.copy() - result = A.corr(B) - expected, _ = stats.pearsonr(A, B) + result = A.corr(A) + expected, _ = stats.pearsonr(A, A) tm.assert_almost_equal(result, expected) def test_corr_rank(self): stats = pytest.importorskip("scipy.stats") # kendall and spearman - A = Series( + B = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - B = A.copy() - A[-5:] = A[:5].copy() + A = Series( + np.concatenate([np.arange(5, dtype=np.float64)] * 2), + index=date_range("2020-01-01", periods=10), + name="ts", + ) result = A.corr(B, method="kendall") expected = stats.kendalltau(A, B)[0] tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/series/methods/test_diff.py b/pandas/tests/series/methods/test_diff.py index 18de81a927c3a..89c34aeb64206 100644 --- a/pandas/tests/series/methods/test_diff.py +++ b/pandas/tests/series/methods/test_diff.py @@ -10,6 +10,11 @@ class TestSeriesDiff: + def test_diff_series_requires_integer(self): + series = Series(np.random.default_rng(2).standard_normal(2)) + with pytest.raises(ValueError, match="periods must be an integer"): + series.diff(1.5) + def test_diff_np(self): # TODO(__array_function__): could make np.diff return a Series # matching ser.diff() @@ -69,13 +74,11 @@ def test_diff_dt64tz(self): expected = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "input,output,diff", - [([False, True, True, False, False], [np.nan, True, False, True, False], 1)], - ) - def test_diff_bool(self, input, output, diff): + def test_diff_bool(self): # boolean series (test for fixing #17294) - ser = Series(input) + data = [False, True, True, False, False] + output = [np.nan, True, False, True, False] + ser = Series(data) result = ser.diff() expected = Series(output) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_drop.py b/pandas/tests/series/methods/test_drop.py index 5d9a469915cfb..d2a5a3324e886 100644 --- a/pandas/tests/series/methods/test_drop.py +++ b/pandas/tests/series/methods/test_drop.py @@ -31,17 +31,17 @@ def test_drop_unique_and_non_unique_index( @pytest.mark.parametrize( - "data, index, drop_labels, axis, error_type, error_desc", + "drop_labels, axis, error_type, error_desc", [ # single string/tuple-like - (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), + ("bc", 0, KeyError, "not found in axis"), # bad axis - (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), - (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), + (("a",), 0, KeyError, "not found in axis"), + ("one", "columns", ValueError, "No axis named columns"), ], ) -def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): - ser = Series(data, index=index) +def test_drop_exception_raised(drop_labels, axis, error_type, error_desc): + ser = Series(range(3), index=list("abc")) with pytest.raises(error_type, match=error_desc): ser.drop(drop_labels, axis=axis) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 10b2e98586365..2dbd61530dc41 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -34,14 +34,14 @@ def test_drop_duplicates(any_numpy_dtype, keep, expected): @pytest.mark.parametrize( "keep, expected", [ - ("first", Series([False, False, True, True])), - ("last", Series([True, True, False, False])), - (False, Series([True, True, True, True])), + ("first", [False, False, True, True]), + ("last", [True, True, False, False]), + (False, [True, True, True, True]), ], ) def test_drop_duplicates_bool(keep, expected): tc = Series([True, False, True, False]) - + expected = Series(expected) tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() @@ -75,10 +75,16 @@ class TestSeriesDropDuplicates: params=["int_", "uint", "float64", "str_", "timedelta64[h]", "datetime64[D]"] ) def dtype(self, request): + """ + Fixture that provides different data types for testing. + """ return request.param @pytest.fixture def cat_series_unused_category(self, dtype, ordered): + """ + Fixture that creates a Categorical Series with some unused categories. + """ # Test case 1 cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) @@ -141,7 +147,9 @@ def test_drop_duplicates_categorical_non_bool_keepfalse( @pytest.fixture def cat_series(self, dtype, ordered): - # no unused categories, unlike cat_series_unused_category + """ + Fixture that creates a Categorical Series with no unused categories. + """ cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index e177b5275d855..f5a387a7f302e 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -12,30 +12,32 @@ @pytest.mark.parametrize( "keep, expected", [ - ("first", Series([False, False, True, False, True], name="name")), - ("last", Series([True, True, False, False, False], name="name")), - (False, Series([True, True, True, False, True], name="name")), + ("first", [False, False, True, False, True]), + ("last", [True, True, False, False, False]), + (False, [True, True, True, False, True]), ], ) def test_duplicated_keep(keep, expected): ser = Series(["a", "b", "b", "c", "a"], name="name") result = ser.duplicated(keep=keep) + expected = Series(expected, name="name") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "keep, expected", [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), + ("first", [False, False, True, False, True]), + ("last", [True, True, False, False, False]), + (False, [True, True, True, False, True]), ], ) def test_duplicated_nan_none(keep, expected): ser = Series([np.nan, 3, 3, None, np.nan], dtype=object) result = ser.duplicated(keep=keep) + expected = Series(expected) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index 875ffdd3fe851..0c52eacd7e516 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -1,11 +1,9 @@ -from contextlib import nullcontext import copy import numpy as np import pytest from pandas._libs.missing import is_matching_na -from pandas.compat.numpy import np_version_gte1p25 from pandas.core.dtypes.common import is_float @@ -14,7 +12,6 @@ MultiIndex, Series, ) -import pandas._testing as tm @pytest.mark.parametrize( @@ -48,14 +45,7 @@ def test_equals_list_array(val): assert s1.equals(s2) s1[1] = val - - cm = ( - tm.assert_produces_warning(FutureWarning, check_stacklevel=False) - if isinstance(val, str) and not np_version_gte1p25 - else nullcontext() - ) - with cm: - assert not s1.equals(s2) + assert not s1.equals(s2) def test_equals_false_negative(): @@ -82,15 +72,13 @@ def test_equals_matching_nas(): left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.timedelta64("NaT")], dtype=object) right = Series([np.timedelta64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.float64("NaN")], dtype=object) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 5a0188585ef30..e4ad2493f9bb9 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -74,11 +74,12 @@ def test_invert_array(): @pytest.mark.parametrize( - "s", [pd.Series([1, 2, 3]), pd.Series(pd.date_range("2019", periods=3, tz="UTC"))] + "data", [[1, 2, 3], pd.date_range("2019", periods=3, tz="UTC")] ) -def test_non_object_dtype(s): - result = s.explode() - tm.assert_series_equal(result, s) +def test_non_object_dtype(data): + ser = pd.Series(data) + result = ser.explode() + tm.assert_series_equal(result, ser) def test_typical_usecase(): @@ -144,8 +145,9 @@ def test_explode_scalars_can_ignore_index(): @pytest.mark.parametrize("ignore_index", [True, False]) -def test_explode_pyarrow_list_type(ignore_index): - # GH 53602 +@pytest.mark.parametrize("list_type", ["list_", "large_list"]) +def test_explode_pyarrow_list_type(ignore_index, list_type): + # GH 53602, 61091 pa = pytest.importorskip("pyarrow") data = [ @@ -155,7 +157,7 @@ def test_explode_pyarrow_list_type(ignore_index): [2, 3], None, ] - ser = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + ser = pd.Series(data, dtype=pd.ArrowDtype(getattr(pa, list_type)(pa.int64()))) result = ser.explode(ignore_index=ignore_index) expected = pd.Series( data=[None, None, 1, None, 2, 3, None], diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 293259661cd9a..f53d75df83124 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -6,7 +6,6 @@ import numpy as np import pytest -import pytz from pandas import ( Categorical, @@ -25,14 +24,11 @@ from pandas.core.arrays import period_array -@pytest.mark.filterwarnings( - "ignore:(Series|DataFrame).fillna with 'method' is deprecated:FutureWarning" -) class TestSeriesFillNA: def test_fillna_nat(self): series = Series([0, 1, 2, NaT._value], dtype="M8[ns]") - filled = series.fillna(method="pad") + filled = series.ffill() filled2 = series.fillna(value=series.values[2]) expected = series.copy() @@ -42,7 +38,7 @@ def test_fillna_nat(self): tm.assert_series_equal(filled2, expected) df = DataFrame({"A": series}) - filled = df.fillna(method="pad") + filled = df.ffill() filled2 = df.fillna(value=series.values[2]) expected = DataFrame({"A": expected}) tm.assert_frame_equal(filled, expected) @@ -50,7 +46,7 @@ def test_fillna_nat(self): series = Series([NaT._value, 0, 1, 2], dtype="M8[ns]") - filled = series.fillna(method="bfill") + filled = series.bfill() filled2 = series.fillna(value=series[1]) expected = series.copy() @@ -60,39 +56,30 @@ def test_fillna_nat(self): tm.assert_series_equal(filled2, expected) df = DataFrame({"A": series}) - filled = df.fillna(method="bfill") + filled = df.bfill() filled2 = df.fillna(value=series[1]) expected = DataFrame({"A": expected}) tm.assert_frame_equal(filled, expected) tm.assert_frame_equal(filled2, expected) - def test_fillna_value_or_method(self, datetime_series): - msg = "Cannot specify both 'value' and 'method'" - with pytest.raises(ValueError, match=msg): - datetime_series.fillna(value=0, method="ffill") - def test_fillna(self): ts = Series( [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) ) - tm.assert_series_equal(ts, ts.fillna(method="ffill")) + tm.assert_series_equal(ts, ts.ffill()) ts.iloc[2] = np.nan exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) - tm.assert_series_equal(ts.fillna(method="ffill"), exp) + tm.assert_series_equal(ts.ffill(), exp) exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) - tm.assert_series_equal(ts.fillna(method="backfill"), exp) + tm.assert_series_equal(ts.bfill(), exp) exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(value=5), exp) - msg = "Must specify a fill 'value' or 'method'" - with pytest.raises(ValueError, match=msg): - ts.fillna() - def test_fillna_nonscalar(self): # GH#5703 s1 = Series([np.nan]) @@ -171,74 +158,8 @@ def test_fillna_consistency(self): # assignment ser2 = ser.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser2[1] = "foo" - tm.assert_series_equal(ser2, expected) - - def test_fillna_downcast(self): - # GH#15277 - # infer int64 from float64 - ser = Series([1.0, np.nan]) - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.fillna(0, downcast="infer") - expected = Series([1, 0]) - tm.assert_series_equal(result, expected) - - # infer int64 from float64 when fillna value is a dict - ser = Series([1.0, np.nan]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.fillna({1: 0}, downcast="infer") - expected = Series([1, 0]) - tm.assert_series_equal(result, expected) - - def test_fillna_downcast_infer_objects_to_numeric(self): - # GH#44241 if we have object-dtype, 'downcast="infer"' should - # _actually_ infer - - arr = np.arange(5).astype(object) - arr[3] = np.nan - - ser = Series(arr) - - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.fillna(3, downcast="infer") - expected = Series(np.arange(5), dtype=np.int64) - tm.assert_series_equal(res, expected) - - msg = "The 'downcast' keyword in ffill is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.ffill(downcast="infer") - expected = Series([0, 1, 2, 2, 4], dtype=np.int64) - tm.assert_series_equal(res, expected) - - msg = "The 'downcast' keyword in bfill is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.bfill(downcast="infer") - expected = Series([0, 1, 2, 4, 4], dtype=np.int64) - tm.assert_series_equal(res, expected) - - # with a non-round float present, we will downcast to float64 - ser[2] = 2.5 - - expected = Series([0, 1, 2.5, 3, 4], dtype=np.float64) - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.fillna(3, downcast="infer") - tm.assert_series_equal(res, expected) - - msg = "The 'downcast' keyword in ffill is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.ffill(downcast="infer") - expected = Series([0, 1, 2.5, 2.5, 4], dtype=np.float64) - tm.assert_series_equal(res, expected) - - msg = "The 'downcast' keyword in bfill is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = ser.bfill(downcast="infer") - expected = Series([0, 1, 2.5, 4, 4], dtype=np.float64) - tm.assert_series_equal(res, expected) def test_timedelta_fillna(self, frame_or_series, unit): # GH#3371 @@ -385,12 +306,7 @@ def test_datetime64_fillna(self): "scalar", [ False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="GH#56410 scalar case not yet addressed" - ), - ), + True, ], ) @pytest.mark.parametrize("tz", [None, "UTC"]) @@ -419,12 +335,7 @@ def test_datetime64_fillna_mismatched_reso_no_rounding(self, tz, scalar): "scalar", [ False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="GH#56410 scalar case not yet addressed" - ), - ), + True, ], ) def test_timedelta64_fillna_mismatched_reso_no_rounding(self, scalar): @@ -460,7 +371,7 @@ def test_datetime64_fillna_backfill(self): ], dtype="M8[ns]", ) - result = ser.fillna(method="backfill") + result = ser.bfill() tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) @@ -498,7 +409,7 @@ def test_datetime64_tz_fillna(self, tz, unit): Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00", tz=tz), - ] + ], ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -680,7 +591,7 @@ def test_fillna_dt64tz_with_method(self): Timestamp("2012-11-11 00:00:00+01:00"), ] ) - tm.assert_series_equal(ser.fillna(method="pad"), exp) + tm.assert_series_equal(ser.ffill(), exp) ser = Series([NaT, Timestamp("2012-11-11 00:00:00+01:00")]) exp = Series( @@ -689,7 +600,7 @@ def test_fillna_dt64tz_with_method(self): Timestamp("2012-11-11 00:00:00+01:00"), ] ) - tm.assert_series_equal(ser.fillna(method="bfill"), exp) + tm.assert_series_equal(ser.bfill(), exp) def test_fillna_pytimedelta(self): # GH#8209 @@ -786,13 +697,11 @@ def test_fillna_categorical(self, fill_value, expected_output): @pytest.mark.parametrize( "fill_value, expected_output", [ - (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), - (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), + (["a", "b", "c", "d", "e"], ["a", "b", "b", "d", "e"]), + (["b", "d", "a", "d", "a"], ["a", "d", "b", "d", "a"]), ( - Series( - Categorical( - ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] - ) + Categorical( + ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] ), ["a", "d", "b", "d", "a"], ), @@ -803,6 +712,7 @@ def test_fillna_categorical_with_new_categories(self, fill_value, expected_outpu data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) + fill_value = Series(fill_value) result = ser.fillna(fill_value) tm.assert_series_equal(result, exp) @@ -838,12 +748,11 @@ def test_fillna_categorical_raises(self): ser.fillna(DataFrame({1: ["a"], 3: ["b"]})) @pytest.mark.parametrize("dtype", [float, "float32", "float64"]) - @pytest.mark.parametrize("fill_type", tm.ALL_REAL_NUMPY_DTYPES) @pytest.mark.parametrize("scalar", [True, False]) - def test_fillna_float_casting(self, dtype, fill_type, scalar): + def test_fillna_float_casting(self, dtype, any_real_numpy_dtype, scalar): # GH-43424 ser = Series([np.nan, 1.2], dtype=dtype) - fill_values = Series([2, 2], dtype=fill_type) + fill_values = Series([2, 2], dtype=any_real_numpy_dtype) if scalar: fill_values = fill_values.dtype.type(2) @@ -874,12 +783,6 @@ def test_fillna_f32_upcast_with_dict(self): # --------------------------------------------------------------- # Invalid Usages - def test_fillna_invalid_method(self, datetime_series): - try: - datetime_series.fillna(method="ffil") - except ValueError as inst: - assert "ffil" in str(inst) - def test_fillna_listlike_invalid(self): ser = Series(np.random.default_rng(2).integers(-100, 100, 50)) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' @@ -901,9 +804,8 @@ def test_fillna_method_and_limit_invalid(self): ] ) for limit in [-1, 0, 1.0, 2.0]: - for method in ["backfill", "bfill", "pad", "ffill", None]: - with pytest.raises(ValueError, match=msg): - ser.fillna(1, limit=limit, method=method) + with pytest.raises(ValueError, match=msg): + ser.fillna(1, limit=limit) def test_fillna_datetime64_with_timezone_tzinfo(self): # https://github.com/pandas-dev/pandas/issues/38851 @@ -944,46 +846,29 @@ def test_fillna_categorical_accept_same_type( tm.assert_categorical_equal(result, expected) -@pytest.mark.filterwarnings( - "ignore:Series.fillna with 'method' is deprecated:FutureWarning" -) class TestFillnaPad: def test_fillna_bug(self): ser = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) - filled = ser.fillna(method="ffill") + filled = ser.ffill() expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], ser.index) tm.assert_series_equal(filled, expected) - filled = ser.fillna(method="bfill") + filled = ser.bfill() expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], ser.index) tm.assert_series_equal(filled, expected) - def test_ffill(self): - ts = Series( - [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) - ) - ts.iloc[2] = np.nan - tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) - def test_ffill_mixed_dtypes_without_missing_data(self): # GH#14956 - series = Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) + series = Series([datetime(2015, 1, 1, tzinfo=timezone.utc), 1]) result = series.ffill() tm.assert_series_equal(series, result) - def test_bfill(self): - ts = Series( - [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) - ) - ts.iloc[2] = np.nan - tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) - def test_pad_nan(self): x = Series( [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float ) - return_value = x.fillna(method="pad", inplace=True) + return_value = x.ffill(inplace=True) assert return_value is None expected = Series( @@ -997,16 +882,16 @@ def test_series_fillna_limit(self): s = Series(np.random.default_rng(2).standard_normal(10), index=index) result = s[:2].reindex(index) - result = result.fillna(method="pad", limit=5) + result = result.ffill(limit=5) - expected = s[:2].reindex(index).fillna(method="pad") + expected = s[:2].reindex(index).ffill() expected[-3:] = np.nan tm.assert_series_equal(result, expected) result = s[-2:].reindex(index) - result = result.fillna(method="bfill", limit=5) + result = result.bfill(limit=5) - expected = s[-2:].reindex(index).fillna(method="backfill") + expected = s[-2:].reindex(index).bfill() expected[:3] = np.nan tm.assert_series_equal(result, expected) @@ -1016,36 +901,36 @@ def test_series_pad_backfill_limit(self): result = s[:2].reindex(index, method="pad", limit=5) - expected = s[:2].reindex(index).fillna(method="pad") + expected = s[:2].reindex(index).ffill() expected[-3:] = np.nan tm.assert_series_equal(result, expected) result = s[-2:].reindex(index, method="backfill", limit=5) - expected = s[-2:].reindex(index).fillna(method="backfill") + expected = s[-2:].reindex(index).bfill() expected[:3] = np.nan tm.assert_series_equal(result, expected) def test_fillna_int(self): ser = Series(np.random.default_rng(2).integers(-100, 100, 50)) - return_value = ser.fillna(method="ffill", inplace=True) + return_value = ser.ffill(inplace=True) assert return_value is None - tm.assert_series_equal(ser.fillna(method="ffill", inplace=False), ser) + tm.assert_series_equal(ser.ffill(inplace=False), ser) def test_datetime64tz_fillna_round_issue(self): # GH#14872 data = Series( - [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc)] ) filled = data.bfill() expected = Series( [ - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), ] ) @@ -1074,13 +959,6 @@ def test_fillna_parr(self): tm.assert_series_equal(filled, expected) - @pytest.mark.parametrize("func", ["pad", "backfill"]) - def test_pad_backfill_deprecated(self, func): - # GH#33396 - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - getattr(ser, func)() - @pytest.mark.parametrize( "data, expected_data, method, kwargs", diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py index 8325cc884ebcb..4a11d7905f506 100644 --- a/pandas/tests/series/methods/test_get_numeric_data.py +++ b/pandas/tests/series/methods/test_get_numeric_data.py @@ -1,5 +1,4 @@ from pandas import ( - Index, Series, date_range, ) @@ -7,25 +6,19 @@ class TestGetNumericData: - def test_get_numeric_data_preserve_dtype( - self, using_copy_on_write, warn_copy_on_write - ): + def test_get_numeric_data_preserve_dtype(self): # get the numeric data obj = Series([1, 2, 3]) result = obj._get_numeric_data() tm.assert_series_equal(result, obj) # returned object is a shallow copy - with tm.assert_cow_warning(warn_copy_on_write): - result.iloc[0] = 0 - if using_copy_on_write: - assert obj.iloc[0] == 1 - else: - assert obj.iloc[0] == 0 + result.iloc[0] = 0 + assert obj.iloc[0] == 1 obj = Series([1, "2", 3.0]) result = obj._get_numeric_data() - expected = Series([], dtype=object, index=Index([], dtype=object)) + expected = Series([], dtype=object) tm.assert_series_equal(result, expected) obj = Series([True, False, True]) @@ -34,5 +27,5 @@ def test_get_numeric_data_preserve_dtype( obj = Series(date_range("20130101", periods=3)) result = obj._get_numeric_data() - expected = Series([], dtype="M8[ns]", index=Index([], dtype=object)) + expected = Series([], dtype="M8[ns]") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_infer_objects.py b/pandas/tests/series/methods/test_infer_objects.py index 29abac6b3780e..0d411e08a0107 100644 --- a/pandas/tests/series/methods/test_infer_objects.py +++ b/pandas/tests/series/methods/test_infer_objects.py @@ -13,12 +13,12 @@ def test_copy(self, index_or_series): # case where we don't need to do inference because it is already non-object obj = index_or_series(np.array([1, 2, 3], dtype="int64")) - result = obj.infer_objects(copy=False) + result = obj.infer_objects() assert tm.shares_memory(result, obj) # case where we try to do inference but can't do better than object obj2 = index_or_series(np.array(["foo", 2], dtype=object)) - result2 = obj2.infer_objects(copy=False) + result2 = obj2.infer_objects() assert tm.shares_memory(result2, obj2) def test_infer_objects_series(self, index_or_series): diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 29dd704f6efa9..88f2cf384fc79 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -5,10 +5,16 @@ import numpy as np import pytest -from pandas.compat import PYPY +from pandas._config import using_string_dtype + +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( CategoricalIndex, + Index, MultiIndex, Series, date_range, @@ -39,7 +45,9 @@ def test_info_categorical(): @pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(lexsorted_two_level_string_multiindex, verbose): +def test_info_series( + lexsorted_two_level_string_multiindex, verbose, using_infer_string +): index = lexsorted_two_level_string_multiindex ser = Series(range(len(index)), index=index, name="sth") buf = StringIO() @@ -48,7 +56,7 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose): expected = textwrap.dedent( """\ - + MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') """ ) @@ -61,10 +69,11 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose): 10 non-null int64 """ ) + qualifier = "" if using_infer_string and HAS_PYARROW else "+" expected += textwrap.dedent( f"""\ dtypes: int64(1) - memory usage: {ser.memory_usage()}.0+ bytes + memory usage: {ser.memory_usage()}.0{qualifier} bytes """ ) assert result == expected @@ -78,7 +87,7 @@ def test_info_memory(): memory_bytes = float(s.memory_usage()) expected = textwrap.dedent( f"""\ - + RangeIndex: 2 entries, 0 to 1 Series name: None Non-Null Count Dtype @@ -141,19 +150,21 @@ def test_info_memory_usage_deep_pypy(): @pytest.mark.parametrize( - "series, plus", + "index, plus", [ - (Series(1, index=[1, 2, 3]), False), - (Series(1, index=list("ABC")), True), - (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ([1, 2, 3], False), + (Index(list("ABC"), dtype="str"), not (using_string_dtype() and HAS_PYARROW)), + (Index(list("ABC"), dtype=object), True), + (MultiIndex.from_product([range(3), range(3)]), False), ( - Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), - True, + MultiIndex.from_product([range(3), ["foo", "bar"]]), + not (using_string_dtype() and HAS_PYARROW), ), ], ) -def test_info_memory_usage_qualified(series, plus): +def test_info_memory_usage_qualified(index, plus): buf = StringIO() + series = Series(1, index=index) series.info(buf=buf) if plus: assert "+" in buf.getvalue() diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index d854f0b787759..f8ceb67b34af2 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -94,7 +94,12 @@ def test_interpolate(self, datetime_series): ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) ts_copy = ts.copy() - ts_copy[5:10] = np.nan + + # Set data between Tuesday and Thursday to NaN for 2 consecutive weeks. + # Linear interpolation should fill in the missing values correctly, + # as the index is equally-spaced within each week. + ts_copy[1:4] = np.nan + ts_copy[6:9] = np.nan linear_interp = ts_copy.interpolate(method="linear") tm.assert_series_equal(linear_interp, ts) @@ -224,8 +229,7 @@ def test_interpolate_index_values(self): result = s.interpolate(method="index") - expected = s.copy() - bad = isna(expected.values) + bad = isna(s) good = ~bad expected = Series( np.interp(vals[bad], vals[good], s.values[good]), index=s.index[bad] @@ -289,26 +293,21 @@ def test_interp_scipy_basic(self): expected = Series([1.0, 3.0, 7.5, 12.0, 18.5, 25.0]) result = s.interpolate(method="slinear") tm.assert_series_equal(result, expected) - - msg = "The 'downcast' keyword in Series.interpolate is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.interpolate(method="slinear", downcast="infer") + result = s.interpolate(method="slinear") tm.assert_series_equal(result, expected) # nearest - expected = Series([1, 3, 3, 12, 12, 25]) + expected = Series([1, 3, 3, 12, 12, 25.0]) result = s.interpolate(method="nearest") tm.assert_series_equal(result, expected.astype("float")) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.interpolate(method="nearest", downcast="infer") + result = s.interpolate(method="nearest") tm.assert_series_equal(result, expected) # zero - expected = Series([1, 3, 3, 12, 12, 25]) + expected = Series([1, 3, 3, 12, 12, 25.0]) result = s.interpolate(method="zero") tm.assert_series_equal(result, expected.astype("float")) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.interpolate(method="zero", downcast="infer") + result = s.interpolate(method="zero") tm.assert_series_equal(result, expected) # quadratic # GH #15662. @@ -316,8 +315,7 @@ def test_interp_scipy_basic(self): result = s.interpolate(method="quadratic") tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.interpolate(method="quadratic", downcast="infer") + result = s.interpolate(method="quadratic") tm.assert_series_equal(result, expected) # cubic expected = Series([1.0, 3.0, 6.8, 12.0, 18.2, 25.0]) @@ -351,7 +349,7 @@ def test_interpolate_invalid_float_limit(self, nontemporal_method): def test_interp_invalid_method(self, invalid_method): s = Series([1, 3, np.nan, 12, np.nan, 25]) - msg = f"method must be one of.* Got '{invalid_method}' instead" + msg = "Can not interpolate with method=nonexistent_method" if invalid_method is None: msg = "'method' should be a string, not None" with pytest.raises(ValueError, match=msg): @@ -362,16 +360,6 @@ def test_interp_invalid_method(self, invalid_method): with pytest.raises(ValueError, match=msg): s.interpolate(method=invalid_method, limit=-1) - def test_interp_invalid_method_and_value(self): - # GH#36624 - ser = Series([1, 3, np.nan, 12, np.nan, 25]) - - msg = "'fill_value' is not a valid keyword for Series.interpolate" - msg2 = "Series.interpolate with method=pad" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - ser.interpolate(fill_value=3, method="pad") - def test_interp_limit_forward(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) @@ -462,107 +450,70 @@ def test_interp_limit_area(self): s.interpolate(method="linear", limit_area="abc") @pytest.mark.parametrize( - "method, limit_direction, expected", - [ - ("pad", "backward", "forward"), - ("ffill", "backward", "forward"), - ("backfill", "forward", "backward"), - ("bfill", "forward", "backward"), - ("pad", "both", "forward"), - ("ffill", "both", "forward"), - ("backfill", "both", "backward"), - ("bfill", "both", "backward"), - ], - ) - def test_interp_limit_direction_raises(self, method, limit_direction, expected): - # https://github.com/pandas-dev/pandas/pull/34746 - s = Series([1, 2, 3]) - - msg = f"`limit_direction` must be '{expected}' for method `{method}`" - msg2 = "Series.interpolate with method=" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - s.interpolate(method=method, limit_direction=limit_direction) - - @pytest.mark.parametrize( - "data, expected_data, kwargs", + "data, kwargs", ( ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], {"method": "pad", "limit_area": "inside"}, ), ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], {"method": "pad", "limit_area": "inside", "limit": 1}, ), ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], {"method": "pad", "limit_area": "outside"}, ), ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], {"method": "pad", "limit_area": "outside", "limit": 1}, ), ( - [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], {"method": "pad", "limit_area": "outside", "limit": 1}, ), ( - range(5), range(5), {"method": "pad", "limit_area": "outside", "limit": 1}, ), ), ) - def test_interp_limit_area_with_pad(self, data, expected_data, kwargs): + def test_interp_limit_area_with_pad(self, data, kwargs): # GH26796 s = Series(data) - expected = Series(expected_data) - msg = "Series.interpolate with method=pad" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.interpolate(**kwargs) - tm.assert_series_equal(result, expected) + msg = "Can not interpolate with method=pad" + with pytest.raises(ValueError, match=msg): + s.interpolate(**kwargs) @pytest.mark.parametrize( - "data, expected_data, kwargs", + "data, kwargs", ( ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], {"method": "bfill", "limit_area": "inside"}, ), ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], {"method": "bfill", "limit_area": "inside", "limit": 1}, ), ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], {"method": "bfill", "limit_area": "outside"}, ), ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], {"method": "bfill", "limit_area": "outside", "limit": 1}, ), ), ) - def test_interp_limit_area_with_backfill(self, data, expected_data, kwargs): + def test_interp_limit_area_with_backfill(self, data, kwargs): # GH26796 - s = Series(data) - expected = Series(expected_data) - msg = "Series.interpolate with method=bfill" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.interpolate(**kwargs) - tm.assert_series_equal(result, expected) + + msg = "Can not interpolate with method=bfill" + with pytest.raises(ValueError, match=msg): + s.interpolate(**kwargs) def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. @@ -657,20 +608,18 @@ def test_interp_datetime64(self, method, tz_naive_fixture): df = Series( [1, np.nan, 3], index=date_range("1/1/2000", periods=3, tz=tz_naive_fixture) ) - warn = None if method == "nearest" else FutureWarning - msg = "Series.interpolate with method=pad is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = df.interpolate(method=method) - if warn is not None: - # check the "use ffill instead" is equivalent - alt = df.ffill() - tm.assert_series_equal(result, alt) - expected = Series( - [1.0, 1.0, 3.0], - index=date_range("1/1/2000", periods=3, tz=tz_naive_fixture), - ) - tm.assert_series_equal(result, expected) + if method == "nearest": + result = df.interpolate(method=method) + expected = Series( + [1.0, 1.0, 3.0], + index=date_range("1/1/2000", periods=3, tz=tz_naive_fixture), + ) + tm.assert_series_equal(result, expected) + else: + msg = "Can not interpolate with method=pad" + with pytest.raises(ValueError, match=msg): + df.interpolate(method=method) def test_interp_pad_datetime64tz_values(self): # GH#27628 missing.interpolate_2d should handle datetimetz values @@ -678,16 +627,9 @@ def test_interp_pad_datetime64tz_values(self): ser = Series(dti) ser[1] = pd.NaT - msg = "Series.interpolate with method=pad is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.interpolate(method="pad") - # check the "use ffill instead" is equivalent - alt = ser.ffill() - tm.assert_series_equal(result, alt) - - expected = Series(dti) - expected[1] = expected[0] - tm.assert_series_equal(result, expected) + msg = "Can not interpolate with method=pad" + with pytest.raises(ValueError, match=msg): + ser.interpolate(method="pad") def test_interp_limit_no_nans(self): # GH 7173 @@ -853,11 +795,9 @@ def test_interpolate_unsorted_index(self, ascending, expected_values): def test_interpolate_asfreq_raises(self): ser = Series(["a", None, "b"], dtype=object) - msg2 = "Series.interpolate with object dtype" - msg = "Invalid fill method" + msg = "Can not interpolate with method=asfreq" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - ser.interpolate(method="asfreq") + ser.interpolate(method="asfreq") def test_interpolate_fill_value(self): # GH#54920 diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index f94f67b8cc40a..4f8484252ba8f 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -92,7 +92,7 @@ def test_isin_with_i8(self): tm.assert_series_equal(result, expected) # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit="d")) + s = Series(pd.to_timedelta(range(5), unit="D")) result = s.isin(s[0:2]) tm.assert_series_equal(result, expected) @@ -212,17 +212,34 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): @pytest.mark.parametrize( - "array,expected", + "dtype, data, values, expected", [ - ( - [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], - Series([False, True, True, False, True, True, True], dtype=bool), - ) + ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), + ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), + ("boolean", [pd.NA, False, True], [pd.NA, True, "a", 20], [True, False, True]), + ("boolean", [pd.NA, False, True], [], [False, False, False]), + ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]), ], ) -def test_isin_complex_numbers(array, expected): +def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch): + # https://github.com/pandas-dev/pandas/issues/60678 + # combination of large series (> _MINIMUM_COMP_ARR_LEN elements) and + # values contains pdNA + min_isin_comp = 2 + ser = Series(data, dtype=dtype) + expected = Series(expected, dtype="boolean") + + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin(values) + tm.assert_series_equal(result, expected) + + +def test_isin_complex_numbers(): # GH 17927 + array = [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j] result = Series(array).isin([1j, 1 + 1j, 1 + 2j]) + expected = Series([False, True, True, False, True, True, True], dtype=bool) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isna.py b/pandas/tests/series/methods/test_isna.py index 7e324aa86a052..92bf2945cc0d1 100644 --- a/pandas/tests/series/methods/test_isna.py +++ b/pandas/tests/series/methods/test_isna.py @@ -1,6 +1,7 @@ """ We also test Series.notna in this file. """ + import numpy as np from pandas import ( diff --git a/pandas/tests/series/methods/test_item.py b/pandas/tests/series/methods/test_item.py index 8e8c33619d564..e927fa69db358 100644 --- a/pandas/tests/series/methods/test_item.py +++ b/pandas/tests/series/methods/test_item.py @@ -2,6 +2,7 @@ Series.item method, mainly testing that we get python scalars as opposed to numpy scalars. """ + import pytest from pandas import ( diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 251d4063008b9..384b7ce3dc985 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -101,16 +101,16 @@ def test_map_series_stringdtype(any_string_dtype, using_infer_string): expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) if using_infer_string and any_string_dtype == "object": - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, expected_dtype", - [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], "str")], ) -def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): +def test_map_categorical_with_nan_values(data, expected_dtype): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -118,8 +118,6 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") - if using_infer_string and expected_dtype == object: - expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -145,9 +143,7 @@ def test_map_simple_str_callables_same_as_astype( # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype( - str if not using_infer_string else "string[pyarrow_numpy]" - ) + expected = string_series.astype(str if not using_infer_string else "str") tm.assert_series_equal(result, expected) @@ -326,7 +322,6 @@ def test_map_dict_na_key(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map_defaultdict_na_key(na_action): # GH 48813 s = Series([1, 2, np.nan]) @@ -336,7 +331,6 @@ def test_map_defaultdict_na_key(na_action): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map_defaultdict_missing_key(na_action): # GH 48813 s = Series([1, 2, np.nan]) @@ -346,7 +340,6 @@ def test_map_defaultdict_missing_key(na_action): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map_defaultdict_unmutated(na_action): # GH 48813 s = Series([1, 2, np.nan]) @@ -483,7 +476,6 @@ def test_map_box_period(): tm.assert_series_equal(res, exp) -@pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map_categorical(na_action, using_infer_string): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) s = Series(values, name="XX", index=list("abcdefg")) @@ -497,7 +489,7 @@ def test_map_categorical(na_action, using_infer_string): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize( @@ -557,13 +549,11 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp, using_infer_string): +def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) exp = Series(exp) - if using_infer_string and mapping == {np.nan: "not NaN"}: - exp.iloc[-1] = np.nan tm.assert_series_equal(result, exp) @@ -607,3 +597,34 @@ def test_map_type(): result = s.map(type) expected = Series([int, str, type], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + + +def test_map_kwargs(): + # GH 59814 + result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2) + expected = Series([4, 6, 7]) + tm.assert_series_equal(result, expected) + + +def test_map_arg_as_kwarg(): + with tm.assert_produces_warning( + FutureWarning, match="`arg` has been renamed to `func`" + ): + Series([1, 2]).map(arg={}) + + +def test_map_func_and_arg(): + # `arg`is considered a normal kwarg that should be passed to the function + result = Series([1, 2]).map(lambda _, arg: arg, arg=3) + expected = Series([3, 3]) + tm.assert_series_equal(result, expected) + + +def test_map_no_func_or_arg(): + with pytest.raises(ValueError, match="The `func` parameter is required"): + Series([1, 2]).map() + + +def test_map_func_is_none(): + with pytest.raises(ValueError, match="The `func` parameter is required"): + Series([1, 2]).map(func=None) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index e8de1cd89e397..67ba1d7ca51b7 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -2,7 +2,6 @@ Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" but are implicitly also testing nsmallest_foo. """ -from itertools import product import numpy as np import pytest @@ -11,74 +10,12 @@ from pandas import Series import pandas._testing as tm -main_dtypes = [ - "datetime", - "datetimetz", - "timedelta", - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "uint8", - "uint16", - "uint32", - "uint64", -] - - -@pytest.fixture -def s_main_dtypes(): - """ - A DataFrame with many dtypes - - * datetime - * datetimetz - * timedelta - * [u]int{8,16,32,64} - * float{32,64} - - The columns are the name of the dtype. - """ - df = pd.DataFrame( - { - "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), - "datetimetz": pd.to_datetime( - ["2003", "2002", "2001", "2002", "2005"] - ).tz_localize("US/Eastern"), - "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), - } - ) - - for dtype in [ - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "uint8", - "uint16", - "uint32", - "uint64", - ]: - df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) - - return df - - -@pytest.fixture(params=main_dtypes) -def s_main_dtypes_split(request, s_main_dtypes): - """Each series in s_main_dtypes.""" - return s_main_dtypes[request.param] - def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests ser = Series(vals, dtype=dtype) result = getattr(ser, method)(3) - expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] + expected_idxr = range(3) if method == "nsmallest" else range(3, 0, -1) expected = ser.loc[expected_idxr] tm.assert_series_equal(result, expected) @@ -96,19 +33,36 @@ class TestSeriesNLargestNSmallest: Series(list("abcde"), dtype="category"), ], ) - def test_nlargest_error(self, r): + @pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) + @pytest.mark.parametrize("arg", [2, 5, 0, -1]) + def test_nlargest_error(self, r, method, arg): dt = r.dtype msg = f"Cannot use method 'n(largest|smallest)' with dtype {dt}" - args = 2, len(r), 0, -1 - methods = r.nlargest, r.nsmallest - for method, arg in product(methods, args): - with pytest.raises(TypeError, match=msg): - method(arg) + with pytest.raises(TypeError, match=msg): + getattr(r, method)(arg) - def test_nsmallest_nlargest(self, s_main_dtypes_split): + @pytest.mark.parametrize( + "data", + [ + pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), + pd.to_datetime(["2003", "2002", "2001", "2002", "2005"], utc=True), + pd.to_timedelta(["3D", "2D", "1D", "2D", "5D"]), + np.array([3, 2, 1, 2, 5], dtype="int8"), + np.array([3, 2, 1, 2, 5], dtype="int16"), + np.array([3, 2, 1, 2, 5], dtype="int32"), + np.array([3, 2, 1, 2, 5], dtype="int64"), + np.array([3, 2, 1, 2, 5], dtype="uint8"), + np.array([3, 2, 1, 2, 5], dtype="uint16"), + np.array([3, 2, 1, 2, 5], dtype="uint32"), + np.array([3, 2, 1, 2, 5], dtype="uint64"), + np.array([3, 2, 1, 2, 5], dtype="float32"), + np.array([3, 2, 1, 2, 5], dtype="float64"), + ], + ) + def test_nsmallest_nlargest(self, data): # float, int, datetime64 (use i8), timedelts64 (same), # object that are numbers, object that are strings - ser = s_main_dtypes_split + ser = Series(data) tm.assert_series_equal(ser.nsmallest(2), ser.iloc[[2, 1]]) tm.assert_series_equal(ser.nsmallest(2, keep="last"), ser.iloc[[2, 3]]) @@ -224,7 +178,7 @@ def test_nlargest_nullable(self, any_numeric_ea_dtype): arr = np.random.default_rng(2).standard_normal(10) arr = arr.astype(dtype.lower(), copy=False) - ser = Series(arr.copy(), dtype=dtype) + ser = Series(arr, dtype=dtype, copy=True) ser[1] = pd.NA result = ser.nlargest(5) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 9727ef3d5c27c..6279cf64818b8 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -10,23 +10,13 @@ class TestSeriesPctChange: def test_pct_change(self, datetime_series): - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - "Series.pct_change are deprecated" - ) - - rs = datetime_series.pct_change(fill_method=None) + rs = datetime_series.pct_change() tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) rs = datetime_series.pct_change(2) filled = datetime_series.ffill() tm.assert_series_equal(rs, filled / filled.shift(2) - 1) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = datetime_series.pct_change(fill_method="bfill", limit=1) - filled = datetime_series.bfill(limit=1) - tm.assert_series_equal(rs, filled / filled.shift(1) - 1) - rs = datetime_series.pct_change(freq="5D") filled = datetime_series.ffill() tm.assert_series_equal( @@ -45,69 +35,27 @@ def test_pct_change_with_duplicate_axis(self): def test_pct_change_shift_over_nas(self): s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - - msg = "The default fill_method='pad' in Series.pct_change is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - chg = s.pct_change() - - expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + chg = s.pct_change() + expected = Series([np.nan, 0.5, np.nan, np.nan, 0.2]) tm.assert_series_equal(chg, expected) - @pytest.mark.parametrize( - "freq, periods, fill_method, limit", - [ - ("5B", 5, None, None), - ("3B", 3, None, None), - ("3B", 3, "bfill", None), - ("7B", 7, "pad", 1), - ("7B", 7, "bfill", 3), - ("14B", 14, None, None), - ], - ) - def test_pct_change_periods_freq( - self, freq, periods, fill_method, limit, datetime_series - ): - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - "Series.pct_change are deprecated" - ) - + @pytest.mark.parametrize("freq, periods", [("5B", 5), ("3B", 3), ("14B", 14)]) + def test_pct_change_periods_freq(self, freq, periods, datetime_series): # GH#7292 - with tm.assert_produces_warning(FutureWarning, match=msg): - rs_freq = datetime_series.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs_periods = datetime_series.pct_change( - periods, fill_method=fill_method, limit=limit - ) + rs_freq = datetime_series.pct_change(freq=freq) + rs_periods = datetime_series.pct_change(periods) tm.assert_series_equal(rs_freq, rs_periods) empty_ts = Series(index=datetime_series.index, dtype=object) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs_freq = empty_ts.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs_periods = empty_ts.pct_change( - periods, fill_method=fill_method, limit=limit - ) + rs_freq = empty_ts.pct_change(freq=freq) + rs_periods = empty_ts.pct_change(periods) tm.assert_series_equal(rs_freq, rs_periods) -@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) -def test_pct_change_with_duplicated_indices(fill_method): +def test_pct_change_with_duplicated_indices(): # GH30463 s = Series([np.nan, 1, 2, 3, 9, 18], index=["a", "b"] * 3) - - warn = None if fill_method is None else FutureWarning - msg = ( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - "Series.pct_change are deprecated" - ) - with tm.assert_produces_warning(warn, match=msg): - result = s.pct_change(fill_method=fill_method) - + result = s.pct_change() expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) tm.assert_series_equal(result, expected) @@ -118,3 +66,11 @@ def test_pct_change_no_warning_na_beginning(): result = ser.pct_change() expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) tm.assert_series_equal(result, expected) + + +def test_pct_change_empty(): + # GH 57056 + ser = Series([], dtype="float64") + expected = ser.copy() + result = ser.pct_change(periods=0) + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 24cf97c05c0a8..7c6a7893ba3a0 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -33,7 +33,8 @@ def ser(): ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])], ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])], ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])], - ] + ], + ids=lambda x: x[0], ) def results(request): return request.param @@ -48,12 +49,29 @@ def results(request): "Int64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + "string[python]", + "str", ] ) def dtype(request): return request.param +def expected_dtype(dtype, method, pct=False): + exp_dtype = "float64" + # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]: + if dtype in ["string[pyarrow]"]: + exp_dtype = "Float64" + elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]: + if method == "average" or pct: + exp_dtype = "double[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + + return exp_dtype + + class TestSeriesRank: def test_rank(self, datetime_series): sp_stats = pytest.importorskip("scipy.stats") @@ -106,7 +124,7 @@ def test_rank(self, datetime_series): tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) - exp = iseries.copy() + exp = iseries iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) @@ -234,6 +252,16 @@ def test_rank_categorical(self): tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep) + def test_rank_nullable_integer(self): + # GH 56976 + exp = Series([np.nan, 2, np.nan, 3, 3, 2, 3, 1]) + exp = exp.astype("Int64") + result = exp.rank(na_option="keep") + + expected = Series([np.nan, 2.5, np.nan, 5.0, 5.0, 2.5, 5.0, 1.0]) + + tm.assert_series_equal(result, expected) + def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") @@ -241,15 +269,15 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - @pytest.mark.parametrize("dtype", [None, object]) - def test_rank_tie_methods(self, ser, results, dtype): + def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results + if dtype == "int64" or (not using_infer_string and dtype == "str"): + pytest.skip("int64/str does not support NaN") + ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) - tm.assert_series_equal(result, Series(exp)) + tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) - @pytest.mark.parametrize("ascending", [True, False]) - @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @pytest.mark.parametrize( "dtype, na_value, pos_inf, neg_inf", @@ -267,11 +295,11 @@ def test_rank_tie_methods(self, ser, results, dtype): ], ) def test_rank_tie_methods_on_infs_nans( - self, method, na_option, ascending, dtype, na_value, pos_inf, neg_inf + self, rank_method, na_option, ascending, dtype, na_value, pos_inf, neg_inf ): pytest.importorskip("scipy") if dtype == "float64[pyarrow]": - if method == "average": + if rank_method == "average": exp_dtype = "float64[pyarrow]" else: exp_dtype = "uint64[pyarrow]" @@ -288,7 +316,7 @@ def test_rank_tie_methods_on_infs_nans( "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]), "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]), } - ranks = exp_ranks[method] + ranks = exp_ranks[rank_method] if na_option == "top": order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": @@ -297,7 +325,9 @@ def test_rank_tie_methods_on_infs_nans( order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] expected = list(chain.from_iterable(expected)) - result = iseries.rank(method=method, na_option=na_option, ascending=ascending) + result = iseries.rank( + method=rank_method, na_option=na_option, ascending=ascending + ) tm.assert_series_equal(result, Series(expected, dtype=exp_dtype)) def test_rank_desc_mix_nans_infs(self): @@ -308,7 +338,6 @@ def test_rank_desc_mix_nans_infs(self): exp = Series([3, np.nan, 1, 4, 2], dtype="float64") tm.assert_series_equal(result, exp) - @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize( "op, value", [ @@ -317,7 +346,7 @@ def test_rank_desc_mix_nans_infs(self): [operator.mul, 1e-6], ], ) - def test_rank_methods_series(self, method, op, value): + def test_rank_methods_series(self, rank_method, op, value): sp_stats = pytest.importorskip("scipy.stats") xs = np.random.default_rng(2).standard_normal(9) @@ -327,8 +356,10 @@ def test_rank_methods_series(self, method, op, value): index = [chr(ord("a") + i) for i in range(len(xs))] vals = op(xs, value) ts = Series(vals, index=index) - result = ts.rank(method=method) - sprank = sp_stats.rankdata(vals, method if method != "first" else "ordinal") + result = ts.rank(method=rank_method) + sprank = sp_stats.rankdata( + vals, rank_method if rank_method != "first" else "ordinal" + ) expected = Series(sprank, index=index).astype("float64") tm.assert_series_equal(result, expected) @@ -346,25 +377,35 @@ def test_rank_methods_series(self, method, op, value): ], ) def test_rank_dense_method(self, dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense") - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype): + def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if "i" in dtype: + if dtype == "int64" or (not using_infer_string and dtype == "str"): s = ser.dropna() else: s = ser.astype(dtype) res = s.rank(ascending=False) - expected = (s.max() - s).rank() - tm.assert_series_equal(res, expected) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank() + else: + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) - expected = (s.max() - s).rank(method=method) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank( + method=method + ) + else: + expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) - tm.assert_series_equal(res2, expected) + tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): method, exp = results @@ -421,9 +462,11 @@ def test_rank_ea_small_values(self): ], ) def test_rank_dense_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True)) tm.assert_series_equal(result, expected) @@ -442,9 +485,11 @@ def test_rank_dense_pct(dtype, ser, exp): ], ) def test_rank_min_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="min", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True)) tm.assert_series_equal(result, expected) @@ -463,9 +508,11 @@ def test_rank_min_pct(dtype, ser, exp): ], ) def test_rank_max_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="max", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True)) tm.assert_series_equal(result, expected) @@ -484,9 +531,11 @@ def test_rank_max_pct(dtype, ser, exp): ], ) def test_rank_average_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="average", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True)) tm.assert_series_equal(result, expected) @@ -505,9 +554,11 @@ def test_rank_average_pct(dtype, ser, exp): ], ) def test_rank_first_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="first", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 6f0c8d751a92a..442d73cadfe47 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - -import pandas.util._test_decorators as td - from pandas import ( NA, Categorical, @@ -24,13 +20,10 @@ import pandas._testing as tm -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" -) def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) - assert np.may_share_memory(string_series.index, identity.index) + assert tm.shares_memory(string_series.index, identity.index) assert identity.index.is_(string_series.index) assert identity.index.identical(string_series.index) @@ -137,15 +130,13 @@ def test_reindex_pad2(): # GH4604 s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) new_index = ["a", "g", "c", "f"] - expected = Series([1, 1, 3, 3], index=new_index) + expected = Series([1, 1, 3, 3.0], index=new_index) # this changes dtype because the ffill happens after result = s.reindex(new_index).ffill() - tm.assert_series_equal(result, expected.astype("float64")) + tm.assert_series_equal(result, expected) - msg = "The 'downcast' keyword in ffill is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.reindex(new_index).ffill(downcast="infer") + result = s.reindex(new_index).ffill() tm.assert_series_equal(result, expected) expected = Series([1, 5, 3, 5], index=new_index) @@ -157,20 +148,16 @@ def test_reindex_inference(): # inference of new dtype s = Series([True, False, False, True], index=list("abcd")) new_index = "agc" - msg = "Downcasting object dtype arrays on" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.reindex(list(new_index)).ffill() - expected = Series([True, True, False], index=list(new_index)) + result = s.reindex(list(new_index)).ffill() + expected = Series([True, True, False], index=list(new_index), dtype=object) tm.assert_series_equal(result, expected) def test_reindex_downcasting(): # GH4618 shifted series downcasting s = Series(False, index=range(5)) - msg = "Downcasting object dtype arrays on" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.shift(1).bfill() - expected = Series(False, index=range(5)) + result = s.shift(1).bfill() + expected = Series(False, index=range(5), dtype=object) tm.assert_series_equal(result, expected) @@ -242,13 +229,15 @@ def test_reindex_categorical(): tm.assert_series_equal(result, expected) # partial reindexing - expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"])) - expected.index = [1, 2] + expected = Series( + Categorical(values=["b", "c"], categories=["a", "b", "c"]), index=range(1, 3) + ) result = s.reindex([1, 2]) tm.assert_series_equal(result, expected) - expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"])) - expected.index = [2, 3] + expected = Series( + Categorical(values=["c", np.nan], categories=["a", "b", "c"]), index=range(2, 4) + ) result = s.reindex([2, 3]) tm.assert_series_equal(result, expected) @@ -269,11 +258,11 @@ def test_reindex_fill_value(): # floats floats = Series([1.0, 2.0, 3.0]) result = floats.reindex([1, 2, 3]) - expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=range(1, 4)) tm.assert_series_equal(result, expected) result = floats.reindex([1, 2, 3], fill_value=0) - expected = Series([2.0, 3.0, 0], index=[1, 2, 3]) + expected = Series([2.0, 3.0, 0], index=range(1, 4)) tm.assert_series_equal(result, expected) # ----------------------------------------------------------- @@ -281,12 +270,12 @@ def test_reindex_fill_value(): ints = Series([1, 2, 3]) result = ints.reindex([1, 2, 3]) - expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=range(1, 4)) tm.assert_series_equal(result, expected) # don't upcast result = ints.reindex([1, 2, 3], fill_value=0) - expected = Series([2, 3, 0], index=[1, 2, 3]) + expected = Series([2, 3, 0], index=range(1, 4)) assert issubclass(result.dtype.type, np.integer) tm.assert_series_equal(result, expected) @@ -295,11 +284,11 @@ def test_reindex_fill_value(): objects = Series([1, 2, 3], dtype=object) result = objects.reindex([1, 2, 3]) - expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) + expected = Series([2, 3, np.nan], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) result = objects.reindex([1, 2, 3], fill_value="foo") - expected = Series([2, 3, "foo"], index=[1, 2, 3], dtype=object) + expected = Series([2, 3, "foo"], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) # ------------------------------------------------------------ @@ -307,18 +296,17 @@ def test_reindex_fill_value(): bools = Series([True, False, True]) result = bools.reindex([1, 2, 3]) - expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) + expected = Series([False, True, np.nan], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) result = bools.reindex([1, 2, 3], fill_value=False) - expected = Series([False, True, False], index=[1, 2, 3]) + expected = Series([False, True, False], index=range(1, 4)) tm.assert_series_equal(result, expected) -@td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) @pytest.mark.parametrize("fill_value", ["string", 0, Timedelta(0)]) -def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value, using_array_manager): +def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value): # https://github.com/pandas-dev/pandas/issues/42921 if dtype == "timedelta64[ns]" and fill_value == Timedelta(0): # use the scalar that is not compatible with the dtype for this test @@ -327,7 +315,7 @@ def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value, using_array_m ser = Series([NaT], dtype=dtype) result = ser.reindex([0, 1], fill_value=fill_value) - expected = Series([NaT, fill_value], index=[0, 1], dtype=object) + expected = Series([NaT, fill_value], index=range(2), dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex_like.py b/pandas/tests/series/methods/test_reindex_like.py index 7f24c778feb1b..10b8ac5817636 100644 --- a/pandas/tests/series/methods/test_reindex_like.py +++ b/pandas/tests/series/methods/test_reindex_like.py @@ -20,7 +20,8 @@ def test_reindex_like(datetime_series): series1 = Series([5, None, None], [day1, day2, day3]) series2 = Series([None, None], [day1, day3]) - result = series1.reindex_like(series2, method="pad") + with tm.assert_produces_warning(FutureWarning): + result = series1.reindex_like(series2, method="pad") expected = Series([5, np.nan], index=[day1, day3]) tm.assert_series_equal(result, expected) @@ -32,10 +33,13 @@ def test_reindex_like_nearest(): other = ser.reindex(target, method="nearest") expected = Series(np.around(target).astype("int64"), target) - result = ser.reindex_like(other, method="nearest") + with tm.assert_produces_warning(FutureWarning): + result = ser.reindex_like(other, method="nearest") tm.assert_series_equal(expected, result) - result = ser.reindex_like(other, method="nearest", tolerance=1) + with tm.assert_produces_warning(FutureWarning): + result = ser.reindex_like(other, method="nearest", tolerance=1) tm.assert_series_equal(expected, result) - result = ser.reindex_like(other, method="nearest", tolerance=[1, 2, 3, 4]) + with tm.assert_produces_warning(FutureWarning): + result = ser.reindex_like(other, method="nearest", tolerance=[1, 2, 3, 4]) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 119654bd19b3f..f5a97d61990a4 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -64,7 +64,7 @@ def test_rename_set_name_inplace(self, using_infer_string): assert ser.name == name exp = np.array(["a", "b", "c"], dtype=np.object_) if using_infer_string: - exp = array(exp, dtype="string[pyarrow_numpy]") + exp = array(exp, dtype="str") tm.assert_extension_array_equal(ser.index.values, exp) else: tm.assert_numpy_array_equal(ser.index.values, exp) @@ -169,16 +169,11 @@ def test_rename_error_arg(self): with pytest.raises(KeyError, match=match): ser.rename({2: 9}, errors="raise") - def test_rename_copy_false(self, using_copy_on_write, warn_copy_on_write): + def test_rename_copy_false(self): # GH 46889 ser = Series(["foo", "bar"]) ser_orig = ser.copy() - shallow_copy = ser.rename({1: 9}, copy=False) - with tm.assert_cow_warning(warn_copy_on_write): - ser[0] = "foobar" - if using_copy_on_write: - assert ser_orig[0] == shallow_copy[0] - assert ser_orig[1] == shallow_copy[9] - else: - assert ser[0] == shallow_copy[0] - assert ser[1] == shallow_copy[9] + shallow_copy = ser.rename({1: 9}) + ser[0] = "foobar" + assert ser_orig[0] == shallow_copy[0] + assert ser_orig[1] == shallow_copy[9] diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 4330153c186ca..abd5d075ea3d5 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -78,9 +76,8 @@ def test_replace(self): ser[20:30] = "bar" # replace list with a single value - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = ser.replace([np.nan, "foo", "bar"], -1) + + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -88,8 +85,7 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -97,13 +93,11 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - with tm.assert_produces_warning(FutureWarning, match=msg): - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() @@ -141,20 +135,15 @@ def test_replace_gh5319(self): # API change from 0.12? # GH 5319 ser = pd.Series([0, np.nan, 2, 3, 4]) - expected = ser.ffill() msg = ( - "Series.replace without 'value' and with non-dict-like " - "'to_replace' is deprecated" + "Series.replace must specify either 'value', " + "a dict-like 'to_replace', or dict-like 'regex'" ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace([np.nan]) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + ser.replace([np.nan]) - ser = pd.Series([0, np.nan, 2, 3, 4]) - expected = ser.ffill() - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace(np.nan) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + ser.replace(np.nan) def test_replace_datetime64(self): # GH 5797 @@ -186,32 +175,16 @@ def test_replace_timedelta_td64(self): def test_replace_with_single_list(self): ser = pd.Series([0, 1, 2, 3, 4]) - msg2 = ( - "Series.replace without 'value' and with non-dict-like " - "'to_replace' is deprecated" + msg = ( + "Series.replace must specify either 'value', " + "a dict-like 'to_replace', or dict-like 'regex'" ) - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = ser.replace([1, 2, 3]) - tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4])) - - s = ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg2): - return_value = s.replace([1, 2, 3], inplace=True) - assert return_value is None - tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4])) + with pytest.raises(ValueError, match=msg): + ser.replace([1, 2, 3]) - # make sure things don't get corrupted when fillna call fails s = ser.copy() - msg = ( - r"Invalid fill method\. Expecting pad \(ffill\) or backfill " - r"\(bfill\)\. Got crash_cymbal" - ) - msg3 = "The 'method' keyword in Series.replace is deprecated" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg3): - return_value = s.replace([1, 2, 3], inplace=True, method="crash_cymbal") - assert return_value is None - tm.assert_series_equal(s, ser) + s.replace([1, 2, 3], inplace=True) def test_replace_mixed_types(self): ser = pd.Series(np.arange(5), dtype="int64") @@ -301,9 +274,7 @@ def test_replace2(self): ser[20:30] = "bar" # replace list with a single value - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = ser.replace([np.nan, "foo", "bar"], -1) + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -311,8 +282,7 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -320,13 +290,11 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - with tm.assert_produces_warning(FutureWarning, match=msg): - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() @@ -385,44 +353,35 @@ def test_replace_unicode_with_number(self): def test_replace_mixed_types_with_string(self): # Testing mixed s = pd.Series([1, 2, 3, "4", 4, 5]) - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.replace([2, "4"], np.nan) - expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) + result = s.replace([2, "4"], np.nan) + expected = pd.Series([1, np.nan, 3, np.nan, 4, 5], dtype=object) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ - (pd.Categorical(["A"], categories=["A", "B"]), [1]), - (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), + (["A"], [1]), + (["A", "B"], [1, 2]), ], ) def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 - ser = pd.Series(categorical) - msg = "Downcasting behavior in `replace`" - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace({"A": 1, "B": 2}) + ser = pd.Series(pd.Categorical(categorical, categories=["A", "B"])) + result = ser.cat.rename_categories({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present # GH#44940 expected = expected.cat.add_categories(2) - tm.assert_series_equal(expected, result) + tm.assert_series_equal(expected, result, check_categorical=False) - @pytest.mark.parametrize( - "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] - ) - def test_replace_categorical_inplace(self, data, data_exp): + def test_replace_categorical_inplace(self): # GH 53358 + data = ["a", "b", "c"] + data_exp = ["b", "b", "c"] result = pd.Series(data, dtype="category") - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result.replace(to_replace="a", value="b", inplace=True) - expected = pd.Series(data_exp, dtype="category") + result.replace(to_replace="a", value="b", inplace=True) + expected = pd.Series(pd.Categorical(data_exp, categories=data)) tm.assert_series_equal(result, expected) def test_replace_categorical_single(self): @@ -437,25 +396,10 @@ def test_replace_categorical_single(self): expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = c.replace(c[2], "foo") + result = c.cat.rename_categories({c.values[2]: "foo"}) tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[2], "foo", inplace=True) - assert return_value is None - tm.assert_series_equal(expected, c) - - first_value = c[0] - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[1], c[0], inplace=True) - assert return_value is None - assert c[0] == c[1] == first_value # test replacing with existing value - def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError @@ -508,13 +452,8 @@ def test_replace_invalid_to_replace(self): r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*" ) - msg2 = ( - "Series.replace without 'value' and with non-dict-like " - "'to_replace' is deprecated" - ) with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - series.replace(lambda x: x.strip()) + series.replace(lambda x: x.strip()) @pytest.mark.parametrize("frame", [False, True]) def test_replace_nonbool_regex(self, frame): @@ -555,6 +494,15 @@ def test_replace_only_one_dictlike_arg(self, fixed_now_ts): with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) + def test_replace_dict_like_with_dict_like(self): + # GH 59452 + s = pd.Series([1, 2, 3, 4, 5]) + to_replace = pd.Series([1]) + value = pd.Series([75]) + msg = "to_replace and value cannot be dict-like for Series.replace" + with pytest.raises(ValueError, match=msg): + s.replace(to_replace, value) + def test_replace_extension_other(self, frame_or_series): # https://github.com/pandas-dev/pandas/issues/34530 obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64")) @@ -562,62 +510,6 @@ def test_replace_extension_other(self, frame_or_series): # should not have changed dtype tm.assert_equal(obj, result) - def _check_replace_with_method(self, ser: pd.Series): - df = ser.to_frame() - - msg1 = "The 'method' keyword in Series.replace is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg1): - res = ser.replace(ser[1], method="pad") - expected = pd.Series([ser[0], ser[0]] + list(ser[2:]), dtype=ser.dtype) - tm.assert_series_equal(res, expected) - - msg2 = "The 'method' keyword in DataFrame.replace is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - res_df = df.replace(ser[1], method="pad") - tm.assert_frame_equal(res_df, expected.to_frame()) - - ser2 = ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg1): - res2 = ser2.replace(ser[1], method="pad", inplace=True) - assert res2 is None - tm.assert_series_equal(ser2, expected) - - with tm.assert_produces_warning(FutureWarning, match=msg2): - res_df2 = df.replace(ser[1], method="pad", inplace=True) - assert res_df2 is None - tm.assert_frame_equal(df, expected.to_frame()) - - def test_replace_ea_dtype_with_method(self, any_numeric_ea_dtype): - arr = pd.array([1, 2, pd.NA, 4], dtype=any_numeric_ea_dtype) - ser = pd.Series(arr) - - self._check_replace_with_method(ser) - - @pytest.mark.parametrize("as_categorical", [True, False]) - def test_replace_interval_with_method(self, as_categorical): - # in particular interval that can't hold NA - - idx = pd.IntervalIndex.from_breaks(range(4)) - ser = pd.Series(idx) - if as_categorical: - ser = ser.astype("category") - - self._check_replace_with_method(ser) - - @pytest.mark.parametrize("as_period", [True, False]) - @pytest.mark.parametrize("as_categorical", [True, False]) - def test_replace_datetimelike_with_method(self, as_period, as_categorical): - idx = pd.date_range("2016-01-01", periods=5, tz="US/Pacific") - if as_period: - idx = idx.tz_localize(None).to_period("D") - - ser = pd.Series(idx) - ser.iloc[-2] = pd.NaT - if as_categorical: - ser = ser.astype("category") - - self._check_replace_with_method(ser) - def test_replace_with_compiled_regex(self): # https://github.com/pandas-dev/pandas/issues/35680 s = pd.Series(["a", "b", "c"]) @@ -628,7 +520,8 @@ def test_replace_with_compiled_regex(self): def test_pandas_replace_na(self): # GH#43344 - ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA], dtype="string") + # GH#56599 + ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA, "AA"], dtype="string") regex_mapping = { "AA": "CC", "BB": "CC", @@ -636,7 +529,9 @@ def test_pandas_replace_na(self): "CC": "CC-REPL", } result = ser.replace(regex_mapping, regex=True) - exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") + exp = pd.Series( + ["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA, "CC"], dtype="string" + ) tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -731,15 +626,19 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 - series = pd.Series(["0"]) - expected = pd.Series([1]) - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = series.replace(to_replace="0", value=1, regex=regex) + series = pd.Series(["0"], dtype=object) + expected = pd.Series([1], dtype=object) + result = series.replace(to_replace="0", value=1, regex=regex) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_series_string(self, regex): + series = pd.Series(["0"], dtype="str") + expected = pd.Series([1], dtype=object) + result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): @@ -761,20 +660,18 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - def test_replace_change_dtype_series(self, using_infer_string): + def test_replace_change_dtype_series(self): # GH#25797 - df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - warn = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warn, match="Downcasting"): - df["Test"] = df["Test"].replace([True], [np.nan]) - expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object) + df["Test"] = df["Test"].replace([True], [np.nan]) + expected = pd.DataFrame({"Test": ["0.5", np.nan, "0.6"]}, dtype=object) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].replace([None], [np.nan]) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].fillna(np.nan) tm.assert_frame_equal(df, expected) @@ -799,3 +696,22 @@ def test_replace_numeric_column_with_na(self, val): ser.replace(to_replace=1, value=pd.NA, inplace=True) tm.assert_series_equal(ser, expected) + + def test_replace_ea_float_with_bool(self): + # GH#55398 + ser = pd.Series([0.0], dtype="Float64") + expected = ser.copy() + result = ser.replace(False, 1.0) + tm.assert_series_equal(result, expected) + + ser = pd.Series([False], dtype="boolean") + expected = ser.copy() + result = ser.replace(0.0, True) + tm.assert_series_equal(result, expected) + + def test_replace_all_NA(self): + # GH#60688 + df = pd.Series([pd.NA, pd.NA]) + result = df.replace({r"^#": "$"}, regex=True) + expected = pd.Series([pd.NA, pd.NA]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 48e2608a1032a..d42aafc001680 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -39,7 +39,7 @@ def test_reset_index(self): columns=Index(list("ABCD"), dtype=object), index=Index([f"i-{i}" for i in range(30)], dtype=object), )[:5] - ser = df.stack(future_stack=True) + ser = df.stack() ser.index.names = ["hash", "category"] ser.name = "value" @@ -193,7 +193,7 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex( # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes - exp = "string" if using_infer_string else object + exp = "str" if using_infer_string else object expected = Series( { "level_0": np.int64, diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index 7f60c94f10e4f..c330b7a7dfbbb 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -63,3 +63,12 @@ def test_round_nat(self, method, freq, unit): round_method = getattr(ser.dt, method) result = round_method(freq) tm.assert_series_equal(result, expected) + + def test_round_ea_boolean(self): + # GH#55936 + ser = Series([True, False], dtype="boolean") + expected = ser.copy() + result = ser.round(2) + tm.assert_series_equal(result, expected) + result.iloc[0] = False + tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/methods/test_set_name.py b/pandas/tests/series/methods/test_set_name.py index cbc8ebde7a8ab..137207053c225 100644 --- a/pandas/tests/series/methods/test_set_name.py +++ b/pandas/tests/series/methods/test_set_name.py @@ -14,7 +14,7 @@ def test_set_name(self): def test_set_name_attribute(self): ser = Series([1, 2, 3]) ser2 = Series([1, 2, 3], name="bar") - for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: + for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05d0"]: ser.name = name assert ser.name == name ser2.name = name diff --git a/pandas/tests/series/methods/test_size.py b/pandas/tests/series/methods/test_size.py index 20a454996fa44..043e4b66dbf16 100644 --- a/pandas/tests/series/methods/test_size.py +++ b/pandas/tests/series/methods/test_size.py @@ -10,8 +10,6 @@ ({"a": 1, "b": 2, "c": 3}, None, 3), ([1, 2, 3], ["x", "y", "z"], 3), ([1, 2, 3, 4, 5], ["x", "y", "z", "w", "n"], 5), - ([1, 2, 3], None, 3), - ([1, 2, 3], ["x", "y", "z"], 3), ([1, 2, 3, 4], ["x", "y", "z", "w"], 4), ], ) diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index 4808272879071..995d9d176fc2b 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -10,7 +10,7 @@ class TestSeriesSortValues: - def test_sort_values(self, datetime_series, using_copy_on_write): + def test_sort_values(self, datetime_series): # check indexes are reordered corresponding with the values ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"]) expected = Series([1, 2, 3, 4], ["D", "B", "A", "C"]) @@ -79,21 +79,11 @@ def test_sort_values(self, datetime_series, using_copy_on_write): # Series.sort_values operating on a view df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) s = df.iloc[:, 0] - - msg = ( - "This Series is a view of some other array, to sort in-place " - "you must create a copy" - ) - if using_copy_on_write: - s.sort_values(inplace=True) - tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) - else: - with pytest.raises(ValueError, match=msg): - s.sort_values(inplace=True) + s.sort_values(inplace=True) + tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) def test_sort_values_categorical(self): - c = Categorical(["a", "b", "b", "a"], ordered=False) - cat = Series(c.copy()) + cat = Series(Categorical(["a", "b", "b", "a"], ordered=False)) # sort in the categories order expected = Series( @@ -204,7 +194,6 @@ def test_sort_values_validate_ascending_for_value_error(self): with pytest.raises(ValueError, match=msg): ser.sort_values(ascending="False") - @pytest.mark.parametrize("ascending", [False, 0, 1, True]) def test_sort_values_validate_ascending_functional(self, ascending): # GH41634 ser = Series([23, 7, 21]) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 1c17013d621c7..3e3eb36112680 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -24,58 +24,58 @@ def read_csv(self, path, **kwargs): return out - def test_from_csv(self, datetime_series, string_series): + def test_from_csv(self, datetime_series, string_series, temp_file): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) - with tm.ensure_clean() as path: - datetime_series.to_csv(path, header=False) - ts = self.read_csv(path, parse_dates=True) - tm.assert_series_equal(datetime_series, ts, check_names=False) + path = temp_file + datetime_series.to_csv(path, header=False) + ts = self.read_csv(path, parse_dates=True) + expected = datetime_series.copy() + expected.index = expected.index.as_unit("s") + tm.assert_series_equal(expected, ts, check_names=False) - assert ts.name is None - assert ts.index.name is None + assert ts.name is None + assert ts.index.name is None - # see gh-10483 - datetime_series.to_csv(path, header=True) - ts_h = self.read_csv(path, header=0) - assert ts_h.name == "ts" + # see gh-10483 + datetime_series.to_csv(path, header=True) + ts_h = self.read_csv(path, header=0) + assert ts_h.name == "ts" - string_series.to_csv(path, header=False) - series = self.read_csv(path) - tm.assert_series_equal(string_series, series, check_names=False) + string_series.to_csv(path, header=False) + series = self.read_csv(path) + tm.assert_series_equal(string_series, series, check_names=False) - assert series.name is None - assert series.index.name is None + assert series.name is None + assert series.index.name is None - string_series.to_csv(path, header=True) - series_h = self.read_csv(path, header=0) - assert series_h.name == "series" + string_series.to_csv(path, header=True) + series_h = self.read_csv(path, header=0) + assert series_h.name == "series" - with open(path, "w", encoding="utf-8") as outfile: - outfile.write("1998-01-01|1.0\n1999-01-01|2.0") + with open(path, "w", encoding="utf-8") as outfile: + outfile.write("1998-01-01|1.0\n1999-01-01|2.0") - series = self.read_csv(path, sep="|", parse_dates=True) - check_series = Series( - {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} - ) - tm.assert_series_equal(check_series, series) + series = self.read_csv(path, sep="|", parse_dates=True) + check_series = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) + check_series.index = check_series.index.as_unit("s") + tm.assert_series_equal(check_series, series) - series = self.read_csv(path, sep="|", parse_dates=False) - check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0}) - tm.assert_series_equal(check_series, series) + series = self.read_csv(path, sep="|", parse_dates=False) + check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0}) + tm.assert_series_equal(check_series, series) - def test_to_csv(self, datetime_series): - with tm.ensure_clean() as path: - datetime_series.to_csv(path, header=False) + def test_to_csv(self, datetime_series, temp_file): + datetime_series.to_csv(temp_file, header=False) - with open(path, newline=None, encoding="utf-8") as f: - lines = f.readlines() - assert lines[1] != "\n" + with open(temp_file, newline=None, encoding="utf-8") as f: + lines = f.readlines() + assert lines[1] != "\n" - datetime_series.to_csv(path, index=False, header=False) - arr = np.loadtxt(path) - tm.assert_almost_equal(arr, datetime_series.values) + datetime_series.to_csv(temp_file, index=False, header=False) + arr = np.loadtxt(temp_file) + tm.assert_almost_equal(arr, datetime_series.values) def test_to_csv_unicode_index(self): buf = StringIO() @@ -87,14 +87,13 @@ def test_to_csv_unicode_index(self): s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") tm.assert_series_equal(s, s2) - def test_to_csv_float_format(self): - with tm.ensure_clean() as filename: - ser = Series([0.123456, 0.234567, 0.567567]) - ser.to_csv(filename, float_format="%.2f", header=False) + def test_to_csv_float_format(self, temp_file): + ser = Series([0.123456, 0.234567, 0.567567]) + ser.to_csv(temp_file, float_format="%.2f", header=False) - rs = self.read_csv(filename) - xp = Series([0.12, 0.23, 0.57]) - tm.assert_series_equal(rs, xp) + rs = self.read_csv(temp_file) + xp = Series([0.12, 0.23, 0.57]) + tm.assert_series_equal(rs, xp) def test_to_csv_list_entries(self): s = Series(["jack and jill", "jesse and frank"]) @@ -128,55 +127,51 @@ def test_to_csv_path_is_none(self): ), ], ) - def test_to_csv_compression(self, s, encoding, compression): - with tm.ensure_clean() as filename: - s.to_csv(filename, compression=compression, encoding=encoding, header=True) - # test the round trip - to_csv -> read_csv - result = pd.read_csv( - filename, - compression=compression, - encoding=encoding, - index_col=0, - ).squeeze("columns") - tm.assert_series_equal(s, result) - - # test the round trip using file handle - to_csv -> read_csv - with get_handle( - filename, "w", compression=compression, encoding=encoding - ) as handles: - s.to_csv(handles.handle, encoding=encoding, header=True) - - result = pd.read_csv( - filename, - compression=compression, - encoding=encoding, - index_col=0, - ).squeeze("columns") - tm.assert_series_equal(s, result) - - # explicitly ensure file was compressed - with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding or "utf8") - assert s.name in text - - with tm.decompress_file(filename, compression) as fh: - tm.assert_series_equal( - s, - pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), - ) - - def test_to_csv_interval_index(self, using_infer_string): + def test_to_csv_compression(self, s, encoding, compression, temp_file): + filename = temp_file + s.to_csv(filename, compression=compression, encoding=encoding, header=True) + # test the round trip - to_csv -> read_csv + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_series_equal(s, result) + + # test the round trip using file handle - to_csv -> read_csv + with get_handle( + filename, "w", compression=compression, encoding=encoding + ) as handles: + s.to_csv(handles.handle, encoding=encoding, header=True) + + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_series_equal(s, result) + + # explicitly ensure file was compressed + with tm.decompress_file(filename, compression) as fh: + text = fh.read().decode(encoding or "utf8") + assert s.name in text + + with tm.decompress_file(filename, compression) as fh: + tm.assert_series_equal( + s, + pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), + ) + + def test_to_csv_interval_index(self, using_infer_string, temp_file): # GH 28210 s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) - with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: - s.to_csv(path, header=False) - result = self.read_csv(path, index_col=0) - - # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) - expected = s.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) - tm.assert_series_equal(result, expected) + s.to_csv(temp_file, header=False) + result = self.read_csv(temp_file, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = s + expected.index = expected.index.astype("str") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py index 5fe3e19b0a20b..4bc7631090761 100644 --- a/pandas/tests/series/methods/test_to_numpy.py +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -1,9 +1,12 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( NA, Series, + Timedelta, ) import pandas._testing as tm @@ -23,3 +26,24 @@ def test_to_numpy_cast_before_setting_na(): result = ser.to_numpy(dtype=np.float64, na_value=np.nan) expected = np.array([1.0]) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_to_numpy_arrow_dtype_given(): + # GH#57121 + ser = Series([1, NA], dtype="int64[pyarrow]") + result = ser.to_numpy(dtype="float64") + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_ea_int_to_td_ts(): + # GH#57093 + ser = Series([1, None], dtype="Int64") + result = ser.astype("m8[ns]") + expected = Series([1, Timedelta("nat")], dtype="m8[ns]") + tm.assert_series_equal(result, expected) + + result = ser.astype("M8[ns]") + expected = Series([1, Timedelta("nat")], dtype="M8[ns]") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_tz_localize.py b/pandas/tests/series/methods/test_tz_localize.py index 45620a721f442..53288e8a1f8e7 100644 --- a/pandas/tests/series/methods/test_tz_localize.py +++ b/pandas/tests/series/methods/test_tz_localize.py @@ -1,7 +1,6 @@ from datetime import timezone import pytest -import pytz from pandas._libs.tslibs import timezones @@ -28,7 +27,7 @@ def test_series_tz_localize_ambiguous_bool(self): expected0 = Series([expected0]) expected1 = Series([expected1]) - with tm.external_error_raised(pytz.AmbiguousTimeError): + with tm.external_error_raised(ValueError): ser.dt.tz_localize("US/Central") result = ser.dt.tz_localize("US/Central", ambiguous=True) @@ -79,11 +78,11 @@ def test_tz_localize_nonexistent(self, warsaw, method, exp, unit): df = ser.to_frame() if method == "raise": - with tm.external_error_raised(pytz.NonExistentTimeError): + with tm.external_error_raised(ValueError): dti.tz_localize(tz, nonexistent=method) - with tm.external_error_raised(pytz.NonExistentTimeError): + with tm.external_error_raised(ValueError): ser.tz_localize(tz, nonexistent=method) - with tm.external_error_raised(pytz.NonExistentTimeError): + with tm.external_error_raised(ValueError): df.tz_localize(tz, nonexistent=method) elif exp == "invalid": diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 3c70e839c8e20..f61e20c43657d 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -137,10 +137,10 @@ def test_unstack_mixed_type_name_in_multiindex( def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) - mi = df.stack(future_stack=True).index.rename(["major", "minor"]) + mi = df.stack().index.rename(["major", "minor"]) ser = Series(["foo"] * len(mi), index=mi, name="category", dtype="category") result = ser.unstack() @@ -148,7 +148,7 @@ def test_unstack_multi_index_categorical_values(): dti = ser.index.levels[0] c = pd.Categorical(["foo"] * len(dti)) expected = DataFrame( - {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, + {"A": c, "B": c, "C": c, "D": c}, columns=Index(list("ABCD"), name="minor"), index=dti.rename("major"), ) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 3f18ae6c13880..9b5fb098bf3ee 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -14,7 +14,7 @@ class TestUpdate: - def test_update(self, using_copy_on_write): + def test_update(self): s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) s2 = Series([np.nan, 3.5, np.nan, 5.0]) s.update(s2) @@ -29,68 +29,64 @@ def test_update(self, using_copy_on_write): df["c"] = df["c"].astype(object) df_orig = df.copy() - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["c"].update(Series(["foo"], index=[0])) - expected = df_orig - else: - with tm.assert_produces_warning(FutureWarning, match="inplace method"): - df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame( - [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] - ) - expected["c"] = expected["c"].astype(object) + with tm.raises_chained_assignment_error(): + df["c"].update(Series(["foo"], index=[0])) + expected = df_orig tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "other, dtype, expected, warn", + "other, dtype, expected, raises", [ # other is int - ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), None), - ([61, 63], "int64", Series([10, 61, 12]), None), - ([61, 63], float, Series([10.0, 61.0, 12.0]), None), - ([61, 63], object, Series([10, 61, 12], dtype=object), None), + ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), False), + ([61, 63], "int64", Series([10, 61, 12]), False), + ([61, 63], float, Series([10.0, 61.0, 12.0]), False), + ([61, 63], object, Series([10, 61, 12], dtype=object), False), # other is float, but can be cast to int - ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), None), - ([61.0, 63.0], "int64", Series([10, 61, 12]), None), - ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), None), - ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), None), + ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), False), + ([61.0, 63.0], "int64", Series([10, 61, 12]), False), + ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), False), + ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), False), # others is float, cannot be cast to int - ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), FutureWarning), - ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), FutureWarning), - ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), None), - ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), None), + ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), True), + ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), True), + ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), False), + ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), False), # other is object, cannot be cast - ([(61,), (63,)], "int32", Series([10, (61,), 12]), FutureWarning), - ([(61,), (63,)], "int64", Series([10, (61,), 12]), FutureWarning), - ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), FutureWarning), - ([(61,), (63,)], object, Series([10, (61,), 12]), None), + ([(61,), (63,)], "int32", Series([10, (61,), 12]), True), + ([(61,), (63,)], "int64", Series([10, (61,), 12]), True), + ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), True), + ([(61,), (63,)], object, Series([10, (61,), 12]), False), ], ) - def test_update_dtypes(self, other, dtype, expected, warn): + def test_update_dtypes(self, other, dtype, expected, raises): ser = Series([10, 11, 12], dtype=dtype) other = Series(other, index=[1, 3]) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + ser.update(other) + else: ser.update(other) - - tm.assert_series_equal(ser, expected) + tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( - "series, other, expected", + "values, other, expected", [ # update by key ( - Series({"a": 1, "b": 2, "c": 3, "d": 4}), + {"a": 1, "b": 2, "c": 3, "d": 4}, {"b": 5, "c": np.nan}, - Series({"a": 1, "b": 5, "c": 3, "d": 4}), + {"a": 1, "b": 5, "c": 3, "d": 4}, ), # update by position - (Series([1, 2, 3, 4]), [np.nan, 5, 1], Series([1, 5, 1, 4])), + ([1, 2, 3, 4], [np.nan, 5, 1], [1, 5, 1, 4]), ], ) - def test_update_from_non_series(self, series, other, expected): + def test_update_from_non_series(self, values, other, expected): # GH 33215 + series = Series(values) series.update(other) + expected = Series(expected) tm.assert_series_equal(series, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index 859010d9c79c6..7f882fa348b7e 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -225,30 +225,14 @@ def test_value_counts_bool_with_nan(self, ser, dropna, exp): out = ser.value_counts(dropna=dropna) tm.assert_series_equal(out, exp) - @pytest.mark.parametrize( - "input_array,expected", - [ - ( - [1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], - Series( - [3, 2, 1], - index=Index([3j, 1 + 1j, 1], dtype=np.complex128), - name="count", - ), - ), - ( - np.array([1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], dtype=np.complex64), - Series( - [3, 2, 1], - index=Index([3j, 1 + 1j, 1], dtype=np.complex64), - name="count", - ), - ), - ], - ) - def test_value_counts_complex_numbers(self, input_array, expected): + @pytest.mark.parametrize("dtype", [np.complex128, np.complex64]) + def test_value_counts_complex_numbers(self, dtype): # GH 17927 + input_array = np.array([1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], dtype=dtype) result = Series(input_array).value_counts() + expected = Series( + [3, 2, 1], index=Index([3j, 1 + 1j, 1], dtype=dtype), name="count" + ) tm.assert_series_equal(result, expected) def test_value_counts_masked(self): diff --git a/pandas/tests/series/methods/test_view.py b/pandas/tests/series/methods/test_view.py deleted file mode 100644 index 7e0ac372cd443..0000000000000 --- a/pandas/tests/series/methods/test_view.py +++ /dev/null @@ -1,61 +0,0 @@ -import numpy as np -import pytest - -from pandas import ( - Index, - Series, - array, - date_range, -) -import pandas._testing as tm - -pytestmark = pytest.mark.filterwarnings( - "ignore:Series.view is deprecated and will be removed in a future version.:FutureWarning" # noqa: E501 -) - - -class TestView: - def test_view_i8_to_datetimelike(self): - dti = date_range("2000", periods=4, tz="US/Central") - ser = Series(dti.asi8) - - result = ser.view(dti.dtype) - tm.assert_datetime_array_equal(result._values, dti._data._with_freq(None)) - - pi = dti.tz_localize(None).to_period("D") - ser = Series(pi.asi8) - result = ser.view(pi.dtype) - tm.assert_period_array_equal(result._values, pi._data) - - def test_view_tz(self): - # GH#24024 - ser = Series(date_range("2000", periods=4, tz="US/Central")) - result = ser.view("i8") - expected = Series( - [ - 946706400000000000, - 946792800000000000, - 946879200000000000, - 946965600000000000, - ] - ) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "first", ["m8[ns]", "M8[ns]", "M8[ns, US/Central]", "period[D]"] - ) - @pytest.mark.parametrize( - "second", ["m8[ns]", "M8[ns]", "M8[ns, US/Central]", "period[D]"] - ) - @pytest.mark.parametrize("box", [Series, Index, array]) - def test_view_between_datetimelike(self, first, second, box): - dti = date_range("2016-01-01", periods=3) - - orig = box(dti) - obj = orig.view(first) - assert obj.dtype == first - tm.assert_numpy_array_equal(np.asarray(obj.view("i8")), dti.asi8) - - res = obj.view(second) - assert res.dtype == second - tm.assert_numpy_array_equal(np.asarray(obj.view("i8")), dti.asi8) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 29d6e2036476e..4b369bb0bc869 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -107,7 +107,7 @@ def test_contains(self, datetime_series): def test_axis_alias(self): s = Series([1, 2, np.nan]) tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) - assert s.dropna().sum("rows") == 3 + assert s.dropna().sum(axis="rows") == 3 assert s._get_axis_number("rows") == 0 assert s._get_axis_name("rows") == "index" @@ -138,13 +138,6 @@ def test_ndarray_compat_like_func(self): expected = Series(1, index=range(10), dtype="float64") tm.assert_series_equal(result, expected) - def test_ndarray_compat_ravel(self): - # ravel - s = Series(np.random.default_rng(2).standard_normal(10)) - with tm.assert_produces_warning(FutureWarning, match="ravel is deprecated"): - result = s.ravel(order="F") - tm.assert_almost_equal(result, s.values.ravel(order="F")) - def test_empty_method(self): s_empty = Series(dtype=object) assert s_empty.empty @@ -169,13 +162,8 @@ def test_attrs(self): def test_inspect_getmembers(self): # GH38782 - pytest.importorskip("jinja2") ser = Series(dtype=object) - msg = "Series._data is deprecated" - with tm.assert_produces_warning( - DeprecationWarning, match=msg, check_stacklevel=False - ): - inspect.getmembers(ser) + inspect.getmembers(ser) def test_unknown_attribute(self): # GH#9680 @@ -206,7 +194,6 @@ def test_series_datetimelike_attribute_access_invalid(self): with pytest.raises(AttributeError, match=msg): ser.weekday - @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "kernel, has_numeric_only", [ @@ -227,7 +214,6 @@ def test_series_datetimelike_attribute_access_invalid(self): ("count", False), ("median", True), ("std", True), - ("backfill", False), ("rank", True), ("pct_change", False), ("cummax", False), @@ -238,7 +224,6 @@ def test_series_datetimelike_attribute_access_invalid(self): ("cumprod", False), ("fillna", False), ("ffill", False), - ("pad", False), ("bfill", False), ("sample", False), ("tail", False), @@ -291,10 +276,3 @@ def test_numeric_only(self, kernel, has_numeric_only, dtype): else: # reducer assert result == expected - - -@pytest.mark.parametrize("converter", [int, float, complex]) -def test_float_int_deprecated(converter): - # GH 51101 - with tm.assert_produces_warning(FutureWarning): - assert converter(Series([1])) == converter(1) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index b40e2e99dae2e..e7d284bd47e21 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -212,9 +212,9 @@ def test_series_integer_mod(self, index): s1 = Series(range(1, 10)) s2 = Series("foo", index=index) - msg = "not all arguments converted during string formatting|mod not" + msg = "not all arguments converted during string formatting|'mod' not supported" - with pytest.raises((TypeError, NotImplementedError), match=msg): + with pytest.raises(TypeError, match=msg): s2 % s1 def test_add_with_duplicate_index(self): @@ -241,7 +241,7 @@ def test_add_corner_cases(self, datetime_series): result = datetime_series + empty assert np.isnan(result).all() - result = empty + empty.copy() + result = empty + empty assert len(result) == 0 def test_add_float_plus_int(self, datetime_series): @@ -359,12 +359,13 @@ def test_add_list_to_masked_array_boolean(self, request): else None ) ser = Series([True, None, False], dtype="boolean") - with tm.assert_produces_warning(warning): + msg = "operator is not supported by numexpr for the bool dtype" + with tm.assert_produces_warning(warning, match=msg): result = ser + [True, None, True] expected = Series([True, None, True], dtype="boolean") tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(warning): + with tm.assert_produces_warning(warning, match=msg): result = [True, None, True] + ser tm.assert_series_equal(result, expected) @@ -499,27 +500,14 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] - def test_comparisons(self, using_infer_string): + def test_comparisons(self): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! exp = Series([False, False, False]) - if using_infer_string: - import pyarrow as pa - - msg = "has no kernel" - # TODO(3.0) GH56008 - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - s == s2 - with tm.assert_produces_warning( - DeprecationWarning, match="comparison", check_stacklevel=False - ): - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - s2 == s - else: - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) # ----------------------------------------------------------------- # Categorical Dtype Comparisons @@ -659,12 +647,10 @@ def test_comparison_operators_with_nas(self, comparison_op): result = comparison_op(ser, val) expected = comparison_op(ser.dropna(), val).reindex(ser.index) - msg = "Downcasting object dtype arrays" - with tm.assert_produces_warning(FutureWarning, match=msg): - if comparison_op is operator.ne: - expected = expected.fillna(True).astype(bool) - else: - expected = expected.fillna(False).astype(bool) + if comparison_op is operator.ne: + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) tm.assert_series_equal(result, expected) @@ -674,22 +660,12 @@ def test_ne(self): tm.assert_numpy_array_equal(ts.index != 5, expected) tm.assert_numpy_array_equal(~(ts.index == 5), expected) - @pytest.mark.parametrize( - "left, right", - [ - ( - Series([1, 2, 3], index=list("ABC"), name="x"), - Series([2, 2, 2], index=list("ABD"), name="x"), - ), - ( - Series([1, 2, 3], index=list("ABC"), name="x"), - Series([2, 2, 2, 2], index=list("ABCD"), name="x"), - ), - ], - ) - def test_comp_ops_df_compat(self, left, right, frame_or_series): + @pytest.mark.parametrize("right_data", [[2, 2, 2], [2, 2, 2, 2]]) + def test_comp_ops_df_compat(self, right_data, frame_or_series): # GH 1134 # GH 50083 to clarify that index and columns must be identically labeled + left = Series([1, 2, 3], index=list("ABC"), name="x") + right = Series(right_data, index=list("ABDC")[: len(right_data)], name="x") if frame_or_series is not Series: msg = ( rf"Can only compare identically-labeled \(both index and columns\) " @@ -819,9 +795,6 @@ def test_series_ops_name_retention(self, flex, box, names, all_binary_operators) r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) - warn = None - if box in [list, tuple] and is_logical: - warn = FutureWarning right = box(right) if flex: @@ -830,9 +803,12 @@ def test_series_ops_name_retention(self, flex, box, names, all_binary_operators) return result = getattr(left, name)(right) else: - # GH#37374 logical ops behaving as set ops deprecated - with tm.assert_produces_warning(warn, match=msg): - result = op(left, right) + if is_logical and box in [list, tuple]: + with pytest.raises(TypeError, match=msg): + # GH#52264 logical ops with dtype-less sequences deprecated + op(left, right) + return + result = op(left, right) assert isinstance(result, Series) if box in [Index, Series]: diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py new file mode 100644 index 0000000000000..e73cf9bee6aeb --- /dev/null +++ b/pandas/tests/series/test_arrow_interface.py @@ -0,0 +1,61 @@ +import ctypes + +import pytest + +import pandas as pd + +pa = pytest.importorskip("pyarrow", minversion="16.0") + + +def test_series_arrow_interface(): + s = pd.Series([1, 4, 2]) + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([[1, 4, 2]]) + assert ca.equals(expected) + ca = pa.chunked_array(s, type=pa.int32()) + expected = pa.chunked_array([[1, 4, 2]], type=pa.int32()) + assert ca.equals(expected) + + +def test_series_arrow_interface_arrow_dtypes(): + s = pd.Series([1, 4, 2], dtype="Int64[pyarrow]") + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([[1, 4, 2]]) + assert ca.equals(expected) + ca = pa.chunked_array(s, type=pa.int32()) + expected = pa.chunked_array([[1, 4, 2]], type=pa.int32()) + assert ca.equals(expected) + + +def test_series_arrow_interface_stringdtype(): + s = pd.Series(["foo", "bar"], dtype="string[pyarrow]") + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([["foo", "bar"]], type=pa.large_string()) + assert ca.equals(expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index da069afe5e709..f82451a2be84d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -14,9 +14,9 @@ iNaT, lib, ) +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import CategoricalDtype @@ -83,13 +83,20 @@ def test_infer_with_date_and_datetime(self): def test_unparsable_strings_with_dt64_dtype(self): # pre-2.0 these would be silently ignored and come back with object dtype vals = ["aa"] - msg = "^Unknown datetime string format, unable to parse: aa, at position 0$" + msg = "^Unknown datetime string format, unable to parse: aa$" with pytest.raises(ValueError, match=msg): Series(vals, dtype="datetime64[ns]") with pytest.raises(ValueError, match=msg): Series(np.array(vals, dtype=object), dtype="datetime64[ns]") + def test_invalid_dtype_conversion_datetime_to_timedelta(self): + # GH#60728 + vals = Series([NaT, Timestamp(2025, 1, 1)], dtype="datetime64[ns]") + msg = r"^Cannot cast DatetimeArray to dtype timedelta64\[ns\]$" + with pytest.raises(TypeError, match=msg): + Series(vals, dtype="timedelta64[ns]") + @pytest.mark.parametrize( "constructor", [ @@ -166,7 +173,7 @@ def test_constructor(self, datetime_series, using_infer_string): # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ if not using_infer_string else "string" + assert mixed.dtype == np.object_ if not using_infer_string else "str" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -229,7 +236,7 @@ def test_constructor_empty(self, input_class, using_infer_string): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) if using_infer_string: - empty2 = Series("", index=range(3), dtype=object) + empty2 = Series("", index=range(3), dtype="str") else: empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) @@ -336,11 +343,11 @@ def test_constructor_index_dtype(self, dtype): @pytest.mark.parametrize( "input_vals", [ - ([1, 2]), - (["1", "2"]), - (list(date_range("1/1/2011", periods=2, freq="h"))), - (list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern"))), - ([Interval(left=0, right=5)]), + [1, 2], + ["1", "2"], + list(date_range("1/1/2011", periods=2, freq="h")), + list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern")), + [Interval(left=0, right=5)], ], ) def test_constructor_list_str(self, input_vals, string_dtype): @@ -623,15 +630,12 @@ def test_constructor_maskedarray_hardened(self): expected = Series([np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) - def test_series_ctor_plus_datetimeindex(self, using_copy_on_write): + def test_series_ctor_plus_datetimeindex(self): rng = date_range("20090415", "20090519", freq="B") - data = {k: 1 for k in rng} + data = dict.fromkeys(rng, 1) result = Series(data, index=rng) - if using_copy_on_write: - assert result.index.is_(rng) - else: - assert result.index is rng + assert result.index.is_(rng) def test_constructor_default_index(self): s = Series([0, 1, 2]) @@ -702,7 +706,6 @@ def test_constructor_copy(self): assert x[0] == 2.0 assert y[0] == 1.0 - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite test @pytest.mark.parametrize( "index", [ @@ -757,7 +760,7 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([NaT, NaT]) - assert exp.dtype == "datetime64[ns]" + assert exp.dtype == "datetime64[s]" tm.assert_series_equal(Series([NaT, NaT]), exp) tm.assert_series_equal(Series(np.array([NaT, NaT])), exp) @@ -893,18 +896,13 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp with pytest.raises(IntCastingNaNError, match=msg): Series(np.array(vals), dtype=any_int_numpy_dtype) - def test_constructor_dtype_no_cast(self, using_copy_on_write, warn_copy_on_write): + def test_constructor_dtype_no_cast(self): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) - warn = FutureWarning if warn_copy_on_write else None - with tm.assert_produces_warning(warn): - s2[1] = 5 - if using_copy_on_write: - assert s[1] == 2 - else: - assert s[1] == 5 + s2[1] = 5 + assert s[1] == 2 def test_constructor_datelike_coercion(self): # GH 9477 @@ -944,7 +942,7 @@ def test_constructor_datetimes_with_nulls(self): np.array([None, None, datetime.now(), None]), ]: result = Series(arr) - assert result.dtype == "M8[ns]" + assert result.dtype == "M8[us]" def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype="M8[ns]", index=range(5)) @@ -972,15 +970,15 @@ def test_constructor_dtype_datetime64_10(self): dates = [np.datetime64(x) for x in pydates] ser = Series(dates) - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" ser.iloc[0] = np.nan - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") @@ -1094,16 +1092,16 @@ def test_constructor_dtype_datetime64_4(self): def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) - assert ser.dtype == "object" + assert ser.dtype == "M8[us]" assert ser[2] is NaT assert "NaT" in str(ser) def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert ser.dtype == "object" - assert ser[2] is np.nan - assert "NaN" in str(ser) + assert ser.dtype == "M8[us]" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_with_datetime_tz(self): # 8260 @@ -1165,7 +1163,7 @@ def test_constructor_with_datetime_tz4(self): Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ] ) - assert ser.dtype == "datetime64[ns, US/Pacific]" + assert ser.dtype == "datetime64[s, US/Pacific]" assert lib.infer_dtype(ser, skipna=True) == "datetime64" def test_constructor_with_datetime_tz3(self): @@ -1225,7 +1223,7 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, kind, unit): def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") - expected = Series(Timestamp(arg)).dt.tz_localize("CET") + expected = Series([Timestamp(arg)], dtype="M8[ns]").dt.tz_localize("CET") tm.assert_series_equal(result, expected) def test_constructor_datetime64_bigendian(self): @@ -1328,9 +1326,8 @@ def test_constructor_periodindex(self): pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - expected = Series(pi.astype(object)) - tm.assert_series_equal(s, expected) + expected = Series(pi.astype(object)) + assert expected.dtype == object def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} @@ -1366,14 +1363,8 @@ def test_constructor_dict_order(self): expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) - def test_constructor_dict_extension(self, ea_scalar_and_dtype, request): + def test_constructor_dict_extension(self, ea_scalar_and_dtype): ea_scalar, ea_dtype = ea_scalar_and_dtype - if isinstance(ea_scalar, Timestamp): - mark = pytest.mark.xfail( - reason="Construction from dict goes through " - "maybe_convert_objects which casts to nano" - ) - request.applymarker(mark) d = {"a": ea_scalar} result = Series(d, index=["a"]) expected = Series(ea_scalar, index=["a"], dtype=ea_dtype) @@ -1418,7 +1409,9 @@ def create_data(constructor): result_Timestamp = Series(data_Timestamp) tm.assert_series_equal(result_datetime64, expected) - tm.assert_series_equal(result_datetime, expected) + tm.assert_series_equal( + result_datetime, expected.set_axis(expected.index.as_unit("us")) + ) tm.assert_series_equal(result_Timestamp, expected) def test_constructor_dict_tuple_indexer(self): @@ -1448,10 +1441,17 @@ def test_constructor_tuple_of_tuples(self): s = Series(data) assert tuple(s) == data - def test_constructor_dict_of_tuples(self): - data = {(1, 2): 3, (None, 5): 6} + @pytest.mark.parametrize( + "data, expected_values, expected_index", + [ + ({(1, 2): 3, (None, 5): 6}, [3, 6], [(1, 2), (None, 5)]), + ({(1,): 3, (4, 5): 6}, [3, 6], [(1, None), (4, 5)]), + ], + ) + def test_constructor_dict_of_tuples(self, data, expected_values, expected_index): + # GH 60695 result = Series(data).sort_values() - expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) + expected = Series(expected_values, index=MultiIndex.from_tuples(expected_index)) tm.assert_series_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/22698 @@ -1468,7 +1468,7 @@ def test_fromDict(self, using_infer_string): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ if not using_infer_string else "string" + assert series.dtype == np.object_ if not using_infer_string else "str" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) @@ -1480,7 +1480,7 @@ def test_fromValue(self, datetime_series, using_infer_string): assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ if not using_infer_string else "string" + assert strings.dtype == np.object_ if not using_infer_string else "str" assert len(strings) == len(datetime_series) d = datetime.now() @@ -1597,7 +1597,7 @@ def test_NaT_cast(self): tm.assert_series_equal(result, expected) def test_constructor_name_hashable(self): - for n in [777, 777.0, "name", datetime(2001, 11, 11), (1,), "\u05D0"]: + for n in [777, 777.0, "name", datetime(2001, 11, 11), (1,), "\u05d0"]: for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]: s = Series(data, name=n) assert s.name == n @@ -1808,15 +1808,10 @@ def test_constructor_datetimelike_scalar_to_string_dtype( expected = Series(["M", "M", "M"], index=[1, 2, 3], dtype=nullable_string_dtype) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "values", - [ - [np.datetime64("2012-01-01"), np.datetime64("2013-01-01")], - ["2012-01-01", "2013-01-01"], - ], - ) - def test_constructor_sparse_datetime64(self, values): + @pytest.mark.parametrize("box", [lambda x: x, np.datetime64]) + def test_constructor_sparse_datetime64(self, box): # https://github.com/pandas-dev/pandas/issues/35762 + values = [box("2012-01-01"), box("2013-01-01")] dtype = pd.SparseDtype("datetime64[ns]") result = Series(values, dtype=dtype) arr = pd.arrays.SparseArray(values, dtype=dtype) @@ -1872,23 +1867,30 @@ class A(OrderedDict): series = Series(A(data)) tm.assert_series_equal(series, expected) - def test_constructor_dict_multiindex(self): - d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} - _d = sorted(d.items()) - result = Series(d) - expected = Series( - [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d]) - ) - tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data, expected_index_multi", + [ + ({("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}, True), + ({("a",): 0.0, ("a", "b"): 1.0}, True), + ({"z": 111.0, ("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}, False), + ], + ) + def test_constructor_dict_multiindex(self, data, expected_index_multi): + # GH#60695 + result = Series(data) - d["z"] = 111.0 - _d.insert(0, ("z", d["z"])) - result = Series(d) - expected = Series( - [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False) - ) - result = result.reindex(index=expected.index) - tm.assert_series_equal(result, expected) + if expected_index_multi: + expected = Series( + list(data.values()), + index=MultiIndex.from_tuples(list(data.keys())), + ) + tm.assert_series_equal(result, expected) + else: + expected = Series( + list(data.values()), + index=Index(list(data.keys())), + ) + tm.assert_series_equal(result, expected) def test_constructor_dict_multiindex_reindex_flat(self): # construction involves reindexing with a MultiIndex corner case @@ -1958,9 +1960,15 @@ def test_constructor_int64_dtype(self, any_int_dtype): def test_constructor_raise_on_lossy_conversion_of_strings(self): # GH#44923 - with pytest.raises( - ValueError, match="string values cannot be losslessly cast to int8" - ): + if not np_version_gt2: + raises = pytest.raises( + ValueError, match="string values cannot be losslessly cast to int8" + ) + else: + raises = pytest.raises( + OverflowError, match="The elements provided in the data" + ) + with raises: Series(["128"], dtype="int8") def test_constructor_dtype_timedelta_alternative_construct(self): @@ -2088,11 +2096,10 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) expected = Series(["a", 1], dtype="object") @@ -2103,37 +2110,43 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", None], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_scalar(self): # GH#54430 - pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series("a", index=[1], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series(np.array(["a", "b"])) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): - # GH#54793 - pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + # https://github.com/pandas-dev/pandas/issues/54793 + # but after PDEP-14 (string dtype), it was decided to keep dtype="string" + # returning the NA string dtype, so expected is changed from + # "string[pyarrow_numpy]" to "string[python]" + expected = Series(["a", "b"], dtype="string[python]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) + expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="str") + tm.assert_series_equal(result, expected) + def test_series_constructor_infer_string_scalar(self): # GH#55537 with pd.option_context("future.infer_string", True): @@ -2144,26 +2157,20 @@ def test_series_constructor_infer_string_scalar(self): def test_series_string_inference_na_first(self): # GH#55655 - pytest.importorskip("pyarrow") - expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): result = Series([pd.NA, "b"]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series([None, "b"], dtype=dtype) tm.assert_series_equal(result, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - ser = Series([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(None): - # This doesn't do inference - result = Series(ser) + obj = klass([Timestamp("2019-12-31")], dtype=object) + # This doesn't do inference + result = Series(obj) assert result.dtype == np.object_ - idx = Index([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Series(idx) - assert result.dtype != np.object_ - class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): @@ -2185,17 +2192,34 @@ def test_series_constructor_infer_multiindex(self, container, data): multi = Series(data, index=indexes) assert isinstance(multi.index, MultiIndex) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + ["a", "b", "c"], + ["a", "b", np.nan], + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + arr = np.array(data, dtype=StringDType()) + res = Series(arr) + assert res.dtype == np.object_ + assert (res == data).all() + class TestSeriesConstructorInternals: - def test_constructor_no_pandas_array(self, using_array_manager): + def test_constructor_no_pandas_array(self): ser = Series([1, 2, 3]) result = Series(ser.array) tm.assert_series_equal(ser, result) - if not using_array_manager: - assert isinstance(result._mgr.blocks[0], NumpyBlock) - assert result._mgr.blocks[0].is_numeric + assert isinstance(result._mgr.blocks[0], NumpyBlock) + assert result._mgr.blocks[0].is_numeric - @td.skip_array_manager_invalid_test def test_from_array(self): result = Series(pd.array(["1h", "2h"], dtype="timedelta64[ns]")) assert result._mgr.blocks[0].is_extension is False @@ -2203,7 +2227,6 @@ def test_from_array(self): result = Series(pd.array(["2015"], dtype="datetime64[ns]")) assert result._mgr.blocks[0].is_extension is False - @td.skip_array_manager_invalid_test def test_from_list_dtype(self): result = Series(["1h", "2h"], dtype="timedelta64[ns]") assert result._mgr.blocks[0].is_extension is False @@ -2263,3 +2286,9 @@ def test_series_with_complex_nan(input_list): result = Series(ser.array) assert ser.dtype == "complex128" tm.assert_series_equal(ser, result) + + +def test_dict_keys_rangeindex(): + result = Series({0: 1, 1: 2}) + expected = Series([1, 2], index=RangeIndex(2)) + tm.assert_series_equal(result, expected, check_index_type=True) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index e6f7b2a5e69e0..db83cf1112e74 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -91,11 +93,30 @@ def test_cummin_cummax_datetimelike(self, ts, method, skipna, exp_tdi): result = getattr(ser, method)(skipna=skipna) tm.assert_series_equal(expected, result) + def test_cumsum_datetimelike(self): + # GH#57956 + df = pd.DataFrame( + [ + [pd.Timedelta(0), pd.Timedelta(days=1)], + [pd.Timedelta(days=2), pd.NaT], + [pd.Timedelta(hours=-6), pd.Timedelta(hours=12)], + ] + ) + result = df.cumsum() + expected = pd.DataFrame( + [ + [pd.Timedelta(0), pd.Timedelta(days=1)], + [pd.Timedelta(days=2), pd.NaT], + [pd.Timedelta(days=1, hours=18), pd.Timedelta(days=1, hours=12)], + ] + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "func, exp", [ - ("cummin", pd.Period("2012-1-1", freq="D")), - ("cummax", pd.Period("2012-1-2", freq="D")), + ("cummin", "2012-1-1"), + ("cummax", "2012-1-2"), ], ) def test_cummin_cummax_period(self, func, exp): @@ -108,6 +129,7 @@ def test_cummin_cummax_period(self, func, exp): tm.assert_series_equal(result, expected) result = getattr(ser, func)(skipna=True) + exp = pd.Period(exp, freq="D") expected = pd.Series([pd.Period("2012-1-1", freq="D"), pd.NaT, exp]) tm.assert_series_equal(result, expected) @@ -150,8 +172,113 @@ def test_cummethods_bool_in_object_dtype(self, method, expected): result = getattr(ser, method)() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cba"], + ], + ) + def test_cummax_cummin_on_ordered_categorical(self, method, order): + # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) + ser = pd.Series( + list("ababcab"), + dtype=cat, + ) + result = getattr(ser, method)() + expected = pd.Series( + list("abbbccc"), + dtype=cat, + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "skip, exp", + [ + [True, ["a", np.nan, "b", "b", "c"]], + [False, ["a", np.nan, np.nan, np.nan, np.nan]], + ], + ) + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cba"], + ], + ) + def test_cummax_cummin_ordered_categorical_nan(self, skip, exp, method, order): + # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) + ser = pd.Series( + ["a", np.nan, "b", "a", "c"], + dtype=cat, + ) + result = getattr(ser, method)(skipna=skip) + expected = pd.Series( + exp, + dtype=cat, + ) + tm.assert_series_equal( + result, + expected, + ) + def test_cumprod_timedelta(self): # GH#48111 ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cum_methods_ea_strings( + self, string_dtype_no_object, data, op, skipna, expected_data + ): + # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow + # https://github.com/pandas-dev/pandas/pull/60938 - Python + ser = pd.Series(data, dtype=string_dtype_no_object) + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=string_dtype_no_object) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index a1c5018ea7961..eb81840f6f8f9 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -19,7 +17,6 @@ period_range, timedelta_range, ) -import pandas._testing as tm class TestSeriesRepr: @@ -115,13 +112,13 @@ def test_datetime(self, datetime_series): 1, 1.2, "foo", - "\u03B1\u03B2\u03B3", + "\u03b1\u03b2\u03b3", "loooooooooooooooooooooooooooooooooooooooooooooooooooong", ("foo", "bar", "baz"), (1, 2), ("foo", 1, 2.3), - ("\u03B1", "\u03B2", "\u03B3"), - ("\u03B1", "bar"), + ("\u03b1", "\u03b2", "\u03b3"), + ("\u03b1", "bar"), ], ) def test_various_names(self, name, string_series): @@ -144,11 +141,13 @@ def test_tidy_repr_name_0(self, arg): rep_str = repr(ser) assert "Name: 0" in rep_str - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" - ) - def test_newline(self): - ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) + def test_newline(self, any_string_dtype): + ser = Series( + ["a\n\r\tb"], + name="a\n\r\td", + index=Index(["a\n\r\tf"], dtype=any_string_dtype), + dtype=any_string_dtype, + ) assert "\t" not in repr(ser) assert "\r" not in repr(ser) assert "a\n" not in repr(ser) @@ -254,14 +253,6 @@ def test_index_repr_in_frame_with_nan(self): assert repr(s) == exp - def test_format_pre_1900_dates(self): - rng = date_range("1/1/1850", "1/1/1950", freq="YE-DEC") - msg = "DatetimeIndex.format is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - rng.format() - ts = Series(1, index=rng) - repr(ts) - def test_series_repr_nat(self): series = Series([0, 1000, 2000, pd.NaT._value], dtype="M8[ns]") @@ -323,7 +314,7 @@ def test_categorical_repr(self, using_infer_string): "0 a\n1 b\n" " ..\n" "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, string): [a, b]" + "Length: 50, dtype: category\nCategories (2, str): [a, b]" ) else: exp = ( @@ -341,7 +332,7 @@ def test_categorical_repr(self, using_infer_string): exp = ( "0 a\n1 b\n" "dtype: category\n" - "Categories (26, string): [a < b < c < d ... w < x < y < z]" + "Categories (26, str): [a < b < c < d ... w < x < y < z]" ) else: exp = ( diff --git a/pandas/tests/series/test_iteration.py b/pandas/tests/series/test_iteration.py index edc82455234bb..1e0fa7fae107e 100644 --- a/pandas/tests/series/test_iteration.py +++ b/pandas/tests/series/test_iteration.py @@ -4,12 +4,10 @@ def test_keys(self, datetime_series): def test_iter_datetimes(self, datetime_series): for i, val in enumerate(datetime_series): - # pylint: disable-next=unnecessary-list-index-lookup assert val == datetime_series.iloc[i] def test_iter_strings(self, string_series): for i, val in enumerate(string_series): - # pylint: disable-next=unnecessary-list-index-lookup assert val == string_series.iloc[i] def test_iteritems_datetimes(self, datetime_series): diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index d9c94e871bd4b..8f63819b09238 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -5,9 +5,11 @@ import pytest from pandas import ( + ArrowDtype, DataFrame, Index, Series, + StringDtype, bdate_range, ) import pandas._testing as tm @@ -15,7 +17,6 @@ class TestSeriesLogicalOps: - @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) def test_bool_operators_with_nas(self, bool_op): # boolean &, |, ^ should work with object arrays and propagate NAs @@ -87,7 +88,7 @@ def test_logical_operators_int_dtype_with_float(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") - warn_msg = ( + err_msg = ( r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) @@ -98,9 +99,8 @@ def test_logical_operators_int_dtype_with_float(self): with pytest.raises(TypeError, match=msg): s_0123 & 3.14 msg = "unsupported operand type.+for &:" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s_0123 & [0.1, 4, 3.14, 2] + with pytest.raises(TypeError, match=err_msg): + s_0123 & [0.1, 4, 3.14, 2] with pytest.raises(TypeError, match=msg): s_0123 & np.array([0.1, 4, 3.14, 2]) with pytest.raises(TypeError, match=msg): @@ -109,7 +109,7 @@ def test_logical_operators_int_dtype_with_float(self): def test_logical_operators_int_dtype_with_str(self): s_1111 = Series([1] * 4, dtype="int8") - warn_msg = ( + err_msg = ( r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) @@ -117,9 +117,8 @@ def test_logical_operators_int_dtype_with_str(self): msg = "Cannot perform 'and_' with a dtyped.+array and scalar of type" with pytest.raises(TypeError, match=msg): s_1111 & "a" - with pytest.raises(TypeError, match="unsupported operand.+for &"): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s_1111 & ["a", "b", "c", "d"] + with pytest.raises(TypeError, match=err_msg): + s_1111 & ["a", "b", "c", "d"] def test_logical_operators_int_dtype_with_bool(self): # GH#9016: support bitwise op for integer types @@ -130,23 +129,21 @@ def test_logical_operators_int_dtype_with_bool(self): result = s_0123 & False tm.assert_series_equal(result, expected) - warn_msg = ( + msg = ( r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - result = s_0123 & [False] - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + s_0123 & [False] - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - result = s_0123 & (False,) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + s_0123 & (False,) result = s_0123 ^ False expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) - def test_logical_operators_int_dtype_with_object(self, using_infer_string): + def test_logical_operators_int_dtype_with_object(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -155,14 +152,10 @@ def test_logical_operators_int_dtype_with_object(self, using_infer_string): tm.assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.nan, "d"]) - if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): - s_0123 & s_abNd - else: - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): - s_0123 & s_abNd + with pytest.raises( + TypeError, match="unsupported.* 'int' and 'str'|'rand_' not supported" + ): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") @@ -189,9 +182,8 @@ def test_logical_ops_bool_dtype_with_ndarray(self): ) expected = Series([True, False, False, False, False]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = left & right - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + left & right result = left & np.array(right) tm.assert_series_equal(result, expected) result = left & Index(right) @@ -200,9 +192,8 @@ def test_logical_ops_bool_dtype_with_ndarray(self): tm.assert_series_equal(result, expected) expected = Series([True, True, True, True, True]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = left | right - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + left | right result = left | np.array(right) tm.assert_series_equal(result, expected) result = left | Index(right) @@ -211,9 +202,8 @@ def test_logical_ops_bool_dtype_with_ndarray(self): tm.assert_series_equal(result, expected) expected = Series([False, True, True, True, True]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = left ^ right - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + left ^ right result = left ^ np.array(right) tm.assert_series_equal(result, expected) result = left ^ Index(right) @@ -234,26 +224,22 @@ def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self): # s_0123 will be all false now because of reindexing like s_tft expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning): - result = s_tft & s_0123 + result = s_tft & s_0123 tm.assert_series_equal(result, expected) - # GH 52538: Deprecate casting to object type when reindex is needed; + # GH#52538: no longer to object type when reindex is needed; # matches DataFrame behavior - expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning): - result = s_0123 & s_tft - tm.assert_series_equal(result, expected) + msg = r"unsupported operand type\(s\) for &: 'float' and 'bool'" + with pytest.raises(TypeError, match=msg): + s_0123 & s_tft s_a0b1c0 = Series([1], list("b")) - with tm.assert_produces_warning(FutureWarning): - res = s_tft & s_a0b1c0 + res = s_tft & s_a0b1c0 expected = s_tff.reindex(list("abc")) tm.assert_series_equal(res, expected) - with tm.assert_produces_warning(FutureWarning): - res = s_tft | s_a0b1c0 + res = s_tft | s_a0b1c0 expected = s_tft.reindex(list("abc")) tm.assert_series_equal(res, expected) @@ -274,9 +260,8 @@ def test_scalar_na_logical_ops_corners(self): r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s & list(s) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + s & list(s) def test_scalar_na_logical_ops_corners_aligns(self): s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)]) @@ -345,9 +330,9 @@ def test_reversed_logical_op_with_index_returns_series(self, op): @pytest.mark.parametrize( "op, expected", [ - (ops.rand_, Series([False, False])), - (ops.ror_, Series([True, True])), - (ops.rxor, Series([True, True])), + (ops.rand_, [False, False]), + (ops.ror_, [True, True]), + (ops.rxor, [True, True]), ], ) def test_reverse_ops_with_index(self, op, expected): @@ -358,6 +343,7 @@ def test_reverse_ops_with_index(self, op, expected): idx = Index([False, True]) result = op(ser, idx) + expected = Series(expected) tm.assert_series_equal(result, expected) def test_logical_ops_label_based(self, using_infer_string): @@ -396,47 +382,42 @@ def test_logical_ops_label_based(self, using_infer_string): # vs empty empty = Series([], dtype=object) - result = a & empty.copy() + result = a & empty expected = Series([False, False, False], list("abc")) tm.assert_series_equal(result, expected) - result = a | empty.copy() + result = a | empty expected = Series([True, True, False], list("abc")) tm.assert_series_equal(result, expected) # vs non-matching - with tm.assert_produces_warning(FutureWarning): - result = a & Series([1], ["z"]) + result = a & Series([1], ["z"]) expected = Series([False, False, False, False], list("abcz")) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - result = a | Series([1], ["z"]) + result = a | Series([1], ["z"]) expected = Series([True, True, False, False], list("abcz")) tm.assert_series_equal(result, expected) # identity # we would like s[s|e] == s to hold for any e, whether empty or not - with tm.assert_produces_warning(FutureWarning): - for e in [ - empty.copy(), - Series([1], ["z"]), - Series(np.nan, b.index), - Series(np.nan, a.index), - ]: - result = a[a | e] - tm.assert_series_equal(result, a[a]) + for e in [ + empty.copy(), + Series([1], ["z"]), + Series(np.nan, b.index), + Series(np.nan, a.index), + ]: + result = a[a | e] + tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: - warn = FutureWarning if using_infer_string else None if using_infer_string: - import pyarrow as pa - - with tm.assert_produces_warning(warn, match="Operation between non"): - with pytest.raises( - pa.lib.ArrowNotImplementedError, match="has no kernel" - ): - result = a[a | e] + # TODO(infer_string) should this behave differently? + # -> https://github.com/pandas-dev/pandas/issues/60234 + with pytest.raises( + TypeError, match="not supported for dtype|unsupported operand type" + ): + result = a[a | e] else: result = a[a | e] tm.assert_series_equal(result, a[a]) @@ -519,7 +500,6 @@ def test_logical_ops_df_compat(self): tm.assert_frame_equal(s3.to_frame() | s4.to_frame(), exp_or1.to_frame()) tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp_or.to_frame()) - @pytest.mark.xfail(reason="Will pass once #52839 deprecation is enforced") def test_int_dtype_different_index_not_bool(self): # GH 52500 ser1 = Series([1, 2, 3], index=[10, 11, 23], name="a") @@ -531,18 +511,38 @@ def test_int_dtype_different_index_not_bool(self): result = ser1 ^ ser2 tm.assert_series_equal(result, expected) + # TODO: this belongs in comparison tests def test_pyarrow_numpy_string_invalid(self): # GH#56008 - pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow") ser = Series([False, True]) - ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan)) result = ser == ser2 - expected = Series(False, index=ser.index) - tm.assert_series_equal(result, expected) + expected_eq = Series(False, index=ser.index) + tm.assert_series_equal(result, expected_eq) result = ser != ser2 - expected = Series(True, index=ser.index) - tm.assert_series_equal(result, expected) + expected_ne = Series(True, index=ser.index) + tm.assert_series_equal(result, expected_ne) with pytest.raises(TypeError, match="Invalid comparison"): ser > ser2 + + # GH#59505 + ser3 = ser2.astype("string[pyarrow]") + result3_eq = ser3 == ser + tm.assert_series_equal(result3_eq, expected_eq.astype("bool[pyarrow]")) + result3_ne = ser3 != ser + tm.assert_series_equal(result3_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser3 + + ser4 = ser2.astype(ArrowDtype(pa.string())) + result4_eq = ser4 == ser + tm.assert_series_equal(result4_eq, expected_eq.astype("bool[pyarrow]")) + result4_ne = ser4 != ser + tm.assert_series_equal(result4_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser4 diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index cafc69c4d0f20..1c88329a83b0e 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -25,18 +25,6 @@ def test_categorical_nan_handling(self): s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8) ) - def test_isna_for_inf(self): - s = Series(["a", np.inf, np.nan, pd.NA, 1.0]) - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with pd.option_context("mode.use_inf_as_na", True): - r = s.isna() - dr = s.dropna() - e = Series([False, True, True, True, False]) - de = Series(["a", 1.0], index=[0, 4]) - tm.assert_series_equal(r, e) - tm.assert_series_equal(dr, de) - def test_timedelta64_nan(self): td = Series([timedelta(days=i) for i in range(10)]) @@ -49,13 +37,8 @@ def test_timedelta64_nan(self): assert not isna(td1[0]) # GH#16674 iNaT is treated as an integer when given by the user - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): td1[1] = iNaT - assert not isna(td1[1]) - assert td1.dtype == np.object_ - assert td1[1] == iNaT - td1[1] = td[1] - assert not isna(td1[1]) td1[2] = NaT assert isna(td1[2]) diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 76353ab25fca6..5ffada8b95753 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -51,6 +51,29 @@ def test_mode_nullable_dtype(any_numeric_ea_dtype): tm.assert_series_equal(result, expected) +def test_mode_nullable_dtype_edge_case(any_numeric_ea_dtype): + # GH##58926 + ser = Series([1, 2, 3, 1], dtype=any_numeric_ea_dtype) + result = ser.mode(dropna=False) + expected = Series([1], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser2 = Series([1, 1, 2, 3, pd.NA], dtype=any_numeric_ea_dtype) + result = ser2.mode(dropna=False) + expected = Series([1], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser3 = Series([1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype) + result = ser3.mode(dropna=False) + expected = Series([pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser4 = Series([1, 1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype) + result = ser4.mode(dropna=False) + expected = Series([1, pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + def test_mode_infer_string(): # GH#56183 pytest.importorskip("pyarrow") @@ -70,7 +93,6 @@ def test_reductions_td64_with_nat(): assert ser.max() == exp -@pytest.mark.parametrize("skipna", [True, False]) def test_td64_sum_empty(skipna): # GH#37151 ser = Series([], dtype="timedelta64[ns]") @@ -163,56 +185,49 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): +def test_mean_with_convertible_string_raises(): # GH#44008 ser = Series(["1", "2"]) - if using_infer_string: - msg = "does not support" - with pytest.raises(TypeError, match=msg): - ser.sum() - else: - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric|does not support" + assert ser.sum() == "12" + + msg = "Could not convert string '12' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() - if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric|does not support" + msg = r"Could not convert \['12'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() -def test_mean_dont_convert_j_to_complex(using_array_manager): +def test_mean_dont_convert_j_to_complex(): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) - if using_array_manager: - msg = "Could not convert string 'J' to numeric" - else: - msg = r"Could not convert \['J'\] to numeric|does not support" + msg = r"Could not convert \['J'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric|does not support" + msg = "Could not convert string 'J' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df["db"].mean() - msg = "Could not convert string 'J' to numeric|ufunc 'divide'" + msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot perform" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) -def test_median_with_convertible_string_raises(using_array_manager): +def test_median_with_convertible_string_raises(): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support|Cannot perform" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() - if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + msg = ( + r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support|Cannot perform" + ) df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median() diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 9d13ebf740eab..a5976bb2518c9 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -5,8 +5,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm from pandas.arrays import SparseArray @@ -18,7 +16,10 @@ def ufunc(request): return request.param -@pytest.fixture(params=[True, False], ids=["sparse", "dense"]) +@pytest.fixture( + params=[pytest.param(True, marks=pytest.mark.fails_arm_wheels), False], + ids=["sparse", "dense"], +) def sparse(request): return request.param @@ -291,7 +292,7 @@ def test_multiply(self, values_for_np_reduce, box_with_array, request): else: msg = "|".join( [ - "does not support reduction", + "does not support operation", "unsupported operand type", "ufunc 'multiply' cannot use operands", ] @@ -321,7 +322,7 @@ def test_add(self, values_for_np_reduce, box_with_array): else: msg = "|".join( [ - "does not support reduction", + "does not support operation", "unsupported operand type", "ufunc 'add' cannot use operands", ] @@ -427,6 +428,13 @@ def test_np_matmul(): tm.assert_frame_equal(expected, result) +@pytest.mark.parametrize("box", [pd.Index, pd.Series]) +def test_np_matmul_1D(box): + result = np.matmul(box([1, 2]), box([2, 3])) + assert result == 8 + assert isinstance(result, np.int64) + + def test_array_ufuncs_for_many_arguments(): # GH39853 def add3(x, y, z): @@ -449,8 +457,7 @@ def add3(x, y, z): ufunc(ser, ser, df) -# TODO(CoW) see https://github.com/pandas-dev/pandas/pull/51082 -@td.skip_copy_on_write_not_yet_implemented +@pytest.mark.xfail(reason="see https://github.com/pandas-dev/pandas/pull/51082") def test_np_fix(): # np.fix is not a ufunc but is composed of several ufunc calls under the hood # with `out` and `where` keywords diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 01b49b5e5b633..6c4bec6a23789 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,12 +2,20 @@ import pandas as pd -object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + +def is_object_or_nan_string_dtype(dtype): + """ + Check if string-like dtype is following NaN semantics, i.e. is object + dtype or a NaN-variant of the StringDtype. + """ + return (isinstance(dtype, np.dtype) and dtype == "object") or ( + dtype.na_value is np.nan + ) def _convert_na_value(ser, expected): if ser.dtype != object: - if ser.dtype.storage == "pyarrow_numpy": + if ser.dtype.na_value is np.nan: expected = expected.fillna(np.nan) else: # GH#18463 diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 036e4de20ba53..5bcbb16da3be9 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -68,6 +68,7 @@ "get_dummies", "isalnum", "isalpha", + "isascii", "isdecimal", "isdigit", "islower", @@ -122,7 +123,7 @@ def any_string_method(request): Examples -------- >>> def test_something(any_string_method): - ... s = Series(['a', 'b', np.nan, 'd']) + ... s = Series(["a", "b", np.nan, "d"]) ... ... method_name, args, kwargs = any_string_method ... method = getattr(s.str, method_name) diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 31e005466af7b..4a1b97606db2b 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -1,3 +1,5 @@ +import weakref + import numpy as np import pytest @@ -68,6 +70,15 @@ def test_api(any_string_dtype): assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods) +def test_no_circular_reference(any_string_dtype): + # GH 47667 + ser = Series([""], dtype=any_string_dtype) + ref = weakref.ref(ser) + ser.str # Used to cache and cause circular reference + del ser + assert ref() is None + + def test_api_mi_raises(): # GH 23679 mi = MultiIndex.from_arrays([["a", "b", "c"]]) @@ -111,6 +122,7 @@ def test_api_per_method( any_allowed_skipna_inferred_dtype, any_string_method, request, + using_infer_string, ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, @@ -149,6 +161,10 @@ def test_api_per_method( t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) + if using_infer_string and dtype == "category": + string_allowed = method_name not in ["decode"] + else: + string_allowed = True bytes_allowed = method_name in ["decode", "get", "len", "slice"] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. @@ -157,7 +173,8 @@ def test_api_per_method( mixed_allowed = method_name not in ["cat"] allowed_types = ( - ["string", "unicode", "empty"] + ["empty"] + + ["string", "unicode"] * string_allowed + ["bytes"] * bytes_allowed + ["mixed", "mixed-integer"] * mixed_allowed ) @@ -170,7 +187,8 @@ def test_api_per_method( # GH 23011, GH 23163 msg = ( f"Cannot use .str.{method_name} with values of " - f"inferred dtype {repr(inferred_dtype)}." + f"inferred dtype {inferred_dtype!r}." + "|a bytes-like object is required, not 'str'" ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 41aedae90ca76..819556f961fa3 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -291,11 +291,7 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): - if any_string_dtype == "string[pyarrow_numpy]": - pytest.skip( - "Arrow logic is different, " - "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", - ) + # GH#54533, GH#54792 s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index c1e7ad6e02779..68ca807bde145 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -16,6 +16,11 @@ ) +@pytest.fixture +def index_or_series2(index_or_series): + return index_or_series + + @pytest.mark.parametrize("other", [None, Series, Index]) def test_str_cat_name(index_or_series, other): # GH 21053 @@ -270,14 +275,13 @@ def test_str_cat_mixed_inputs(index_or_series): s.str.cat(iter([t.values, list(s)])) -@pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) -def test_str_cat_align_indexed(index_or_series, join): +def test_str_cat_align_indexed(index_or_series, join_type): # https://github.com/pandas-dev/pandas/issues/18657 box = index_or_series s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) - sa, ta = s.align(t, join=join) + sa, ta = s.align(t, join=join_type) # result after manual alignment of inputs expected = sa.str.cat(ta, na_rep="-") @@ -286,25 +290,24 @@ def test_str_cat_align_indexed(index_or_series, join): sa = Index(sa) expected = Index(expected) - result = s.str.cat(t, join=join, na_rep="-") + result = s.str.cat(t, join=join_type, na_rep="-") tm.assert_equal(result, expected) -@pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) -def test_str_cat_align_mixed_inputs(join): +def test_str_cat_align_mixed_inputs(join_type): s = Series(["a", "b", "c", "d"]) t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) d = concat([t, t], axis=1) expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"]) - expected = expected_outer.loc[s.index.join(t.index, how=join)] + expected = expected_outer.loc[s.index.join(t.index, how=join_type)] # list of Series - result = s.str.cat([t, t], join=join, na_rep="-") + result = s.str.cat([t, t], join=join_type, na_rep="-") tm.assert_series_equal(result, expected) # DataFrame - result = s.str.cat(d, join=join, na_rep="-") + result = s.str.cat(d, join=join_type, na_rep="-") tm.assert_series_equal(result, expected) # mixed list of indexed/unindexed @@ -313,19 +316,19 @@ def test_str_cat_align_mixed_inputs(join): # joint index of rhs [t, u]; u will be forced have index of s rhs_idx = ( t.index.intersection(s.index) - if join == "inner" + if join_type == "inner" else t.index.union(s.index) - if join == "outer" + if join_type == "outer" else t.index.append(s.index.difference(t.index)) ) - expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] - result = s.str.cat([t, u], join=join, na_rep="-") + expected = expected_outer.loc[s.index.join(rhs_idx, how=join_type)] + result = s.str.cat([t, u], join=join_type, na_rep="-") tm.assert_series_equal(result, expected) with pytest.raises(TypeError, match="others must be Series,.*"): # nested lists are forbidden - s.str.cat([t, list(u)], join=join) + s.str.cat([t, list(u)], join=join_type) # errors for incorrect lengths rgx = r"If `others` contains arrays or lists \(or other list-likes.*" @@ -333,11 +336,11 @@ def test_str_cat_align_mixed_inputs(join): # unindexed object of wrong length with pytest.raises(ValueError, match=rgx): - s.str.cat(z, join=join) + s.str.cat(z, join=join_type) # unindexed object of wrong length in list with pytest.raises(ValueError, match=rgx): - s.str.cat([t, z], join=join) + s.str.cat([t, z], join=join_type) def test_str_cat_all_na(index_or_series, index_or_series2): diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 77d008c650264..1dc7dbabe85b0 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -548,7 +548,6 @@ def test_extractall_single_group_with_quantifier(any_string_dtype): (["a3", "b3", "d4c2"], (None,)), (["a3", "b3", "d4c2"], ("i1", "i2")), (["a3", "b3", "d4c2"], (None, "i2")), - (["a3", "b3", "d4c2"], ("i1", "i2")), ], ) def test_extractall_no_matches(data, names, any_string_dtype): @@ -564,13 +563,13 @@ def test_extractall_no_matches(data, names, any_string_dtype): # one un-named group. result = s.str.extractall("(z)") - expected = DataFrame(columns=[0], index=expected_index, dtype=any_string_dtype) - tm.assert_frame_equal(result, expected) + expected = DataFrame(columns=range(1), index=expected_index, dtype=any_string_dtype) + tm.assert_frame_equal(result, expected, check_column_type=True) # two un-named groups. result = s.str.extractall("(z)(z)") - expected = DataFrame(columns=[0, 1], index=expected_index, dtype=any_string_dtype) - tm.assert_frame_equal(result, expected) + expected = DataFrame(columns=range(2), index=expected_index, dtype=any_string_dtype) + tm.assert_frame_equal(result, expected, check_column_type=True) # one named group. result = s.str.extractall("(?Pz)") diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 3f58c6d703f8f..30e6ebf0eed13 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td import pandas as pd @@ -14,7 +13,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) # -------------------------------------------------------------------------------------- @@ -22,10 +21,6 @@ # -------------------------------------------------------------------------------------- -def using_pyarrow(dtype): - return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -34,18 +29,28 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series( - np.array([False, np.nan, True, True, False], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, True, True, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True, False], dtype=np.object_), + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) result = values.str.contains(pat, regex=False) - expected = Series( - np.array([False, np.nan, False, False, True], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + expected = Series([False, False, False, False, True], dtype=bool) + else: + expected = Series( + np.array([False, np.nan, False, False, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) values = Series( @@ -53,7 +58,9 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -80,14 +87,22 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series( - np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype - ) + if any_string_dtype == "str": + expected = Series([False, False, True, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -159,7 +174,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - result = values.str.contains("a", na=na, regex=regex) + + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + warn = None + if not pd.isna(na) and not isinstance(na, bool): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) @@ -172,37 +196,45 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan expected = Series( - [False, False, False, True, True, False, np.nan, False, False, True], + [False, False, False, True, True, False, na_value, False, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( - [True, False, False, True, True, False, np.nan, True, False, True], + [True, False, False, True, True, False, na_value, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( - [False, False, False, True, True, False, np.nan, True, False, False], + [False, False, False, True, True, False, na_value, True, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) @@ -213,7 +245,9 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -221,18 +255,29 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na="foo") + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) - elif any_string_dtype == "string[pyarrow_numpy]": + elif any_string_dtype.na_value is np.nan: expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -241,11 +286,26 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- +def test_startswith_endswith_validate_na(any_string_dtype): + # GH#59615 + ser = Series( + ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"], + dtype=any_string_dtype, + ) + + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") + + @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_startswith(pat, dtype, null_value, na): +def test_startswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -259,6 +319,8 @@ def test_startswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.startswith(pat, na=na) @@ -276,20 +338,31 @@ def test_startswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_startswith_nullable_string_dtype(nullable_string_dtype, na): +def test_startswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.startswith("foo", na=na) + + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, True, False, False, na, True, False, False], dtype="boolean" + [False, na, True, False, False, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.startswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -303,7 +376,7 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_endswith(pat, dtype, null_value, na): +def test_endswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -317,6 +390,8 @@ def test_endswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.endswith(pat, na=na) @@ -334,20 +409,30 @@ def test_endswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_endswith_nullable_string_dtype(nullable_string_dtype, na): +def test_endswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.endswith("foo", na=na) + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, False, False, True, na, True, False, False], dtype="boolean" + [False, na, False, False, True, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.endswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -355,6 +440,21 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): # -------------------------------------------------------------------------------------- # str.replace # -------------------------------------------------------------------------------------- +def test_replace_dict_invalid(any_string_dtype): + # GH 51914 + series = Series(data=["A", "B_junk", "C_gunk"], name="my_messy_col") + msg = "repl cannot be used when pat is a dictionary" + + with pytest.raises(ValueError, match=msg): + series.str.replace(pat={"A": "a", "B": "b"}, repl="A") + + +def test_replace_dict(any_string_dtype): + # GH 51914 + series = Series(data=["A", "B", "C"], name="my_messy_col") + new_series = series.str.replace(pat={"A": "a", "B": "b"}) + expected = Series(data=["a", "b", "C"], name="my_messy_col") + tm.assert_series_equal(new_series, expected) def test_replace(any_string_dtype): @@ -391,8 +491,7 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) + result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -412,8 +511,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) + result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -431,10 +529,7 @@ def test_replace_callable_raises(any_string_dtype, repl): r"(?(3)required )positional arguments?" ) with pytest.raises(TypeError, match=msg): - with tm.maybe_produces_warning( - PerformanceWarning, using_pyarrow(any_string_dtype) - ): - values.str.replace("a", repl, regex=True) + values.str.replace("a", repl, regex=True) def test_replace_callable_named_groups(any_string_dtype): @@ -442,8 +537,7 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, repl, regex=True) + result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -454,13 +548,11 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, "", regex=True) + result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, "", n=1, regex=True) + result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -481,8 +573,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, ", ", regex=True) + result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -509,19 +600,16 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, repl, n=2, regex=True) + result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "regex,expected", [(True, ["bao", "bao", np.nan]), (False, ["bao", "foo", np.nan])] -) -def test_replace_literal(regex, expected, any_string_dtype): +@pytest.mark.parametrize("regex,expected_val", [(True, "bao"), (False, "foo")]) +def test_replace_literal(regex, expected_val, any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) - expected = Series(expected, dtype=any_string_dtype) + expected = Series(["bao", expected_val, np.nan], dtype=any_string_dtype) result = ser.str.replace("f.", "ba", regex=regex) tm.assert_series_equal(result, expected) @@ -558,8 +646,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("A", "YYY", case=False) + result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", @@ -577,8 +664,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) + result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", @@ -601,13 +687,11 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("a", "c", case=False, regex=False) + result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("a.", "c.", case=False, regex=False) + result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -640,34 +724,41 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): - # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") - expected = Series([True, np.nan, False], dtype=expected_dtype) + expected = Series([True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match(".*BAD[_]+.*BAD") - expected = Series([True, True, np.nan, False], dtype=expected_dtype) + expected = Series([True, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match("^BAD[_]+.*BAD") - expected = Series([False, False, np.nan, False], dtype=expected_dtype) + expected = Series([False, False, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("\\^BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -696,20 +787,33 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series([True, False, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan + + expected = Series([True, False, na_value], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -725,8 +829,29 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series([True, False, np.nan, False], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([True, False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, False, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +def test_fullmatch_dollar_literal(any_string_dtype): + # GH 56652 + ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) + result = ser.str.fullmatch("foo\\$") + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -735,14 +860,18 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) @@ -754,8 +883,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.fullmatch("ab", flags=re.IGNORECASE) + result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -814,7 +942,9 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -866,7 +996,9 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) @@ -936,17 +1068,13 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - use_pyarrow = using_pyarrow(any_string_dtype) - result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): - result = data.str.match(pat, flags=re.IGNORECASE) + result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): - result = data.str.fullmatch(pat, flags=re.IGNORECASE) + result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] result = data.str.findall(pat, flags=re.IGNORECASE) @@ -956,8 +1084,6 @@ def test_flags_kwarg(any_string_dtype): assert result.iloc[0] == 1 msg = "has match groups" - with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow - ): + with tm.assert_produces_warning(UserWarning, match=msg): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 31386e4e342ae..16e10c6fcdccd 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,4 +1,7 @@ import numpy as np +import pytest + +import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -32,22 +35,68 @@ def test_get_dummies_index(): tm.assert_index_equal(result, expected) -def test_get_dummies_with_name_dummy(any_string_dtype): - # GH 12180 - # Dummies named 'name' should work as expected - s = Series(["a", "b,name", "b"], dtype=any_string_dtype) - result = s.str.get_dummies(",") - expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"]) +# GH#47872 +@pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.int16, + np.uint16, + np.int32, + np.uint32, + np.int64, + np.uint64, + bool, + "Int8", + "Int16", + "Int32", + "Int64", + "boolean", + ], +) +def test_get_dummies_with_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype + ) tm.assert_frame_equal(result, expected) -def test_get_dummies_with_name_dummy_index(): - # GH 12180 - # Dummies named 'name' should work as expected - idx = Index(["a|b", "name|c", "b|name"]) - result = idx.str.get_dummies("|") - - expected = MultiIndex.from_tuples( - [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") +# GH#47872 +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "dtype", + [ + "int8[pyarrow]", + "uint8[pyarrow]", + "int16[pyarrow]", + "uint16[pyarrow]", + "int32[pyarrow]", + "uint32[pyarrow]", + "int64[pyarrow]", + "uint64[pyarrow]", + "bool[pyarrow]", + ], +) +def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=dtype, ) - tm.assert_index_equal(result, expected) + tm.assert_frame_equal(result, expected) + + +# GH#47872 +def test_get_dummies_with_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + + msg = "Only numeric or boolean dtypes are supported for 'dtype'" + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype=str) + + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype="datetime64[ns]") diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 9ff1fc0e13ae9..4fab6e7778002 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -14,7 +14,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) @@ -190,23 +190,24 @@ def test_split_maxsplit(data, pat, any_string_dtype, n): @pytest.mark.parametrize( - "data, pat, expected", + "data, pat, expected_val", [ ( ["split once", "split once too!"], None, - Series({0: ["split", "once"], 1: ["split", "once too!"]}), + "once too!", ), ( ["split_once", "split_once_too!"], "_", - Series({0: ["split", "once"], 1: ["split", "once_too!"]}), + "once_too!", ), ], ) -def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype): +def test_split_no_pat_with_nonzero_n(data, pat, expected_val, any_string_dtype): s = Series(data, dtype=any_string_dtype) result = s.str.split(pat=pat, n=1) + expected = Series({0: ["split", "once"], 1: ["split", expected_val]}) tm.assert_series_equal(expected, result, check_index_type=False) @@ -384,7 +385,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) @@ -533,37 +534,27 @@ def test_partition_series_stdlib(any_string_dtype, method): @pytest.mark.parametrize( - "method, expand, exp, exp_levels", + "method, exp", [ [ "partition", - False, - np.array( - [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], - dtype=object, - ), - 1, + [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], ], [ "rpartition", - False, - np.array( - [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], - dtype=object, - ), - 1, + [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], ], ], ) -def test_partition_index(method, expand, exp, exp_levels): +def test_partition_index(method, exp): # https://github.com/pandas-dev/pandas/issues/23558 values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None]) - result = getattr(values.str, method)("_", expand=expand) - exp = Index(exp) + result = getattr(values.str, method)("_", expand=False) + exp = Index(np.array(exp, dtype=object), dtype=object) tm.assert_index_equal(result, exp) - assert result.nlevels == exp_levels + assert result.nlevels == 1 @pytest.mark.parametrize( diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 0b3f368afea5e..c5414022e664b 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -12,7 +12,6 @@ ) -@pytest.mark.filterwarnings("ignore:Falling back") def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method @@ -39,7 +38,7 @@ def test_string_array(nullable_string_dtype, any_string_method): expected.values, skipna=True ): assert result.dtype == "boolean" - result = result.astype(object) + expected = expected.astype("boolean") elif expected.dtype == "bool": assert result.dtype == "boolean" @@ -84,6 +83,7 @@ def test_string_array_numeric_integer_array(nullable_string_dtype, method, expec [ ("isdigit", [False, None, True]), ("isalpha", [True, None, False]), + ("isascii", [True, None, True]), ("isalnum", [True, None, True]), ("isnumeric", [False, None, True]), ], diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index f662dfd7e2b14..025f837982595 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,7 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import is_object_or_nan_string_dtype @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -41,7 +41,9 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -93,7 +95,8 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype in object_pyarrow_numpy: + empty_inferred_str = Series(dtype="str") + if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -119,16 +122,16 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.repeat(3)) tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( - DataFrame(columns=[0], dtype=any_string_dtype), + DataFrame(columns=range(1), dtype=any_string_dtype), empty.str.extract("()", expand=True), ) tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=any_string_dtype), + DataFrame(columns=range(2), dtype=any_string_dtype), empty.str.extract("()()", expand=True), ) tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=any_string_dtype), + DataFrame(columns=range(2), dtype=any_string_dtype), empty.str.extract("()()", expand=False), ) tm.assert_frame_equal(empty_df.set_axis([], axis=1), empty.str.get_dummies()) @@ -152,11 +155,12 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) + tm.assert_series_equal(empty_bool, empty.str.isascii()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) @@ -175,6 +179,7 @@ def test_empty_str_methods(any_string_dtype): @pytest.mark.parametrize( "method, expected", [ + ("isascii", [True, True, True, True, True, True, True, True, True, True]), ("isalnum", [True, True, True, True, True, False, True, True, False, False]), ("isalpha", [True, True, True, False, False, False, True, False, False, False]), ( @@ -207,14 +212,29 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) # compare with standard library - expected = [getattr(item, method)() for item in ser] - assert list(result) == expected + expected_stdlib = [getattr(item, method)() for item in ser] + assert list(result) == expected_stdlib + + # with missing value + ser.iloc[[1, 2, 3, 4]] = np.nan + result = getattr(ser.str, method)() + if ser.dtype == "object": + expected = expected.astype(object) + expected.iloc[[1, 2, 3, 4]] = np.nan + elif ser.dtype == "str": + # NaN propagates as False + expected.iloc[[1, 2, 3, 4]] = False + else: + # nullable dtypes propagate NaN + expected.iloc[[1, 2, 3, 4]] = np.nan @pytest.mark.parametrize( @@ -230,9 +250,12 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 # noqa: RUF003 ser = Series( - ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 + ["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001 + dtype=any_string_dtype, + ) + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -252,8 +275,14 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series(expected, dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series(expected, dtype=object).fillna(False).astype(bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -283,7 +312,9 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -312,7 +343,9 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -353,7 +386,9 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) @@ -379,6 +414,7 @@ def test_pipe_failures(any_string_dtype): (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], @@ -531,7 +567,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")).astype(object) + expected = Series(["a", "b", "a\xe4"], dtype="str") tm.assert_series_equal(result, expected) @@ -561,10 +597,34 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") + tm.assert_series_equal(result, expected) + + +def test_decode_string_dtype(string_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + result = ser.str.decode("utf-8", dtype=string_dtype) + expected = Series(["a", "b"], dtype=string_dtype) tm.assert_series_equal(result, expected) +def test_decode_object_dtype(object_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", rb"\ud800"]) + result = ser.str.decode("utf-8", dtype=object_dtype) + expected = Series(["a", r"\ud800"], dtype=object_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_bad_dtype(): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + msg = "dtype must be string or object, got dtype='int64'" + with pytest.raises(ValueError, match=msg): + ser.str.decode("utf-8", dtype="int64") + + @pytest.mark.parametrize( "form, expected", [ @@ -716,5 +776,5 @@ def test_get_with_dict_label(): def test_series_str_decode(): # GH 22613 result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") - expected = Series(["x", "y"], dtype="object") + expected = Series(["x", "y"], dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_aggregation.py b/pandas/tests/test_aggregation.py index 7695c953712ed..3a01805cc2365 100644 --- a/pandas/tests/test_aggregation.py +++ b/pandas/tests/test_aggregation.py @@ -10,7 +10,7 @@ def test_maybe_mangle_lambdas_passthrough(): assert maybe_mangle_lambdas("mean") == "mean" assert maybe_mangle_lambdas(lambda x: x).__name__ == "" - # don't mangel single lambda. + # don't mangle single lambda. assert maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 718d1b3ee2e83..7fb421e27bb40 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -16,7 +16,10 @@ is_integer_dtype, is_object_dtype, ) -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, +) import pandas as pd from pandas import ( @@ -51,19 +54,15 @@ class TestFactorize: def test_factorize_complex(self): # GH#17927 - array = [1, 2, 2 + 1j] - msg = "factorize with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - labels, uniques = algos.factorize(array) + array = np.array([1, 2, 2 + 1j], dtype=complex) + labels, uniques = algos.factorize(array) expected_labels = np.array([0, 1, 2], dtype=np.intp) tm.assert_numpy_array_equal(labels, expected_labels) - # Should return a complex dtype in the future - expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=object) + expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex) tm.assert_numpy_array_equal(uniques, expected_uniques) - @pytest.mark.parametrize("sort", [True, False]) def test_factorize(self, index_or_series_obj, sort): obj = index_or_series_obj result_codes, result_uniques = obj.factorize(sort=sort) @@ -263,9 +262,8 @@ def test_factorizer_object_with_nan(self): ) def test_factorize_tuple_list(self, data, expected_codes, expected_uniques): # GH9454 - msg = "factorize with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - codes, uniques = pd.factorize(data) + data = com.asarray_tuplesafe(data, dtype=object) + codes, uniques = pd.factorize(data) tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp)) @@ -354,7 +352,6 @@ def test_datetime64_factorize(self, writable): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_numpy_array_equal(uniques, expected_uniques) - @pytest.mark.parametrize("sort", [True, False]) def test_factorize_rangeindex(self, sort): # increasing -> sort doesn't matter ri = pd.RangeIndex.from_range(range(10)) @@ -368,7 +365,6 @@ def test_factorize_rangeindex(self, sort): tm.assert_numpy_array_equal(result[0], expected[0]) tm.assert_index_equal(result[1], expected[1], exact=True) - @pytest.mark.parametrize("sort", [True, False]) def test_factorize_rangeindex_decreasing(self, sort): # decreasing -> sort matters ri = pd.RangeIndex.from_range(range(10)) @@ -431,7 +427,6 @@ def test_parametrized_factorize_na_value(self, data, na_value): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_numpy_array_equal(uniques, expected_uniques) - @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize( "data, uniques", [ @@ -489,12 +484,12 @@ def test_object_factorize_use_na_sentinel_false( "data, expected_codes, expected_uniques", [ ( - [1, None, 1, 2], + np.array([1, None, 1, 2], dtype=object), np.array([0, 1, 0, 2], dtype=np.dtype("intp")), np.array([1, np.nan, 2], dtype="O"), ), ( - [1, np.nan, 1, 2], + np.array([1, np.nan, 1, 2], dtype=np.float64), np.array([0, 1, 0, 2], dtype=np.dtype("intp")), np.array([1, np.nan, 2], dtype=np.float64), ), @@ -503,9 +498,7 @@ def test_object_factorize_use_na_sentinel_false( def test_int_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - msg = "factorize with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - codes, uniques = algos.factorize(data, use_na_sentinel=False) + codes, uniques = algos.factorize(data, use_na_sentinel=False) tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True) tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True) @@ -574,19 +567,20 @@ def test_object_refcount_bug(self): for i in range(1000): len(algos.unique(lst)) - def test_on_index_object(self): - mindex = MultiIndex.from_arrays( - [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] - ) - expected = mindex.values - expected.sort() - - mindex = mindex.repeat(2) + def test_index_returned(self, index): + # GH#57043 + index = index.repeat(2) + result = algos.unique(index) - result = pd.unique(mindex) - result.sort() - - tm.assert_almost_equal(result, expected) + # dict.fromkeys preserves the order + unique_values = list(dict.fromkeys(index.values)) + if isinstance(index, MultiIndex): + expected = MultiIndex.from_tuples(unique_values, names=index.names) + else: + expected = Index(unique_values, dtype=index.dtype) + if isinstance(index.dtype, DatetimeTZDtype): + expected = expected.normalize() + tm.assert_index_equal(result, expected, exact=True) def test_dtype_preservation(self, any_numpy_dtype): # GH 15442 @@ -627,7 +621,7 @@ def test_dtype_preservation(self, any_numpy_dtype): def test_datetime64_dtype_array_returned(self): # GH 9431 - expected = np.array( + dt_arr = np.array( [ "2015-01-03T00:00:00.000000000", "2015-01-01T00:00:00.000000000", @@ -643,18 +637,18 @@ def test_datetime64_dtype_array_returned(self): ] ) result = algos.unique(dt_index) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + expected = to_datetime(dt_arr) + tm.assert_index_equal(result, expected, exact=True) s = Series(dt_index) result = algos.unique(s) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, dt_arr) + assert result.dtype == dt_arr.dtype arr = s.values result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, dt_arr) + assert result.dtype == dt_arr.dtype def test_datetime_non_ns(self): a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]") @@ -670,22 +664,23 @@ def test_timedelta_non_ns(self): def test_timedelta64_dtype_array_returned(self): # GH 9431 - expected = np.array([31200, 45678, 10000], dtype="m8[ns]") + td_arr = np.array([31200, 45678, 10000], dtype="m8[ns]") td_index = to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) - tm.assert_numpy_array_equal(result, expected) + expected = to_timedelta(td_arr) + tm.assert_index_equal(result, expected) assert result.dtype == expected.dtype s = Series(td_index) result = algos.unique(s) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, td_arr) + assert result.dtype == td_arr.dtype arr = s.values result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + tm.assert_numpy_array_equal(result, td_arr) + assert result.dtype == td_arr.dtype def test_uint64_overflow(self): s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) @@ -776,9 +771,8 @@ def test_order_of_appearance(self): result = pd.unique(Series([2] + [1] * 5)) tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64")) - msg = "unique with argument that is not not a Series, Index," - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.unique(list("aabc")) + data = np.array(["a", "a", "b", "c"], dtype=object) + result = pd.unique(data) expected = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -814,9 +808,8 @@ def test_order_of_appearance_dt64tz(self, unit): ) def test_tuple_with_strings(self, arg, expected): # see GH 17108 - msg = "unique with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.unique(arg) + arg = com.asarray_tuplesafe(arg, dtype=object) + result = pd.unique(arg) tm.assert_numpy_array_equal(result, expected) def test_obj_none_preservation(self): @@ -880,6 +873,14 @@ def test_unique_masked(self, any_numeric_ea_dtype): expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype) tm.assert_extension_array_equal(result, expected) + def test_unique_NumpyExtensionArray(self): + arr_complex = pd.array( + [1 + 1j, 2, 3] + ) # NumpyEADtype('complex128') => NumpyExtensionArray + result = pd.unique(arr_complex) + expected = pd.array([1 + 1j, 2 + 0j, 3 + 0j]) + tm.assert_extension_array_equal(result, expected) + def test_nunique_ints(index_or_series_or_array): # GH#36327 @@ -903,12 +904,6 @@ def test_invalid(self): algos.isin([1], 1) def test_basic(self): - msg = "isin with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.isin([1, 2], [1]) - expected = np.array([True, False]) - tm.assert_numpy_array_equal(result, expected) - result = algos.isin(np.array([1, 2]), [1]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) @@ -925,21 +920,20 @@ def test_basic(self): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.isin(["a", "b"], ["a"]) + arg = np.array(["a", "b"], dtype=object) + result = algos.isin(arg, ["a"]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(Series(["a", "b"]), Series(["a"])) + result = algos.isin(Series(arg), Series(["a"])) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(Series(["a", "b"]), {"a"}) + result = algos.isin(Series(arg), {"a"}) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.isin(["a", "b"], [1]) + result = algos.isin(arg, [1]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) @@ -1004,21 +998,18 @@ def test_isin_datetimelike_all_nat(self, dtype): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"]) - def test_isin_datetimelike_strings_deprecated(self, dtype): + def test_isin_datetimelike_strings_returns_false(self, dtype): # GH#53111 dta = date_range("2013-01-01", periods=3)._values arr = Series(dta.view("i8")).array.view(dtype) vals = [str(x) for x in arr] - msg = "The behavior of 'isin' with dtype=.* is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = algos.isin(arr, vals) - assert res.all() + res = algos.isin(arr, vals) + assert not res.any() vals2 = np.array(vals, dtype=str) - with tm.assert_produces_warning(FutureWarning, match=msg): - res2 = algos.isin(arr, vals2) - assert res2.all() + res2 = algos.isin(arr, vals2) + assert not res2.any() def test_isin_dt64tz_with_nat(self): # the all-NaT values used to get inferred to tznaive, which was evaluated @@ -1057,12 +1048,10 @@ def test_same_nan_is_in(self): # at least, isin() should follow python's "np.nan in [nan] == True" # casting to -> np.float64 -> another float-object somewhere on # the way could lead jeopardize this behavior - comps = [np.nan] # could be casted to float64 + comps = np.array([np.nan], dtype=object) # could be casted to float64 values = [np.nan] expected = np.array([True]) - msg = "isin with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.isin(comps, values) + result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) def test_same_nan_is_in_large(self): @@ -1097,12 +1086,12 @@ def __hash__(self): a, b = LikeNan(), LikeNan() - msg = "isin with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - # same object -> True - tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True])) - # different objects -> False - tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False])) + arg = np.array([a], dtype=object) + + # same object -> True + tm.assert_numpy_array_equal(algos.isin(arg, [a]), np.array([True])) + # different objects -> False + tm.assert_numpy_array_equal(algos.isin(arg, [b]), np.array([False])) def test_different_nans(self): # GH 22160 @@ -1131,12 +1120,11 @@ def test_different_nans(self): def test_no_cast(self): # GH 22160 # ensure 42 is not casted to a string - comps = ["ss", 42] + comps = np.array(["ss", 42], dtype=object) values = ["42"] expected = np.array([False, False]) - msg = "isin with argument that is not not a Series, Index" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.isin(comps, values) + + result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) @@ -1216,9 +1204,7 @@ def test_value_counts(self): factor = cut(arr, 4) # assert isinstance(factor, n) - msg = "pandas.value_counts is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.value_counts(factor) + result = algos.value_counts_internal(factor) breaks = [-1.606, -1.018, -0.431, 0.155, 0.741] index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True)) expected = Series([1, 0, 2, 1], index=index, name="count") @@ -1226,16 +1212,13 @@ def test_value_counts(self): def test_value_counts_bins(self): s = [1, 2, 3, 4] - msg = "pandas.value_counts is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.value_counts(s, bins=1) + result = algos.value_counts_internal(s, bins=1) expected = Series( [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), name="count" ) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.value_counts(s, bins=2, sort=False) + result = algos.value_counts_internal(s, bins=2, sort=False) expected = Series( [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]), @@ -1244,45 +1227,35 @@ def test_value_counts_bins(self): tm.assert_series_equal(result, expected) def test_value_counts_dtypes(self): - msg2 = "pandas.value_counts is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = algos.value_counts(np.array([1, 1.0])) + result = algos.value_counts_internal(np.array([1, 1.0])) assert len(result) == 1 - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = algos.value_counts(np.array([1, 1.0]), bins=1) + result = algos.value_counts_internal(np.array([1, 1.0]), bins=1) assert len(result) == 1 - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = algos.value_counts(Series([1, 1.0, "1"])) # object + result = algos.value_counts_internal(Series([1, 1.0, "1"])) # object assert len(result) == 2 msg = "bins argument only works with numeric data" with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - algos.value_counts(np.array(["1", 1], dtype=object), bins=1) + algos.value_counts_internal(np.array(["1", 1], dtype=object), bins=1) def test_value_counts_nat(self): td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]") dt = to_datetime(["NaT", "2014-01-01"]) - msg = "pandas.value_counts is deprecated" - for ser in [td, dt]: - with tm.assert_produces_warning(FutureWarning, match=msg): - vc = algos.value_counts(ser) - vc_with_na = algos.value_counts(ser, dropna=False) + vc = algos.value_counts_internal(ser) + vc_with_na = algos.value_counts_internal(ser, dropna=False) assert len(vc) == 1 assert len(vc_with_na) == 2 exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count") - with tm.assert_produces_warning(FutureWarning, match=msg): - result_dt = algos.value_counts(dt) + result_dt = algos.value_counts_internal(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") - with tm.assert_produces_warning(FutureWarning, match=msg): - result_td = algos.value_counts(td) + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") + result_td = algos.value_counts_internal(td) tm.assert_series_equal(result_td, exp_td) @pytest.mark.parametrize("dtype", [object, "M8[us]"]) @@ -1299,6 +1272,7 @@ def test_value_counts_datetime_outofbounds(self, dtype): ], dtype=dtype, ) + res = ser.value_counts() exp_index = Index( @@ -1439,16 +1413,13 @@ def test_value_counts_normalized(self, dtype): def test_value_counts_uint64(self): arr = np.array([2**63], dtype=np.uint64) expected = Series([1], index=[2**63], name="count") - msg = "pandas.value_counts is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.value_counts(arr) + result = algos.value_counts_internal(arr) tm.assert_series_equal(result, expected) arr = np.array([-1, 2**63], dtype=object) expected = Series([1, 1], index=[-1, 2**63], name="count") - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.value_counts(arr) + result = algos.value_counts_internal(arr) tm.assert_series_equal(result, expected) @@ -1526,9 +1497,7 @@ def test_duplicated_with_nas(self): ] ), np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object), - np.array( - [1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], dtype=np.uint64 - ), + np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], dtype=np.uint64), ], ) def test_numeric_object_likes(self, case): @@ -1677,35 +1646,53 @@ def test_unique_tuples(self, arr, uniques): expected = np.empty(len(uniques), dtype=object) expected[:] = uniques - msg = "unique with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.unique(arr) - tm.assert_numpy_array_equal(result, expected) + msg = ( + r"unique requires a Series, Index, ExtensionArray, np.ndarray " + r"or NumpyExtensionArray got list" + ) + with pytest.raises(TypeError, match=msg): + # GH#52986 + pd.unique(arr) + + res = pd.unique(com.asarray_tuplesafe(arr, dtype=object)) + tm.assert_numpy_array_equal(res, expected) @pytest.mark.parametrize( "array,expected", [ ( [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j], - # Should return a complex dtype in the future - np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=object), + np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=complex), ) ], ) def test_unique_complex_numbers(self, array, expected): # GH 17927 - msg = "unique with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.unique(array) - tm.assert_numpy_array_equal(result, expected) + msg = ( + r"unique requires a Series, Index, ExtensionArray, np.ndarray " + r"or NumpyExtensionArray got list" + ) + + with pytest.raises(TypeError, match=msg): + # GH#52986 + pd.unique(array) + + res = pd.unique(np.array(array)) + tm.assert_numpy_array_equal(res, expected) class TestHashTable: @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1713,7 +1700,7 @@ class TestHashTable: ) def test_hashtable_unique(self, htable, data, writable): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1743,8 +1730,14 @@ def test_hashtable_unique(self, htable, data, writable): @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1752,7 +1745,7 @@ def test_hashtable_unique(self, htable, data, writable): ) def test_hashtable_factorize(self, htable, writable, data): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1794,18 +1787,16 @@ def test_scipy_compat(self, arr): arr = np.array(arr) mask = ~np.isfinite(arr) - arr = arr.copy() result = libalgos.rank_1d(arr) arr[mask] = np.inf exp = sp_stats.rankdata(arr) exp[mask] = np.nan tm.assert_almost_equal(result, exp) - @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) - def test_basic(self, writable, dtype): + def test_basic(self, writable, any_int_numpy_dtype): exp = np.array([1, 2], dtype=np.float64) - data = np.array([1, 100], dtype=dtype) + data = np.array([1, 100], dtype=any_int_numpy_dtype) data.setflags(write=writable) ser = Series(data) result = algos.rank(ser) @@ -1840,10 +1831,10 @@ def test_pct_max_many_rows(self): class TestMode: def test_no_mode(self): exp = Series([], dtype=np.float64, index=Index([], dtype=int)) - tm.assert_numpy_array_equal(algos.mode(np.array([])), exp.values) + result, _ = algos.mode(np.array([])) + tm.assert_numpy_array_equal(result, exp.values) - @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"]) - def test_mode_single(self, dt): + def test_mode_single(self, any_real_numpy_dtype): # GH 15714 exp_single = [1] data_single = [1] @@ -1851,39 +1842,44 @@ def test_mode_single(self, dt): exp_multi = [1] data_multi = [1, 1] - ser = Series(data_single, dtype=dt) - exp = Series(exp_single, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + ser = Series(data_single, dtype=any_real_numpy_dtype) + exp = Series(exp_single, dtype=any_real_numpy_dtype) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) - ser = Series(data_multi, dtype=dt) - exp = Series(exp_multi, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + ser = Series(data_multi, dtype=any_real_numpy_dtype) + exp = Series(exp_multi, dtype=any_real_numpy_dtype) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_mode_obj_int(self): exp = Series([1], dtype=int) - tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values) + result, _ = algos.mode(exp.values) + tm.assert_numpy_array_equal(result, exp.values) exp = Series(["a", "b", "c"], dtype=object) - tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values) + result, _ = algos.mode(exp.values) + tm.assert_numpy_array_equal(result, exp.values) - @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"]) - def test_number_mode(self, dt): + def test_number_mode(self, any_real_numpy_dtype): exp_single = [1] data_single = [1] * 5 + [2] * 3 exp_multi = [1, 3] data_multi = [1] * 5 + [2] * 3 + [3] * 5 - ser = Series(data_single, dtype=dt) - exp = Series(exp_single, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + ser = Series(data_single, dtype=any_real_numpy_dtype) + exp = Series(exp_single, dtype=any_real_numpy_dtype) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) - ser = Series(data_multi, dtype=dt) - exp = Series(exp_multi, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + ser = Series(data_multi, dtype=any_real_numpy_dtype) + exp = Series(exp_multi, dtype=any_real_numpy_dtype) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_strobj_mode(self): @@ -1892,17 +1888,22 @@ def test_strobj_mode(self): ser = Series(data, dtype="c") exp = Series(exp, dtype="c") - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) - def test_strobj_multi_char(self, dt): + def test_strobj_multi_char(self, dt, using_infer_string): exp = ["bar"] data = ["foo"] * 2 + ["bar"] * 3 ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + if using_infer_string and dt is str: + tm.assert_extension_array_equal(result, exp.values) + else: + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): @@ -1936,18 +1937,21 @@ def test_timedelta_mode(self): def test_mixed_dtype(self): exp = Series(["foo"], dtype=object) ser = Series([1, "foo", "foo"]) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_uint64_overflow(self): exp = Series([2**63], dtype=np.uint64) ser = Series([1, 2**63, 2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) exp = Series([1, 2**63], dtype=np.uint64) ser = Series([1, 2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_categorical(self): @@ -1969,15 +1973,18 @@ def test_categorical(self): def test_index(self): idx = Index([1, 2, 3]) exp = Series([1, 2, 3], dtype=np.int64) - tm.assert_numpy_array_equal(algos.mode(idx), exp.values) + result, _ = algos.mode(idx) + tm.assert_numpy_array_equal(result, exp.values) idx = Index([1, "a", "a"]) exp = Series(["a"], dtype=object) - tm.assert_numpy_array_equal(algos.mode(idx), exp.values) + result, _ = algos.mode(idx) + tm.assert_numpy_array_equal(result, exp.values) idx = Index([1, 1, 2, 3, 3]) exp = Series([1, 3], dtype=np.int64) - tm.assert_numpy_array_equal(algos.mode(idx), exp.values) + result, _ = algos.mode(idx) + tm.assert_numpy_array_equal(result, exp.values) idx = Index( ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index e8a1c961c8cb6..9f68ef5ffdc47 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -3,11 +3,12 @@ import string import subprocess import sys -import textwrap import numpy as np import pytest +from pandas.compat import WASM + import pandas as pd from pandas import Series import pandas._testing as tm @@ -16,84 +17,91 @@ from pandas.util.version import Version -def test_get_callable_name(): - getname = com.get_callable_name - - def fn(x): +class TestGetCallableName: + def fn(self, x): return x + partial1 = partial(fn) + partial2 = partial(partial1) lambda_ = lambda x: x - part1 = partial(fn) - part2 = partial(part1) - class somecall: + class SomeCall: def __call__(self): - # This shouldn't actually get called below; somecall.__init__ + # This shouldn't actually get called below; SomeCall.__init__ # should. raise NotImplementedError - assert getname(fn) == "fn" - assert getname(lambda_) - assert getname(part1) == "fn" - assert getname(part2) == "fn" - assert getname(somecall()) == "somecall" - assert getname(1) is None + @pytest.mark.parametrize( + "func, expected", + [ + (fn, "fn"), + (partial1, "fn"), + (partial2, "fn"), + (lambda_, ""), + (SomeCall(), "SomeCall"), + (1, None), + ], + ) + def test_get_callable_name(self, func, expected): + assert com.get_callable_name(func) == expected -def test_any_none(): - assert com.any_none(1, 2, 3, None) - assert not com.any_none(1, 2, 3, 4) +class TestRandomState: + def test_seed(self): + seed = 5 + assert com.random_state(seed).uniform() == np.random.RandomState(seed).uniform() + def test_object(self): + seed = 10 + state_obj = np.random.RandomState(seed) + assert ( + com.random_state(state_obj).uniform() + == np.random.RandomState(seed).uniform() + ) -def test_all_not_none(): - assert com.all_not_none(1, 2, 3, 4) - assert not com.all_not_none(1, 2, 3, None) - assert not com.all_not_none(None, None, None, None) + def test_default(self): + assert com.random_state() is np.random + def test_array_like(self): + state = np.random.default_rng(None).integers(0, 2**31, size=624, dtype="uint32") + assert ( + com.random_state(state).uniform() == np.random.RandomState(state).uniform() + ) -def test_random_state(): - # Check with seed - state = com.random_state(5) - assert state.uniform() == np.random.RandomState(5).uniform() + def test_bit_generators(self): + seed = 3 + assert ( + com.random_state(np.random.MT19937(seed)).uniform() + == np.random.RandomState(np.random.MT19937(seed)).uniform() + ) - # Check with random state object - state2 = np.random.RandomState(10) - assert com.random_state(state2).uniform() == np.random.RandomState(10).uniform() + seed = 11 + assert ( + com.random_state(np.random.PCG64(seed)).uniform() + == np.random.RandomState(np.random.PCG64(seed)).uniform() + ) - # check with no arg random state - assert com.random_state() is np.random + @pytest.mark.parametrize("state", ["test", 5.5]) + def test_error(self, state): + msg = ( + "random_state must be an integer, array-like, a BitGenerator, Generator, " + "a numpy RandomState, or None" + ) + with pytest.raises(ValueError, match=msg): + com.random_state(state) - # check array-like - # GH32503 - state_arr_like = np.random.default_rng(None).integers( - 0, 2**31, size=624, dtype="uint32" - ) - assert ( - com.random_state(state_arr_like).uniform() - == np.random.RandomState(state_arr_like).uniform() - ) - # Check BitGenerators - # GH32503 - assert ( - com.random_state(np.random.MT19937(3)).uniform() - == np.random.RandomState(np.random.MT19937(3)).uniform() - ) - assert ( - com.random_state(np.random.PCG64(11)).uniform() - == np.random.RandomState(np.random.PCG64(11)).uniform() - ) +@pytest.mark.parametrize("args, expected", [((1, 2, None), True), ((1, 2, 3), False)]) +def test_any_none(args, expected): + assert com.any_none(*args) is expected - # Error for floats or strings - msg = ( - "random_state must be an integer, array-like, a BitGenerator, Generator, " - "a numpy RandomState, or None" - ) - with pytest.raises(ValueError, match=msg): - com.random_state("test") - with pytest.raises(ValueError, match=msg): - com.random_state(5.5) +@pytest.mark.parametrize( + "args, expected", + [((1, 2, 3), True), ((1, 2, None), False), ((None, None, None), False)], +) +def test_all_not_none(args, expected): + assert com.all_not_none(*args) is expected @pytest.mark.parametrize( @@ -135,21 +143,32 @@ def test_maybe_match_name(left, right, expected): assert res is expected or res == expected -def test_standardize_mapping(): - # No uninitialized defaultdicts - msg = r"to_dict\(\) only accepts initialized defaultdicts" - with pytest.raises(TypeError, match=msg): - com.standardize_mapping(collections.defaultdict) - - # No non-mapping subtypes, instance - msg = "unsupported type: " +@pytest.mark.parametrize( + "into, msg", + [ + ( + # uninitialized defaultdict + collections.defaultdict, + r"to_dict\(\) only accepts initialized defaultdicts", + ), + ( + # non-mapping subtypes,, instance + [], + "unsupported type: ", + ), + ( + # non-mapping subtypes, class + list, + "unsupported type: ", + ), + ], +) +def test_standardize_mapping_type_error(into, msg): with pytest.raises(TypeError, match=msg): - com.standardize_mapping([]) + com.standardize_mapping(into) - # No non-mapping subtypes, class - with pytest.raises(TypeError, match=msg): - com.standardize_mapping(list) +def test_standardize_mapping(): fill = {"bad": "data"} assert com.standardize_mapping(fill) == dict @@ -171,10 +190,10 @@ def test_version_tag(): version = Version(pd.__version__) try: version > Version("0.0.1") - except TypeError: + except TypeError as err: raise ValueError( "No git tags exist, please sync tags between upstream and your repo" - ) + ) from err @pytest.mark.parametrize( @@ -234,6 +253,7 @@ def test_temp_setattr(with_exception): assert ser.name == "first" +@pytest.mark.skipif(WASM, reason="Can't start subprocesses in WASM") @pytest.mark.single_cpu def test_str_size(): # GH#21758 @@ -247,21 +267,3 @@ def test_str_size(): ] result = subprocess.check_output(call).decode()[-4:-1].strip("\n") assert int(result) == int(expected) - - -@pytest.mark.single_cpu -def test_bz2_missing_import(): - # Check whether bz2 missing import is handled correctly (issue #53857) - code = """ - import sys - sys.modules['bz2'] = None - import pytest - import pandas as pd - from pandas.compat import get_bz2_file - msg = 'bz2 module not available.' - with pytest.raises(RuntimeError, match=msg): - get_bz2_file() - """ - code = textwrap.dedent(code) - call = [sys.executable, "-c", code] - subprocess.check_output(call) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 51ce73ef54300..d7398ffe259cb 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -1,7 +1,10 @@ """ Testing that we work in the downstream packages """ + import array +from functools import partial +import importlib import subprocess import sys @@ -9,7 +12,6 @@ import pytest from pandas.errors import IntCastingNaNError -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -19,10 +21,7 @@ TimedeltaIndex, ) import pandas._testing as tm -from pandas.core.arrays import ( - DatetimeArray, - TimedeltaArray, -) +from pandas.util.version import Version @pytest.fixture @@ -46,6 +45,8 @@ def test_dask(df): pd.set_option("compute.use_numexpr", olduse) +# TODO(CoW) see https://github.com/pandas-dev/pandas/pull/51082 +@pytest.mark.skip(reason="not implemented with CoW") def test_dask_ufunc(): # dask sets "compute.use_numexpr" to False, so catch the current value # and ensure to reset it afterwards to avoid impacting other tests @@ -58,8 +59,8 @@ def test_dask_ufunc(): s = Series([1.5, 2.3, 3.7, 4.0]) ds = dd.from_pandas(s, npartitions=2) - result = da.fix(ds).compute() - expected = np.fix(s) + result = da.log(ds).compute() + expected = np.log(s) tm.assert_series_equal(result, expected) finally: pd.set_option("compute.use_numexpr", olduse) @@ -102,7 +103,7 @@ def test_xarray_cftimeindex_nearest(): cftime = pytest.importorskip("cftime") xarray = pytest.importorskip("xarray") - times = xarray.cftime_range("0001", periods=2) + times = xarray.date_range("0001", periods=2, use_cftime=True) key = cftime.DatetimeGregorian(2000, 1, 1) result = times.get_indexer([key], method="nearest") expected = 1 @@ -153,7 +154,7 @@ def test_scikit_learn(): clf.predict(digits.data[-1:]) -def test_seaborn(): +def test_seaborn(mpl_cleanup): seaborn = pytest.importorskip("seaborn") tips = DataFrame( {"day": pd.date_range("2023", freq="D", periods=5), "total_bill": range(5)} @@ -186,44 +187,24 @@ def test_yaml_dump(df): tm.assert_frame_equal(df, loaded2) -@pytest.mark.single_cpu -def test_missing_required_dependency(): - # GH 23868 - # To ensure proper isolation, we pass these flags - # -S : disable site-packages - # -s : disable user site-packages - # -E : disable PYTHON* env vars, especially PYTHONPATH - # https://github.com/MacPython/pandas-wheels/pull/50 - - pyexe = sys.executable.replace("\\", "/") - - # We skip this test if pandas is installed as a site package. We first - # import the package normally and check the path to the module before - # executing the test which imports pandas with site packages disabled. - call = [pyexe, "-c", "import pandas;print(pandas.__file__)"] - output = subprocess.check_output(call).decode() - if "site-packages" in output: - pytest.skip("pandas installed as site package") - - # This test will fail if pandas is installed as a site package. The flags - # prevent pandas being imported and the test will report Failed: DID NOT - # RAISE - call = [pyexe, "-sSE", "-c", "import pandas"] - - msg = ( - rf"Command '\['{pyexe}', '-sSE', '-c', 'import pandas'\]' " - "returned non-zero exit status 1." - ) +@pytest.mark.parametrize("dependency", ["numpy", "dateutil", "tzdata"]) +def test_missing_required_dependency(monkeypatch, dependency): + # GH#61030, GH61273 + original_import = __import__ + mock_error = ImportError(f"Mock error for {dependency}") - with pytest.raises(subprocess.CalledProcessError, match=msg) as exc: - subprocess.check_output(call, stderr=subprocess.STDOUT) + def mock_import(name, *args, **kwargs): + if name == dependency: + raise mock_error + return original_import(name, *args, **kwargs) - output = exc.value.stdout.decode() - for name in ["numpy", "pytz", "dateutil"]: - assert name in output + monkeypatch.setattr("builtins.__import__", mock_import) + with pytest.raises(ImportError, match=dependency): + importlib.reload(importlib.import_module("pandas")) -def test_frame_setitem_dask_array_into_new_col(): + +def test_frame_setitem_dask_array_into_new_col(request): # GH#47128 # dask sets "compute.use_numexpr" to False, so catch the current value @@ -231,7 +212,14 @@ def test_frame_setitem_dask_array_into_new_col(): olduse = pd.get_option("compute.use_numexpr") try: + dask = pytest.importorskip("dask") da = pytest.importorskip("dask.array") + if Version(dask.__version__) <= Version("2025.1.0") and Version( + np.__version__ + ) >= Version("2.1"): + request.applymarker( + pytest.mark.xfail(reason="loc.__setitem__ incorrectly mutated column c") + ) dda = da.array([1, 2]) df = DataFrame({"a": ["a", "b"]}) @@ -261,54 +249,25 @@ def __radd__(self, other): assert right + left is left -@pytest.fixture( - params=[ - "memoryview", - "array", - pytest.param("dask", marks=td.skip_if_no("dask.array")), - pytest.param("xarray", marks=td.skip_if_no("xarray")), - ] -) -def array_likes(request): - """ - Fixture giving a numpy array and a parametrized 'data' object, which can - be a memoryview, array, dask or xarray object created from the numpy array. - """ - # GH#24539 recognize e.g xarray, dask, ... - arr = np.array([1, 2, 3], dtype=np.int64) - - name = request.param - if name == "memoryview": - data = memoryview(arr) - elif name == "array": - data = array.array("i", arr) - elif name == "dask": - import dask.array - - data = dask.array.array(arr) - elif name == "xarray": - import xarray as xr - - data = xr.DataArray(arr) - - return arr, data - - @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) -def test_from_obscure_array(dtype, array_likes): +@pytest.mark.parametrize( + "box", [memoryview, partial(array.array, "i"), "dask", "xarray"] +) +def test_from_obscure_array(dtype, box): # GH#24539 recognize e.g xarray, dask, ... # Note: we dont do this for PeriodArray bc _from_sequence won't accept # an array of integers # TODO: could check with arraylike of Period objects - arr, data = array_likes - - cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] - - depr_msg = f"{cls.__name__}.__init__ is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = cls(arr) - result = cls._from_sequence(data, dtype=dtype) - tm.assert_extension_array_equal(result, expected) + # GH#24539 recognize e.g xarray, dask, ... + arr = np.array([1, 2, 3], dtype=np.int64) + if box == "dask": + da = pytest.importorskip("dask.array") + data = da.array(arr) + elif box == "xarray": + xr = pytest.importorskip("xarray") + data = xr.DataArray(arr) + else: + data = box(arr) if not isinstance(data, memoryview): # FIXME(GH#44431) these raise on memoryview and attempted fix @@ -325,25 +284,6 @@ def test_from_obscure_array(dtype, array_likes): tm.assert_index_equal(result, expected) -def test_dataframe_consortium() -> None: - """ - Test some basic methods of the dataframe consortium standard. - - Full testing is done at https://github.com/data-apis/dataframe-api-compat, - this is just to check that the entry point works as expected. - """ - pytest.importorskip("dataframe_api_compat") - df_pd = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = df_pd.__dataframe_consortium_standard__() - result_1 = df.get_column_names() - expected_1 = ["a", "b"] - assert result_1 == expected_1 - - ser = Series([1, 2, 3], name="a") - col = ser.__column_consortium_standard__() - assert col.name == "a" - - def test_xarray_coerce_unit(): # GH44053 xr = pytest.importorskip("xarray") diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index aeddc08e4b888..c5c4b234eb129 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -37,8 +37,6 @@ "PossibleDataLossError", "PossiblePrecisionLoss", "PyperclipException", - "SettingWithCopyError", - "SettingWithCopyWarning", "SpecificationError", "UnsortedIndexError", "UnsupportedFunctionCall", @@ -67,13 +65,7 @@ def test_catch_oob(): pd.Timestamp("15000101").as_unit("ns") -@pytest.mark.parametrize( - "is_local", - [ - True, - False, - ], -) +@pytest.mark.parametrize("is_local", [True, False]) def test_catch_undefined_variable_error(is_local): variable_name = "x" if is_local: diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index dfec99f0786eb..8f275345a7819 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -6,11 +6,7 @@ from pandas import option_context import pandas._testing as tm -from pandas.core.api import ( - DataFrame, - Index, - Series, -) +from pandas.core.api import DataFrame from pandas.core.computation import expressions as expr @@ -36,7 +32,7 @@ def _frame2(): def _mixed(_frame): return DataFrame( { - "A": _frame["A"].copy(), + "A": _frame["A"], "B": _frame["B"].astype("float32"), "C": _frame["C"].astype("int64"), "D": _frame["D"].astype("int32"), @@ -48,7 +44,7 @@ def _mixed(_frame): def _mixed2(_frame2): return DataFrame( { - "A": _frame2["A"].copy(), + "A": _frame2["A"], "B": _frame2["B"].astype("float32"), "C": _frame2["C"].astype("int64"), "D": _frame2["D"].astype("int32"), @@ -82,22 +78,22 @@ def _integer2(): @pytest.fixture def _array(_frame): - return _frame["A"].values.copy() + return _frame["A"].to_numpy() @pytest.fixture def _array2(_frame2): - return _frame2["A"].values.copy() + return _frame2["A"].to_numpy() @pytest.fixture def _array_mixed(_mixed): - return _mixed["D"].values.copy() + return _mixed["D"].to_numpy() @pytest.fixture def _array_mixed2(_mixed2): - return _mixed2["D"].values.copy() + return _mixed2["D"].to_numpy() @pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr") @@ -174,7 +170,7 @@ def test_run_binary(self, request, fixture, flex, comparison_op, monkeypatch): df = request.getfixturevalue(fixture) arith = comparison_op.__name__ with option_context("compute.use_numexpr", False): - other = df.copy() + 1 + other = df + 1 with monkeypatch.context() as m: m.setattr(expr, "_MIN_ELEMENTS", 0) @@ -324,7 +320,7 @@ def test_bool_ops_raise_on_arithmetic(self, op_str, opname): @pytest.mark.parametrize( "op_str,opname", [("+", "add"), ("*", "mul"), ("-", "sub")] ) - def test_bool_ops_warn_on_arithmetic(self, op_str, opname): + def test_bool_ops_warn_on_arithmetic(self, op_str, opname, monkeypatch): n = 10 df = DataFrame( { @@ -343,36 +339,39 @@ def test_bool_ops_warn_on_arithmetic(self, op_str, opname): # raises TypeError return - with tm.use_numexpr(True, min_elements=5): - with tm.assert_produces_warning(): - r = f(df, df) - e = fe(df, df) - tm.assert_frame_equal(r, e) - - with tm.assert_produces_warning(): - r = f(df.a, df.b) - e = fe(df.a, df.b) - tm.assert_series_equal(r, e) - - with tm.assert_produces_warning(): - r = f(df.a, True) - e = fe(df.a, True) - tm.assert_series_equal(r, e) - - with tm.assert_produces_warning(): - r = f(False, df.a) - e = fe(False, df.a) - tm.assert_series_equal(r, e) - - with tm.assert_produces_warning(): - r = f(False, df) - e = fe(False, df) - tm.assert_frame_equal(r, e) - - with tm.assert_produces_warning(): - r = f(df, True) - e = fe(df, True) - tm.assert_frame_equal(r, e) + msg = "operator is not supported by numexpr" + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 5) + with option_context("compute.use_numexpr", True): + with tm.assert_produces_warning(UserWarning, match=msg): + r = f(df, df) + e = fe(df, df) + tm.assert_frame_equal(r, e) + + with tm.assert_produces_warning(UserWarning, match=msg): + r = f(df.a, df.b) + e = fe(df.a, df.b) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(UserWarning, match=msg): + r = f(df.a, True) + e = fe(df.a, True) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(UserWarning, match=msg): + r = f(False, df.a) + e = fe(False, df.a) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(UserWarning, match=msg): + r = f(False, df) + e = fe(False, df) + tm.assert_frame_equal(r, e) + + with tm.assert_produces_warning(UserWarning, match=msg): + r = f(df, True) + e = fe(df, True) + tm.assert_frame_equal(r, e) @pytest.mark.parametrize( "test_input,expected", @@ -433,16 +432,15 @@ def test_frame_series_axis(self, axis, arith, _frame, monkeypatch): "__rfloordiv__", ], ) - @pytest.mark.parametrize("box", [DataFrame, Series, Index]) @pytest.mark.parametrize("scalar", [-5, 5]) def test_python_semantics_with_numexpr_installed( - self, op, box, scalar, monkeypatch + self, op, box_with_array, scalar, monkeypatch ): # https://github.com/pandas-dev/pandas/issues/36047 with monkeypatch.context() as m: m.setattr(expr, "_MIN_ELEMENTS", 0) data = np.arange(-50, 50) - obj = box(data) + obj = box_with_array(data) method = getattr(obj, op) result = method(scalar) @@ -454,7 +452,7 @@ def test_python_semantics_with_numexpr_installed( # compare result element-wise with Python for i, elem in enumerate(data): - if box == DataFrame: + if box_with_array == DataFrame: scalar_result = result.iloc[i, 0] else: scalar_result = result[i] diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 6644ec82fab17..ff7ab22c197d8 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -5,6 +5,7 @@ import pandas as pd from pandas import ( + ArrowDtype, DataFrame, MultiIndex, Series, @@ -28,16 +29,6 @@ def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data): expected = ymd["A"].groupby(level="month").transform("sum") tm.assert_series_equal(result, expected, check_names=False) - # axis=1 - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = ymd.T.groupby("month", axis=1) - - month_sums = gb.sum() - result = month_sums.reindex(columns=ymd.index, level=1) - expected = ymd.groupby(level="month").transform("sum").T - tm.assert_frame_equal(result, expected) - def test_reindex(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -46,26 +37,20 @@ def test_reindex(self, multiindex_dataframe_random_data): tm.assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels( - self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write + self, multiindex_year_month_day_dataframe_random_data ): ymd = multiindex_year_month_day_dataframe_random_data new_index = ymd.index[::10] chunk = ymd.reindex(new_index) - if using_copy_on_write: - assert chunk.index.is_(new_index) - else: - assert chunk.index is new_index + assert chunk.index.is_(new_index) chunk = ymd.loc[new_index] assert chunk.index.equals(new_index) ymdT = ymd.T chunk = ymdT.reindex(columns=new_index) - if using_copy_on_write: - assert chunk.columns.is_(new_index) - else: - assert chunk.columns is new_index + assert chunk.columns.is_(new_index) chunk = ymdT.loc[:, new_index] assert chunk.columns.equals(new_index) @@ -97,27 +82,6 @@ def test_groupby_corner(self): # should work df.groupby(level="three") - def test_groupby_level_no_obs(self): - # #1697 - midx = MultiIndex.from_tuples( - [ - ("f1", "s1"), - ("f1", "s2"), - ("f2", "s1"), - ("f2", "s2"), - ("f3", "s1"), - ("f3", "s2"), - ] - ) - df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) - df1 = df.loc(axis=1)[df.columns.map(lambda u: u[0] in ["f2", "f3"])] - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = df1.groupby(axis=1, level=0) - result = grouped.sum() - assert (result.columns == ["f2", "f3"]).all() - def test_setitem_with_expansion_multiindex_columns( self, multiindex_year_month_day_dataframe_random_data ): @@ -158,8 +122,7 @@ def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_dat expected = ymd.groupby([k1, k2]).mean() - # TODO groupby with level_values drops names - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) assert result.index.names == ymd.index.names[:2] result2 = ymd.groupby(level=ymd.index.names[:2]).mean() @@ -172,7 +135,7 @@ def test_multilevel_consolidate(self): df = DataFrame( np.random.default_rng(2).standard_normal((4, 4)), index=index, columns=index ) - df["Totals", ""] = df.sum(1) + df["Totals", ""] = df.sum(axis=1) df = df._consolidate() def test_level_with_tuples(self): @@ -326,6 +289,64 @@ def test_multiindex_with_na(self): tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("na", [None, np.nan]) + def test_multiindex_insert_level_with_na(self, na): + # GH 59003 + df = DataFrame([0], columns=[["A"], ["B"]]) + df[na, "B"] = 1 + tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"])) + + def test_multiindex_dt_with_nan(self): + # GH#60388 + df = DataFrame( + [ + [1, np.nan, 5, np.nan], + [2, np.nan, 6, np.nan], + [np.nan, 3, np.nan, 7], + [np.nan, 4, np.nan, 8], + ], + index=Series(["a", "b", "c", "d"], dtype=object, name="sub"), + columns=MultiIndex.from_product( + [ + ["value1", "value2"], + [datetime.datetime(2024, 11, 1), datetime.datetime(2024, 11, 2)], + ], + names=[None, "Date"], + ), + ) + df = df.reset_index() + result = df[df.columns[0]] + expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) + tm.assert_series_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_multiindex_with_pyarrow_categorical(self): + # GH#53051 + pa = pytest.importorskip("pyarrow") + + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) + + df = df.set_index(["string_column", "number_column"]) + + df_expected = DataFrame( + index=MultiIndex.from_arrays( + [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"] + ) + ) + tm.assert_frame_equal( + df, + df_expected, + check_index_type=False, + check_column_type=False, + ) + class TestSorted: """everything you wanted to test about sorting""" diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index a50054f33f382..e7ed8e855a762 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -296,6 +296,7 @@ def check_fun_data( self, testfunc, targfunc, + testar, testarval, targarval, skipna, @@ -319,6 +320,13 @@ def check_fun_data( else: targ = bool(targ) + if testfunc.__name__ in ["nanargmax", "nanargmin"] and ( + testar.startswith("arr_nan") + or (testar.endswith("nan") and (not skipna or axis == 1)) + ): + with pytest.raises(ValueError, match="Encountered .* NA value"): + testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + return res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) if ( @@ -350,6 +358,7 @@ def check_fun_data( self.check_fun_data( testfunc, targfunc, + testar, testarval2, targarval2, skipna=skipna, @@ -370,6 +379,7 @@ def check_fun( self.check_fun_data( testfunc, targfunc, + testar, testarval, targarval, skipna=skipna, @@ -527,11 +537,8 @@ def _argminmax_wrap(self, value, axis=None, func=None): nullnan = isna(nans) if res.ndim: res[nullnan] = -1 - elif ( - hasattr(nullnan, "all") - and nullnan.all() - or not hasattr(nullnan, "all") - and nullnan + elif (hasattr(nullnan, "all") and nullnan.all()) or ( + not hasattr(nullnan, "all") and nullnan ): res = -1 return res @@ -748,7 +755,6 @@ def test_nancov(self): ("arr_bool", False), ("arr_str", False), ("arr_utf", False), - ("arr_complex", False), ("arr_complex_nan", False), ("arr_nan_nanj", False), ("arr_nan_infj", True), @@ -1102,10 +1108,6 @@ def prng(self): class TestDatetime64NaNOps: - @pytest.fixture(params=["s", "ms", "us", "ns"]) - def unit(self, request): - return request.param - # Enabling mean changes the behavior of DataFrame.mean # See https://github.com/pandas-dev/pandas/issues/24752 def test_nanmean(self, unit): diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index c1d1948d6c31a..9127981d1845d 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -42,7 +42,7 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule", min_version="0.8") assert result is module - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=match): result = import_optional_dependency("fakemodule", errors="warn") assert result is None @@ -50,6 +50,20 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule") assert result is module + with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): + import_optional_dependency("fakemodule", min_version="1.1.0") + + with tm.assert_produces_warning(UserWarning, match="Pandas requires version"): + result = import_optional_dependency( + "fakemodule", errors="warn", min_version="1.1.0" + ) + assert result is None + + result = import_optional_dependency( + "fakemodule", errors="ignore", min_version="1.1.0" + ) + assert result is None + def test_submodule(monkeypatch): # Create a fake module with a submodule @@ -67,7 +81,7 @@ def test_submodule(monkeypatch): with pytest.raises(ImportError, match=match): import_optional_dependency("fakemodule.submodule") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=match): result = import_optional_dependency("fakemodule.submodule", errors="warn") assert result is None diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index 4e569dc40005d..9deff56139394 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -1,5 +1,6 @@ from collections.abc import Generator import contextlib +import weakref import pytest @@ -101,3 +102,22 @@ def __init__(self, data) -> None: with pytest.raises(AttributeError, match="whoops"): pd.Series([], dtype=object).bad + + +@pytest.mark.parametrize( + "klass, registrar", + [ + (pd.Series, pd.api.extensions.register_series_accessor), + (pd.DataFrame, pd.api.extensions.register_dataframe_accessor), + (pd.Index, pd.api.extensions.register_index_accessor), + ], +) +def test_no_circular_reference(klass, registrar): + # GH 41357 + with ensure_removed(klass, "access"): + registrar("access")(MyAccessor) + obj = klass([0]) + ref = weakref.ref(obj) + assert obj.access.obj is obj + del obj + assert ref() is None diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 285f240028152..869d41efa6c28 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -33,12 +33,10 @@ def left_right(): np.random.default_rng(2).integers(low, high, (n, 7)), columns=list("ABCDEFG") ) left["left"] = left.sum(axis=1) - - # one-2-one match - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i].copy() + right = left.sample( + frac=1, random_state=np.random.default_rng(2), ignore_index=True + ) right.columns = right.columns[:-1].tolist() + ["right"] - right.index = np.arange(len(right)) right["right"] *= -1 return left, right @@ -106,7 +104,9 @@ def test_int64_overflow_groupby_large_df_shuffled(self, agg): gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! - assert is_int64_overflow_possible(gr._grouper.shape) + assert is_int64_overflow_possible( + tuple(ping.ngroups for ping in gr._grouper.groupings) + ) mi = MultiIndex.from_arrays( [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)], @@ -218,14 +218,12 @@ def test_int64_overflow_check_sum_col(self, left_right): assert result.name is None @pytest.mark.slow - @pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) - def test_int64_overflow_how_merge(self, left_right, how): + def test_int64_overflow_how_merge(self, left_right, join_type): left, right = left_right out = merge(left, right, how="outer") out.sort_values(out.columns.tolist(), inplace=True) - out.index = np.arange(len(out)) - tm.assert_frame_equal(out, merge(left, right, how=how, sort=True)) + tm.assert_frame_equal(out, merge(left, right, how=join_type, sort=True)) @pytest.mark.slow def test_int64_overflow_sort_false_order(self, left_right): @@ -239,10 +237,9 @@ def test_int64_overflow_sort_false_order(self, left_right): tm.assert_frame_equal(right, out[right.columns.tolist()]) @pytest.mark.slow - @pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) - @pytest.mark.parametrize("sort", [True, False]) - def test_int64_overflow_one_to_many_none_match(self, how, sort): + def test_int64_overflow_one_to_many_none_match(self, join_type, sort): # one-2-many/none match + how = join_type low, high, n = -1 << 10, 1 << 10, 1 << 11 left = DataFrame( np.random.default_rng(2).integers(low, high, (n, 7)).astype("int64"), @@ -269,13 +266,12 @@ def test_int64_overflow_one_to_many_none_match(self, how, sort): right["right"] = np.random.default_rng(2).standard_normal(len(right)) # shuffle left & right frames - i = np.random.default_rng(5).permutation(len(left)) - left = left.iloc[i].copy() - left.index = np.arange(len(left)) - - i = np.random.default_rng(6).permutation(len(right)) - right = right.iloc[i].copy() - right.index = np.arange(len(right)) + left = left.sample( + frac=1, ignore_index=True, random_state=np.random.default_rng(5) + ) + right = right.sample( + frac=1, ignore_index=True, random_state=np.random.default_rng(6) + ) # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) @@ -309,13 +305,8 @@ def test_int64_overflow_one_to_many_none_match(self, how, sort): for rv in rval ) - def align(df): - df = df.sort_values(df.columns.tolist()) - df.index = np.arange(len(df)) - return df - out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) - out = align(out) + out = out.sort_values(out.columns.to_list(), ignore_index=True) jmask = { "left": out["left"].notna(), @@ -325,19 +316,21 @@ def align(df): } mask = jmask[how] - frame = align(out[mask].copy()) + frame = out[mask].sort_values(out.columns.to_list(), ignore_index=True) assert mask.all() ^ mask.any() or how == "outer" res = merge(left, right, how=how, sort=sort) if sort: kcols = list("ABCDEFG") tm.assert_frame_equal( - res[kcols].copy(), res[kcols].sort_values(kcols, kind="mergesort") + res[kcols], res[kcols].sort_values(kcols, kind="mergesort") ) # as in GH9092 dtypes break with outer/right join # 2021-12-18: dtype does not break anymore - tm.assert_frame_equal(frame, align(res)) + tm.assert_frame_equal( + frame, res.sort_values(res.columns.to_list(), ignore_index=True) + ) @pytest.mark.parametrize( @@ -415,6 +408,13 @@ def test_codes_out_of_bound(self): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) + @pytest.mark.parametrize("codes", [[-1, -1], [2, -1], [2, 2]]) + def test_codes_empty_array_out_of_bound(self, codes): + empty_values = np.array([]) + expected_codes = -np.ones_like(codes, dtype=np.intp) + _, result_codes = safe_sort(empty_values, codes) + tm.assert_numpy_array_equal(result_codes, expected_codes) + def test_mixed_integer(self): values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) result = safe_sort(values) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 4f34ab34c35f0..451ef42fff3d1 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -5,6 +5,7 @@ from pandas._libs import iNaT +from pandas import array import pandas._testing as tm import pandas.core.algorithms as algos @@ -299,9 +300,18 @@ def test_take_na_empty(self): tm.assert_numpy_array_equal(result, expected) def test_take_coerces_list(self): + # GH#52981 coercing is deprecated, disabled in 3.0 arr = [1, 2, 3] - msg = "take accepting non-standard inputs is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = algos.take(arr, [0, 0]) - expected = np.array([1, 1]) - tm.assert_numpy_array_equal(result, expected) + msg = ( + "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " + "Index, Series, or NumpyExtensionArray got list" + ) + with pytest.raises(TypeError, match=msg): + algos.take(arr, [0, 0]) + + def test_take_NumpyExtensionArray(self): + # GH#59177 + arr = array([1 + 1j, 2, 3]) # NumpyEADtype('complex128') (NumpyExtensionArray) + assert algos.take(arr, [2]) == 2 + arr = array([1, 2, 3]) # Int64Dtype() (ExtensionArray) + assert algos.take(arr, [2]) == 2 diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 6791ac0340640..b02fab70fb825 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1,4 +1,4 @@ -""" test to_datetime """ +"""test to_datetime""" import calendar from collections import deque @@ -10,18 +10,18 @@ ) from decimal import Decimal import locale +import zoneinfo from dateutil.parser import parse -from dateutil.tz.tz import tzoffset import numpy as np import pytest -import pytz from pandas._libs import tslib from pandas._libs.tslibs import ( iNaT, parsing, ) +from pandas.compat import WASM from pandas.errors import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, @@ -57,26 +57,12 @@ r"alongside this." ) -pytestmark = pytest.mark.filterwarnings( - "ignore:errors='ignore' is deprecated:FutureWarning" -) - - -@pytest.fixture(params=[True, False]) -def cache(request): - """ - cache keyword to pass to to_datetime. - """ - return request.param - class TestTimeConversionFormats: - @pytest.mark.parametrize("readonly", [True, False]) - def test_to_datetime_readonly(self, readonly): + def test_to_datetime_readonly(self, writable): # GH#34857 arr = np.array([], dtype=object) - if readonly: - arr.setflags(write=False) + arr.setflags(write=writable) result = to_datetime(arr) expected = to_datetime([]) tm.assert_index_equal(result, expected) @@ -131,7 +117,9 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser = Series([19801222, 19801222] + [19810105] * 5, dtype="float") # with NaT expected = Series( - [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + [Timestamp("19801222"), Timestamp("19801222")] + + [Timestamp("19810105")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan @@ -145,8 +133,7 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): with pytest.raises( ValueError, match=( - 'unconverted data remains when parsing with format "%Y%m%d": ".0", ' - "at position 0" + 'unconverted data remains when parsing with format "%Y%m%d": ".0". ' ), ): # https://github.com/pandas-dev/pandas/issues/50051 @@ -157,41 +144,32 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): # Explicit cast to float to explicit cast when setting np.nan ser = Series([198012, 198012] + [198101] * 5, dtype="float") expected = Series( - [Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5 + [Timestamp("19801201"), Timestamp("19801201")] + + [Timestamp("19810101")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan result = to_datetime(ser, format="%Y%m", cache=cache) tm.assert_series_equal(result, expected) - def test_to_datetime_format_YYYYMMDD_ignore(self, cache): + def test_to_datetime_format_YYYYMMDD_oob_for_ns(self, cache): # coercion # GH 7930, GH 14487 ser = Series([20121231, 20141231, 99991231]) - result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache) + result = to_datetime(ser, format="%Y%m%d", errors="raise", cache=cache) expected = Series( - [20121231, 20141231, 99991231], - dtype=object, + np.array(["2012-12-31", "2014-12-31", "9999-12-31"], dtype="M8[s]"), + dtype="M8[s]", ) tm.assert_series_equal(result, expected) - def test_to_datetime_format_YYYYMMDD_ignore_with_outofbounds(self, cache): - # https://github.com/pandas-dev/pandas/issues/26493 - result = to_datetime( - ["15010101", "20150101", np.nan], - format="%Y%m%d", - errors="ignore", - cache=cache, - ) - expected = Index(["15010101", "20150101", np.nan], dtype=object) - tm.assert_index_equal(result, expected) - def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 - ser = Series([20121231, 20141231, 99991231]) + ser = Series([20121231, 20141231, 999999999999999999999999999991231]) result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) - expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[s]") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -222,51 +200,54 @@ def test_to_datetime_format_YYYYMMDD_with_none(self, input_s): [ # NaN before strings with invalid date values [ - Series(["19801222", np.nan, "20010012", "10019999"]), - Series([Timestamp("19801222"), np.nan, np.nan, np.nan]), + ["19801222", np.nan, "20010012", "10019999"], + [Timestamp("19801222"), np.nan, np.nan, np.nan], ], # NaN after strings with invalid date values [ - Series(["19801222", "20010012", "10019999", np.nan]), - Series([Timestamp("19801222"), np.nan, np.nan, np.nan]), + ["19801222", "20010012", "10019999", np.nan], + [Timestamp("19801222"), np.nan, np.nan, np.nan], ], # NaN before integers with invalid date values [ - Series([20190813, np.nan, 20010012, 20019999]), - Series([Timestamp("20190813"), np.nan, np.nan, np.nan]), + [20190813, np.nan, 20010012, 20019999], + [Timestamp("20190813"), np.nan, np.nan, np.nan], ], # NaN after integers with invalid date values [ - Series([20190813, 20010012, np.nan, 20019999]), - Series([Timestamp("20190813"), np.nan, np.nan, np.nan]), + [20190813, 20010012, np.nan, 20019999], + [Timestamp("20190813"), np.nan, np.nan, np.nan], ], ], ) def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): # GH 25512 # format='%Y%m%d', errors='coerce' + input_s = Series(input_s) result = to_datetime(input_s, format="%Y%m%d", errors="coerce") + expected = Series(expected) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, format, expected", [ - ([pd.NA], "%Y%m%d%H%M%S", DatetimeIndex(["NaT"])), - ([pd.NA], None, DatetimeIndex(["NaT"])), + ([pd.NA], "%Y%m%d%H%M%S", ["NaT"]), + ([pd.NA], None, ["NaT"]), ( [pd.NA, "20210202202020"], "%Y%m%d%H%M%S", - DatetimeIndex(["NaT", "2021-02-02 20:20:20"]), + ["NaT", "2021-02-02 20:20:20"], ), - (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), - (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), - ([None, np.nan, pd.NA], None, DatetimeIndex(["NaT", "NaT", "NaT"])), - ([None, np.nan, pd.NA], "%Y%m%d", DatetimeIndex(["NaT", "NaT", "NaT"])), + (["201010", pd.NA], "%y%m%d", ["2020-10-10", "NaT"]), + (["201010", pd.NA], "%d%m%y", ["2010-10-20", "NaT"]), + ([None, np.nan, pd.NA], None, ["NaT", "NaT", "NaT"]), + ([None, np.nan, pd.NA], "%Y%m%d", ["NaT", "NaT", "NaT"]), ], ) def test_to_datetime_with_NA(self, data, format, expected): # GH#42957 result = to_datetime(data, format=format) + expected = DatetimeIndex(expected) tm.assert_index_equal(result, expected) def test_to_datetime_with_NA_with_warning(self): @@ -289,28 +270,6 @@ def test_to_datetime_format_integer(self, cache): result = to_datetime(ser, format="%Y%m", cache=cache) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "int_date, expected", - [ - # valid date, length == 8 - [20121030, datetime(2012, 10, 30)], - # short valid date, length == 6 - [199934, datetime(1999, 3, 4)], - # long integer date partially parsed to datetime(2012,1,1), length > 8 - [2012010101, 2012010101], - # invalid date partially parsed to datetime(2012,9,9), length == 8 - [20129930, 20129930], - # short integer date partially parsed to datetime(2012,9,9), length < 8 - [2012993, 2012993], - # short invalid date, length == 4 - [2121, 2121], - ], - ) - def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, expected): - # GH 26583 - result = to_datetime(int_date, format="%Y%m%d", errors="ignore") - assert result == expected - def test_to_datetime_format_microsecond(self, cache): month_abbr = calendar.month_abbr[4] val = f"01-{month_abbr}-2011 00:00:01.978" @@ -432,12 +391,12 @@ def test_parse_nanoseconds_with_formula(self, cache, arg): @pytest.mark.parametrize( "value,fmt,expected", [ - ["2009324", "%Y%W%w", Timestamp("2009-08-13")], - ["2013020", "%Y%U%w", Timestamp("2013-01-13")], + ["2009324", "%Y%W%w", "2009-08-13"], + ["2013020", "%Y%U%w", "2013-01-13"], ], ) def test_to_datetime_format_weeks(self, value, fmt, expected, cache): - assert to_datetime(value, format=fmt, cache=cache) == expected + assert to_datetime(value, format=fmt, cache=cache) == Timestamp(expected) @pytest.mark.parametrize( "fmt,dates,expected_dates", @@ -472,9 +431,11 @@ def test_to_datetime_format_weeks(self, value, fmt, expected, cache): ["2010-01-01 12:00:00 Z", "2010-01-01 12:00:00 Z"], [ Timestamp( - "2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0) - ), # pytz coerces to UTC - Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0)), + "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0)) + ), + Timestamp( + "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0)) + ), ], ], ], @@ -515,15 +476,13 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): ], ], ) - def test_to_datetime_parse_tzname_or_tzoffset_utc_false_deprecated( + def test_to_datetime_parse_tzname_or_tzoffset_utc_false_removed( self, fmt, dates, expected_dates ): - # GH 13486, 50887 - msg = "parsing datetimes with mixed time zones will raise an error" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_datetime(dates, format=fmt) - expected = Index(expected_dates) - tm.assert_equal(result, expected) + # GH#13486, GH#50887, GH#57275 + msg = "Mixed timezones detected. Pass utc=True in to_datetime" + with pytest.raises(ValueError, match=msg): + to_datetime(dates, format=fmt) def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): # GH 32792 @@ -554,10 +513,9 @@ def test_to_datetime_parse_timezone_malformed(self, offset): msg = "|".join( [ - r'^time data ".*" doesn\'t match format ".*", at position 0. ' + r'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$", + r'^unconverted data remains when parsing with format ".*": ".*". ' f"{PARSING_ERR_MSG}$", - r'^unconverted data remains when parsing with format ".*": ".*", ' - f"at position 0. {PARSING_ERR_MSG}$", ] ) with pytest.raises(ValueError, match=msg): @@ -579,7 +537,7 @@ def test_to_datetime_overflow(self): # TODO: Timestamp raises ValueError("could not convert string to Timestamp") # can we make these more consistent? arg = "08335394550" - msg = 'Parsing "08335394550" to datetime overflows, at position 0' + msg = 'Parsing "08335394550" to datetime overflows' with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(arg) @@ -589,12 +547,8 @@ def test_to_datetime_overflow(self): res = to_datetime(arg, errors="coerce") assert res is NaT res = to_datetime([arg], errors="coerce") - tm.assert_index_equal(res, Index([NaT])) - - res = to_datetime(arg, errors="ignore") - assert isinstance(res, str) and res == arg - res = to_datetime([arg], errors="ignore") - tm.assert_index_equal(res, Index([arg], dtype=object)) + exp = Index([NaT], dtype="M8[s]") + tm.assert_index_equal(res, exp) def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior @@ -625,7 +579,7 @@ def test_to_datetime_mixed_date_and_string(self, format): # https://github.com/pandas-dev/pandas/issues/50108 d1 = date(2020, 1, 2) res = to_datetime(["2020-01-01", d1], format=format) - expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[ns]") + expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[s]") tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -641,7 +595,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, with utc", ), @@ -650,7 +604,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - ), + ).as_unit("us"), id="all tz-aware, without utc", ), pytest.param( @@ -658,7 +612,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, mixed offsets, with utc", ), @@ -667,7 +621,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="tz-aware string, naive pydatetime, with utc", ), @@ -687,6 +641,8 @@ def test_to_datetime_mixed_datetime_and_string_with_format( ts1 = constructor(args[0]) ts2 = args[1] result = to_datetime([ts1, ts2], format=fmt, utc=utc) + if constructor is Timestamp: + expected = expected.as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -698,66 +654,58 @@ def test_to_datetime_mixed_datetime_and_string_with_format( "constructor", [Timestamp, lambda x: Timestamp(x).to_pydatetime()], ) - def test_to_datetime_mixed_datetime_and_string_with_format_mixed_offsets_utc_false( + def test_to_datetime_mixed_dt_and_str_with_format_mixed_offsets_utc_false_removed( self, fmt, constructor ): # https://github.com/pandas-dev/pandas/issues/49298 # https://github.com/pandas-dev/pandas/issues/50254 + # GH#57275 # note: ISO8601 formats go down a fastpath, so we need to check both # a ISO8601 format and a non-ISO8601 one args = ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"] ts1 = constructor(args[0]) ts2 = args[1] - msg = "parsing datetimes with mixed time zones will raise an error" + msg = "Mixed timezones detected. Pass utc=True in to_datetime" - expected = Index( - [ - Timestamp("2000-01-01 01:00:00"), - Timestamp("2000-01-01 02:00:00+0000", tz="UTC"), - ], - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_datetime([ts1, ts2], format=fmt, utc=False) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + to_datetime([ts1, ts2], format=fmt, utc=False) @pytest.mark.parametrize( "fmt, expected", [ pytest.param( "%Y-%m-%d %H:%M:%S%z", - Index( - [ - Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"), - Timestamp("2000-01-02 02:00:00+0200", tz="UTC+02:00"), - NaT, - ] - ), + [ + Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"), + Timestamp("2000-01-02 02:00:00+0200", tz="UTC+02:00"), + NaT, + ], id="ISO8601, non-UTC", ), pytest.param( "%Y-%d-%m %H:%M:%S%z", - Index( - [ - Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"), - Timestamp("2000-02-01 02:00:00+0200", tz="UTC+02:00"), - NaT, - ] - ), + [ + Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"), + Timestamp("2000-02-01 02:00:00+0200", tz="UTC+02:00"), + NaT, + ], id="non-ISO8601, non-UTC", ), ], ) - def test_to_datetime_mixed_offsets_with_none_tz(self, fmt, expected): + def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( + self, fmt, expected + ): # https://github.com/pandas-dev/pandas/issues/50071 - msg = "parsing datetimes with mixed time zones will raise an error" + # GH#57275 + msg = "Mixed timezones detected. Pass utc=True in to_datetime" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_datetime( + with pytest.raises(ValueError, match=msg): + to_datetime( ["2000-01-01 09:00:00+01:00", "2000-01-02 02:00:00+02:00", None], format=fmt, utc=False, ) - tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "fmt, expected", @@ -766,7 +714,7 @@ def test_to_datetime_mixed_offsets_with_none_tz(self, fmt, expected): "%Y-%m-%d %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="ISO8601, UTC", ), @@ -774,7 +722,7 @@ def test_to_datetime_mixed_offsets_with_none_tz(self, fmt, expected): "%Y-%d-%m %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="non-ISO8601, UTC", ), @@ -846,12 +794,36 @@ def test_to_datetime_np_str(self): ["2015-1-1", "%G-%V-%u", datetime(2014, 12, 29, 0, 0)], ["2015-1-4", "%G-%V-%u", datetime(2015, 1, 1, 0, 0)], ["2015-1-7", "%G-%V-%u", datetime(2015, 1, 4, 0, 0)], + ["2024-52-1", "%G-%V-%u", datetime(2024, 12, 23, 0, 0)], + ["2024-52-7", "%G-%V-%u", datetime(2024, 12, 29, 0, 0)], + ["2025-1-1", "%G-%V-%u", datetime(2024, 12, 30, 0, 0)], + ["2020-53-1", "%G-%V-%u", datetime(2020, 12, 28, 0, 0)], ], ) def test_to_datetime_iso_week_year_format(self, s, _format, dt): # See GH#16607 assert to_datetime(s, format=_format) == dt + @pytest.mark.parametrize( + "msg, s, _format", + [ + [ + "Week 53 does not exist in ISO year 2024", + "2024 53 1", + "%G %V %u", + ], + [ + "Week 53 does not exist in ISO year 2023", + "2023 53 1", + "%G %V %u", + ], + ], + ) + def test_invalid_iso_week_53(self, msg, s, _format): + # See GH#60885 + with pytest.raises(ValueError, match=msg): + to_datetime(s, format=_format) + @pytest.mark.parametrize( "msg, s, _format", [ @@ -971,7 +943,7 @@ def test_to_datetime_iso_week_year_format(self, s, _format, dt): ], ], ) - @pytest.mark.parametrize("errors", ["raise", "coerce", "ignore"]) + @pytest.mark.parametrize("errors", ["raise", "coerce"]) def test_error_iso_week_year(self, msg, s, _format, errors): # See GH#16607, GH#50308 # This test checks for errors thrown when giving the wrong format @@ -998,14 +970,13 @@ def test_to_datetime_dtarr(self, tz): # Doesn't work on Windows since tzpath not set correctly @td.skip_if_windows - @pytest.mark.parametrize("arg_class", [Series, Index]) @pytest.mark.parametrize("utc", [True, False]) @pytest.mark.parametrize("tz", [None, "US/Central"]) - def test_to_datetime_arrow(self, tz, utc, arg_class): + def test_to_datetime_arrow(self, tz, utc, index_or_series): pa = pytest.importorskip("pyarrow") dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) - dti = arg_class(dti) + dti = index_or_series(dti) dti_arrow = dti.astype(pd.ArrowDtype(pa.timestamp(unit="ns", tz=tz))) @@ -1013,11 +984,11 @@ def test_to_datetime_arrow(self, tz, utc, arg_class): expected = to_datetime(dti, utc=utc).astype( pd.ArrowDtype(pa.timestamp(unit="ns", tz=tz if not utc else "UTC")) ) - if not utc and arg_class is not Series: + if not utc and index_or_series is not Series: # Doesn't hold for utc=True, since that will astype # to_datetime also returns a new object for series assert result is dti_arrow - if arg_class is Series: + if index_or_series is Series: tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result, expected) @@ -1030,17 +1001,13 @@ def test_to_datetime_YYYYMMDD(self): actual = to_datetime("20080115") assert actual == datetime(2008, 1, 15) - def test_to_datetime_unparsable_ignore(self): - # unparsable - ser = "Month 1, 1999" - assert to_datetime(ser, errors="ignore") == ser - @td.skip_if_windows # `tm.set_timezone` does not work in windows + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now").as_unit("ns") + now = Timestamp("now") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -1052,7 +1019,8 @@ def test_to_datetime_now(self): assert pdnow.tzinfo is None assert pdnow2.tzinfo is None - @td.skip_if_windows # `tm.set_timezone` does not work in windows + @td.skip_if_windows # `tm.set_timezone` does not work on Windows + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") @pytest.mark.parametrize("tz", ["Pacific/Auckland", "US/Samoa"]) def test_to_datetime_today(self, tz): # See GH#18666 @@ -1062,12 +1030,12 @@ def test_to_datetime_today(self, tz): # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): - nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + nptoday = np.datetime64("today").astype("datetime64[us]").astype(np.int64) pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today").as_unit("ns") - tstoday2 = Timestamp.today().as_unit("ns") + tstoday = Timestamp("today") + tstoday2 = Timestamp.today() # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -1083,6 +1051,8 @@ def test_to_datetime_today(self, tz): def test_to_datetime_today_now_unicode_bytes(self, arg): to_datetime([arg]) + @pytest.mark.filterwarnings("ignore:Timestamp.utcnow is deprecated:FutureWarning") + @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") @pytest.mark.parametrize( "format, expected_ds", [ @@ -1102,7 +1072,7 @@ def test_to_datetime_now_with_format(self, format, expected_ds, string, attribut # https://github.com/pandas-dev/pandas/issues/50359 result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True) expected = DatetimeIndex( - [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" + [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[s, UTC]" ) assert (expected - result).max().total_seconds() < 1 @@ -1128,7 +1098,7 @@ def test_to_datetime_dt64s_and_str(self, arg, format): @pytest.mark.parametrize( "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")] ) - @pytest.mark.parametrize("errors", ["raise", "ignore", "coerce"]) + @pytest.mark.parametrize("errors", ["raise", "coerce"]) def test_to_datetime_dt64s_out_of_ns_bounds(self, cache, dt, errors): # GH#50369 We cast to the nearest supported reso, i.e. "s" ts = to_datetime(dt, errors=errors, cache=cache) @@ -1163,11 +1133,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing result = to_datetime(dts, cache=cache) - if cache: - # FIXME: behavior should not depend on cache - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") - else: - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]") + expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") tm.assert_index_equal(result, expected) @@ -1178,41 +1144,9 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): to_datetime(dts_with_oob, errors="raise") result = to_datetime(dts_with_oob, errors="coerce", cache=cache) - if not cache: - # FIXME: shouldn't depend on cache! - expected = DatetimeIndex( - [Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30 - + [NaT], - ) - else: - expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) + expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) tm.assert_index_equal(result, expected) - # With errors='ignore', out of bounds datetime64s - # are converted to their .item(), which depending on the version of - # numpy is either a python datetime.datetime or datetime.date - result = to_datetime(dts_with_oob, errors="ignore", cache=cache) - if not cache: - # FIXME: shouldn't depend on cache! - expected = Index(dts_with_oob) - tm.assert_index_equal(result, expected) - - def test_out_of_bounds_errors_ignore(self): - # https://github.com/pandas-dev/pandas/issues/50587 - result = to_datetime(np.datetime64("9999-01-01"), errors="ignore") - expected = np.datetime64("9999-01-01") - assert result == expected - - def test_out_of_bounds_errors_ignore2(self): - # GH#12424 - msg = "errors='ignore' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = to_datetime( - Series(["2362-01-01", np.nan], dtype=object), errors="ignore" - ) - exp = Series(["2362-01-01", np.nan], dtype=object) - tm.assert_series_equal(res, exp) - def test_to_datetime_tz(self, cache): # xref 8260 # uniform returns a DatetimeIndex @@ -1223,7 +1157,7 @@ def test_to_datetime_tz(self, cache): result = to_datetime(arr, cache=cache) expected = DatetimeIndex( ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_to_datetime_tz_mixed(self, cache): @@ -1240,37 +1174,26 @@ def test_to_datetime_tz_mixed(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(arr, cache=cache) - depr_msg = "errors='ignore' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = to_datetime(arr, cache=cache, errors="ignore") - expected = Index( - [ - Timestamp("2013-01-01 13:00:00-08:00"), - Timestamp("2013-01-02 14:00:00-05:00"), - ], - dtype="object", - ) - tm.assert_index_equal(result, expected) result = to_datetime(arr, cache=cache, errors="coerce") expected = DatetimeIndex( - ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[ns, US/Pacific]" + ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[s, US/Pacific]" ) tm.assert_index_equal(result, expected) - def test_to_datetime_different_offsets(self, cache): + def test_to_datetime_different_offsets_removed(self, cache): # inspired by asv timeseries.ToDatetimeNONISO8601 benchmark # see GH-26097 for more + # GH#57275 ts_string_1 = "March 1, 2018 12:00:00+0400" ts_string_2 = "March 1, 2018 12:00:00+0500" arr = [ts_string_1] * 5 + [ts_string_2] * 5 - expected = Index([parse(x) for x in arr]) - msg = "parsing datetimes with mixed time zones will raise an error" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_datetime(arr, cache=cache) - tm.assert_index_equal(result, expected) + msg = "Mixed timezones detected. Pass utc=True in to_datetime" + with pytest.raises(ValueError, match=msg): + to_datetime(arr, cache=cache) def test_to_datetime_tz_pytz(self, cache): # see gh-8260 + pytz = pytest.importorskip("pytz") us_eastern = pytz.timezone("US/Eastern") arr = np.array( [ @@ -1286,7 +1209,7 @@ def test_to_datetime_tz_pytz(self, cache): result = to_datetime(arr, utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1373,7 +1296,7 @@ def test_to_datetime_tz_psycopg2(self, request, cache): result = to_datetime(arr, errors="coerce", utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1382,15 +1305,15 @@ def test_to_datetime_tz_psycopg2(self, request, cache): i = DatetimeIndex( ["2000-01-01 08:00:00"], tz=psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None), - ) - assert is_datetime64_ns_dtype(i) + ).as_unit("us") + assert not is_datetime64_ns_dtype(i) # tz coercion result = to_datetime(i, errors="coerce", cache=cache) tm.assert_index_equal(result, i) result = to_datetime(i, errors="coerce", utc=True, cache=cache) - expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]") + expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[us, UTC]") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) @@ -1400,7 +1323,6 @@ def test_datetime_bool(self, cache, arg): with pytest.raises(TypeError, match=msg): to_datetime(arg) assert to_datetime(arg, errors="coerce", cache=cache) is NaT - assert to_datetime(arg, errors="ignore", cache=cache) is arg def test_datetime_bool_arrays_mixed(self, cache): msg = f"{type(cache)} is not convertible to datetime" @@ -1409,8 +1331,8 @@ def test_datetime_bool_arrays_mixed(self, cache): with pytest.raises( ValueError, match=( - r'^time data "True" doesn\'t match format "%Y%m%d", ' - f"at position 1. {PARSING_ERR_MSG}$" + r'^time data "True" doesn\'t match format "%Y%m%d". ' + f"{PARSING_ERR_MSG}$" ), ): to_datetime(["20130101", True], cache=cache) @@ -1428,7 +1350,7 @@ def test_datetime_invalid_datatype(self, arg): with pytest.raises(TypeError, match=msg): to_datetime(arg) - @pytest.mark.parametrize("errors", ["coerce", "raise", "ignore"]) + @pytest.mark.parametrize("errors", ["coerce", "raise"]) def test_invalid_format_raises(self, errors): # https://github.com/pandas-dev/pandas/issues/50255 with pytest.raises( @@ -1440,20 +1362,17 @@ def test_invalid_format_raises(self, errors): @pytest.mark.parametrize("format", [None, "%H:%M:%S"]) def test_datetime_invalid_scalar(self, value, format): # GH24763 - res = to_datetime(value, errors="ignore", format=format) - assert res == value - res = to_datetime(value, errors="coerce", format=format) assert res is NaT msg = "|".join( [ - r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. ' + r'^time data "a" doesn\'t match format "%H:%M:%S". ' f"{PARSING_ERR_MSG}$", - r'^Given date string "a" not likely a datetime, at position 0$', - r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' - f"at position 0. {PARSING_ERR_MSG}$", - r"^second must be in 0..59: 00:01:99, at position 0$", + r'^Given date string "a" not likely a datetime$', + r'^unconverted data remains when parsing with format "%H:%M:%S": "9". ' + f"{PARSING_ERR_MSG}$", + r"^second must be in 0..59: 00:01:99$", ] ) with pytest.raises(ValueError, match=msg): @@ -1463,20 +1382,21 @@ def test_datetime_invalid_scalar(self, value, format): @pytest.mark.parametrize("format", [None, "%H:%M:%S"]) def test_datetime_outofbounds_scalar(self, value, format): # GH24763 - res = to_datetime(value, errors="ignore", format=format) - assert res == value - res = to_datetime(value, errors="coerce", format=format) - assert res is NaT + if format is None: + assert isinstance(res, Timestamp) + assert res == Timestamp(value) + else: + assert res is NaT if format is not None: - msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' + msg = r'^time data ".*" doesn\'t match format ".*"' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: - msg = "^Out of bounds .*, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(value, errors="raise", format=format) + res = to_datetime(value, errors="raise", format=format) + assert isinstance(res, Timestamp) + assert res == Timestamp(value) @pytest.mark.parametrize( ("values"), [(["a"]), (["00:01:99"]), (["a", "b", "99:00:00"])] @@ -1490,11 +1410,6 @@ def test_datetime_invalid_index(self, values, format): warn = UserWarning else: warn = None - with tm.assert_produces_warning( - warn, match="Could not infer format", raise_on_extra_warnings=False - ): - res = to_datetime(values, errors="ignore", format=format) - tm.assert_index_equal(res, Index(values, dtype=object)) with tm.assert_produces_warning( warn, match="Could not infer format", raise_on_extra_warnings=False @@ -1504,12 +1419,12 @@ def test_datetime_invalid_index(self, values, format): msg = "|".join( [ - r'^Given date string "a" not likely a datetime, at position 0$', - r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. ' + r'^Given date string "a" not likely a datetime$', + r'^time data "a" doesn\'t match format "%H:%M:%S". ' f"{PARSING_ERR_MSG}$", - r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' - f"at position 0. {PARSING_ERR_MSG}$", - r"^second must be in 0..59: 00:01:99, at position 0$", + r'^unconverted data remains when parsing with format "%H:%M:%S": "9". ' + f"{PARSING_ERR_MSG}$", + r"^second must be in 0..59: 00:01:99$", ] ) with pytest.raises(ValueError, match=msg): @@ -1554,15 +1469,17 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes,expected_values", + "datetimelikes,expected_values,exp_unit", ( ( (None, np.nan) + (NaT,) * start_caching_at, (NaT,) * (start_caching_at + 2), + "s", ), ( (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + "s", ), ( (None,) @@ -1570,11 +1487,12 @@ def test_to_datetime_cache_scalar(self): + ("2012 July 26", Timestamp("2012-07-26")), (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + "s", ), ), ) def test_convert_object_to_datetime_with_cache( - self, datetimelikes, expected_values + self, datetimelikes, expected_values, exp_unit ): # GH#39882 ser = Series( @@ -1584,11 +1502,10 @@ def test_convert_object_to_datetime_with_cache( result_series = to_datetime(ser, errors="coerce") expected_series = Series( expected_values, - dtype="datetime64[ns]", + dtype=f"datetime64[{exp_unit}]", ) tm.assert_series_equal(result_series, expected_series) - @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize( "input", [ @@ -1606,7 +1523,7 @@ def test_convert_object_to_datetime_with_cache( ) def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 - expected = Series([NaT] * len(input), dtype="M8[ns]") + expected = Series([NaT] * len(input), dtype="M8[s]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1629,23 +1546,15 @@ def test_week_without_day_and_calendar_year(self, date, format): to_datetime(date, format=format) def test_to_datetime_coerce(self): - # GH 26122 + # GH#26122, GH#57275 ts_strings = [ "March 1, 2018 12:00:00+0400", "March 1, 2018 12:00:00+0500", "20100240", ] - msg = "parsing datetimes with mixed time zones will raise an error" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_datetime(ts_strings, errors="coerce") - expected = Index( - [ - datetime(2018, 3, 1, 12, 0, tzinfo=tzoffset(None, 14400)), - datetime(2018, 3, 1, 12, 0, tzinfo=tzoffset(None, 18000)), - NaT, - ] - ) - tm.assert_index_equal(result, expected) + msg = "Mixed timezones detected. Pass utc=True in to_datetime" + with pytest.raises(ValueError, match=msg): + to_datetime(ts_strings, errors="coerce") @pytest.mark.parametrize( "string_arg, format", @@ -1665,33 +1574,37 @@ def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): # https://github.com/pandas-dev/pandas/issues/50255 ts_strings = [string_arg, outofbounds] result = to_datetime(ts_strings, errors="coerce", format=format) - expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) + if isinstance(outofbounds, str) and ( + format.startswith("%B") ^ outofbounds.startswith("J") + ): + # the strings don't match the given format, so they raise and we coerce + expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[s]") + elif isinstance(outofbounds, datetime): + expected = DatetimeIndex( + [datetime(2018, 3, 1), outofbounds], dtype="M8[us]" + ) + else: + expected = DatetimeIndex([datetime(2018, 3, 1), outofbounds], dtype="M8[s]") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "errors, expected", - [ - ("coerce", Index([NaT, NaT])), - ("ignore", Index(["200622-12-31", "111111-24-11"], dtype=object)), - ], - ) - def test_to_datetime_malformed_no_raise(self, errors, expected): + def test_to_datetime_malformed_no_raise(self): # GH 28299 # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] with tm.assert_produces_warning( UserWarning, match="Could not infer format", raise_on_extra_warnings=False ): - result = to_datetime(ts_strings, errors=errors) - tm.assert_index_equal(result, expected) + result = to_datetime(ts_strings, errors="coerce") + # TODO: should Index get "s" by default here? + exp = Index([NaT, NaT], dtype="M8[s]") + tm.assert_index_equal(result, exp) def test_to_datetime_malformed_raise(self): # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] msg = ( 'Parsed string "200622-12-31" gives an invalid tzoffset, which must ' - r"be between -timedelta\(hours=24\) and timedelta\(hours=24\), " - "at position 0" + r"be between -timedelta\(hours=24\) and timedelta\(hours=24\)" ) with pytest.raises( ValueError, @@ -1719,34 +1632,23 @@ def test_iso_8601_strings_with_same_offset(self): result = DatetimeIndex([ts_str] * 2) tm.assert_index_equal(result, expected) - def test_iso_8601_strings_with_different_offsets(self): - # GH 17697, 11736, 50887 + def test_iso_8601_strings_with_different_offsets_removed(self): + # GH#17697, GH#11736, GH#50887, GH#57275 ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] - msg = "parsing datetimes with mixed time zones will raise an error" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_datetime(ts_strings) - expected = np.array( - [ - datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 19800)), - datetime(2015, 11, 18, 16, 30, tzinfo=tzoffset(None, 23400)), - NaT, - ], - dtype=object, - ) - # GH 21864 - expected = Index(expected) - tm.assert_index_equal(result, expected) + msg = "Mixed timezones detected. Pass utc=True in to_datetime" + with pytest.raises(ValueError, match=msg): + to_datetime(ts_strings) def test_iso_8601_strings_with_different_offsets_utc(self): ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] result = to_datetime(ts_strings, utc=True) expected = DatetimeIndex( [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) - def test_mixed_offsets_with_native_datetime_raises(self): - # GH 25978 + def test_mixed_offsets_with_native_datetime_utc_false_raises(self): + # GH#25978, GH#57275 vals = [ "nan", @@ -1760,35 +1662,15 @@ def test_mixed_offsets_with_native_datetime_raises(self): ser = Series(vals) assert all(ser[i] is vals[i] for i in range(len(vals))) # GH#40111 - now = Timestamp("now") - today = Timestamp("today") - msg = "parsing datetimes with mixed time zones will raise an error" - with tm.assert_produces_warning(FutureWarning, match=msg): - mixed = to_datetime(ser) - expected = Series( - [ - "NaT", - Timestamp("1990-01-01"), - Timestamp("2015-03-14T16:15:14.123-08:00").to_pydatetime(), - Timestamp("2019-03-04T21:56:32.620-07:00").to_pydatetime(), - None, - ], - dtype=object, - ) - tm.assert_series_equal(mixed[:-2], expected) - # we'll check mixed[-1] and mixed[-2] match now and today to within - # call-timing tolerances - assert (now - mixed.iloc[-1]).total_seconds() <= 0.1 - assert (today - mixed.iloc[-2]).total_seconds() <= 0.1 - - with pytest.raises(ValueError, match="Tz-aware datetime.datetime"): - to_datetime(mixed) + msg = "Mixed timezones detected. Pass utc=True in to_datetime" + with pytest.raises(ValueError, match=msg): + to_datetime(ser) def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) expected = DatetimeIndex( [datetime(2018, 3, 1, 12, tzinfo=timezone(timedelta(minutes=240)))] * 2 - ) + ).as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1809,9 +1691,11 @@ def test_timestamp_utc_true(self, ts, expected): @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"]) def test_to_datetime_with_format_out_of_bounds(self, dt_str): # GH 9107 - msg = "Out of bounds nanosecond timestamp" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(dt_str, format="%Y%m%d") + res = to_datetime(dt_str, format="%Y%m%d") + dtobj = datetime.strptime(dt_str, "%Y%m%d") + expected = Timestamp(dtobj).as_unit("s") + assert res == expected + assert res.unit == expected.unit def test_to_datetime_utc(self): arr = np.array([parse("2012-06-13T01:39:00Z")], dtype=object) @@ -1839,14 +1723,16 @@ def test_to_datetime_fixed_offset(self): ["2020-10-26 00:00:00+06:00", Timestamp("2018-01-01", tz="US/Pacific")], [ "2020-10-26 00:00:00+06:00", - datetime(2020, 1, 1, 18, tzinfo=pytz.timezone("Australia/Melbourne")), + datetime(2020, 1, 1, 18).astimezone( + zoneinfo.ZoneInfo("Australia/Melbourne") + ), ], ], ) - def test_to_datetime_mixed_offsets_with_utc_false_deprecated(self, date): - # GH 50887 - msg = "parsing datetimes with mixed time zones will raise an error" - with tm.assert_produces_warning(FutureWarning, match=msg): + def test_to_datetime_mixed_offsets_with_utc_false_removed(self, date): + # GH#50887, GH#57275 + msg = "Mixed timezones detected. Pass utc=True in to_datetime" + with pytest.raises(ValueError, match=msg): to_datetime(date, utc=False) @@ -1877,27 +1763,24 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 # Match Timestamp behavior in disallowing non-round floats with # Y or M unit - warn_msg = "strings will be parsed as datetime strings" msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): to_datetime(np.array([1.5]), unit=unit, errors="raise") - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - to_datetime(["1.5"], unit=unit, errors="raise") - # with errors="ignore" we also end up raising within the Timestamp - # constructor; this may not be ideal + msg = r"Given date string \"1.5\" not likely a datetime" with pytest.raises(ValueError, match=msg): - to_datetime([1.5], unit=unit, errors="ignore") + to_datetime(["1.5"], unit=unit, errors="raise") res = to_datetime([1.5], unit=unit, errors="coerce") expected = Index([NaT], dtype="M8[ns]") tm.assert_index_equal(res, expected) - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = to_datetime(["1.5"], unit=unit, errors="coerce") + # In 3.0, the string "1.5" is parsed as as it would be without unit, + # which fails. With errors="coerce" this becomes NaT. + res = to_datetime(["1.5"], unit=unit, errors="coerce") + expected = to_datetime([NaT]).as_unit("ns") tm.assert_index_equal(res, expected) # round floats are OK @@ -1914,21 +1797,6 @@ def test_unit(self, cache): def test_unit_array_mixed_nans(self, cache): values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] - result = to_datetime(values, unit="D", errors="ignore", cache=cache) - expected = Index( - [ - 11111111111111111, - Timestamp("1970-01-02"), - Timestamp("1970-01-02"), - NaT, - NaT, - NaT, - NaT, - NaT, - ], - dtype=object, - ) - tm.assert_index_equal(result, expected) result = to_datetime(values, unit="D", errors="coerce", cache=cache) expected = DatetimeIndex( @@ -1944,10 +1812,6 @@ def test_unit_array_mixed_nans(self, cache): def test_unit_array_mixed_nans_large_int(self, cache): values = [1420043460000000000000000, iNaT, NaT, np.nan, "NaT"] - result = to_datetime(values, errors="ignore", unit="s", cache=cache) - expected = Index([1420043460000000000000000, NaT, NaT, NaT, NaT], dtype=object) - tm.assert_index_equal(result, expected) - result = to_datetime(values, errors="coerce", unit="s", cache=cache) expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[ns]") tm.assert_index_equal(result, expected) @@ -1959,11 +1823,11 @@ def test_unit_array_mixed_nans_large_int(self, cache): def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache): # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - msg = "non convertible value foo with the unit 's'" + msg = "Unknown datetime string format, unable to parse: foo" with pytest.raises(ValueError, match=msg): to_datetime("foo", errors="raise", unit="s", cache=cache) - @pytest.mark.parametrize("error", ["raise", "coerce", "ignore"]) + @pytest.mark.parametrize("error", ["raise", "coerce"]) def test_unit_consistency(self, cache, error): # consistency of conversions expected = Timestamp("1970-05-09 14:25:11") @@ -1971,7 +1835,7 @@ def test_unit_consistency(self, cache, error): assert result == expected assert isinstance(result, Timestamp) - @pytest.mark.parametrize("errors", ["ignore", "raise", "coerce"]) + @pytest.mark.parametrize("errors", ["raise", "coerce"]) @pytest.mark.parametrize("dtype", ["float64", "int64"]) def test_unit_with_numeric(self, cache, errors, dtype): # GH 13180 @@ -2041,18 +1905,6 @@ def test_unit_rounding(self, cache): alt = Timestamp(value, unit="s") assert alt == result - def test_unit_ignore_keeps_name(self, cache): - # GH 21697 - expected = Index([15e9] * 2, name="name") - result = to_datetime(expected, errors="ignore", unit="s", cache=cache) - tm.assert_index_equal(result, expected) - - def test_to_datetime_errors_ignore_utc_true(self): - # GH#23758 - result = to_datetime([1], unit="s", utc=True, errors="ignore") - expected = DatetimeIndex(["1970-01-01 00:00:01"], dtype="M8[ns, UTC]") - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("dtype", [int, float]) def test_to_datetime_unit(self, dtype): epoch = 1370745748 @@ -2106,7 +1958,10 @@ def test_to_datetime_unit_na_values(self): @pytest.mark.parametrize("bad_val", ["foo", 111111111]) def test_to_datetime_unit_invalid(self, bad_val): - msg = f"{bad_val} with the unit 'D'" + if bad_val == "foo": + msg = f"Unknown datetime string format, unable to parse: {bad_val}" + else: + msg = "cannot convert input 111111111 with the unit 'D'" with pytest.raises(ValueError, match=msg): to_datetime([1, 2, bad_val], unit="D") @@ -2130,7 +1985,7 @@ def test_float_to_datetime_raise_near_bounds(self): [0, tsmax_in_days - 0.005, -tsmax_in_days + 0.005], dtype=float ) expected = (should_succeed * oneday_in_ns).astype(np.int64) - for error_mode in ["raise", "coerce", "ignore"]: + for error_mode in ["raise", "coerce"]: result1 = to_datetime(should_succeed, unit="D", errors=error_mode) # Cast to `np.float64` so that `rtol` and inexact checking kick in # (`check_exact` doesn't take place for integer dtypes) @@ -2176,6 +2031,7 @@ def test_dataframe(self, df, cache): # dict-like result = to_datetime(df[["year", "month", "day"]].to_dict(), cache=cache) + expected.index = Index([0, 1]) tm.assert_series_equal(result, expected) def test_dataframe_dict_with_constructable(self, df, cache): @@ -2184,7 +2040,8 @@ def test_dataframe_dict_with_constructable(self, df, cache): df2["month"] = 2 result = to_datetime(df2, cache=cache) expected2 = Series( - [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")] + [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")], + index=Index([0, 1]), ) tm.assert_series_equal(result, expected2) @@ -2251,13 +2108,25 @@ def test_dataframe_str_dtype(self, df, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_float32_dtype(self, df, cache): + # GH#60506 + # coerce to float64 + result = to_datetime(df.astype(np.float32), cache=cache) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) + tm.assert_series_equal(result, expected) + def test_dataframe_coerce(self, cache): # passing coerce df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) msg = ( r'^cannot assemble the datetimes: time data ".+" doesn\'t ' - r'match format "%Y%m%d", at position 1\.' + r'match format "%Y%m%d"\.' ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) @@ -2266,12 +2135,12 @@ def test_dataframe_coerce(self, cache): expected = Series([Timestamp("20150204 00:00:00"), NaT]) tm.assert_series_equal(result, expected) - def test_dataframe_extra_keys_raisesm(self, df, cache): + def test_dataframe_extra_keys_raises(self, df, cache): # extra columns msg = r"extra keys have been passed to the datetime assemblage: \[foo\]" + df2 = df.copy() + df2["foo"] = 1 with pytest.raises(ValueError, match=msg): - df2 = df.copy() - df2["foo"] = 1 to_datetime(df2, cache=cache) @pytest.mark.parametrize( @@ -2335,7 +2204,7 @@ def test_dataframe_float(self, cache): df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) msg = ( r"^cannot assemble the datetimes: unconverted data remains when parsing " - r'with format ".*": "1", at position 0.' + r'with format ".*": "1".' ) with pytest.raises(ValueError, match=msg): to_datetime(df, cache=cache) @@ -2345,7 +2214,7 @@ def test_dataframe_utc_true(self): df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = to_datetime(df, utc=True) expected = Series( - np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[s]") ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -2357,7 +2226,7 @@ def test_to_datetime_barely_out_of_bounds(self): # in an in-bounds datetime arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object) - msg = "^Out of bounds nanosecond timestamp: .*, at position 0" + msg = "^Out of bounds nanosecond timestamp: .*" with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(arr) @@ -2392,10 +2261,7 @@ def test_to_datetime_iso8601_fails(self, input, format, exact): # `format` is longer than the string, so this fails regardless of `exact` with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" doesn't match format " - rf"\"{format}\", at position 0" - ), + match=(rf"time data \"{input}\" doesn't match format " rf"\"{format}\""), ): to_datetime(input, format=format, exact=exact) @@ -2414,10 +2280,9 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): # `format` is shorter than the date string, so only fails with `exact=True` msg = "|".join( [ - '^unconverted data remains when parsing with format ".*": ".*"' - f", at position 0. {PARSING_ERR_MSG}$", - f'^time data ".*" doesn\'t match format ".*", at position 0. ' + '^unconverted data remains when parsing with format ".*": ".*". ' f"{PARSING_ERR_MSG}$", + f'^time data ".*" doesn\'t match format ".*". {PARSING_ERR_MSG}$', ] ) with pytest.raises( @@ -2458,10 +2323,7 @@ def test_to_datetime_iso8601_separator(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" doesn\'t match format " - rf"\"{format}\", at position 0" - ), + match=(rf"time data \"{input}\" doesn\'t match format " rf"\"{format}\""), ): to_datetime(input, format=format) @@ -2519,7 +2381,7 @@ def test_to_datetime_iso8601_non_padded(self, input, format): ) def test_to_datetime_iso8601_with_timezone_valid(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 - expected = Timestamp(2020, 1, 1, tzinfo=pytz.UTC) + expected = Timestamp(2020, 1, 1, tzinfo=timezone.utc) result = to_datetime(input, format=format) assert result == expected @@ -2551,16 +2413,15 @@ def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) msg = ( - r'^time data " " doesn\'t match format "%m/%d/%Y", ' - rf"at position 2. {PARSING_ERR_MSG}$" + r'^time data " " doesn\'t match format "%m/%d/%Y". ' rf"{PARSING_ERR_MSG}$" ) with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) - expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) + expected_coerce = Series( + [datetime(2006, 10, 18), datetime(2008, 10, 18), NaT] + ).dt.as_unit("s") tm.assert_series_equal(result_coerce, expected_coerce) - result_ignore = to_datetime(ser, errors="ignore", cache=cache) - tm.assert_series_equal(result_ignore, ser) @td.skip_if_not_us_locale def test_to_datetime_with_apply(self, cache): @@ -2579,7 +2440,7 @@ def test_to_datetime_timezone_name(self): assert result == expected @td.skip_if_not_us_locale - @pytest.mark.parametrize("errors", ["raise", "coerce", "ignore"]) + @pytest.mark.parametrize("errors", ["raise", "coerce"]) def test_to_datetime_with_apply_with_empty_str(self, cache, errors): # this is only locale tested with US/None locales # GH 5195, GH50251 @@ -2627,19 +2488,10 @@ def test_to_datetime_strings_vs_constructor(self, result): def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 - result = to_datetime([1, "1"], errors="ignore", cache=cache) - - expected = Index(np.array([1, "1"], dtype="O")) - tm.assert_equal(result, expected) - msg = '^Given date string "1" not likely a datetime, at position 1$' + msg = '^Given date string "1" not likely a datetime$' with pytest.raises(ValueError, match=msg): to_datetime([1, "1"], errors="raise", cache=cache) - def test_to_datetime_unhashable_input(self, cache): - series = Series([["a"]] * 100) - result = to_datetime(series, errors="ignore", cache=cache) - tm.assert_series_equal(series, result) - def test_to_datetime_other_datetime64_units(self): # 5/25/2012 scalar = np.int64(1337904000000000).view("M8[us]") @@ -2668,6 +2520,15 @@ def test_to_datetime_overflow(self): with pytest.raises(OutOfBoundsTimedelta, match=msg): date_range(start="1/1/1700", freq="B", periods=100000) + def test_to_datetime_float_with_nans_floating_point_error(self): + # GH#58419 + ser = Series([np.nan] * 1000 + [1712219033.0], dtype=np.float64) + result = to_datetime(ser, unit="s", errors="coerce") + expected = Series( + [NaT] * 1000 + [Timestamp("2024-04-04 08:23:53")], dtype="datetime64[ns]" + ) + tm.assert_series_equal(result, expected) + def test_string_invalid_operation(self, cache): invalid = np.array(["87156549591102612381000001219H5"], dtype=object) # GH #51084 @@ -2680,7 +2541,7 @@ def test_string_na_nat_conversion(self, cache): strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) - expected = np.empty(4, dtype="M8[ns]") + expected = np.empty(4, dtype="M8[s]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -2702,11 +2563,6 @@ def test_string_na_nat_conversion_malformed(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(malformed, errors="raise", cache=cache) - result = to_datetime(malformed, errors="ignore", cache=cache) - # GH 21864 - expected = Index(malformed, dtype=object) - tm.assert_index_equal(result, expected) - with pytest.raises(ValueError, match=msg): to_datetime(malformed, errors="raise", cache=cache) @@ -2730,7 +2586,7 @@ def test_string_na_nat_conversion_with_name(self, cache): result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) + expected = Series(np.empty(5, dtype="M8[s]"), index=idx) for i in range(5): x = series.iloc[i] if isna(x): @@ -2770,7 +2626,7 @@ def test_dayfirst(self, cache): arr = ["10/02/2014", "11/02/2014", "12/02/2014"] expected = DatetimeIndex( [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] - ) + ).as_unit("s") idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -2794,7 +2650,7 @@ def test_dayfirst_warnings_valid_input(self): # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None ) # A. dayfirst arg correct, no warning @@ -2818,7 +2674,7 @@ def test_dayfirst_warnings_invalid_input(self): ValueError, match=( r'^time data "03/30/2011" doesn\'t match format ' - rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$' + rf'"%d/%m/%Y". {PARSING_ERR_MSG}$' ), ): to_datetime(arr, dayfirst=True) @@ -2889,7 +2745,7 @@ def test_to_datetime_inconsistent_format(self, cache): ser = Series(np.array(data)) msg = ( r'^time data "01-02-2011 00:00:00" doesn\'t match format ' - rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$' + rf'"%m/%d/%Y %H:%M:%S". {PARSING_ERR_MSG}$' ) with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) @@ -2899,7 +2755,7 @@ def test_to_datetime_consistent_format(self, cache): ser = Series(np.array(data)) result = to_datetime(ser, cache=cache) expected = Series( - ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[ns]" + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2911,9 +2767,7 @@ def test_to_datetime_series_with_nans(self, cache): ) ) result = to_datetime(ser, cache=cache) - expected = Series( - ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[ns]" - ) + expected = Series(["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[s]") tm.assert_series_equal(result, expected) def test_to_datetime_series_start_with_nans(self, cache): @@ -2932,7 +2786,7 @@ def test_to_datetime_series_start_with_nans(self, cache): result = to_datetime(ser, cache=cache) expected = Series( - [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[ns]" + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2946,6 +2800,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) + expected = expected.dt.as_unit("s") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -2961,7 +2816,7 @@ def test_infer_datetime_format_zero_tz(self, ts, zero_tz): # GH 41047 ser = Series([ts + zero_tz]) result = to_datetime(ser) - tz = pytz.utc if zero_tz == "Z" else None + tz = timezone.utc if zero_tz == "Z" else None expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) @@ -2979,14 +2834,6 @@ def test_to_datetime_iso8601_noleading_0s(self, cache, format): result = to_datetime(ser, format=format, cache=cache) tm.assert_series_equal(result, expected) - def test_parse_dates_infer_datetime_format_warning(self): - # GH 49024 - with tm.assert_produces_warning( - UserWarning, - match="The argument 'infer_datetime_format' is deprecated", - ): - to_datetime(["10-10-2000"], infer_datetime_format=True) - class TestDaysInMonth: # tests for issue #10154 @@ -3004,7 +2851,7 @@ def test_day_not_in_month_coerce(self, cache, arg, format): assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) def test_day_not_in_month_raise(self, cache): - msg = "day is out of range for month: 2015-02-29, at position 0" + msg = "day is out of range for month: 2015-02-29" with pytest.raises(ValueError, match=msg): to_datetime("2015-02-29", errors="raise", cache=cache) @@ -3014,34 +2861,34 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-29", "%Y-%m-%d", - f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", + f"^day is out of range for month. {PARSING_ERR_MSG}$", ), ( "2015-29-02", "%Y-%d-%m", - f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", + f"^day is out of range for month. {PARSING_ERR_MSG}$", ), ( "2015-02-32", "%Y-%m-%d", - '^unconverted data remains when parsing with format "%Y-%m-%d": "2", ' - f"at position 0. {PARSING_ERR_MSG}$", + '^unconverted data remains when parsing with format "%Y-%m-%d": "2". ' + f"{PARSING_ERR_MSG}$", ), ( "2015-32-02", "%Y-%d-%m", - '^time data "2015-32-02" doesn\'t match format "%Y-%d-%m", ' - f"at position 0. {PARSING_ERR_MSG}$", + '^time data "2015-32-02" doesn\'t match format "%Y-%d-%m". ' + f"{PARSING_ERR_MSG}$", ), ( "2015-04-31", "%Y-%m-%d", - f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", + f"^day is out of range for month. {PARSING_ERR_MSG}$", ), ( "2015-31-04", "%Y-%d-%m", - f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", + f"^day is out of range for month. {PARSING_ERR_MSG}$", ), ], ) @@ -3050,19 +2897,6 @@ def test_day_not_in_month_raise_value(self, cache, arg, format, msg): with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format=format, cache=cache) - @pytest.mark.parametrize( - "expected, format", - [ - ["2015-02-29", None], - ["2015-02-29", "%Y-%m-%d"], - ["2015-02-29", "%Y-%m-%d"], - ["2015-04-31", "%Y-%m-%d"], - ], - ) - def test_day_not_in_month_ignore(self, cache, expected, format): - result = to_datetime(expected, errors="ignore", format=format, cache=cache) - assert result == expected - class TestDatetimeParsingWrappers: @pytest.mark.parametrize( @@ -3123,9 +2957,16 @@ def test_parsers(self, date_str, expected, cache): # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _ = parsing.parse_datetime_string_with_reso( + result1, reso_attrname = parsing.parse_datetime_string_with_reso( date_str, yearfirst=yearfirst ) + + reso = { + "nanosecond": "ns", + "microsecond": "us", + "millisecond": "ms", + "second": "s", + }.get(reso_attrname, "s") result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -3140,7 +2981,7 @@ def test_parsers(self, date_str, expected, cache): for res in [result1, result2]: assert res == expected for res in [result3, result4, result6, result8, result9]: - exp = DatetimeIndex([Timestamp(expected)]) + exp = DatetimeIndex([Timestamp(expected)]).as_unit(reso) tm.assert_index_equal(res, exp) # these really need to have yearfirst, but we don't support @@ -3154,7 +2995,7 @@ def test_na_values_with_cache( self, cache, unique_nulls_fixture, unique_nulls_fixture2 ): # GH22305 - expected = Index([NaT, NaT], dtype="datetime64[ns]") + expected = Index([NaT, NaT], dtype="datetime64[s]") result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) tm.assert_index_equal(result, expected) @@ -3180,6 +3021,8 @@ def test_parsers_nat(self): ("20/12/21", True, False, datetime(2021, 12, 20)), ("20/12/21", False, True, datetime(2020, 12, 21)), ("20/12/21", True, True, datetime(2020, 12, 21)), + # GH 58859 + ("20201012", True, False, datetime(2020, 12, 10)), ], ) def test_parsers_dayfirst_yearfirst( @@ -3311,37 +3154,6 @@ def units(request): return request.param -@pytest.fixture -def epoch_1960(): - """Timestamp at 1960-01-01.""" - return Timestamp("1960-01-01") - - -@pytest.fixture -def units_from_epochs(): - return list(range(5)) - - -@pytest.fixture(params=["timestamp", "pydatetime", "datetime64", "str_1960"]) -def epochs(epoch_1960, request): - """Timestamp at 1960-01-01 in various forms. - - * Timestamp - * datetime.datetime - * numpy.datetime64 - * str - """ - assert request.param in {"timestamp", "pydatetime", "datetime64", "str_1960"} - if request.param == "timestamp": - return epoch_1960 - elif request.param == "pydatetime": - return epoch_1960.to_pydatetime() - elif request.param == "datetime64": - return epoch_1960.to_datetime64() - else: - return str(epoch_1960) - - @pytest.fixture def julian_dates(): return date_range("2014-1-1", periods=10).to_julian_date().values @@ -3399,7 +3211,18 @@ def test_invalid_origin(self, unit): with pytest.raises(ValueError, match=msg): to_datetime("2005-01-01", origin="1960-01-01", unit=unit) - def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): + @pytest.mark.parametrize( + "epochs", + [ + Timestamp(1960, 1, 1), + datetime(1960, 1, 1), + "1960-01-01", + np.datetime64("1960-01-01"), + ], + ) + def test_epoch(self, units, epochs): + epoch_1960 = Timestamp(1960, 1, 1) + units_from_epochs = np.arange(5, dtype=np.int64) expected = Series( [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] ) @@ -3416,7 +3239,7 @@ def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): (datetime(1, 1, 1), OutOfBoundsDatetime), ], ) - def test_invalid_origins(self, origin, exc, units, units_from_epochs): + def test_invalid_origins(self, origin, exc, units): msg = "|".join( [ f"origin {origin} is Out of Bounds", @@ -3425,18 +3248,16 @@ def test_invalid_origins(self, origin, exc, units, units_from_epochs): ] ) with pytest.raises(exc, match=msg): - to_datetime(units_from_epochs, unit=units, origin=origin) + to_datetime(list(range(5)), unit=units, origin=origin) def test_invalid_origins_tzinfo(self): # GH16842 with pytest.raises(ValueError, match="must be tz-naive"): - to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=timezone.utc)) def test_incorrect_value_exception(self): # GH47495 - msg = ( - "Unknown datetime string format, unable to parse: yesterday, at position 1" - ) + msg = "Unknown datetime string format, unable to parse: yesterday" with pytest.raises(ValueError, match=msg): to_datetime(["today", "yesterday"]) @@ -3450,9 +3271,16 @@ def test_incorrect_value_exception(self): ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 - msg = r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-10 00:00:00", format=format) + if format is None: + res = to_datetime("2417-10-10 00:00:00.00", format=format) + assert isinstance(res, Timestamp) + assert res.year == 2417 + assert res.month == 10 + assert res.day == 10 + else: + msg = "unconverted data remains when parsing with format.*" + with pytest.raises(ValueError, match=msg): + to_datetime("2417-10-10 00:00:00.00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", @@ -3569,7 +3397,7 @@ def test_na_to_datetime(nulls_fixture, klass): assert result[0] is NaT -@pytest.mark.parametrize("errors", ["raise", "coerce", "ignore"]) +@pytest.mark.parametrize("errors", ["raise", "coerce"]) @pytest.mark.parametrize( "args, format", [ @@ -3584,7 +3412,7 @@ def test_empty_string_datetime(errors, args, format): # coerce empty string to pd.NaT result = to_datetime(td, format=format, errors=errors) - expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[ns]") + expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[s]") tm.assert_series_equal(expected, result) @@ -3600,7 +3428,6 @@ def test_empty_string_datetime_coerce__unit(): tm.assert_index_equal(expected, result) -@pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_monotonic_increasing_index(cache): # GH28238 cstart = start_caching_at @@ -3625,23 +3452,12 @@ def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): ) result1 = to_datetime(ser, errors="coerce", utc=True) - expected1 = Series( - [NaT] + ([Timestamp("1991-10-20 00:00:00+00:00")] * series_length) - ) - + expected1 = Series([Timestamp(x) for x in ser]) + assert expected1.dtype == "M8[us, UTC]" tm.assert_series_equal(result1, expected1) - result2 = to_datetime(ser, errors="ignore", utc=True) - - expected2 = Series( - [datetime.fromisoformat("1446-04-12 00:00:00+00:00")] - + ([datetime.fromisoformat("1991-10-20 00:00:00+00:00")] * series_length) - ) - - tm.assert_series_equal(result2, expected2) - - with pytest.raises(OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp"): - to_datetime(ser, errors="raise", utc=True) + result3 = to_datetime(ser, errors="raise", utc=True) + tm.assert_series_equal(result3, expected1) def test_to_datetime_format_f_parse_nanos(): @@ -3686,39 +3502,86 @@ def test_to_datetime_mixed_or_iso_exact(exact, format): def test_to_datetime_mixed_not_necessarily_iso8601_raise(): # https://github.com/pandas-dev/pandas/issues/50411 - with pytest.raises( - ValueError, match="Time data 01-01-2000 is not ISO8601 format, at position 1" - ): + with pytest.raises(ValueError, match="Time data 01-01-2000 is not ISO8601 format"): to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601") -@pytest.mark.parametrize( - ("errors", "expected"), - [ - ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])), - ("ignore", Index(["2020-01-01", "01-01-2000"], dtype=object)), - ], -) -def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected): +def test_to_datetime_mixed_not_necessarily_iso8601_coerce(): # https://github.com/pandas-dev/pandas/issues/50411 - result = to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601", errors=errors) + result = to_datetime( + ["2020-01-01", "01-01-2000"], format="ISO8601", errors="coerce" + ) + tm.assert_index_equal(result, DatetimeIndex(["2020-01-01 00:00:00", NaT])) + + +def test_to_datetime_iso8601_utc_single_naive(): + # GH#61389 + result = to_datetime("2023-10-15T14:30:00", utc=True, format="ISO8601") + expected = Timestamp("2023-10-15 14:30:00+00:00") + assert result == expected + + +def test_to_datetime_iso8601_utc_mixed_negative_offset(): + # GH#61389 + data = ["2023-10-15T10:30:00-12:00", "2023-10-15T14:30:00"] + result = to_datetime(data, utc=True, format="ISO8601") + + expected = DatetimeIndex( + [Timestamp("2023-10-15 22:30:00+00:00"), Timestamp("2023-10-15 14:30:00+00:00")] + ) tm.assert_index_equal(result, expected) -def test_ignoring_unknown_tz_deprecated(): +def test_to_datetime_iso8601_utc_mixed_positive_offset(): + # GH#61389 + data = ["2023-10-15T10:30:00+08:00", "2023-10-15T14:30:00"] + result = to_datetime(data, utc=True, format="ISO8601") + + expected = DatetimeIndex( + [Timestamp("2023-10-15 02:30:00+00:00"), Timestamp("2023-10-15 14:30:00+00:00")] + ) + tm.assert_index_equal(result, expected) + + +def test_to_datetime_iso8601_utc_mixed_both_offsets(): + # GH#61389 + data = [ + "2023-10-15T10:30:00+08:00", + "2023-10-15T12:30:00-05:00", + "2023-10-15T14:30:00", + ] + result = to_datetime(data, utc=True, format="ISO8601") + + expected = DatetimeIndex( + [ + Timestamp("2023-10-15 02:30:00+00:00"), + Timestamp("2023-10-15 17:30:00+00:00"), + Timestamp("2023-10-15 14:30:00+00:00"), + ] + ) + tm.assert_index_equal(result, expected) + + +def test_unknown_tz_raises(): # GH#18702, GH#51476 dtstr = "2014 Jan 9 05:15 FAKE" - msg = 'un-recognized timezone "FAKE". Dropping unrecognized timezones is deprecated' - with tm.assert_produces_warning(FutureWarning, match=msg): - res = Timestamp(dtstr) - assert res == Timestamp(dtstr[:-5]) + msg = '.*un-recognized timezone "FAKE".' + with pytest.raises(ValueError, match=msg): + Timestamp(dtstr) + + with pytest.raises(ValueError, match=msg): + to_datetime(dtstr) + with pytest.raises(ValueError, match=msg): + to_datetime([dtstr]) - with tm.assert_produces_warning(FutureWarning): - res = to_datetime(dtstr) - assert res == to_datetime(dtstr[:-5]) - with tm.assert_produces_warning(FutureWarning): - res = to_datetime([dtstr]) - tm.assert_index_equal(res, to_datetime([dtstr[:-5]])) + +def test_unformatted_input_raises(): + valid, invalid = "2024-01-01", "N" + ser = Series([valid] * start_caching_at + [invalid]) + msg = 'time data "N" doesn\'t match format "%Y-%m-%d"' + + with pytest.raises(ValueError, match=msg): + to_datetime(ser, format="%Y-%m-%d", exact=True, cache=True) def test_from_numeric_arrow_dtype(any_numeric_ea_dtype): @@ -3734,7 +3597,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 vals = ["2020-01-01 00:00+00:00", ""] result = to_datetime(vals, format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[ns, UTC]") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[s, UTC]") tm.assert_index_equal(result, expected) # Check that a couple of other similar paths work the same way @@ -3745,10 +3608,10 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed(): - # GH 50887 - msg = "parsing datetimes with mixed time zones will raise an error" + # GH#50887, GH#57275 + msg = "Mixed timezones detected. Pass utc=True in to_datetime" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): to_datetime( ["2020-01-01 00:00+00:00", "2020-01-01 00:00+02:00", ""], format="mixed" ) @@ -3762,7 +3625,7 @@ def test_to_datetime_mixed_tzs_mixed_types(): arr = [ts, dtstr] msg = ( - "Mixed timezones detected. pass utc=True in to_datetime or tz='UTC' " + "Mixed timezones detected. Pass utc=True in to_datetime or tz='UTC' " "in DatetimeIndex to convert to a common timezone" ) with pytest.raises(ValueError, match=msg): @@ -3807,7 +3670,7 @@ def test_to_datetime_mixed_types_matching_tzs(): ) @pytest.mark.parametrize("naive_first", [True, False]) def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_first): - # GH#55793, GH#55693 + # GH#55793, GH#55693, GH#57275 # Empty string parses to NaT vals = [aware_val, naive_val, ""] @@ -3820,21 +3683,27 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir # issued in _array_to_datetime_object both_strs = isinstance(aware_val, str) and isinstance(naive_val, str) has_numeric = isinstance(naive_val, (int, float)) + both_datetime = isinstance(naive_val, datetime) and isinstance(aware_val, datetime) - depr_msg = "In a future version of pandas, parsing datetimes with mixed time zones" + mixed_msg = ( + "Mixed timezones detected. Pass utc=True in to_datetime or tz='UTC' " + "in DatetimeIndex to convert to a common timezone" + ) first_non_null = next(x for x in vec if x != "") # if first_non_null is a not a string, _guess_datetime_format_for_array # doesn't guess a format so we don't go through array_strptime if not isinstance(first_non_null, str): # that case goes through array_strptime which has different behavior - msg = "Cannot mix tz-aware with tz-naive values" + msg = mixed_msg if naive_first and isinstance(aware_val, Timestamp): if isinstance(naive_val, Timestamp): msg = "Tz-aware datetime.datetime cannot be converted to datetime64" with pytest.raises(ValueError, match=msg): to_datetime(vec) else: + if not naive_first and both_datetime: + msg = "Cannot mix tz-aware with tz-naive values" with pytest.raises(ValueError, match=msg): to_datetime(vec) @@ -3863,21 +3732,21 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir to_datetime(vec, utc=True) else: - with tm.assert_produces_warning(FutureWarning, match=depr_msg): + msg = mixed_msg + with pytest.raises(ValueError, match=msg): to_datetime(vec) # No warning/error with utc=True to_datetime(vec, utc=True) if both_strs: - with tm.assert_produces_warning(FutureWarning, match=depr_msg): + msg = mixed_msg + with pytest.raises(ValueError, match=msg): to_datetime(vec, format="mixed") - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - msg = "DatetimeIndex has mixed timezones" - with pytest.raises(TypeError, match=msg): - DatetimeIndex(vec) + with pytest.raises(ValueError, match=msg): + DatetimeIndex(vec) else: - msg = "Cannot mix tz-aware with tz-naive values" + msg = mixed_msg if naive_first and isinstance(aware_val, Timestamp): if isinstance(naive_val, Timestamp): msg = "Tz-aware datetime.datetime cannot be converted to datetime64" @@ -3886,7 +3755,18 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir with pytest.raises(ValueError, match=msg): DatetimeIndex(vec) else: + if not naive_first and both_datetime: + msg = "Cannot mix tz-aware with tz-naive values" with pytest.raises(ValueError, match=msg): to_datetime(vec, format="mixed") with pytest.raises(ValueError, match=msg): DatetimeIndex(vec) + + +def test_to_datetime_wrapped_datetime64_ps(): + # GH#60341 + result = to_datetime([np.datetime64(1901901901901, "ps")]) + expected = DatetimeIndex( + ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index c452382ec572b..893f526fb3eb0 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -18,7 +18,7 @@ import pandas._testing as tm -@pytest.fixture(params=[None, "ignore", "raise", "coerce"]) +@pytest.fixture(params=[None, "raise", "coerce"]) def errors(request): return request.param @@ -116,15 +116,11 @@ def test_error(data, msg): to_numeric(ser, errors="raise") -@pytest.mark.parametrize( - "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])] -) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") -def test_ignore_error(errors, exp_data): +def test_ignore_error(): ser = Series([1, -3.14, "apple"]) - result = to_numeric(ser, errors=errors) + result = to_numeric(ser, errors="coerce") - expected = Series(exp_data) + expected = Series([1, -3.14, np.nan]) tm.assert_series_equal(result, expected) @@ -132,12 +128,10 @@ def test_ignore_error(errors, exp_data): "errors,exp", [ ("raise", 'Unable to parse string "apple" at position 2'), - ("ignore", [True, False, "apple"]), # Coerces to float. ("coerce", [1.0, 0.0, np.nan]), ], ) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_bool_handling(errors, exp): ser = Series([True, False, "apple"]) @@ -198,27 +192,25 @@ def test_numeric_df_columns(columns): # see gh-14827 df = DataFrame( { - "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], + "a": [1.2, decimal.Decimal("3.14"), decimal.Decimal("infinity"), "0.1"], "b": [1.0, 2.0, 3.0, 4.0], } ) expected = DataFrame({"a": [1.2, 3.14, np.inf, 0.1], "b": [1.0, 2.0, 3.0, 4.0]}) + df[columns] = df[columns].apply(to_numeric) - df_copy = df.copy() - df_copy[columns] = df_copy[columns].apply(to_numeric) - - tm.assert_frame_equal(df_copy, expected) + tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( "data,exp_data", [ ( - [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], + [[decimal.Decimal("3.14"), 1.0], decimal.Decimal("1.6"), 0.1], [[3.14, 1.0], 1.6, 0.1], ), - ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]), + ([np.array([decimal.Decimal("3.14"), 1.0]), 0.1], [[3.14, 1.0], 0.1]), ], ) def test_numeric_embedded_arr_likes(data, exp_data): @@ -238,7 +230,6 @@ def test_all_nan(): tm.assert_series_equal(result, expected) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_type_check(errors): # see gh-11776 df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]}) @@ -253,7 +244,6 @@ def test_scalar(val, signed, transform): assert to_numeric(transform(val)) == float(val) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_really_large_scalar(large_val, signed, transform, errors): # see gh-24910 kwargs = {"errors": errors} if errors is not None else {} @@ -271,7 +261,6 @@ def test_really_large_scalar(large_val, signed, transform, errors): tm.assert_almost_equal(to_numeric(val, **kwargs), expected) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors): # see gh-24910 kwargs = {"errors": errors} if errors is not None else {} @@ -311,7 +300,6 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors): # see gh-24910 # @@ -331,13 +319,8 @@ def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors to_numeric(arr, **kwargs) else: result = to_numeric(arr, **kwargs) - - if errors == "coerce": - expected = [float(i) for i in arr] - exp_dtype = float - else: - expected = arr - exp_dtype = object + expected = [float(i) for i in arr] + exp_dtype = float tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) @@ -346,11 +329,9 @@ def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors "errors,checker", [ ("raise", 'Unable to parse string "fail" at position 0'), - ("ignore", lambda x: x == "fail"), ("coerce", lambda x: np.isnan(x)), ], ) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_scalar_fail(errors, checker): scalar = "fail" @@ -403,6 +384,21 @@ def test_timedelta(transform_assert_equal): assert_equal(result, expected) +@pytest.mark.parametrize( + "scalar", + [ + pd.Timedelta(1, "D"), + pd.Timestamp("2017-01-01T12"), + pd.Timestamp("2017-01-01T12", tz="US/Pacific"), + ], +) +def test_timedelta_timestamp_scalar(scalar): + # GH#59944 + result = to_numeric(scalar) + expected = to_numeric(Series(scalar))[0] + assert result == expected + + def test_period(request, transform_assert_equal): transform, assert_equal = transform_assert_equal @@ -422,11 +418,9 @@ def test_period(request, transform_assert_equal): "errors,expected", [ ("raise", "Invalid object type at position 0"), - ("ignore", Series([[10.0, 2], 1.0, "apple"])), ("coerce", Series([np.nan, 1.0, np.nan])), ], ) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_non_hashable(errors, expected): # see gh-13324 ser = Series([[10.0, 2], 1.0, "apple"]) @@ -504,19 +498,6 @@ def test_signed_downcast(data, signed_downcast): tm.assert_numpy_array_equal(res, expected) -def test_ignore_downcast_invalid_data(): - # If we can't successfully cast the given - # data to a numeric dtype, do not bother - # with the downcast parameter. - data = ["foo", 2, 3] - expected = np.array(data, dtype=object) - - msg = "errors='ignore' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = to_numeric(data, errors="ignore", downcast="unsigned") - tm.assert_numpy_array_equal(res, expected) - - def test_ignore_downcast_neg_to_unsigned(): # Cannot cast to an unsigned integer # because we have a negative number. @@ -528,7 +509,6 @@ def test_ignore_downcast_neg_to_unsigned(): # Warning in 32 bit platforms -@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"]) @pytest.mark.parametrize( "data,expected", @@ -598,21 +578,12 @@ def test_downcast_float64_to_float32(): assert series.dtype == result.dtype -@pytest.mark.parametrize( - "ser,expected", - [ - ( - Series([0, 9223372036854775808]), - Series([0, 9223372036854775808], dtype=np.uint64), - ) - ], -) -def test_downcast_uint64(ser, expected): +def test_downcast_uint64(): # see gh-14422: # BUG: to_numeric doesn't work uint64 numbers - + ser = Series([0, 9223372036854775808]) result = to_numeric(ser, downcast="unsigned") - + expected = Series([0, 9223372036854775808], dtype=np.uint64) tm.assert_series_equal(result, expected) @@ -639,26 +610,14 @@ def test_coerce_uint64_conflict(data, exp_data): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "errors,exp", - [ - ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])), - ("raise", "Unable to parse string"), - ], -) -@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") -def test_non_coerce_uint64_conflict(errors, exp): +def test_non_coerce_uint64_conflict(): # see gh-17007 and gh-17125 # # For completeness. ser = Series(["12345678901234567890", "1234567890", "ITEM"]) - if isinstance(exp, str): - with pytest.raises(ValueError, match=exp): - to_numeric(ser, errors=errors) - else: - result = to_numeric(ser, errors=errors) - tm.assert_series_equal(result, ser) + with pytest.raises(ValueError, match="Unable to parse string"): + to_numeric(ser, errors="raise") @pytest.mark.parametrize("dc1", ["integer", "float", "unsigned"]) @@ -768,17 +727,6 @@ def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype): tm.assert_series_equal(result, expected) -def test_to_numeric_from_nullable_string_ignore(nullable_string_dtype): - # GH#52146 - values = ["a", "1"] - ser = Series(values, dtype=nullable_string_dtype) - expected = ser.copy() - msg = "errors='ignore' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_numeric(ser, errors="ignore") - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( "data, input_dtype, downcast, expected_dtype", ( @@ -945,11 +893,6 @@ def test_to_numeric_dtype_backend_error(dtype_backend): with pytest.raises(ValueError, match="Unable to parse string"): to_numeric(ser, dtype_backend=dtype_backend) - msg = "errors='ignore' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_numeric(ser, dtype_backend=dtype_backend, errors="ignore") - tm.assert_series_equal(result, expected) - result = to_numeric(ser, dtype_backend=dtype_backend, errors="coerce") if dtype_backend == "pyarrow": dtype = "double[pyarrow]" diff --git a/pandas/tests/tools/test_to_time.py b/pandas/tests/tools/test_to_time.py index b673bd9c2ec71..5e61f5e1a3029 100644 --- a/pandas/tests/tools/test_to_time.py +++ b/pandas/tests/tools/test_to_time.py @@ -54,10 +54,8 @@ def test_arraylike(self): assert to_time(arg, infer_time_format=True) == expected_arr assert to_time(arg, format="%I:%M%p", errors="coerce") == [None, None] - msg = "errors='ignore' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = to_time(arg, format="%I:%M%p", errors="ignore") - tm.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) + with pytest.raises(ValueError, match="errors must be"): + to_time(arg, format="%I:%M%p", errors="ignore") msg = "Cannot convert.+to a time with given format" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index b67694f1c58c7..9ec2689069da9 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas.compat import ( + IS64, + WASM, +) from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -26,18 +29,16 @@ def test_to_timedelta_dt64_raises(self): # supported GH#29794 msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" - ser = Series([pd.NaT]) + ser = Series([pd.NaT], dtype="M8[ns]") with pytest.raises(TypeError, match=msg): to_timedelta(ser) with pytest.raises(TypeError, match=msg): ser.to_frame().apply(to_timedelta) - @pytest.mark.parametrize("readonly", [True, False]) - def test_to_timedelta_readonly(self, readonly): + def test_to_timedelta_readonly(self, writable): # GH#34857 arr = np.array([], dtype=object) - if readonly: - arr.setflags(write=False) + arr.setflags(write=writable) result = to_timedelta(arr) expected = to_timedelta([]) tm.assert_index_equal(result, expected) @@ -55,7 +56,10 @@ def test_to_timedelta_same_np_timedelta64(self): def test_to_timedelta_series(self): # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) - result = to_timedelta(Series(["1d", "1days 00:00:01"])) + + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(Series(["1d", "1days 00:00:01"])) tm.assert_series_equal(result, expected) def test_to_timedelta_units(self): @@ -100,13 +104,11 @@ def test_to_timedelta_oob_non_nano(self): with pytest.raises(OutOfBoundsTimedelta, match=msg): TimedeltaArray._from_sequence(arr, dtype="m8[s]") - @pytest.mark.parametrize( - "arg", [np.arange(10).reshape(2, 5), pd.DataFrame(np.arange(10).reshape(2, 5))] - ) - @pytest.mark.parametrize("errors", ["ignore", "raise", "coerce"]) - @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") - def test_to_timedelta_dataframe(self, arg, errors): + @pytest.mark.parametrize("box", [lambda x: x, pd.DataFrame]) + @pytest.mark.parametrize("errors", ["raise", "coerce"]) + def test_to_timedelta_dataframe(self, box, errors): # GH 11776 + arg = box(np.arange(10).reshape(2, 5)) with pytest.raises(TypeError, match="1-d array"): to_timedelta(arg, errors=errors) @@ -148,32 +150,6 @@ def test_to_timedelta_bad_value_coerce(self): to_timedelta(["1 day", "bar", "1 min"], errors="coerce"), ) - def test_to_timedelta_invalid_errors_ignore(self): - # gh-13613: these should not error because errors='ignore' - msg = "errors='ignore' is deprecated" - invalid_data = "apple" - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_timedelta(invalid_data, errors="ignore") - assert invalid_data == result - - invalid_data = ["apple", "1 days"] - expected = np.array(invalid_data, dtype=object) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_timedelta(invalid_data, errors="ignore") - tm.assert_numpy_array_equal(expected, result) - - invalid_data = pd.Index(["apple", "1 days"]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_timedelta(invalid_data, errors="ignore") - tm.assert_index_equal(invalid_data, result) - - invalid_data = Series(["apple", "1 days"]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_timedelta(invalid_data, errors="ignore") - tm.assert_series_equal(invalid_data, result) - @pytest.mark.parametrize( "val, errors", [ @@ -244,6 +220,7 @@ def test_to_timedelta_on_missing_values_list(self, val): actual = to_timedelta([val]) assert actual[0]._value == np.timedelta64("NaT").astype("int64") + @pytest.mark.skipif(WASM, reason="No fp exception support in WASM") @pytest.mark.xfail(not IS64, reason="Floating point error") def test_to_timedelta_float(self): # https://github.com/pandas-dev/pandas/issues/25077 @@ -258,13 +235,6 @@ def test_to_timedelta_coerce_strings_unit(self): expected = to_timedelta([1, 2, pd.NaT], unit="ns") tm.assert_index_equal(result, expected) - def test_to_timedelta_ignore_strings_unit(self): - arr = np.array([1, 2, "error"], dtype=object) - msg = "errors='ignore' is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = to_timedelta(arr, unit="ns", errors="ignore") - tm.assert_numpy_array_equal(result, arr) - @pytest.mark.parametrize( "expected_val, result_val", [[timedelta(days=2), 2], [None, None]] ) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 99a504f4188c1..dad5c73b89626 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -23,7 +23,6 @@ date_range, period_range, ) -import pandas._testing as tm from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -202,17 +201,6 @@ def test_infer_freq_custom(base_delta_code_pair, constructor): assert frequencies.infer_freq(index) is None -@pytest.mark.parametrize( - "freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")] -) -def test_infer_freq_index(freq, expected): - rng = period_range("1959Q2", "2009Q3", freq=freq) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - rng = Index(rng.to_timestamp("D", how="e").astype(object)) - - assert rng.inferred_freq == expected - - @pytest.mark.parametrize( "expected,dates", list( @@ -231,7 +219,6 @@ def test_infer_freq_index(freq, expected): }.items() ), ) -@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_infer_freq_tz(tz_naive_fixture, expected, dates, unit): # see gh-7310, GH#55609 tz = tz_naive_fixture @@ -500,7 +487,6 @@ def test_series_datetime_index(freq): "YE@OCT", "YE@NOV", "YE@DEC", - "YE@JAN", "WOM@1MON", "WOM@2MON", "WOM@3MON", diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 99829857e6836..90e2e117852a2 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -57,10 +57,10 @@ def __init__(self, name=None, rules=None) -> None: jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)]) # Getting holidays for Jan 1 should not alter results for Jan 2. - expected = DatetimeIndex(["01-Jan-2015"]).as_unit("ns") + expected = DatetimeIndex(["01-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan1.holidays(), expected) - expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("ns") + expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan2.holidays(), expected2) diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index b2eefd04ef93b..ffe6ff0b51bcf 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -1,7 +1,9 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import pytest -from pytz import utc from pandas import ( DatetimeIndex, @@ -128,9 +130,9 @@ def test_holiday_dates(holiday, start_date, end_date, expected): # Verify that timezone info is preserved. assert list( holiday.dates( - utc.localize(Timestamp(start_date)), utc.localize(Timestamp(end_date)) + Timestamp(start_date, tz=timezone.utc), Timestamp(end_date, tz=timezone.utc) ) - ) == [utc.localize(dt) for dt in expected] + ) == [dt.replace(tzinfo=timezone.utc) for dt in expected] @pytest.mark.parametrize( @@ -194,8 +196,10 @@ def test_holidays_within_dates(holiday, start, expected): # Verify that timezone info is preserved. assert list( - holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(start))) - ) == [utc.localize(dt) for dt in expected] + holiday.dates( + Timestamp(start, tz=timezone.utc), Timestamp(start, tz=timezone.utc) + ) + ) == [dt.replace(tzinfo=timezone.utc) for dt in expected] @pytest.mark.parametrize( @@ -271,6 +275,25 @@ def test_both_offset_observance_raises(): ) +def test_list_of_list_of_offsets_raises(): + # see gh-29049 + # Test that the offsets of offsets are forbidden + holiday1 = Holiday( + "Holiday1", + month=USThanksgivingDay.month, + day=USThanksgivingDay.day, + offset=[USThanksgivingDay.offset, DateOffset(1)], + ) + msg = "Only BaseOffsets and flat lists of them are supported for offset." + with pytest.raises(ValueError, match=msg): + Holiday( + "Holiday2", + month=holiday1.month, + day=holiday1.day, + offset=[holiday1.offset, DateOffset(3)], + ) + + def test_half_open_interval_with_observance(): # Prompted by GH 49075 # Check for holidays that have a half-open date interval where diff --git a/pandas/tests/tseries/offsets/common.py b/pandas/tests/tseries/offsets/common.py index efb010addad22..e0838cceb4c7b 100644 --- a/pandas/tests/tseries/offsets/common.py +++ b/pandas/tests/tseries/offsets/common.py @@ -1,6 +1,7 @@ """ Assertion helpers and base class for offsets tests """ + from __future__ import annotations diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py deleted file mode 100644 index 2fc846353dcb5..0000000000000 --- a/pandas/tests/tseries/offsets/conftest.py +++ /dev/null @@ -1,13 +0,0 @@ -import datetime - -import pytest - -from pandas._libs.tslibs import Timestamp - - -@pytest.fixture -def dt(): - """ - Fixture for common Timestamp. - """ - return Timestamp(datetime.datetime(2008, 1, 2)) diff --git a/pandas/tests/tseries/offsets/test_business_day.py b/pandas/tests/tseries/offsets/test_business_day.py index 7db1921369023..b1ab5fc64804b 100644 --- a/pandas/tests/tseries/offsets/test_business_day.py +++ b/pandas/tests/tseries/offsets/test_business_day.py @@ -1,6 +1,7 @@ """ Tests for offsets.BDay """ + from __future__ import annotations from datetime import ( diff --git a/pandas/tests/tseries/offsets/test_business_halfyear.py b/pandas/tests/tseries/offsets/test_business_halfyear.py new file mode 100644 index 0000000000000..9ea336b3d13f8 --- /dev/null +++ b/pandas/tests/tseries/offsets/test_business_halfyear.py @@ -0,0 +1,329 @@ +""" +Tests for the following offsets: +- BHalfYearBegin +- BHalfYearEnd +""" + +from __future__ import annotations + +from datetime import datetime + +import pytest + +from pandas.tests.tseries.offsets.common import ( + assert_is_on_offset, + assert_offset_equal, +) + +from pandas.tseries.offsets import ( + BHalfYearBegin, + BHalfYearEnd, +) + + +@pytest.mark.parametrize("klass", (BHalfYearBegin, BHalfYearEnd)) +def test_halfyearly_dont_normalize(klass): + date = datetime(2012, 3, 31, 5, 30) + result = date + klass() + assert result.time() == date.time() + + +@pytest.mark.parametrize("offset", [BHalfYearBegin(), BHalfYearEnd()]) +@pytest.mark.parametrize( + "date", + [ + datetime(2016, m, d) + for m in [7, 8, 9, 10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m in {9, 11} and d == 31) + ], +) +def test_on_offset(offset, date): + res = offset.is_on_offset(date) + slow_version = date == (date + offset) - offset + assert res == slow_version + + +class TestBHalfYearBegin: + def test_repr(self): + expected = "" + assert repr(BHalfYearBegin()) == expected + expected = "" + assert repr(BHalfYearBegin(startingMonth=3)) == expected + expected = "" + assert repr(BHalfYearBegin(startingMonth=1)) == expected + + def test_offset_corner_case(self): + # corner + offset = BHalfYearBegin(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) + + offset_cases = [] + offset_cases.append( + ( + BHalfYearBegin(startingMonth=1), + { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1), + datetime(2008, 7, 1): datetime(2009, 1, 1), + datetime(2008, 7, 15): datetime(2009, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 8, 1), + datetime(2008, 3, 15): datetime(2008, 8, 1), + datetime(2008, 3, 31): datetime(2008, 8, 1), + datetime(2008, 4, 15): datetime(2008, 8, 1), + datetime(2008, 4, 30): datetime(2008, 8, 1), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1), + datetime(2008, 7, 1): datetime(2008, 7, 1), + datetime(2008, 7, 15): datetime(2009, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 7, 2), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 1, 1), + datetime(2008, 4, 30): datetime(2008, 1, 1), + datetime(2008, 7, 1): datetime(2008, 1, 1), + datetime(2008, 7, 15): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 2, 15): datetime(2009, 1, 1), + datetime(2008, 2, 29): datetime(2009, 1, 1), + datetime(2008, 3, 15): datetime(2009, 1, 1), + datetime(2008, 3, 31): datetime(2009, 1, 1), + datetime(2008, 4, 15): datetime(2009, 1, 1), + datetime(2008, 4, 1): datetime(2009, 1, 1), + datetime(2008, 7, 15): datetime(2009, 7, 1), + datetime(2008, 7, 1): datetime(2009, 7, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BHalfYearBegin(1, startingMonth=1), datetime(2008, 1, 1), True), + (BHalfYearBegin(1, startingMonth=1), datetime(2007, 12, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2008, 2, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2007, 3, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2008, 4, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2008, 5, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2007, 6, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 1, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2007, 12, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 2, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2007, 3, 1), True), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 4, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 5, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 5, 2), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2007, 6, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2007, 6, 2), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 1, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2007, 12, 3), True), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 2, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2007, 3, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2007, 3, 2), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 4, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 5, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 5, 2), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2007, 6, 1), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestBHalfYearEnd: + def test_repr(self): + expected = "" + assert repr(BHalfYearEnd()) == expected + expected = "" + assert repr(BHalfYearEnd(startingMonth=3)) == expected + expected = "" + assert repr(BHalfYearEnd(startingMonth=1)) == expected + + def test_offset_corner_case(self): + # corner + offset = BHalfYearEnd(n=-1, startingMonth=1) + assert datetime(2010, 1, 30) + offset == datetime(2010, 1, 29) + + offset_cases = [] + offset_cases.append( + ( + BHalfYearEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 7, 31): datetime(2009, 1, 30), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 8, 29), + datetime(2008, 3, 15): datetime(2008, 8, 29), + datetime(2008, 3, 31): datetime(2008, 8, 29), + datetime(2008, 4, 15): datetime(2008, 8, 29), + datetime(2008, 8, 28): datetime(2008, 8, 29), + datetime(2008, 8, 29): datetime(2009, 2, 27), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 7, 31): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 7, 31), + datetime(2008, 1, 31): datetime(2007, 7, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 7, 15): datetime(2008, 1, 31), + datetime(2008, 7, 30): datetime(2008, 1, 31), + datetime(2008, 7, 31): datetime(2008, 1, 31), + datetime(2008, 8, 1): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearEnd(startingMonth=6, n=2), + { + datetime(2008, 1, 31): datetime(2008, 12, 31), + datetime(2008, 2, 15): datetime(2008, 12, 31), + datetime(2008, 2, 29): datetime(2008, 12, 31), + datetime(2008, 3, 15): datetime(2008, 12, 31), + datetime(2008, 3, 31): datetime(2008, 12, 31), + datetime(2008, 4, 15): datetime(2008, 12, 31), + datetime(2008, 4, 30): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2009, 6, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 4, 30), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 5, 31), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 12, 31), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 3, 30), True), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 3, 31), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 5, 31), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 6, 29), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 1, 31), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 12, 31), True), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 2, 29), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 3, 30), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 3, 31), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 4, 30), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 5, 30), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 5, 31), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 6, 29), True), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 6, 30), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) diff --git a/pandas/tests/tseries/offsets/test_business_hour.py b/pandas/tests/tseries/offsets/test_business_hour.py index 2779100f5355c..1b488dc9a47d4 100644 --- a/pandas/tests/tseries/offsets/test_business_hour.py +++ b/pandas/tests/tseries/offsets/test_business_hour.py @@ -1,6 +1,7 @@ """ Tests for offsets.BusinessHour """ + from __future__ import annotations from datetime import ( @@ -915,28 +916,34 @@ def test_apply_nanoseconds(self): ( BusinessHour(), { - Timestamp("2014-07-04 15:00") - + Nano(5): Timestamp("2014-07-04 16:00") + Timestamp("2014-07-04 15:00") + Nano(5): Timestamp( + "2014-07-04 16:00" + ) + Nano(5), - Timestamp("2014-07-04 16:00") - + Nano(5): Timestamp("2014-07-07 09:00") + Timestamp("2014-07-04 16:00") + Nano(5): Timestamp( + "2014-07-07 09:00" + ) + Nano(5), - Timestamp("2014-07-04 16:00") - - Nano(5): Timestamp("2014-07-04 17:00") + Timestamp("2014-07-04 16:00") - Nano(5): Timestamp( + "2014-07-04 17:00" + ) - Nano(5), }, ), ( BusinessHour(-1), { - Timestamp("2014-07-04 15:00") - + Nano(5): Timestamp("2014-07-04 14:00") + Timestamp("2014-07-04 15:00") + Nano(5): Timestamp( + "2014-07-04 14:00" + ) + Nano(5), - Timestamp("2014-07-04 10:00") - + Nano(5): Timestamp("2014-07-04 09:00") + Timestamp("2014-07-04 10:00") + Nano(5): Timestamp( + "2014-07-04 09:00" + ) + Nano(5), - Timestamp("2014-07-04 10:00") - - Nano(5): Timestamp("2014-07-03 17:00") + Timestamp("2014-07-04 10:00") - Nano(5): Timestamp( + "2014-07-03 17:00" + ) - Nano(5), }, ), @@ -947,7 +954,6 @@ def test_apply_nanoseconds(self): assert_offset_equal(offset, base, expected) @pytest.mark.parametrize("td_unit", ["s", "ms", "us", "ns"]) - @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_bday_ignores_timedeltas(self, unit, td_unit): # GH#55608 idx = date_range("2010/02/01", "2010/02/10", freq="12h", unit=unit) diff --git a/pandas/tests/tseries/offsets/test_business_month.py b/pandas/tests/tseries/offsets/test_business_month.py index a14451e60aa89..3ae2a115d46f7 100644 --- a/pandas/tests/tseries/offsets/test_business_month.py +++ b/pandas/tests/tseries/offsets/test_business_month.py @@ -3,6 +3,7 @@ - BMonthBegin - BMonthEnd """ + from __future__ import annotations from datetime import datetime diff --git a/pandas/tests/tseries/offsets/test_business_quarter.py b/pandas/tests/tseries/offsets/test_business_quarter.py index 44a7f16ab039d..ab3e55c4989fb 100644 --- a/pandas/tests/tseries/offsets/test_business_quarter.py +++ b/pandas/tests/tseries/offsets/test_business_quarter.py @@ -3,6 +3,7 @@ - BQuarterBegin - BQuarterEnd """ + from __future__ import annotations from datetime import datetime @@ -53,11 +54,6 @@ def test_repr(self): expected = "" assert repr(BQuarterBegin(startingMonth=1)) == expected - def test_is_anchored(self): - assert BQuarterBegin(startingMonth=1).is_anchored() - assert BQuarterBegin().is_anchored() - assert not BQuarterBegin(2, startingMonth=1).is_anchored() - def test_offset_corner_case(self): # corner offset = BQuarterBegin(n=-1, startingMonth=1) @@ -176,11 +172,6 @@ def test_repr(self): expected = "" assert repr(BQuarterEnd(startingMonth=1)) == expected - def test_is_anchored(self): - assert BQuarterEnd(startingMonth=1).is_anchored() - assert BQuarterEnd().is_anchored() - assert not BQuarterEnd(2, startingMonth=1).is_anchored() - def test_offset_corner_case(self): # corner offset = BQuarterEnd(n=-1, startingMonth=1) diff --git a/pandas/tests/tseries/offsets/test_business_year.py b/pandas/tests/tseries/offsets/test_business_year.py index 3b7a1025cc19c..cf12b166b30e4 100644 --- a/pandas/tests/tseries/offsets/test_business_year.py +++ b/pandas/tests/tseries/offsets/test_business_year.py @@ -3,6 +3,7 @@ - BYearBegin - BYearEnd """ + from __future__ import annotations from datetime import datetime diff --git a/pandas/tests/tseries/offsets/test_common.py b/pandas/tests/tseries/offsets/test_common.py index 5b80b8b1c4ab4..34181f28bb1a0 100644 --- a/pandas/tests/tseries/offsets/test_common.py +++ b/pandas/tests/tseries/offsets/test_common.py @@ -9,6 +9,7 @@ ) from pandas.compat import ( IS64, + WASM, is_platform_windows, ) @@ -106,15 +107,7 @@ def _offset(request): return request.param -@pytest.fixture -def dt(_offset): - if _offset in (CBMonthBegin, CBMonthEnd, BDay): - return Timestamp(2008, 1, 1) - elif _offset is (CustomBusinessHour, BusinessHour): - return Timestamp(2014, 7, 1, 10, 00) - return Timestamp(2008, 1, 2) - - +@pytest.mark.skipif(WASM, reason="OverflowError received on WASM") def test_apply_out_of_range(request, tz_naive_fixture, _offset): tz = tz_naive_fixture @@ -139,7 +132,11 @@ def test_apply_out_of_range(request, tz_naive_fixture, _offset): if tz is not None: assert t.tzinfo is not None - if isinstance(tz, tzlocal) and not IS64 and _offset is not DateOffset: + if ( + isinstance(tz, tzlocal) + and ((not IS64) or WASM) + and _offset is not DateOffset + ): # If we hit OutOfBoundsDatetime on non-64 bit machines # we'll drop out of the try clause before the next test request.applymarker( @@ -250,7 +247,8 @@ def test_sub(date, offset_box, offset2): [BusinessHour, BusinessHour()], ], ) -def test_Mult1(offset_box, offset1, dt): +def test_Mult1(offset_box, offset1): + dt = Timestamp(2008, 1, 2) assert dt + 10 * offset1 == dt + offset_box(10) assert dt + 5 * offset1 == dt + offset_box(5) diff --git a/pandas/tests/tseries/offsets/test_custom_business_day.py b/pandas/tests/tseries/offsets/test_custom_business_day.py index 519fb712d0415..d2f309dd3f33c 100644 --- a/pandas/tests/tseries/offsets/test_custom_business_day.py +++ b/pandas/tests/tseries/offsets/test_custom_business_day.py @@ -1,6 +1,7 @@ """ Tests for offsets.CustomBusinessDay / CDay """ + from datetime import ( datetime, timedelta, diff --git a/pandas/tests/tseries/offsets/test_custom_business_hour.py b/pandas/tests/tseries/offsets/test_custom_business_hour.py index 55a184f95c2d8..360ed70fa5b9e 100644 --- a/pandas/tests/tseries/offsets/test_custom_business_hour.py +++ b/pandas/tests/tseries/offsets/test_custom_business_hour.py @@ -1,6 +1,7 @@ """ Tests for offsets.CustomBusinessHour """ + from __future__ import annotations from datetime import ( @@ -269,28 +270,22 @@ def test_apply(self, apply_case): ( CustomBusinessHour(holidays=holidays), { - Timestamp("2014-07-01 15:00") - + Nano(5): Timestamp("2014-07-01 16:00") + Timestamp("2014-07-01 15:00") + Nano(5): Timestamp("2014-07-01 16:00") + Nano(5), - Timestamp("2014-07-01 16:00") - + Nano(5): Timestamp("2014-07-03 09:00") + Timestamp("2014-07-01 16:00") + Nano(5): Timestamp("2014-07-03 09:00") + Nano(5), - Timestamp("2014-07-01 16:00") - - Nano(5): Timestamp("2014-07-01 17:00") + Timestamp("2014-07-01 16:00") - Nano(5): Timestamp("2014-07-01 17:00") - Nano(5), }, ), ( CustomBusinessHour(-1, holidays=holidays), { - Timestamp("2014-07-01 15:00") - + Nano(5): Timestamp("2014-07-01 14:00") + Timestamp("2014-07-01 15:00") + Nano(5): Timestamp("2014-07-01 14:00") + Nano(5), - Timestamp("2014-07-01 10:00") - + Nano(5): Timestamp("2014-07-01 09:00") + Timestamp("2014-07-01 10:00") + Nano(5): Timestamp("2014-07-01 09:00") + Nano(5), - Timestamp("2014-07-01 10:00") - - Nano(5): Timestamp("2014-06-26 17:00") + Timestamp("2014-07-01 10:00") - Nano(5): Timestamp("2014-06-26 17:00") - Nano(5), }, ), diff --git a/pandas/tests/tseries/offsets/test_custom_business_month.py b/pandas/tests/tseries/offsets/test_custom_business_month.py index d226302e042d3..fd6565e3908f3 100644 --- a/pandas/tests/tseries/offsets/test_custom_business_month.py +++ b/pandas/tests/tseries/offsets/test_custom_business_month.py @@ -4,6 +4,7 @@ - CustomBusinessMonthBegin - CustomBusinessMonthEnd """ + from __future__ import annotations from datetime import ( @@ -62,30 +63,18 @@ def test_copy(self, _offset): class TestCustomBusinessMonthBegin: - @pytest.fixture - def _offset(self): - return CBMonthBegin - - @pytest.fixture - def offset(self): - return CBMonthBegin() - - @pytest.fixture - def offset2(self): - return CBMonthBegin(2) - - def test_different_normalize_equals(self, _offset): + def test_different_normalize_equals(self): # GH#21404 changed __eq__ to return False when `normalize` does not match - offset = _offset() - offset2 = _offset(normalize=True) + offset = CBMonthBegin() + offset2 = CBMonthBegin(normalize=True) assert offset != offset2 - def test_repr(self, offset, offset2): - assert repr(offset) == "" - assert repr(offset2) == "<2 * CustomBusinessMonthBegins>" + def test_repr(self): + assert repr(CBMonthBegin()) == "" + assert repr(CBMonthBegin(2)) == "<2 * CustomBusinessMonthBegins>" - def test_add_datetime(self, dt, offset2): - assert offset2 + dt == datetime(2008, 3, 3) + def test_add_datetime(self, dt): + assert CBMonthBegin(2) + dt == datetime(2008, 3, 3) def testRollback1(self): assert CDay(10).rollback(datetime(2007, 12, 31)) == datetime(2007, 12, 31) @@ -252,30 +241,18 @@ def test_apply_with_extra_offset(self, case): class TestCustomBusinessMonthEnd: - @pytest.fixture - def _offset(self): - return CBMonthEnd - - @pytest.fixture - def offset(self): - return CBMonthEnd() - - @pytest.fixture - def offset2(self): - return CBMonthEnd(2) - - def test_different_normalize_equals(self, _offset): + def test_different_normalize_equals(self): # GH#21404 changed __eq__ to return False when `normalize` does not match - offset = _offset() - offset2 = _offset(normalize=True) + offset = CBMonthEnd() + offset2 = CBMonthEnd(normalize=True) assert offset != offset2 - def test_repr(self, offset, offset2): - assert repr(offset) == "" - assert repr(offset2) == "<2 * CustomBusinessMonthEnds>" + def test_repr(self): + assert repr(CBMonthEnd()) == "" + assert repr(CBMonthEnd(2)) == "<2 * CustomBusinessMonthEnds>" - def test_add_datetime(self, dt, offset2): - assert offset2 + dt == datetime(2008, 2, 29) + def test_add_datetime(self, dt): + assert CBMonthEnd(2) + dt == datetime(2008, 2, 29) def testRollback1(self): assert CDay(10).rollback(datetime(2007, 12, 31)) == datetime(2007, 12, 31) diff --git a/pandas/tests/tseries/offsets/test_dst.py b/pandas/tests/tseries/offsets/test_dst.py index b22dc0b330817..e75958843040d 100644 --- a/pandas/tests/tseries/offsets/test_dst.py +++ b/pandas/tests/tseries/offsets/test_dst.py @@ -1,10 +1,10 @@ """ Tests for DateOffset additions over Daylight Savings Time """ + from datetime import timedelta import pytest -import pytz from pandas._libs.tslibs import Timestamp from pandas._libs.tslibs.offsets import ( @@ -29,14 +29,11 @@ YearBegin, YearEnd, ) -from pandas.errors import PerformanceWarning from pandas import DatetimeIndex import pandas._testing as tm -from pandas.util.version import Version -# error: Module has no attribute "__version__" -pytz_version = Version(pytz.__version__) # type: ignore[attr-defined] +pytz = pytest.importorskip("pytz") def get_utc_offset_hours(ts): @@ -52,7 +49,10 @@ class TestDST: # test both basic names and dateutil timezones timezone_utc_offsets = { - "US/Eastern": {"utc_offset_daylight": -4, "utc_offset_standard": -5}, + pytz.timezone("US/Eastern"): { + "utc_offset_daylight": -4, + "utc_offset_standard": -5, + }, "dateutil/US/Pacific": {"utc_offset_daylight": -7, "utc_offset_standard": -8}, } valid_date_offsets_singular = [ @@ -73,7 +73,7 @@ class TestDST: "microseconds", ] - def _test_all_offsets(self, n, **kwds): + def _test_all_offsets(self, n, performance_warning, **kwds): valid_offsets = ( self.valid_date_offsets_plural if n > 1 @@ -81,15 +81,25 @@ def _test_all_offsets(self, n, **kwds): ) for name in valid_offsets: - self._test_offset(offset_name=name, offset_n=n, **kwds) + self._test_offset( + offset_name=name, + offset_n=n, + performance_warning=performance_warning, + **kwds, + ) - def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): + def _test_offset( + self, offset_name, offset_n, tstart, expected_utc_offset, performance_warning + ): offset = DateOffset(**{offset_name: offset_n}) if ( offset_name in ["hour", "minute", "second", "microsecond"] and offset_n == 1 - and tstart == Timestamp("2013-11-03 01:59:59.999999-0500", tz="US/Eastern") + and tstart + == Timestamp( + "2013-11-03 01:59:59.999999-0500", tz=pytz.timezone("US/Eastern") + ) ): # This addition results in an ambiguous wall time err_msg = { @@ -98,14 +108,14 @@ def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): "second": "2013-11-03 01:59:01.999999", "microsecond": "2013-11-03 01:59:59.000001", }[offset_name] - with pytest.raises(pytz.AmbiguousTimeError, match=err_msg): + with pytest.raises(ValueError, match=err_msg): tstart + offset # While we're here, let's check that we get the same behavior in a # vectorized path dti = DatetimeIndex([tstart]) warn_msg = "Non-vectorized DateOffset" - with pytest.raises(pytz.AmbiguousTimeError, match=err_msg): - with tm.assert_produces_warning(PerformanceWarning, match=warn_msg): + with pytest.raises(ValueError, match=err_msg): + with tm.assert_produces_warning(performance_warning, match=warn_msg): dti + offset return @@ -140,7 +150,9 @@ def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): assert datepart_offset == offset.kwds[offset_name] else: # the offset should be the same as if it was done in UTC - assert t == (tstart.tz_convert("UTC") + offset).tz_convert("US/Pacific") + assert t == (tstart.tz_convert("UTC") + offset).tz_convert( + pytz.timezone("US/Pacific") + ) def _make_timestamp(self, string, hrs_offset, tz): if hrs_offset >= 0: @@ -149,18 +161,19 @@ def _make_timestamp(self, string, hrs_offset, tz): offset_string = f"-{(hrs_offset * -1):02}00" return Timestamp(string + offset_string).tz_convert(tz) - def test_springforward_plural(self): + def test_springforward_plural(self, performance_warning): # test moving from standard to daylight savings for tz, utc_offsets in self.timezone_utc_offsets.items(): hrs_pre = utc_offsets["utc_offset_standard"] hrs_post = utc_offsets["utc_offset_daylight"] self._test_all_offsets( n=3, + performance_warning=performance_warning, tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), expected_utc_offset=hrs_post, ) - def test_fallback_singular(self): + def test_fallback_singular(self, performance_warning): # in the case of singular offsets, we don't necessarily know which utc # offset the new Timestamp will wind up in (the tz for 1 month may be # different from 1 second) so we don't specify an expected_utc_offset @@ -168,15 +181,17 @@ def test_fallback_singular(self): hrs_pre = utc_offsets["utc_offset_standard"] self._test_all_offsets( n=1, + performance_warning=performance_warning, tstart=self._make_timestamp(self.ts_pre_fallback, hrs_pre, tz), expected_utc_offset=None, ) - def test_springforward_singular(self): + def test_springforward_singular(self, performance_warning): for tz, utc_offsets in self.timezone_utc_offsets.items(): hrs_pre = utc_offsets["utc_offset_standard"] self._test_all_offsets( n=1, + performance_warning=performance_warning, tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), expected_utc_offset=None, ) @@ -214,16 +229,6 @@ def test_all_offset_classes(self, tup): @pytest.mark.parametrize( "original_dt, target_dt, offset, tz", [ - pytest.param( - Timestamp("1900-01-01"), - Timestamp("1905-07-01"), - MonthBegin(66), - "Africa/Lagos", - marks=pytest.mark.xfail( - pytz_version < Version("2020.5") or pytz_version == Version("2022.2"), - reason="GH#41906: pytz utc transition dates changed", - ), - ), ( Timestamp("2021-10-01 01:15"), Timestamp("2021-10-31 01:15"), @@ -251,10 +256,10 @@ def test_all_offset_classes(self, tup): ], ) def test_nontick_offset_with_ambiguous_time_error(original_dt, target_dt, offset, tz): - # .apply for non-Tick offsets throws AmbiguousTimeError when the target dt + # .apply for non-Tick offsets throws ValueError when the target dt # is dst-ambiguous localized_dt = original_dt.tz_localize(tz) msg = f"Cannot infer dst time from {target_dt}, try using the 'ambiguous' argument" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): localized_dt + offset diff --git a/pandas/tests/tseries/offsets/test_easter.py b/pandas/tests/tseries/offsets/test_easter.py index d11a72cc1b9d5..ada72d94434a3 100644 --- a/pandas/tests/tseries/offsets/test_easter.py +++ b/pandas/tests/tseries/offsets/test_easter.py @@ -2,6 +2,7 @@ Tests for the following offsets: - Easter """ + from __future__ import annotations from datetime import datetime diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index 7f8c34bc6832e..f442b363f737d 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -1,6 +1,7 @@ """ Tests for Fiscal Year and Fiscal Quarter offset classes """ + from datetime import datetime from dateutil.relativedelta import relativedelta @@ -294,17 +295,6 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter: - def test_is_anchored(self): - assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() - assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 - ).is_anchored() - assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() - def test_equality(self): assert makeFY5253LastOfMonthQuarter( startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 diff --git a/pandas/tests/tseries/offsets/test_halfyear.py b/pandas/tests/tseries/offsets/test_halfyear.py new file mode 100644 index 0000000000000..5bb3821fd07f8 --- /dev/null +++ b/pandas/tests/tseries/offsets/test_halfyear.py @@ -0,0 +1,329 @@ +""" +Tests for the following offsets: +- HalfYearBegin +- HalfYearEnd +""" + +from __future__ import annotations + +from datetime import datetime + +import pytest + +from pandas.tests.tseries.offsets.common import ( + assert_is_on_offset, + assert_offset_equal, +) + +from pandas.tseries.offsets import ( + HalfYearBegin, + HalfYearEnd, +) + + +@pytest.mark.parametrize("klass", (HalfYearBegin, HalfYearEnd)) +def test_halfyearly_dont_normalize(klass): + date = datetime(2012, 3, 31, 5, 30) + result = date + klass() + assert result.time() == date.time() + + +@pytest.mark.parametrize("offset", [HalfYearBegin(), HalfYearEnd()]) +@pytest.mark.parametrize( + "date", + [ + datetime(2016, m, d) + for m in [7, 8, 9, 10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m in {9, 11} and d == 31) + ], +) +def test_on_offset(offset, date): + res = offset.is_on_offset(date) + slow_version = date == (date + offset) - offset + assert res == slow_version + + +class TestHalfYearBegin: + def test_repr(self): + expected = "" + assert repr(HalfYearBegin()) == expected + expected = "" + assert repr(HalfYearBegin(startingMonth=3)) == expected + expected = "" + assert repr(HalfYearBegin(startingMonth=1)) == expected + + def test_offset_corner_case(self): + # corner + offset = HalfYearBegin(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) + + offset_cases = [] + offset_cases.append( + ( + HalfYearBegin(startingMonth=1), + { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1), + datetime(2008, 7, 1): datetime(2009, 1, 1), + datetime(2008, 7, 15): datetime(2009, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + HalfYearBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 8, 1), + datetime(2008, 3, 15): datetime(2008, 8, 1), + datetime(2008, 3, 31): datetime(2008, 8, 1), + datetime(2008, 4, 15): datetime(2008, 8, 1), + datetime(2008, 4, 30): datetime(2008, 8, 1), + }, + ) + ) + + offset_cases.append( + ( + HalfYearBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1), + datetime(2008, 7, 1): datetime(2008, 7, 1), + datetime(2008, 7, 15): datetime(2009, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + HalfYearBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 7, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 1, 1), + datetime(2008, 4, 30): datetime(2008, 1, 1), + datetime(2008, 7, 1): datetime(2008, 1, 1), + datetime(2008, 7, 15): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + HalfYearBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 2, 15): datetime(2009, 1, 1), + datetime(2008, 2, 29): datetime(2009, 1, 1), + datetime(2008, 3, 15): datetime(2009, 1, 1), + datetime(2008, 3, 31): datetime(2009, 1, 1), + datetime(2008, 4, 15): datetime(2009, 1, 1), + datetime(2008, 4, 1): datetime(2009, 1, 1), + datetime(2008, 7, 15): datetime(2009, 7, 1), + datetime(2008, 7, 1): datetime(2009, 7, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (HalfYearBegin(1, startingMonth=1), datetime(2008, 1, 1), True), + (HalfYearBegin(1, startingMonth=1), datetime(2007, 12, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2008, 2, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2007, 3, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2008, 4, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2008, 5, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2007, 6, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 1, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2007, 12, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 2, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2007, 3, 1), True), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 4, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 5, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 5, 2), False), + (HalfYearBegin(1, startingMonth=3), datetime(2007, 6, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2007, 6, 2), False), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 1, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2007, 12, 1), True), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 2, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2007, 3, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2007, 3, 2), False), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 4, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 5, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 5, 2), False), + (HalfYearBegin(1, startingMonth=6), datetime(2007, 6, 1), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestHalfYearEnd: + def test_repr(self): + expected = "" + assert repr(HalfYearEnd()) == expected + expected = "" + assert repr(HalfYearEnd(startingMonth=3)) == expected + expected = "" + assert repr(HalfYearEnd(startingMonth=1)) == expected + + def test_offset_corner_case(self): + # corner + offset = HalfYearEnd(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) + + offset_cases = [] + offset_cases.append( + ( + HalfYearEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 7, 31): datetime(2009, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + HalfYearEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 8, 31), + datetime(2008, 3, 15): datetime(2008, 8, 31), + datetime(2008, 3, 31): datetime(2008, 8, 31), + datetime(2008, 4, 15): datetime(2008, 8, 31), + datetime(2008, 8, 30): datetime(2008, 8, 31), + datetime(2008, 8, 31): datetime(2009, 2, 28), + }, + ) + ) + + offset_cases.append( + ( + HalfYearEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 7, 31): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + HalfYearEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 7, 31), + datetime(2008, 1, 31): datetime(2007, 7, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 7, 15): datetime(2008, 1, 31), + datetime(2008, 7, 30): datetime(2008, 1, 31), + datetime(2008, 7, 31): datetime(2008, 1, 31), + datetime(2008, 8, 1): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + HalfYearEnd(startingMonth=6, n=2), + { + datetime(2008, 1, 31): datetime(2008, 12, 31), + datetime(2008, 2, 15): datetime(2008, 12, 31), + datetime(2008, 2, 29): datetime(2008, 12, 31), + datetime(2008, 3, 15): datetime(2008, 12, 31), + datetime(2008, 3, 31): datetime(2008, 12, 31), + datetime(2008, 4, 15): datetime(2008, 12, 31), + datetime(2008, 4, 30): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2009, 6, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (HalfYearEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (HalfYearEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (HalfYearEnd(1, startingMonth=1), datetime(2008, 4, 30), False), + (HalfYearEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (HalfYearEnd(1, startingMonth=1), datetime(2008, 5, 31), False), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 12, 31), False), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 3, 30), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 3, 31), True), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 5, 31), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 6, 29), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 1, 31), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 12, 31), True), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 2, 29), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 3, 30), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 3, 31), False), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 4, 30), False), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 5, 30), False), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 5, 31), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 6, 29), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 6, 30), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) diff --git a/pandas/tests/tseries/offsets/test_index.py b/pandas/tests/tseries/offsets/test_index.py index 7a62944556d11..4fb9815ba92bb 100644 --- a/pandas/tests/tseries/offsets/test_index.py +++ b/pandas/tests/tseries/offsets/test_index.py @@ -1,6 +1,7 @@ """ Tests for offset behavior with indices. """ + import pytest from pandas import ( diff --git a/pandas/tests/tseries/offsets/test_month.py b/pandas/tests/tseries/offsets/test_month.py index 2b643999c3ad3..4dd494d0872a1 100644 --- a/pandas/tests/tseries/offsets/test_month.py +++ b/pandas/tests/tseries/offsets/test_month.py @@ -5,6 +5,7 @@ - MonthBegin - MonthEnd """ + from __future__ import annotations from datetime import datetime diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index ddf56e68b1611..f5c2c06162fcb 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,6 +1,7 @@ """ Tests of pandas.tseries.offsets """ + from __future__ import annotations from datetime import ( @@ -25,7 +26,6 @@ to_offset, ) from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -from pandas.errors import PerformanceWarning from pandas import ( DataFrame, @@ -160,6 +160,10 @@ def expecteds(): "BQuarterBegin": Timestamp("2011-03-01 09:00:00"), "QuarterEnd": Timestamp("2011-03-31 09:00:00"), "BQuarterEnd": Timestamp("2011-03-31 09:00:00"), + "HalfYearBegin": Timestamp("2011-07-01 09:00:00"), + "HalfYearEnd": Timestamp("2011-06-30 09:00:00"), + "BHalfYearBegin": Timestamp("2011-01-03 09:00:00"), + "BHalfYearEnd": Timestamp("2011-06-30 09:00:00"), "BusinessHour": Timestamp("2011-01-03 10:00:00"), "CustomBusinessHour": Timestamp("2011-01-03 10:00:00"), "WeekOfMonth": Timestamp("2011-01-08 09:00:00"), @@ -325,6 +329,7 @@ def test_rollforward(self, offset_types, expecteds): "MonthBegin", "SemiMonthBegin", "YearBegin", + "HalfYearBegin", "Week", "Hour", "Minute", @@ -351,6 +356,7 @@ def test_rollforward(self, offset_types, expecteds): "MonthBegin": Timestamp("2011-02-01 00:00:00"), "SemiMonthBegin": Timestamp("2011-01-15 00:00:00"), "YearBegin": Timestamp("2012-01-01 00:00:00"), + "HalfYearBegin": Timestamp("2011-07-01 00:00:00"), "Week": Timestamp("2011-01-08 00:00:00"), "Hour": Timestamp("2011-01-01 00:00:00"), "Minute": Timestamp("2011-01-01 00:00:00"), @@ -388,6 +394,10 @@ def test_rollback(self, offset_types): "BQuarterBegin": Timestamp("2010-12-01 09:00:00"), "QuarterEnd": Timestamp("2010-12-31 09:00:00"), "BQuarterEnd": Timestamp("2010-12-31 09:00:00"), + "HalfYearBegin": Timestamp("2010-07-01 09:00:00"), + "HalfYearEnd": Timestamp("2010-12-31 09:00:00"), + "BHalfYearBegin": Timestamp("2010-07-01 09:00:00"), + "BHalfYearEnd": Timestamp("2010-12-31 09:00:00"), "BusinessHour": Timestamp("2010-12-31 17:00:00"), "CustomBusinessHour": Timestamp("2010-12-31 17:00:00"), "WeekOfMonth": Timestamp("2010-12-11 09:00:00"), @@ -403,6 +413,7 @@ def test_rollback(self, offset_types): "MonthBegin", "SemiMonthBegin", "YearBegin", + "HalfYearBegin", "Week", "Hour", "Minute", @@ -425,6 +436,7 @@ def test_rollback(self, offset_types): "MonthBegin": Timestamp("2010-12-01 00:00:00"), "SemiMonthBegin": Timestamp("2010-12-15 00:00:00"), "YearBegin": Timestamp("2010-01-01 00:00:00"), + "HalfYearBegin": Timestamp("2010-07-01 00:00:00"), "Week": Timestamp("2010-12-25 00:00:00"), "Hour": Timestamp("2011-01-01 00:00:00"), "Minute": Timestamp("2011-01-01 00:00:00"), @@ -500,14 +512,15 @@ def test_add(self, offset_types, tz_naive_fixture, expecteds): assert isinstance(result, Timestamp) assert result == expected_localize - def test_add_empty_datetimeindex(self, offset_types, tz_naive_fixture): + def test_add_empty_datetimeindex( + self, performance_warning, offset_types, tz_naive_fixture + ): # GH#12724, GH#30336 offset_s = _create_offset(offset_types) dti = DatetimeIndex([], tz=tz_naive_fixture).as_unit("ns") - warn = None - if isinstance( + if not isinstance( offset_s, ( Easter, @@ -523,23 +536,31 @@ def test_add_empty_datetimeindex(self, offset_types, tz_naive_fixture): ), ): # We don't have an optimized apply_index - warn = PerformanceWarning + performance_warning = False # stacklevel checking is slow, and we have ~800 of variants of this # test, so let's only check the stacklevel in a subset of them check_stacklevel = tz_naive_fixture is None - with tm.assert_produces_warning(warn, check_stacklevel=check_stacklevel): + with tm.assert_produces_warning( + performance_warning, check_stacklevel=check_stacklevel + ): result = dti + offset_s tm.assert_index_equal(result, dti) - with tm.assert_produces_warning(warn, check_stacklevel=check_stacklevel): + with tm.assert_produces_warning( + performance_warning, check_stacklevel=check_stacklevel + ): result = offset_s + dti tm.assert_index_equal(result, dti) dta = dti._data - with tm.assert_produces_warning(warn, check_stacklevel=check_stacklevel): + with tm.assert_produces_warning( + performance_warning, check_stacklevel=check_stacklevel + ): result = dta + offset_s tm.assert_equal(result, dta) - with tm.assert_produces_warning(warn, check_stacklevel=check_stacklevel): + with tm.assert_produces_warning( + performance_warning, check_stacklevel=check_stacklevel + ): result = offset_s + dta tm.assert_equal(result, dta) @@ -624,10 +645,6 @@ def test_constructor(self, kwd, request): def test_default_constructor(self, dt): assert (dt + DateOffset(2)) == datetime(2008, 1, 4) - def test_is_anchored(self): - assert not DateOffset(2).is_anchored() - assert DateOffset(1).is_anchored() - def test_copy(self): assert DateOffset(months=2).copy() == DateOffset(months=2) assert DateOffset(milliseconds=1).copy() == DateOffset(milliseconds=1) @@ -783,9 +800,7 @@ def test_get_offset(): pairs = [ ("B", BDay()), - ("b", BDay()), - ("bme", BMonthEnd()), - ("Bme", BMonthEnd()), + ("BME", BMonthEnd()), ("W-MON", Week(weekday=0)), ("W-TUE", Week(weekday=1)), ("W-WED", Week(weekday=2)), @@ -796,8 +811,7 @@ def test_get_offset(): for name, expected in pairs: offset = _get_offset(name) assert offset == expected, ( - f"Expected {repr(name)} to yield {repr(expected)} " - f"(actual: {repr(offset)})" + f"Expected {name!r} to yield {expected!r} (actual: {offset!r})" ) @@ -847,7 +861,20 @@ def test_rule_code(self): "NOV", "DEC", ] - base_lst = ["YE", "YS", "BYE", "BYS", "QE", "QS", "BQE", "BQS"] + base_lst = [ + "YE", + "YS", + "BYE", + "BYS", + "QE", + "QS", + "BQE", + "BQS", + "HYE", + "HYS", + "BHYE", + "BHYS", + ] for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) @@ -866,7 +893,20 @@ def test_freq_offsets(): class TestReprNames: def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ["YE", "YS", "BYE", "BYS", "QE", "BQE", "BQS", "QS"] + month_prefixes = [ + "YE", + "YS", + "BYE", + "BYS", + "QE", + "BQE", + "BQS", + "QS", + "HYE", + "HYS", + "BHYE", + "BHYS", + ] names = [ prefix + "-" + month for prefix in month_prefixes @@ -1116,7 +1156,7 @@ def test_offset_multiplication( tm.assert_series_equal(resultarray, expectedarray) -def test_dateoffset_operations_on_dataframes(): +def test_dateoffset_operations_on_dataframes(performance_warning): # GH 47953 df = DataFrame({"T": [Timestamp("2019-04-30")], "D": [DateOffset(months=1)]}) frameresult1 = df["T"] + 26 * df["D"] @@ -1127,7 +1167,7 @@ def test_dateoffset_operations_on_dataframes(): } ) expecteddate = Timestamp("2021-06-30") - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning(performance_warning): frameresult2 = df2["T"] + 26 * df2["D"] assert frameresult1[0] == expecteddate @@ -1180,3 +1220,10 @@ def test_is_yqm_start_end(): for ts, value in tests: assert ts == value + + +@pytest.mark.parametrize("left", [DateOffset(1), Nano(1)]) +@pytest.mark.parametrize("right", [DateOffset(1), Nano(1)]) +def test_multiply_dateoffset_typeerror(left, right): + with pytest.raises(TypeError, match="Cannot multiply"): + left * right diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 1b4fa9292c403..809d8f87b2c02 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -7,12 +7,16 @@ You may wish to consult the previous version for inspiration on further tests, or when trying to pin down the bugs exposed by the tests below. """ + +import zoneinfo + from hypothesis import ( assume, given, ) import pytest -import pytz + +from pandas.compat import WASM import pandas as pd from pandas._testing._hypothesis import ( @@ -28,16 +32,25 @@ @given(DATETIME_JAN_1_1900_OPTIONAL_TZ, YQM_OFFSET) def test_on_offset_implementations(dt, offset): assume(not offset.normalize) + # This case is flaky in CI 2024-11-04 + assume( + not ( + WASM + and isinstance(dt.tzinfo, zoneinfo.ZoneInfo) + and dt.tzinfo.key == "Indian/Cocos" + and isinstance(offset, pd.offsets.MonthBegin) + ) + ) # check that the class-specific implementations of is_on_offset match # the general case definition: # (dt + offset) - offset == dt try: compare = (dt + offset) - offset - except (pytz.NonExistentTimeError, pytz.AmbiguousTimeError): + except ValueError: # When dt + offset does not exist or is DST-ambiguous, assume(False) to # indicate to hypothesis that this is not a valid test case # DST-ambiguous example (GH41906): - # dt = datetime.datetime(1900, 1, 1, tzinfo=pytz.timezone('Africa/Kinshasa')) + # dt = datetime.datetime(1900, 1, 1, tzinfo=ZoneInfo('Africa/Kinshasa')) # offset = MonthBegin(66) assume(False) diff --git a/pandas/tests/tseries/offsets/test_quarter.py b/pandas/tests/tseries/offsets/test_quarter.py index d183645da507d..b92ff9d39a3ca 100644 --- a/pandas/tests/tseries/offsets/test_quarter.py +++ b/pandas/tests/tseries/offsets/test_quarter.py @@ -3,6 +3,7 @@ - QuarterBegin - QuarterEnd """ + from __future__ import annotations from datetime import datetime @@ -52,11 +53,6 @@ def test_repr(self): expected = "" assert repr(QuarterBegin(startingMonth=1)) == expected - def test_is_anchored(self): - assert QuarterBegin(startingMonth=1).is_anchored() - assert QuarterBegin().is_anchored() - assert not QuarterBegin(2, startingMonth=1).is_anchored() - def test_offset_corner_case(self): # corner offset = QuarterBegin(n=-1, startingMonth=1) @@ -160,11 +156,6 @@ def test_repr(self): expected = "" assert repr(QuarterEnd(startingMonth=1)) == expected - def test_is_anchored(self): - assert QuarterEnd(startingMonth=1).is_anchored() - assert QuarterEnd().is_anchored() - assert not QuarterEnd(2, startingMonth=1).is_anchored() - def test_offset_corner_case(self): # corner offset = QuarterEnd(n=-1, startingMonth=1) diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index b68b91826bc6f..46b6846ad1ec2 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -1,6 +1,7 @@ """ Tests for offsets.Tick and subclasses """ + from datetime import ( datetime, timedelta, @@ -15,7 +16,6 @@ import pytest from pandas._libs.tslibs.offsets import delta_to_tick -from pandas.errors import OutOfBoundsTimedelta from pandas import ( Timedelta, @@ -238,16 +238,6 @@ def test_tick_addition(kls, expected): assert result == expected -def test_tick_delta_overflow(): - # GH#55503 raise OutOfBoundsTimedelta, not OverflowError - tick = offsets.Day(10**9) - msg = "Cannot cast 1000000000 days 00:00:00 to unit='ns' without overflow" - depr_msg = "Day.delta is deprecated" - with pytest.raises(OutOfBoundsTimedelta, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - tick.delta - - @pytest.mark.parametrize("cls", tick_classes) def test_tick_division(cls): off = cls(10) @@ -299,8 +289,7 @@ def test_tick_rdiv(cls): td64 = delta.to_timedelta64() instance__type = ".".join([cls.__module__, cls.__name__]) msg = ( - "unsupported operand type\\(s\\) for \\/: 'int'|'float' and " - f"'{instance__type}'" + f"unsupported operand type\\(s\\) for \\/: 'int'|'float' and '{instance__type}'" ) with pytest.raises(TypeError, match=msg): @@ -337,11 +326,6 @@ def test_tick_equalities(cls): assert cls() == cls(1) -@pytest.mark.parametrize("cls", tick_classes) -def test_tick_offset(cls): - assert not cls().is_anchored() - - @pytest.mark.parametrize("cls", tick_classes) def test_compare_ticks(cls): three = cls(3) diff --git a/pandas/tests/tseries/offsets/test_week.py b/pandas/tests/tseries/offsets/test_week.py index f42ff091af277..d34a7953121c1 100644 --- a/pandas/tests/tseries/offsets/test_week.py +++ b/pandas/tests/tseries/offsets/test_week.py @@ -4,6 +4,7 @@ - WeekOfMonth - LastWeekOfMonth """ + from __future__ import annotations from datetime import ( @@ -41,12 +42,6 @@ def test_corner(self): with pytest.raises(ValueError, match="Day must be"): Week(weekday=-1) - def test_is_anchored(self): - assert Week(weekday=0).is_anchored() - assert not Week().is_anchored() - assert not Week(2, weekday=2).is_anchored() - assert not Week(2).is_anchored() - offset_cases = [] # not business week offset_cases.append( diff --git a/pandas/tests/tseries/offsets/test_year.py b/pandas/tests/tseries/offsets/test_year.py index 28cbdcf6abecc..9d2a0b20e1e7c 100644 --- a/pandas/tests/tseries/offsets/test_year.py +++ b/pandas/tests/tseries/offsets/test_year.py @@ -3,6 +3,7 @@ - YearBegin - YearEnd """ + from __future__ import annotations from datetime import datetime diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 632d3b4cc3c84..fc0000553049e 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -15,7 +15,6 @@ tslib, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas import Timestamp import pandas._testing as tm @@ -156,7 +155,7 @@ def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(expected, dtype="M8[ns]") + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -174,6 +173,8 @@ def test_parsing_timezone_offsets(dt_string, expected_tz): # to the same datetime after the timezone offset is added. arr = np.array(["01-01-2013 00:00:00"], dtype=object) expected, _ = tslib.array_to_datetime(arr) + if "000000000" in dt_string: + expected = expected.astype("M8[ns]") arr = np.array([dt_string], dtype=object) result, result_tz = tslib.array_to_datetime(arr) @@ -200,88 +201,57 @@ def test_parsing_different_timezone_offsets(): data = ["2015-11-18 15:30:00+05:30", "2015-11-18 15:30:00+06:30"] data = np.array(data, dtype=object) - msg = "parsing datetimes with mixed time zones will raise an error" - with tm.assert_produces_warning(FutureWarning, match=msg): - result, result_tz = tslib.array_to_datetime(data) - expected = np.array( - [ - datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 19800)), - datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 23400)), - ], - dtype=object, - ) - - tm.assert_numpy_array_equal(result, expected) - assert result_tz is None - - -@pytest.mark.parametrize( - "data", [["-352.737091", "183.575577"], ["1", "2", "3", "4", "5"]] -) -def test_number_looking_strings_not_into_datetime(data): - # see gh-4601 - # - # These strings don't look like datetimes, so - # they shouldn't be attempted to be converted. - arr = np.array(data, dtype=object) - result, _ = tslib.array_to_datetime(arr, errors="ignore") - - tm.assert_numpy_array_equal(result, arr) + msg = "Mixed timezones detected. Pass utc=True in to_datetime" + with pytest.raises(ValueError, match=msg): + tslib.array_to_datetime(data) @pytest.mark.parametrize( - "invalid_date", + "invalid_date,exp_unit", [ - date(1000, 1, 1), - datetime(1000, 1, 1), - "1000-01-01", - "Jan 1, 1000", - np.datetime64("1000-01-01"), + (date(1000, 1, 1), "s"), + (datetime(1000, 1, 1), "us"), + ("1000-01-01", "s"), + ("Jan 1, 1000", "s"), + (np.datetime64("1000-01-01"), "s"), ], ) @pytest.mark.parametrize("errors", ["coerce", "raise"]) -def test_coerce_outside_ns_bounds(invalid_date, errors): +def test_coerce_outside_ns_bounds(invalid_date, exp_unit, errors): arr = np.array([invalid_date], dtype="object") - kwargs = {"values": arr, "errors": errors} - - if errors == "raise": - msg = "^Out of bounds nanosecond timestamp: .*, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - tslib.array_to_datetime(**kwargs) - else: # coerce. - result, _ = tslib.array_to_datetime(**kwargs) - expected = np.array([iNaT], dtype="M8[ns]") + result, _ = tslib.array_to_datetime(arr, errors=errors) + out_reso = np.datetime_data(result.dtype)[0] + assert out_reso == exp_unit + ts = Timestamp(invalid_date) + assert ts.unit == exp_unit - tm.assert_numpy_array_equal(result, expected) + expected = np.array([ts._value], dtype=f"M8[{exp_unit}]") + tm.assert_numpy_array_equal(result, expected) def test_coerce_outside_ns_bounds_one_valid(): arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = [iNaT, "2000-01-01T00:00:00.000000000"] - expected = np.array(expected, dtype="M8[ns]") + expected = ["1000-01-01T00:00:00.000000000", "2000-01-01T00:00:00.000000000"] + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("errors", ["ignore", "coerce"]) -def test_coerce_of_invalid_datetimes(errors): +def test_coerce_of_invalid_datetimes(): arr = np.array(["01-01-2013", "not_a_date", "1"], dtype=object) - kwargs = {"values": arr, "errors": errors} + # With coercing, the invalid dates becomes iNaT + result, _ = tslib.array_to_datetime(arr, errors="coerce") + expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) - if errors == "ignore": - # Without coercing, the presence of any invalid - # dates prevents any values from being converted. - result, _ = tslib.array_to_datetime(**kwargs) - tm.assert_numpy_array_equal(result, arr) - else: # coerce. - # With coercing, the invalid dates becomes iNaT - result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] + # With coercing, the invalid dates becomes iNaT + result, _ = tslib.array_to_datetime(arr, errors="coerce") + expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) def test_to_datetime_barely_out_of_bounds(): @@ -290,31 +260,41 @@ def test_to_datetime_barely_out_of_bounds(): # Close enough to bounds that dropping nanos # would result in an in-bounds datetime. arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object) - msg = "^Out of bounds nanosecond timestamp: 2262-04-11 23:47:16, at position 0$" + msg = "^Out of bounds nanosecond timestamp: 2262-04-11 23:47:16$" with pytest.raises(tslib.OutOfBoundsDatetime, match=msg): tslib.array_to_datetime(arr) -class SubDatetime(datetime): - pass - - @pytest.mark.parametrize( - "data,expected", + "timestamp", [ - ([SubDatetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000"]), - ([datetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000"]), - ([Timestamp(2000, 1, 1)], ["2000-01-01T00:00:00.000000000"]), + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + "1677-09-21T00:12:43.145224193", + "1677-09-21T00:12:43.145224999", + # this always worked + "1677-09-21T00:12:43.145225000", ], ) -def test_datetime_subclass(data, expected): +def test_to_datetime_barely_inside_bounds(timestamp): + # see gh-57150 + result, _ = tslib.array_to_datetime(np.array([timestamp], dtype=object)) + tm.assert_numpy_array_equal(result, np.array([timestamp], dtype="M8[ns]")) + + +class SubDatetime(datetime): + pass + + +@pytest.mark.parametrize("klass", [SubDatetime, datetime, Timestamp]) +def test_datetime_subclass(klass): # GH 25851 # ensure that subclassed datetime works with # array_to_datetime - arr = np.array(data, dtype=object) + arr = np.array([klass(2000, 1, 1)], dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(expected, dtype="M8[ns]") + expected = np.array(["2000-01-01T00:00:00.000000"], dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 9d7a5e906c3c3..f62910b5e1f1c 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -1,8 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest -from pytz import UTC from pandas._libs.tslibs import ( OutOfBoundsTimedelta, @@ -55,7 +57,7 @@ def _compare_local_to_utc(tz_didx, naive_didx): def test_tz_localize_to_utc_copies(): # GH#46460 arr = np.arange(5, dtype="i8") - result = tz_convert_from_utc(arr, tz=UTC) + result = tz_convert_from_utc(arr, tz=timezone.utc) tm.assert_numpy_array_equal(result, arr) assert not np.shares_memory(arr, result) @@ -86,11 +88,12 @@ def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): @pytest.mark.parametrize( "arr", [ - pytest.param(np.array([], dtype=np.int64), id="empty"), - pytest.param(np.array([iNaT], dtype=np.int64), id="all_nat"), + pytest.param([], id="empty"), + pytest.param([iNaT], id="all_nat"), ], ) def test_tz_convert_corner(arr): + arr = np.array([iNaT], dtype=np.int64) result = tz_convert_from_utc(arr, timezones.maybe_get_tz("Asia/Tokyo")) tm.assert_numpy_array_equal(result, arr) @@ -99,7 +102,7 @@ def test_tz_convert_readonly(): # GH#35530 arr = np.array([0], dtype=np.int64) arr.setflags(write=False) - result = tz_convert_from_utc(arr, UTC) + result = tz_convert_from_utc(arr, timezone.utc) tm.assert_numpy_array_equal(result, arr) @@ -140,14 +143,18 @@ class SubDatetime(datetime): "dt, expected", [ pytest.param( - Timestamp("2000-01-01"), Timestamp("2000-01-01", tz=UTC), id="timestamp" + Timestamp("2000-01-01"), + Timestamp("2000-01-01", tz=timezone.utc), + id="timestamp", ), pytest.param( - datetime(2000, 1, 1), datetime(2000, 1, 1, tzinfo=UTC), id="datetime" + datetime(2000, 1, 1), + datetime(2000, 1, 1, tzinfo=timezone.utc), + id="datetime", ), pytest.param( SubDatetime(2000, 1, 1), - SubDatetime(2000, 1, 1, tzinfo=UTC), + SubDatetime(2000, 1, 1, tzinfo=timezone.utc), id="subclassed_datetime", ), ], @@ -156,5 +163,5 @@ def test_localize_pydatetime_dt_types(dt, expected): # GH 25851 # ensure that subclassed datetime works with # localize_pydatetime - result = conversion.localize_pydatetime(dt, UTC) + result = conversion.localize_pydatetime(dt, timezone.utc) assert result == expected diff --git a/pandas/tests/tslibs/test_liboffsets.py b/pandas/tests/tslibs/test_liboffsets.py index c189a431146a7..f311284b9dc63 100644 --- a/pandas/tests/tslibs/test_liboffsets.py +++ b/pandas/tests/tslibs/test_liboffsets.py @@ -1,6 +1,7 @@ """ Tests for helper functions in the cython tslibs.offsets """ + from datetime import datetime import pytest @@ -127,10 +128,8 @@ def test_get_day_of_month_error(): roll_qtrday(dt, n=3, month=11, day_opt=day_opt, modby=12) -@pytest.mark.parametrize( - "month", - [3, 5], # (other.month % 3) < (month % 3) # (other.month % 3) > (month % 3) -) +@pytest.mark.parametrize("month", [3, 5]) +# (other.month % 3) < (month % 3) # (other.month % 3) > (month % 3) @pytest.mark.parametrize("n", [4, -3]) def test_roll_qtr_day_not_mod_unequal(day_opt, month, n): expected = {3: {-3: -2, 4: 4}, 5: {-3: -3, 4: 3}} diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index d8f23156bd4d4..bc5cd5fcccbf8 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -1,11 +1,11 @@ """ Tests for Timestamp parsing, aimed at pandas/_libs/tslibs/parsing.pyx """ + from datetime import datetime import re from dateutil.parser import parse as du_parse -from dateutil.tz import tzlocal from hypothesis import given import numpy as np import pytest @@ -17,35 +17,45 @@ from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso from pandas.compat import ( ISMUSL, + WASM, is_platform_windows, ) import pandas.util._test_decorators as td +# Usually we wouldn't want this import in this test file (which is targeted at +# tslibs.parsing), but it is convenient to test the Timestamp constructor at +# the same time as the other parsing functions. +from pandas import Timestamp import pandas._testing as tm from pandas._testing._hypothesis import DATETIME_NO_TZ +@pytest.mark.skipif(WASM, reason="tzset is not available on WASM") @pytest.mark.skipif( is_platform_windows() or ISMUSL, reason="TZ setting incorrect on Windows and MUSL Linux", ) def test_parsing_tzlocal_deprecated(): # GH#50791 - msg = ( - "Parsing 'EST' as tzlocal.*" - "Pass the 'tz' keyword or call tz_localize after construction instead" + msg = "|".join( + [ + r"Parsing 'EST' as tzlocal \(dependent on system timezone\) " + r"is no longer supported\. " + "Pass the 'tz' keyword or call tz_localize after construction instead", + ".*included an un-recognized timezone", + ] ) dtstr = "Jan 15 2004 03:00 EST" with tm.set_timezone("US/Eastern"): - with tm.assert_produces_warning(FutureWarning, match=msg): - res, _ = parse_datetime_string_with_reso(dtstr) + with pytest.raises(ValueError, match=msg): + parse_datetime_string_with_reso(dtstr) - assert isinstance(res.tzinfo, tzlocal) + with pytest.raises(ValueError, match=msg): + parsing.py_parse_datetime_string(dtstr) - with tm.assert_produces_warning(FutureWarning, match=msg): - res = parsing.py_parse_datetime_string(dtstr) - assert isinstance(res.tzinfo, tzlocal) + with pytest.raises(ValueError, match=msg): + Timestamp(dtstr) def test_parse_datetime_string_with_reso(): @@ -124,10 +134,7 @@ def test_does_not_convert_mixed_integer(date_string, expected): ( "2013Q1", {"freq": "INVLD-L-DEC-SAT"}, - ( - "Unable to retrieve month information " - "from given freq: INVLD-L-DEC-SAT" - ), + ("Unable to retrieve month information from given freq: INVLD-L-DEC-SAT"), ), ], ) @@ -218,6 +225,7 @@ def test_parsers_month_freq(date_str, expected): ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %I:%M:%S %p"), ("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %I:%M:%S %p"), ("27.03.2003 14:55:00.000", "%d.%m.%Y %H:%M:%S.%f"), # GH50317 + ("2023-11-09T20:23:46Z", "%Y-%m-%dT%H:%M:%S%z"), # GH57452 ], ) def test_guess_datetime_format_with_parseable_formats(string, fmt): diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py index 690962f1daa5e..0e7705ad7ed94 100644 --- a/pandas/tests/tslibs/test_resolution.py +++ b/pandas/tests/tslibs/test_resolution.py @@ -1,6 +1,7 @@ +import datetime + import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( Resolution, @@ -8,8 +9,6 @@ ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -import pandas._testing as tm - def test_get_resolution_nano(): # don't return the fallback RESO_DAY @@ -23,7 +22,7 @@ def test_get_resolution_non_nano_data(): res = get_resolution(arr, None, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US - res = get_resolution(arr, pytz.UTC, NpyDatetimeUnit.NPY_FR_us.value) + res = get_resolution(arr, datetime.timezone.utc, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US @@ -48,10 +47,10 @@ def test_get_attrname_from_abbrev(freqstr, expected): assert reso.attrname == expected -@pytest.mark.parametrize("freq", ["A", "H", "T", "S", "L", "U", "N"]) -def test_units_A_H_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): - # GH#52536 - msg = f"'{freq}' is deprecated and will be removed in a future version." +@pytest.mark.parametrize("freq", ["H", "S"]) +def test_unit_H_S_raises(freq): + # GH#59143 + msg = f"Invalid frequency: {freq}" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 28e4889983fb9..60bbcf08ce8e7 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -6,7 +6,6 @@ import dateutil.tz import pytest -import pytz from pandas._libs.tslibs import ( conversion, @@ -22,10 +21,11 @@ def test_is_utc(utc_fixture): assert timezones.is_utc(tz) -@pytest.mark.parametrize("tz_name", list(pytz.common_timezones)) -def test_cache_keys_are_distinct_for_pytz_vs_dateutil(tz_name): - tz_p = timezones.maybe_get_tz(tz_name) - tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) +def test_cache_keys_are_distinct_for_pytz_vs_dateutil(): + pytz = pytest.importorskip("pytz") + for tz_name in pytz.common_timezones: + tz_p = timezones.maybe_get_tz(tz_name) + tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) if tz_d is None: pytest.skip(tz_name + ": dateutil does not know about this one") @@ -76,12 +76,15 @@ def test_tz_compare_utc(utc_fixture, utc_fixture2): @pytest.fixture( params=[ - (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), + ("pytz/US/Eastern", lambda tz, x: tz.localize(x)), (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)), ] ) def infer_setup(request): eastern, localize = request.param + if isinstance(eastern, str) and eastern.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + eastern = pytz.timezone(eastern.removeprefix("pytz/")) start_naive = datetime(2001, 1, 1) end_naive = datetime(2009, 1, 1) @@ -111,10 +114,10 @@ def test_infer_tz_compat(infer_setup): def test_infer_tz_utc_localize(infer_setup): _, _, start, end, start_naive, end_naive = infer_setup - utc = pytz.utc + utc = timezone.utc - start = utc.localize(start_naive) - end = utc.localize(end_naive) + start = start_naive.astimezone(utc) + end = end_naive.astimezone(utc) assert timezones.infer_tzinfo(start, end) is utc @@ -124,8 +127,8 @@ def test_infer_tz_mismatch(infer_setup, ordered): eastern, _, _, _, start_naive, end_naive = infer_setup msg = "Inputs must both have the same timezone" - utc = pytz.utc - start = utc.localize(start_naive) + utc = timezone.utc + start = start_naive.astimezone(utc) end = conversion.localize_pydatetime(end_naive, eastern) args = (start, end) if ordered else (end, start) @@ -139,9 +142,9 @@ def test_maybe_get_tz_invalid_types(): timezones.maybe_get_tz(44.0) with pytest.raises(TypeError, match=""): - timezones.maybe_get_tz(pytz) + timezones.maybe_get_tz(pytest) - msg = "" + msg = "" with pytest.raises(TypeError, match=msg): timezones.maybe_get_tz(Timestamp("2021-01-01", tz="UTC")) diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index ef68408305232..add9213ae59fb 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -8,6 +8,8 @@ to_offset, ) +import pandas._testing as tm + @pytest.mark.parametrize( "freq_input,expected", @@ -24,12 +26,12 @@ ("15ms500us", offsets.Micro(15500)), ("10s75ms", offsets.Milli(10075)), ("1s0.25ms", offsets.Micro(1000250)), - ("1s0.25ms", offsets.Micro(1000250)), ("2800ns", offsets.Nano(2800)), ("2SME", offsets.SemiMonthEnd(2)), ("2SME-16", offsets.SemiMonthEnd(2, day_of_month=16)), ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), ("2SMS-15", offsets.SemiMonthBegin(2)), + ("LWOM-MON", offsets.LastWeekOfMonth()), ], ) def test_to_offset(freq_input, expected): @@ -45,6 +47,7 @@ def test_to_offset_negative(freqstr, expected): assert result.n == expected +@pytest.mark.filterwarnings("ignore:.*'m' is deprecated.*:FutureWarning") @pytest.mark.parametrize( "freqstr", [ @@ -59,11 +62,11 @@ def test_to_offset_negative(freqstr, expected): "2SMS-15D", "100foo", # Invalid leading +/- signs. - "+-1d", + "+-1D", "-+1h", "+1", "-7", - "+d", + "+D", "-m", # Invalid shortcut anchors. "SME-0", @@ -126,9 +129,14 @@ def test_to_offset_leading_zero(freqstr, expected): assert result.n == expected -@pytest.mark.parametrize("freqstr,expected", [("+1d", 1), ("+2h30min", 150)]) -def test_to_offset_leading_plus(freqstr, expected): - result = to_offset(freqstr) +@pytest.mark.parametrize( + "freqstr,expected,wrn", [("+1d", 1, FutureWarning), ("+2h30min", 150, None)] +) +def test_to_offset_leading_plus(freqstr, expected, wrn): + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(wrn, match=msg): + result = to_offset(freqstr) assert result.n == expected @@ -172,3 +180,62 @@ def test_to_offset_pd_timedelta(kwargs, expected): def test_anchored_shortcuts(shortcut, expected): result = to_offset(shortcut) assert result == expected + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "1sme", + "2bms", + "2cbme", + "2me", + ], +) +def test_to_offset_lowercase_frequency_raises(freq_depr): + msg = f"Invalid frequency: {freq_depr}" + + with pytest.raises(ValueError, match=msg): + to_offset(freq_depr) + + +@pytest.mark.parametrize("freq_depr", ["2MIN", "2Us", "2NS"]) +def test_to_offset_uppercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + ) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + to_offset(freq_depr) + + +@pytest.mark.parametrize( + "freq_depr,expected", + [ + ("2w", offsets.Week(2, weekday=6)), + ("2b", offsets.BusinessDay(2)), + ("2d", offsets.Day(2)), + ], +) +def test_to_offset_lowercase_frequency_deprecated(freq_depr, expected): + # GH#54939, GH#58998 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_offset(freq_depr) + assert result == expected + + +@pytest.mark.parametrize("freq", ["2H", "2BH", "2S"]) +def test_to_offset_uppercase_frequency_raises(freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + to_offset(freq) diff --git a/pandas/tests/tslibs/test_tzconversion.py b/pandas/tests/tslibs/test_tzconversion.py index c1a56ffb71b02..f32829b4e0b21 100644 --- a/pandas/tests/tslibs/test_tzconversion.py +++ b/pandas/tests/tslibs/test_tzconversion.py @@ -1,6 +1,7 @@ +import zoneinfo + import numpy as np import pytest -import pytz from pandas._libs.tslibs.tzconversion import tz_localize_to_utc @@ -11,13 +12,15 @@ def test_tz_localize_to_utc_ambiguous_infer(self): val = 1_320_541_200_000_000_000 vals = np.array([val, val - 1, val], dtype=np.int64) - with pytest.raises(pytz.AmbiguousTimeError, match="2011-11-06 01:00:00"): - tz_localize_to_utc(vals, pytz.timezone("US/Eastern"), ambiguous="infer") + with pytest.raises(ValueError, match="2011-11-06 01:00:00"): + tz_localize_to_utc(vals, zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer") - with pytest.raises(pytz.AmbiguousTimeError, match="are no repeated times"): - tz_localize_to_utc(vals[:1], pytz.timezone("US/Eastern"), ambiguous="infer") + with pytest.raises(ValueError, match="are no repeated times"): + tz_localize_to_utc( + vals[:1], zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer" + ) vals[1] += 1 msg = "There are 2 dst switches when there should only be 1" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - tz_localize_to_utc(vals, pytz.timezone("US/Eastern"), ambiguous="infer") + with pytest.raises(ValueError, match=msg): + tz_localize_to_utc(vals, zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer") diff --git a/pandas/tests/util/conftest.py b/pandas/tests/util/conftest.py index b68bcc93431d0..2e931ff42fe15 100644 --- a/pandas/tests/util/conftest.py +++ b/pandas/tests/util/conftest.py @@ -3,24 +3,44 @@ @pytest.fixture(params=[True, False]) def check_dtype(request): + """ + Fixture returning `True` or `False`, determining whether to check + if the `dtype` is identical or not, when comparing two data structures, + e.g. `Series`, `SparseArray` or `DataFrame`. + """ return request.param @pytest.fixture(params=[True, False]) def check_exact(request): + """ + Fixture returning `True` or `False`, determining whether to + compare floating point numbers exactly or not. + """ return request.param @pytest.fixture(params=[True, False]) def check_index_type(request): + """ + Fixture returning `True` or `False`, determining whether to check + if the `Index` types are identical or not. + """ return request.param @pytest.fixture(params=[0.5e-3, 0.5e-5]) def rtol(request): + """ + Fixture returning 0.5e-3 or 0.5e-5. Those values are used as relative tolerance. + """ return request.param @pytest.fixture(params=[True, False]) def check_categorical(request): + """ + Fixture returning `True` or `False`, determining whether to + compare internal `Categorical` exactly or not. + """ return request.param diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index 4e692084f7352..091670ed69f11 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -257,18 +257,14 @@ def test_assert_almost_equal_strings(): _assert_almost_equal_both("abc", "abc") -@pytest.mark.parametrize( - "a,b", [("abc", "abcd"), ("abc", "abd"), ("abc", 1), ("abc", [1])] -) -def test_assert_not_almost_equal_strings(a, b): - _assert_not_almost_equal_both(a, b) +@pytest.mark.parametrize("b", ["abcd", "abd", 1, [1]]) +def test_assert_not_almost_equal_strings(b): + _assert_not_almost_equal_both("abc", b) -@pytest.mark.parametrize( - "a,b", [([1, 2, 3], [1, 2, 3]), (np.array([1, 2, 3]), np.array([1, 2, 3]))] -) -def test_assert_almost_equal_iterables(a, b): - _assert_almost_equal_both(a, b) +@pytest.mark.parametrize("box", [list, np.array]) +def test_assert_almost_equal_iterables(box): + _assert_almost_equal_both(box([1, 2, 3]), box([1, 2, 3])) @pytest.mark.parametrize( @@ -315,7 +311,7 @@ def test_assert_almost_equal_inf(a, b): @pytest.mark.parametrize("left", objs) @pytest.mark.parametrize("right", objs) -def test_mismatched_na_assert_almost_equal_deprecation(left, right): +def test_mismatched_na_assert_almost_equal(left, right): left_arr = np.array([left], dtype=object) right_arr = np.array([right], dtype=object) @@ -335,7 +331,7 @@ def test_mismatched_na_assert_almost_equal_deprecation(left, right): ) else: - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(AssertionError, match=msg): _assert_almost_equal_both(left, right, check_dtype=False) # TODO: to get the same deprecation in assert_numpy_array_equal we need @@ -343,11 +339,11 @@ def test_mismatched_na_assert_almost_equal_deprecation(left, right): # TODO: to get the same deprecation in assert_index_equal we need to # change/deprecate array_equivalent_object to be stricter, as # assert_index_equal uses Index.equal which uses array_equivalent. - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal( Series(left_arr, dtype=object), Series(right_arr, dtype=object) ) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(AssertionError, match="DataFrame.iloc.* are different"): tm.assert_frame_equal( DataFrame(left_arr, dtype=object), DataFrame(right_arr, dtype=object) ) @@ -538,6 +534,10 @@ def test_assert_almost_equal_iterable_values_mismatch(): np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object), np.array([[1, 2, 3], [4, 5]], dtype=object), ), + ( + np.array([np.array([], dtype=object), None], dtype=object), + np.array([[], None], dtype=object), + ), ( np.array( [ diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py index d07bbcbc460a1..c171564574708 100644 --- a/pandas/tests/util/test_assert_categorical_equal.py +++ b/pandas/tests/util/test_assert_categorical_equal.py @@ -4,11 +4,9 @@ import pandas._testing as tm -@pytest.mark.parametrize( - "c", - [Categorical([1, 2, 3, 4]), Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4, 5])], -) +@pytest.mark.parametrize("c", [None, [1, 2, 3, 4, 5]]) def test_categorical_equal(c): + c = Categorical([1, 2, 3, 4], categories=c) tm.assert_categorical_equal(c, c) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index 674e9307d8bb9..5d82ae9af0e95 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -108,11 +108,10 @@ def test_assert_extension_array_equal_non_extension_array(side): tm.assert_extension_array_equal(*args) -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_extension_array_equal_ignore_dtype_mismatch(right_dtype): +def test_assert_extension_array_equal_ignore_dtype_mismatch(any_int_dtype): # https://github.com/pandas-dev/pandas/issues/35715 left = array([1, 2, 3], dtype="Int64") - right = array([1, 2, 3], dtype=right_dtype) + right = array([1, 2, 3], dtype=any_int_dtype) tm.assert_extension_array_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index a074898f6046d..ea954756d63c8 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -10,11 +10,6 @@ def by_blocks_fixture(request): return request.param -@pytest.fixture(params=["DataFrame", "Series"]) -def obj_fixture(request): - return request.param - - def _assert_frame_equal_both(a, b, **kwargs): """ Check that two DataFrame equal. @@ -35,30 +30,36 @@ def _assert_frame_equal_both(a, b, **kwargs): @pytest.mark.parametrize("check_like", [True, False]) -def test_frame_equal_row_order_mismatch(check_like, obj_fixture): +def test_frame_equal_row_order_mismatch(check_like, frame_or_series): df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"]) if not check_like: # Do not ignore row-column orderings. - msg = f"{obj_fixture}.index are different" + msg = f"{frame_or_series.__name__}.index are different" with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) + tm.assert_frame_equal( + df1, df2, check_like=check_like, obj=frame_or_series.__name__ + ) else: - _assert_frame_equal_both(df1, df2, check_like=check_like, obj=obj_fixture) + _assert_frame_equal_both( + df1, df2, check_like=check_like, obj=frame_or_series.__name__ + ) @pytest.mark.parametrize( "df1,df2", [ - (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})), - (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})), + ({"A": [1, 2, 3]}, {"A": [1, 2, 3, 4]}), + ({"A": [1, 2, 3], "B": [4, 5, 6]}, {"A": [1, 2, 3]}), ], ) -def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): - msg = f"{obj_fixture} are different" +def test_frame_equal_shape_mismatch(df1, df2, frame_or_series): + df1 = DataFrame(df1) + df2 = DataFrame(df2) + msg = f"{frame_or_series.__name__} are different" with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(df1, df2, obj=obj_fixture) + tm.assert_frame_equal(df1, df2, obj=frame_or_series.__name__) @pytest.mark.parametrize( @@ -78,7 +79,7 @@ def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): DataFrame.from_records( {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] ), - "MultiIndex level \\[0\\] are different", + "DataFrame\\.index level \\[0\\] are different", ), ], ) @@ -109,14 +110,14 @@ def test_empty_dtypes(check_dtype): @pytest.mark.parametrize("check_like", [True, False]) -def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string): +def test_frame_equal_index_mismatch(check_like, frame_or_series, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" - msg = f"""{obj_fixture}\\.index are different + msg = f"""{frame_or_series.__name__}\\.index are different -{obj_fixture}\\.index values are different \\(33\\.33333 %\\) +{frame_or_series.__name__}\\.index values are different \\(33\\.33333 %\\) \\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='{dtype}'\\) \\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='{dtype}'\\) At positional index 2, first diff: c != d""" @@ -125,18 +126,20 @@ def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "d"]) with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) + tm.assert_frame_equal( + df1, df2, check_like=check_like, obj=frame_or_series.__name__ + ) @pytest.mark.parametrize("check_like", [True, False]) -def test_frame_equal_columns_mismatch(check_like, obj_fixture, using_infer_string): +def test_frame_equal_columns_mismatch(check_like, frame_or_series, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" - msg = f"""{obj_fixture}\\.columns are different + msg = f"""{frame_or_series.__name__}\\.columns are different -{obj_fixture}\\.columns values are different \\(50\\.0 %\\) +{frame_or_series.__name__}\\.columns values are different \\(50\\.0 %\\) \\[left\\]: Index\\(\\['A', 'B'\\], dtype='{dtype}'\\) \\[right\\]: Index\\(\\['A', 'b'\\], dtype='{dtype}'\\)""" @@ -144,11 +147,13 @@ def test_frame_equal_columns_mismatch(check_like, obj_fixture, using_infer_strin df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) + tm.assert_frame_equal( + df1, df2, check_like=check_like, obj=frame_or_series.__name__ + ) -def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): - obj = obj_fixture +def test_frame_equal_block_mismatch(by_blocks_fixture, frame_or_series): + obj = frame_or_series.__name__ msg = f"""{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) are different {obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) values are different \\(33\\.33333 %\\) @@ -160,15 +165,15 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) + tm.assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj) @pytest.mark.parametrize( "df1,df2,msg", [ ( - DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), - DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), + {"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}, + {"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}, """{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) are different {obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) values are different \\(33\\.33333 %\\) @@ -177,8 +182,8 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): \\[right\\]: \\[é, è, e̊\\]""", ), ( - DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), - DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), + {"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}, + {"A": ["a", "a", "a"], "E": ["e", "e", "e"]}, """{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) are different {obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) values are different \\(100\\.0 %\\) @@ -188,14 +193,18 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): ), ], ) -def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): +def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, frame_or_series): # see gh-20503 # # Test ensures that `tm.assert_frame_equals` raises the right exception # when comparing DataFrames containing differing unicode objects. - msg = msg.format(obj=obj_fixture) + df1 = DataFrame(df1) + df2 = DataFrame(df2) + msg = msg.format(obj=frame_or_series.__name__) with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) + tm.assert_frame_equal( + df1, df2, by_blocks=by_blocks_fixture, obj=frame_or_series.__name__ + ) def test_assert_frame_equal_extension_dtype_mismatch(): @@ -211,10 +220,7 @@ def test_assert_frame_equal_extension_dtype_mismatch(): "\\[right\\]: int[32|64]" ) - # TODO: this shouldn't raise (or should raise a better error message) - # https://github.com/pandas-dev/pandas/issues/56131 - with pytest.raises(AssertionError, match="classes are different"): - tm.assert_frame_equal(left, right, check_dtype=False) + tm.assert_frame_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) @@ -246,7 +252,6 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(): tm.assert_frame_equal(left, right, check_dtype=False) -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") @@ -255,12 +260,7 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class(): @pytest.mark.parametrize( - "dtype", - [ - ("timedelta64[ns]"), - ("datetime64[ns, UTC]"), - ("Period[D]"), - ], + "dtype", ["timedelta64[ns]", "datetime64[ns, UTC]", "Period[D]"] ) def test_assert_frame_equal_datetime_like_dtype_mismatch(dtype): df1 = DataFrame({"a": []}, dtype=dtype) @@ -300,9 +300,7 @@ def test_frame_equal_mixed_dtypes(frame_or_series, any_numeric_ea_dtype, indexer dtypes = (any_numeric_ea_dtype, "int64") obj1 = frame_or_series([1, 2], dtype=dtypes[indexer[0]]) obj2 = frame_or_series([1, 2], dtype=dtypes[indexer[1]]) - msg = r'(Series|DataFrame.iloc\[:, 0\] \(column name="0"\) classes) are different' - with pytest.raises(AssertionError, match=msg): - tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) + tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) def test_assert_frame_equal_check_like_different_indexes(): diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index dc6efdcec380e..ab52d6c8e9f39 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -207,7 +207,7 @@ def test_index_equal_names(name1, name2): def test_index_equal_category_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Index are different diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index 8cc4ade3d7e95..aad27672c0f6f 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -1,7 +1,11 @@ import pytest -from pandas import interval_range +from pandas import ( + Interval, + interval_range, +) import pandas._testing as tm +from pandas.arrays import IntervalArray @pytest.mark.parametrize( @@ -79,3 +83,18 @@ def test_interval_array_equal_start_mismatch(): with pytest.raises(AssertionError, match=msg): tm.assert_interval_array_equal(arr1, arr2) + + +def test_interval_array_equal_end_mismatch_only(): + arr1 = IntervalArray([Interval(0, 1), Interval(0, 5)]) + arr2 = IntervalArray([Interval(0, 1), Interval(0, 6)]) + + msg = """\ +IntervalArray.right are different + +IntervalArray.right values are different \\(50.0 %\\) +\\[left\\]: \\[1, 5\\] +\\[right\\]: \\[1, 6\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_interval_array_equal(arr1, arr2) diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index 5c27a3ee79d4a..5b917dbbe7ba7 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -1,6 +1,7 @@ -"""" +""" " Test module for testing ``pandas._testing.assert_produces_warning``. """ + import warnings import pytest @@ -13,26 +14,6 @@ import pandas._testing as tm -@pytest.fixture( - params=[ - RuntimeWarning, - ResourceWarning, - UserWarning, - FutureWarning, - DeprecationWarning, - PerformanceWarning, - DtypeWarning, - ], -) -def category(request): - """ - Return unique warning. - - Useful for testing behavior of tm.assert_produces_warning with various categories. - """ - return request.param - - @pytest.fixture( params=[ (RuntimeWarning, UserWarning), @@ -61,7 +42,6 @@ def f(): warnings.warn("f2", RuntimeWarning) -@pytest.mark.filterwarnings("ignore:f1:FutureWarning") def test_assert_produces_warning_honors_filter(): # Raise by default. msg = r"Caused unexpected warning\(s\)" @@ -73,6 +53,18 @@ def test_assert_produces_warning_honors_filter(): f() +@pytest.mark.parametrize( + "category", + [ + RuntimeWarning, + ResourceWarning, + UserWarning, + FutureWarning, + DeprecationWarning, + PerformanceWarning, + DtypeWarning, + ], +) @pytest.mark.parametrize( "message, match", [ @@ -187,6 +179,44 @@ def test_match_multiple_warnings(): warnings.warn("Match this too", UserWarning) +def test_must_match_multiple_warnings(): + # https://github.com/pandas-dev/pandas/issues/56555 + category = (FutureWarning, UserWarning) + msg = "Did not see expected warning of class 'UserWarning'" + with pytest.raises(AssertionError, match=msg): + with tm.assert_produces_warning(category, match=r"^Match this"): + warnings.warn("Match this", FutureWarning) + + +def test_must_match_multiple_warnings_messages(): + # https://github.com/pandas-dev/pandas/issues/56555 + category = (FutureWarning, UserWarning) + msg = r"The emitted warning messages are \[UserWarning\('Not this'\)\]" + with pytest.raises(AssertionError, match=msg): + with tm.assert_produces_warning(category, match=r"^Match this"): + warnings.warn("Match this", FutureWarning) + warnings.warn("Not this", UserWarning) + + +def test_allow_partial_match_for_multiple_warnings(): + # https://github.com/pandas-dev/pandas/issues/56555 + category = (FutureWarning, UserWarning) + with tm.assert_produces_warning( + category, match=r"^Match this", must_find_all_warnings=False + ): + warnings.warn("Match this", FutureWarning) + + +def test_allow_partial_match_for_multiple_warnings_messages(): + # https://github.com/pandas-dev/pandas/issues/56555 + category = (FutureWarning, UserWarning) + with tm.assert_produces_warning( + category, match=r"^Match this", must_find_all_warnings=False + ): + warnings.warn("Match this", FutureWarning) + warnings.warn("Not this", UserWarning) + + def test_right_category_wrong_match_raises(pair_different_warnings): target_category, other_category = pair_different_warnings with pytest.raises(AssertionError, match="Did not see warning.*matching"): diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index f722f619bc456..c3cd90f2edfb3 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -106,12 +106,11 @@ def test_series_not_equal_metadata_mismatch(kwargs): @pytest.mark.parametrize("data1,data2", [(0.12345, 0.12346), (0.1235, 0.1236)]) -@pytest.mark.parametrize("dtype", ["float32", "float64", "Float32"]) @pytest.mark.parametrize("decimals", [0, 1, 2, 3, 5, 10]) -def test_less_precise(data1, data2, dtype, decimals): +def test_less_precise(data1, data2, any_float_dtype, decimals): rtol = 10**-decimals - s1 = Series([data1], dtype=dtype) - s2 = Series([data2], dtype=dtype) + s1 = Series([data1], dtype=any_float_dtype) + s2 = Series([data2], dtype=any_float_dtype) if decimals in (5, 10) or (decimals >= 3 and abs(data1 - data2) >= 0.0005): msg = "Series values are different" @@ -138,7 +137,7 @@ def test_less_precise(data1, data2, dtype, decimals): DataFrame.from_records( {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] ).c, - "MultiIndex level \\[0\\] are different", + "Series\\.index level \\[0\\] are different", ), ], ) @@ -221,9 +220,9 @@ def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\['a', 'b', 'c'\\] -Categories \\(3, string\\): \\[a, b, c\\] +Categories \\(3, str\\): \\[a, b, c\\] \\[right\\]: \\['a', 'c', 'b'\\] -Categories \\(3, string\\): \\[a, b, c\\]""" +Categories \\(3, str\\): \\[a, b, c\\]""" else: msg = """Series are different @@ -258,7 +257,7 @@ def test_series_equal_datetime_values_mismatch(rtol): def test_series_equal_categorical_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Attributes of Series are different @@ -290,10 +289,7 @@ def test_assert_series_equal_extension_dtype_mismatch(): \\[left\\]: Int64 \\[right\\]: int[32|64]""" - # TODO: this shouldn't raise (or should raise a better error message) - # https://github.com/pandas-dev/pandas/issues/56131 - with pytest.raises(AssertionError, match="Series classes are different"): - tm.assert_series_equal(left, right, check_dtype=False) + tm.assert_series_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_series_equal(left, right, check_dtype=True) @@ -372,7 +368,6 @@ def test_assert_series_equal_ignore_extension_dtype_mismatch(): tm.assert_series_equal(left, right, check_dtype=False) -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = Series([1, 2, 3], dtype="Int64") @@ -456,3 +451,68 @@ def test_large_unequal_ints(dtype): right = Series([1577840521123543], dtype=dtype) with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal(left, right) + + +@pytest.mark.parametrize("dtype", [None, object]) +@pytest.mark.parametrize("check_exact", [True, False]) +@pytest.mark.parametrize("val", [3, 3.5]) +def test_ea_and_numpy_no_dtype_check(val, check_exact, dtype): + # GH#56651 + left = Series([1, 2, val], dtype=dtype) + right = Series(pd.array([1, 2, val])) + tm.assert_series_equal(left, right, check_dtype=False, check_exact=check_exact) + + +def test_assert_series_equal_int_tol(): + # GH#56646 + left = Series([81, 18, 121, 38, 74, 72, 81, 81, 146, 81, 81, 170, 74, 74]) + right = Series([72, 9, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72]) + tm.assert_series_equal(left, right, rtol=1.5) + + tm.assert_frame_equal(left.to_frame(), right.to_frame(), rtol=1.5) + tm.assert_extension_array_equal( + left.astype("Int64").values, right.astype("Int64").values, rtol=1.5 + ) + + +@pytest.mark.parametrize( + "left_idx, right_idx", + [ + ( + pd.Index([0, 0.2, 0.4, 0.6, 0.8, 1]), + pd.Index(np.linspace(0, 1, 6)), + ), + ( + pd.MultiIndex.from_arrays([[0, 0, 0, 0, 1, 1], [0, 0.2, 0.4, 0.6, 0.8, 1]]), + pd.MultiIndex.from_arrays([[0, 0, 0, 0, 1, 1], np.linspace(0, 1, 6)]), + ), + ( + pd.MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 4, 5, 10000000000001]] + ), + pd.MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 4, 5, 10000000000002]] + ), + ), + pytest.param( + pd.Index([1, 2, 3, 4, 5, 10000000000001]), + pd.Index([1, 2, 3, 4, 5, 10000000000002]), + marks=pytest.mark.xfail(reason="check_exact_index defaults to True"), + ), + pytest.param( + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0, 1, 1], [1, 2, 3, 4, 5, 10000000000001]] + ), + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0, 1, 1], [1, 2, 3, 4, 5, 10000000000002]] + ), + marks=pytest.mark.xfail(reason="check_exact_index defaults to True"), + ), + ], +) +def test_assert_series_equal_check_exact_index_default(left_idx, right_idx): + # GH#57067 + ser1 = Series(np.zeros(6, dtype=int), left_idx) + ser2 = Series(np.zeros(6, dtype=int), right_idx) + tm.assert_series_equal(ser1, ser2) + tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame()) diff --git a/pandas/tests/util/test_deprecate_nonkeyword_arguments.py b/pandas/tests/util/test_deprecate_nonkeyword_arguments.py index e74ff89b11581..f81d32b574682 100644 --- a/pandas/tests/util/test_deprecate_nonkeyword_arguments.py +++ b/pandas/tests/util/test_deprecate_nonkeyword_arguments.py @@ -124,8 +124,7 @@ def test_i_signature(): class Foo: @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "bar"]) - def baz(self, bar=None, foobar=None): # pylint: disable=disallowed-name - ... + def baz(self, bar=None, foobar=None): ... def test_foo_signature(): diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 1e7fdd920e365..e654534ccd453 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -222,12 +222,12 @@ def test_hash_pandas_series_diff_index(series): assert not (a == b).all() -@pytest.mark.parametrize( - "obj", [Series([], dtype="float64"), Series([], dtype="object"), Index([])] -) -def test_hash_pandas_empty_object(obj, index): +@pytest.mark.parametrize("klass", [Index, Series]) +@pytest.mark.parametrize("dtype", ["float64", "object"]) +def test_hash_pandas_empty_object(klass, dtype, index): # These are by-definition the same with # or without the index as the data is empty. + obj = klass([], dtype=dtype) a = hash_pandas_object(obj, index=index) b = hash_pandas_object(obj, index=index) tm.assert_series_equal(a, b) @@ -236,9 +236,9 @@ def test_hash_pandas_empty_object(obj, index): @pytest.mark.parametrize( "s1", [ - Series(["a", "b", "c", "d"]), - Series([1000, 2000, 3000, 4000]), - Series(pd.date_range(0, periods=4)), + ["a", "b", "c", "d"], + [1000, 2000, 3000, 4000], + pd.date_range(0, periods=4), ], ) @pytest.mark.parametrize("categorize", [True, False]) @@ -247,6 +247,7 @@ def test_categorical_consistency(s1, categorize): # # Check that categoricals hash consistent with their values, # not codes. This should work for categoricals of any dtype. + s1 = Series(s1) s2 = s1.astype("category").cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) @@ -259,14 +260,14 @@ def test_categorical_consistency(s1, categorize): tm.assert_series_equal(h1, h3) -def test_categorical_with_nan_consistency(): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") - ) - expected = hash_array(c, categorize=False) +def test_categorical_with_nan_consistency(unit): + dti = pd.date_range("2012-01-01", periods=5, name="B", unit=unit) + cat = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=dti) + expected = hash_array(cat, categorize=False) - c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) - result = hash_array(c, categorize=False) + ts = pd.Timestamp("2012-01-01").as_unit(unit) + cat2 = pd.Categorical.from_codes([-1, 0], categories=[ts]) + result = hash_array(cat2, categorize=False) assert result[0] in expected assert result[1] in expected diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 00a897d574a07..8f1ac93b40247 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas.util._test_decorators as td import pandas as pd @@ -20,10 +22,10 @@ def test_shares_memory_string(): # GH#55823 import pyarrow as pa - obj = pd.array(["a", "b"], dtype="string[pyarrow]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA)) assert tm.shares_memory(obj, obj) - obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan)) assert tm.shares_memory(obj, obj) obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string())) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 73ab470ab97a7..fe873b3b74254 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -50,28 +50,6 @@ def min_periods(request): return request.param -@pytest.fixture(params=[True, False]) -def parallel(request): - """parallel keyword argument for numba.jit""" - return request.param - - -# Can parameterize nogil & nopython over True | False, but limiting per -# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472 - - -@pytest.fixture(params=[False]) -def nogil(request): - """nogil keyword argument for numba.jit""" - return request.param - - -@pytest.fixture(params=[True]) -def nopython(request): - """nopython keyword argument for numba.jit""" - return request.param - - @pytest.fixture(params=[True, False]) def adjust(request): """adjust keyword argument for ewm""" diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index fe2da210c6fe9..877b50e37670c 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -71,7 +71,7 @@ def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) with pytest.raises( - DataError, match="Cannot aggregate non-numeric type: object|string" + DataError, match="Cannot aggregate non-numeric type: object|str" ): # GH#42738, enforced in 2.0 r.sum() @@ -87,14 +87,12 @@ def test_agg(step): b_mean = r["B"].mean() b_std = r["B"].std() - with tm.assert_produces_warning(FutureWarning, match="using Rolling.[mean|std]"): - result = r.aggregate([np.mean, np.std]) + result = r.aggregate([np.mean, lambda x: np.std(x, ddof=1)]) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + expected.columns = MultiIndex.from_product([["A", "B"], ["mean", ""]]) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="using Rolling.[mean|std]"): - result = r.aggregate({"A": np.mean, "B": np.std}) + result = r.aggregate({"A": np.mean, "B": lambda x: np.std(x, ddof=1)}) expected = concat([a_mean, b_std], axis=1) tm.assert_frame_equal(result, expected, check_like=True) @@ -127,19 +125,6 @@ def test_agg(step): tm.assert_frame_equal(result, expected, check_like=True) -@pytest.mark.parametrize( - "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}] -) -def test_multi_axis_1_raises(func): - # GH#46904 - df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]}) - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - r = df.rolling(window=3, axis=1) - with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"): - r.agg(func) - - def test_agg_apply(raw): # passed lambda df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) @@ -147,8 +132,7 @@ def test_agg_apply(raw): r = df.rolling(window=3) a_sum = r["A"].sum() - with tm.assert_produces_warning(FutureWarning, match="using Rolling.[sum|std]"): - result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) rcustom = r["B"].apply(lambda x: np.std(x, ddof=1), raw=raw) expected = concat([a_sum, rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) @@ -158,18 +142,15 @@ def test_agg_consistency(step): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3, step=step) - with tm.assert_produces_warning(FutureWarning, match="using Rolling.[sum|mean]"): - result = r.agg([np.sum, np.mean]).columns + result = r.agg([np.sum, np.mean]).columns expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="using Rolling.[sum|mean]"): - result = r["A"].agg([np.sum, np.mean]).columns + result = r["A"].agg([np.sum, np.mean]).columns expected = Index(["sum", "mean"]) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="using Rolling.[sum|mean]"): - result = r.agg({"A": [np.sum, np.mean]}).columns + result = r.agg({"A": [np.sum, np.mean]}).columns expected = MultiIndex.from_tuples([("A", "sum"), ("A", "mean")]) tm.assert_index_equal(result, expected) @@ -196,6 +177,38 @@ def test_agg_nested_dicts(): r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) +@pytest.mark.parametrize( + "func,window_size", + [ + ( + "rolling", + 2, + ), + ( + "expanding", + None, + ), + ], +) +def test_pipe(func, window_size): + # Issue #57076 + df = DataFrame( + { + "B": np.random.default_rng(2).standard_normal(10), + "C": np.random.default_rng(2).standard_normal(10), + } + ) + r = getattr(df, func)(window_size) + + expected = r.max() - r.mean() + result = r.pipe(lambda x: x.max() - x.mean()) + tm.assert_frame_equal(result, expected) + + expected = r.max() - 2 * r.min() + result = r.pipe(lambda x, k: x.max() - k * x.min(), k=2) + tm.assert_frame_equal(result, expected) + + def test_count_nonnumeric_types(step): # GH12541 cols = [ @@ -352,32 +365,6 @@ def test_dont_modify_attributes_after_methods( assert result == expected -def test_centered_axis_validation(step): - # ok - msg = "The 'axis' keyword in Series.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - Series(np.ones(10)).rolling(window=3, center=True, axis=0, step=step).mean() - - # bad axis - msg = "No axis named 1 for object type Series" - with pytest.raises(ValueError, match=msg): - Series(np.ones(10)).rolling(window=3, center=True, axis=1, step=step).mean() - - # ok ok - df = DataFrame(np.ones((10, 10))) - msg = "The 'axis' keyword in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.rolling(window=3, center=True, axis=0, step=step).mean() - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.rolling(window=3, center=True, axis=1, step=step).mean() - - # bad axis - msg = "No axis named 2 for object type DataFrame" - with pytest.raises(ValueError, match=msg): - (df.rolling(window=3, center=True, axis=2, step=step).mean()) - - def test_rolling_min_min_periods(step): a = Series([1, 2, 3, 4, 5]) result = a.rolling(window=100, min_periods=1, step=step).min() diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 136f81632cb0a..2398713585cfb 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -316,13 +316,3 @@ def test_center_reindex_frame(raw): ) frame_rs = frame.rolling(window=25, min_periods=minp, center=True).apply(f, raw=raw) tm.assert_frame_equal(frame_xp, frame_rs) - - -def test_axis1(raw): - # GH 45912 - df = DataFrame([1, 2]) - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(window=1, axis=1).apply(np.sum, raw=raw) - expected = DataFrame([1.0, 2.0]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 104acc1d527cb..4f91e56a7d82b 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -157,13 +157,13 @@ def test_rolling_forward_window( indexer = FixedForwardWindowIndexer(window_size=3) match = "Forward-looking windows can't have center=True" + rolling = frame_or_series(values).rolling(window=indexer, center=True) with pytest.raises(ValueError, match=match): - rolling = frame_or_series(values).rolling(window=indexer, center=True) getattr(rolling, func)() match = "Forward-looking windows don't support setting the closed argument" + rolling = frame_or_series(values).rolling(window=indexer, closed="right") with pytest.raises(ValueError, match=match): - rolling = frame_or_series(values).rolling(window=indexer, closed="right") getattr(rolling, func)() rolling = frame_or_series(values).rolling(window=indexer, min_periods=2, step=step) diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py index c60cb6ea74ec0..39811ea3ec5b9 100644 --- a/pandas/tests/window/test_cython_aggregations.py +++ b/pandas/tests/window/test_cython_aggregations.py @@ -30,6 +30,9 @@ def _get_rolling_aggregations(): ("roll_median_c", window_aggregations.roll_median_c), ("roll_max", window_aggregations.roll_max), ("roll_min", window_aggregations.roll_min), + ("roll_first", window_aggregations.roll_first), + ("roll_last", window_aggregations.roll_last), + ("roll_nunique", window_aggregations.roll_nunique), ] + [ ( diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 058e5ce36e53e..4ea6c805a2ee4 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -102,7 +102,8 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): tm.assert_frame_equal(result, expected) -def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): +def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit, adjust): + # GH 54328 tz = tz_aware_fixture halflife = "23 days" times = ( @@ -112,8 +113,11 @@ def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): ) data = np.arange(3) df = DataFrame(data) - result = df.ewm(halflife=halflife, times=times).mean() - expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + result = df.ewm(halflife=halflife, times=times, adjust=adjust).mean() + if adjust: + expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + else: + expected = DataFrame([0.0, 0.23762518642226227, 1.534926369128742]) tm.assert_frame_equal(result, expected) @@ -148,64 +152,57 @@ def test_ewm_getitem_attributes_retained(arg, adjust, ignore_na): assert result == expected -def test_ewma_times_adjust_false_raises(): - # GH 40098 +def test_ewma_times_adjust_false_with_disallowed_com(): + # GH 54328 with pytest.raises( - NotImplementedError, match="times is not supported with adjust=False." + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), ): Series(range(1)).ewm( - 0.1, adjust=False, times=date_range("2000", freq="D", periods=1) + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + halflife="1D", ) -@pytest.mark.parametrize( - "func, expected", - [ - [ - "mean", - DataFrame( - { - 0: range(5), - 1: range(4, 9), - 2: [7.428571, 9, 10.571429, 12.142857, 13.714286], - }, - dtype=float, - ), - ], - [ - "std", - DataFrame( - { - 0: [np.nan] * 5, - 1: [4.242641] * 5, - 2: [4.6291, 5.196152, 5.781745, 6.380775, 6.989788], - } - ), - ], - [ - "var", - DataFrame( - { - 0: [np.nan] * 5, - 1: [18.0] * 5, - 2: [21.428571, 27, 33.428571, 40.714286, 48.857143], - } - ), - ], - ], -) -def test_float_dtype_ewma(func, expected, float_numpy_dtype): - # GH#42452 +def test_ewma_times_adjust_false_with_disallowed_alpha(): + # GH 54328 + with pytest.raises( + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), + ): + Series(range(1)).ewm( + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + alpha=0.5, + halflife="1D", + ) - df = DataFrame( - {0: range(5), 1: range(6, 11), 2: range(10, 20, 2)}, dtype=float_numpy_dtype - ) - msg = "Support for axis=1 in DataFrame.ewm is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - e = df.ewm(alpha=0.5, axis=1) - result = getattr(e, func)() - tm.assert_frame_equal(result, expected) +def test_ewma_times_adjust_false_with_disallowed_span(): + # GH 54328 + with pytest.raises( + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), + ): + Series(range(1)).ewm( + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + span=10, + halflife="1D", + ) def test_times_string_col_raises(): @@ -223,6 +220,18 @@ def test_ewm_sum_adjust_false_notimplemented(): data.sum() +@pytest.mark.parametrize("method", ["sum", "std", "var", "cov", "corr"]) +def test_times_only_mean_implemented(frame_or_series, method): + # GH 51695 + halflife = "1 day" + times = date_range("2000", freq="D", periods=10) + ewm = frame_or_series(range(10)).ewm(halflife=halflife, times=times) + with pytest.raises( + NotImplementedError, match=f"{method} is not implemented with times" + ): + getattr(ewm, method)() + + @pytest.mark.parametrize( "expected_data, ignore", [[[10.0, 5.0, 2.5, 11.25], False], [[10.0, 5.0, 5.0, 12.5], True]], @@ -271,67 +280,67 @@ def test_ewma_nan_handling(): "s, adjust, ignore_na, w", [ ( - Series([np.nan, 1.0, 101.0]), + [np.nan, 1.0, 101.0], True, False, [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), 1.0], ), ( - Series([np.nan, 1.0, 101.0]), + [np.nan, 1.0, 101.0], True, True, [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), 1.0], ), ( - Series([np.nan, 1.0, 101.0]), + [np.nan, 1.0, 101.0], False, False, [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), (1.0 / (1.0 + 2.0))], ), ( - Series([np.nan, 1.0, 101.0]), + [np.nan, 1.0, 101.0], False, True, [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), (1.0 / (1.0 + 2.0))], ), ( - Series([1.0, np.nan, 101.0]), + [1.0, np.nan, 101.0], True, False, [(1.0 - (1.0 / (1.0 + 2.0))) ** 2, np.nan, 1.0], ), ( - Series([1.0, np.nan, 101.0]), + [1.0, np.nan, 101.0], True, True, [(1.0 - (1.0 / (1.0 + 2.0))), np.nan, 1.0], ), ( - Series([1.0, np.nan, 101.0]), + [1.0, np.nan, 101.0], False, False, [(1.0 - (1.0 / (1.0 + 2.0))) ** 2, np.nan, (1.0 / (1.0 + 2.0))], ), ( - Series([1.0, np.nan, 101.0]), + [1.0, np.nan, 101.0], False, True, [(1.0 - (1.0 / (1.0 + 2.0))), np.nan, (1.0 / (1.0 + 2.0))], ), ( - Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]), + [np.nan, 1.0, np.nan, np.nan, 101.0, np.nan], True, False, [np.nan, (1.0 - (1.0 / (1.0 + 2.0))) ** 3, np.nan, np.nan, 1.0, np.nan], ), ( - Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]), + [np.nan, 1.0, np.nan, np.nan, 101.0, np.nan], True, True, [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), np.nan, np.nan, 1.0, np.nan], ), ( - Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]), + [np.nan, 1.0, np.nan, np.nan, 101.0, np.nan], False, False, [ @@ -344,7 +353,7 @@ def test_ewma_nan_handling(): ], ), ( - Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]), + [np.nan, 1.0, np.nan, np.nan, 101.0, np.nan], False, True, [ @@ -357,7 +366,7 @@ def test_ewma_nan_handling(): ], ), ( - Series([1.0, np.nan, 101.0, 50.0]), + [1.0, np.nan, 101.0, 50.0], True, False, [ @@ -368,7 +377,7 @@ def test_ewma_nan_handling(): ], ), ( - Series([1.0, np.nan, 101.0, 50.0]), + [1.0, np.nan, 101.0, 50.0], True, True, [ @@ -379,7 +388,7 @@ def test_ewma_nan_handling(): ], ), ( - Series([1.0, np.nan, 101.0, 50.0]), + [1.0, np.nan, 101.0, 50.0], False, False, [ @@ -391,7 +400,7 @@ def test_ewma_nan_handling(): ], ), ( - Series([1.0, np.nan, 101.0, 50.0]), + [1.0, np.nan, 101.0, 50.0], False, True, [ @@ -405,6 +414,7 @@ def test_ewma_nan_handling(): ) def test_ewma_nan_handling_cases(s, adjust, ignore_na, w): # GH 7603 + s = Series(s) expected = (s.multiply(w).cumsum() / Series(w).cumsum()).ffill() result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index aebb9e86c763f..2c96ce01c6328 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -79,23 +79,14 @@ def test_missing_minp_zero(): tm.assert_series_equal(result, expected) -def test_expanding_axis(axis_frame): +def test_expanding(): # see gh-23372. df = DataFrame(np.ones((10, 20))) - axis = df._get_axis_number(axis_frame) - if axis == 0: - msg = "The 'axis' keyword in DataFrame.expanding is deprecated" - expected = DataFrame( - {i: [np.nan] * 2 + [float(j) for j in range(3, 11)] for i in range(20)} - ) - else: - # axis == 1 - msg = "Support for axis=1 in DataFrame.expanding is deprecated" - expected = DataFrame([[np.nan] * 2 + [float(i) for i in range(3, 21)]] * 10) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.expanding(3, axis=axis_frame).sum() + expected = DataFrame( + {i: [np.nan] * 2 + [float(j) for j in range(3, 11)] for i in range(20)} + ) + result = df.expanding(3).sum() tm.assert_frame_equal(result, expected) @@ -127,7 +118,7 @@ def test_expanding_count_with_min_periods_exceeding_series_length(frame_or_serie "df,expected,min_periods", [ ( - DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), + {"A": [1, 2, 3], "B": [4, 5, 6]}, [ ({"A": [1], "B": [4]}, [0]), ({"A": [1, 2], "B": [4, 5]}, [0, 1]), @@ -136,7 +127,7 @@ def test_expanding_count_with_min_periods_exceeding_series_length(frame_or_serie 3, ), ( - DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), + {"A": [1, 2, 3], "B": [4, 5, 6]}, [ ({"A": [1], "B": [4]}, [0]), ({"A": [1, 2], "B": [4, 5]}, [0, 1]), @@ -145,7 +136,7 @@ def test_expanding_count_with_min_periods_exceeding_series_length(frame_or_serie 2, ), ( - DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), + {"A": [1, 2, 3], "B": [4, 5, 6]}, [ ({"A": [1], "B": [4]}, [0]), ({"A": [1, 2], "B": [4, 5]}, [0, 1]), @@ -153,10 +144,10 @@ def test_expanding_count_with_min_periods_exceeding_series_length(frame_or_serie ], 1, ), - (DataFrame({"A": [1], "B": [4]}), [], 2), - (DataFrame(), [({}, [])], 1), + ({"A": [1], "B": [4]}, [], 2), + (None, [({}, [])], 1), ( - DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}), + {"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [ ({"A": [1.0], "B": [np.nan]}, [0]), ({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]), @@ -165,7 +156,7 @@ def test_expanding_count_with_min_periods_exceeding_series_length(frame_or_serie 3, ), ( - DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}), + {"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [ ({"A": [1.0], "B": [np.nan]}, [0]), ({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]), @@ -174,7 +165,7 @@ def test_expanding_count_with_min_periods_exceeding_series_length(frame_or_serie 2, ), ( - DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}), + {"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [ ({"A": [1.0], "B": [np.nan]}, [0]), ({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]), @@ -186,9 +177,10 @@ def test_expanding_count_with_min_periods_exceeding_series_length(frame_or_serie ) def test_iter_expanding_dataframe(df, expected, min_periods): # GH 11704 - expected = [DataFrame(values, index=index) for (values, index) in expected] + df = DataFrame(df) + expecteds = [DataFrame(values, index=index) for (values, index) in expected] - for expected, actual in zip(expected, df.expanding(min_periods)): + for expected, actual in zip(expecteds, df.expanding(min_periods)): tm.assert_frame_equal(actual, expected) @@ -205,9 +197,9 @@ def test_iter_expanding_dataframe(df, expected, min_periods): ) def test_iter_expanding_series(ser, expected, min_periods): # GH 11704 - expected = [Series(values, index=index) for (values, index) in expected] + expecteds = [Series(values, index=index) for (values, index) in expected] - for expected, actual in zip(expected, ser.expanding(min_periods)): + for expected, actual in zip(expecteds, ser.expanding(min_periods)): tm.assert_series_equal(actual, expected) @@ -241,7 +233,6 @@ def test_expanding_skew_kurt_numerical_stability(method): @pytest.mark.parametrize("window", [1, 3, 10, 20]) @pytest.mark.parametrize("method", ["min", "max", "average"]) @pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) def test_rank(window, method, pct, ascending, test_data): length = 20 @@ -264,6 +255,43 @@ def test_rank(window, method, pct, ascending, test_data): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("window", [1, 3, 10, 20]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"]) +def test_nunique(window, test_data): + length = 20 + if test_data == "default": + ser = Series(data=np.random.default_rng(2).random(length)) + elif test_data == "duplicates": + ser = Series(data=np.random.default_rng(2).choice(3, length)) + elif test_data == "nans": + ser = Series( + data=np.random.default_rng(2).choice( + [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length + ) + ) + elif test_data == "precision": + ser = Series( + data=[ + 0.3, + 0.1 * 3, # Not necessarily exactly 0.3 + 0.6, + 0.2 * 3, # Not necessarily exactly 0.6 + 0.9, + 0.3 * 3, # Not necessarily exactly 0.9 + 0.5, + 0.1 * 5, # Not necessarily exactly 0.5 + 0.8, + 0.2 * 4, # Not necessarily exactly 0.8 + ], + dtype=np.float64, + ) + + expected = ser.expanding(window).apply(lambda x: x.nunique()) + result = ser.expanding(window).nunique() + + tm.assert_series_equal(result, expected) + + def test_expanding_corr(series): A = series.dropna() B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5] @@ -319,7 +347,7 @@ def test_expanding_corr_pairwise(frame): @pytest.mark.parametrize( "func,static_comp", [ - ("sum", np.sum), + ("sum", lambda x: np.sum(x, axis=0)), ("mean", lambda x: np.mean(x, axis=0)), ("max", lambda x: np.max(x, axis=0)), ("min", lambda x: np.min(x, axis=0)), @@ -329,18 +357,11 @@ def test_expanding_corr_pairwise(frame): def test_expanding_func(func, static_comp, frame_or_series): data = frame_or_series(np.array(list(range(10)) + [np.nan] * 10)) - msg = "The 'axis' keyword in (Series|DataFrame).expanding is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - obj = data.expanding(min_periods=1, axis=0) + obj = data.expanding(min_periods=1) result = getattr(obj, func)() assert isinstance(result, frame_or_series) - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - warn = None - if frame_or_series is DataFrame and static_comp is np.sum: - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - expected = static_comp(data[:11]) + expected = static_comp(data[:11]) if frame_or_series is Series: tm.assert_almost_equal(result[10], expected) else: @@ -355,33 +376,26 @@ def test_expanding_func(func, static_comp, frame_or_series): def test_expanding_min_periods(func, static_comp): ser = Series(np.random.default_rng(2).standard_normal(50)) - msg = "The 'axis' keyword in Series.expanding is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(ser.expanding(min_periods=30, axis=0), func)() + result = getattr(ser.expanding(min_periods=30), func)() assert result[:29].isna().all() tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) # min_periods is working correctly - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(ser.expanding(min_periods=15, axis=0), func)() + result = getattr(ser.expanding(min_periods=15), func)() assert isna(result.iloc[13]) assert notna(result.iloc[14]) ser2 = Series(np.random.default_rng(2).standard_normal(20)) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(ser2.expanding(min_periods=5, axis=0), func)() + result = getattr(ser2.expanding(min_periods=5), func)() assert isna(result[3]) assert notna(result[4]) # min_periods=0 - with tm.assert_produces_warning(FutureWarning, match=msg): - result0 = getattr(ser.expanding(min_periods=0, axis=0), func)() - with tm.assert_produces_warning(FutureWarning, match=msg): - result1 = getattr(ser.expanding(min_periods=1, axis=0), func)() + result0 = getattr(ser.expanding(min_periods=0), func)() + result1 = getattr(ser.expanding(min_periods=1), func)() tm.assert_almost_equal(result0, result1) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = getattr(ser.expanding(min_periods=1, axis=0), func)() + result = getattr(ser.expanding(min_periods=1), func)() tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) @@ -474,6 +488,8 @@ def test_moment_functions_zero_length_pairwise(f): lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), lambda x: x.expanding(min_periods=5).max(), lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).first(), + lambda x: x.expanding(min_periods=5).last(), lambda x: x.expanding(min_periods=5).sum(), lambda x: x.expanding(min_periods=5).mean(), lambda x: x.expanding(min_periods=5).std(), @@ -525,8 +541,8 @@ def test_expanding_apply_min_periods_0(engine_and_raw): def test_expanding_cov_diff_index(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.expanding().cov(s2) expected = Series([None, None, 2.0]) tm.assert_series_equal(result, expected) @@ -538,14 +554,14 @@ def test_expanding_cov_diff_index(): s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().cov(s2) - expected = Series([None, None, None, 4.5]) + expected = Series([None, None, None, 4.5], index=list(range(4))) tm.assert_series_equal(result, expected) def test_expanding_corr_diff_index(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.expanding().corr(s2) expected = Series([None, None, 1.0]) tm.assert_series_equal(result, expected) @@ -557,7 +573,7 @@ def test_expanding_corr_diff_index(): s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.0]) + expected = Series([None, None, None, 1.0], index=list(range(4))) tm.assert_series_equal(result, expected) @@ -573,7 +589,7 @@ def test_expanding_cov_pairwise_diff_length(): df2a = DataFrame( [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") ) - # TODO: xref gh-15826 + # xref gh-15826 # .loc is not preserving the names result1 = df1.expanding().cov(df2, pairwise=True).loc[2] result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] @@ -619,6 +635,104 @@ def test_expanding_corr_pairwise_diff_length(): tm.assert_frame_equal(result4, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 5.0, + 5.0, + 7.0, + 7.0, + 9.0, + 9.0, + ], + ), + ], +) +def test_expanding_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0] * 10, + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_expanding_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_expanding_apply_args_kwargs(engine_and_raw): def mean_w_arg(x, const): return np.mean(x) + const @@ -714,10 +828,3 @@ def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): op2 = getattr(expanding2, kernel) expected = op2(*arg2, numeric_only=numeric_only) tm.assert_series_equal(result, expected) - - -def test_keyword_quantile_deprecated(): - # GH #52550 - ser = Series([1, 2, 3, 4]) - with tm.assert_produces_warning(FutureWarning): - ser.expanding().quantile(quantile=0.5) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 400bf10817ab8..1dcdad2bfd73d 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -6,6 +6,7 @@ DatetimeIndex, Index, MultiIndex, + NamedAgg, Series, Timestamp, date_range, @@ -90,9 +91,12 @@ def test_getitem_multiple(self, roll_frame): "mean", "min", "max", + "first", + "last", "count", "kurt", "skew", + "nunique", ], ) def test_rolling(self, f, roll_frame): @@ -100,11 +104,7 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -116,11 +116,7 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -134,13 +130,9 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -181,9 +173,7 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -199,9 +189,7 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -246,11 +234,7 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -447,7 +431,7 @@ def get_window_bounds( ): min_periods = self.window_size if min_periods is None else 0 end = np.arange(num_values, dtype=np.int64) + 1 - start = end.copy() - self.window_size + start = end - self.window_size start[start < 0] = min_periods return start, end @@ -489,6 +473,36 @@ def test_groupby_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_agg_namedagg(self): + # GH#28333 + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0], + "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0], + } + ) + result = ( + df.groupby("kind") + .rolling(2) + .agg( + total_weight=NamedAgg(column="weight", aggfunc=sum), + min_height=NamedAgg(column="height", aggfunc=min), + ) + ) + expected = DataFrame( + { + "total_weight": [np.nan, 17.8, 19.9, np.nan, 205.5, 240.0], + "min_height": [np.nan, 9.1, 9.5, np.nan, 6.0, 8.0], + }, + index=MultiIndex( + [["cat", "dog"], [0, 1, 2, 3, 4, 5]], + [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]], + names=["kind", None], + ), + ) + tm.assert_frame_equal(result, expected) + def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( @@ -582,7 +596,7 @@ def test_groupby_rolling_string_index(self): groups = df.groupby("group") df["count_to_date"] = groups.cumcount() - rolling_groups = groups.rolling("10d", on="eventTime") + rolling_groups = groups.rolling("10D", on="eventTime") result = rolling_groups.apply(lambda df: df.shape[0]) expected = DataFrame( [ @@ -623,11 +637,14 @@ def test_groupby_rolling_count_closed_on(self, unit): "date": date_range(end="20190101", periods=6, unit=unit), } ) - result = ( - df.groupby("group") - .rolling("3d", on="date", closed="left")["column1"] - .count() - ) + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + df.groupby("group") + .rolling("3d", on="date", closed="left")["column1"] + .count() + ) dti = DatetimeIndex( [ "2018-12-27", @@ -792,13 +809,9 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).sum()).index + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = g.apply(lambda x: x.rolling(4).sum()).index + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -974,13 +987,11 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -999,13 +1010,9 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = ( - df.set_index("B") - .groupby("A") - .apply(lambda x: x.rolling("4s")["C"].mean()) - ) + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1028,18 +1035,26 @@ def frame(self): return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + "f", + [ + "sum", + "mean", + "min", + "max", + "first", + "last", + "count", + "kurt", + "skew", + "nunique", + ], ) def test_expanding(self, f, frame): g = frame.groupby("A", group_keys=False) r = g.expanding() result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1051,11 +1066,7 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1069,13 +1080,9 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1091,9 +1098,7 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(func_0) + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1108,9 +1113,7 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply(func_1) + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1119,18 +1122,42 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index tm.assert_frame_equal(result, expected) + def test_groupby_expanding_agg_namedagg(self): + # GH#28333 + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0], + "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0], + } + ) + result = ( + df.groupby("kind") + .expanding(1) + .agg( + total_weight=NamedAgg(column="weight", aggfunc=sum), + min_height=NamedAgg(column="height", aggfunc=min), + ) + ) + expected = DataFrame( + { + "total_weight": [7.9, 17.8, 27.8, 7.5, 205.5, 247.5], + "min_height": [9.1, 9.1, 9.1, 6.0, 6.0, 6.0], + }, + index=MultiIndex( + [["cat", "dog"], [0, 1, 2, 3, 4, 5]], + [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]], + names=["kind", None], + ), + ) + tm.assert_frame_equal(result, expected) + class TestEWM: @pytest.mark.parametrize( @@ -1159,6 +1186,41 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) + def test_groupby_ewm_agg_namedagg(self): + # GH#28333 + df = DataFrame({"A": ["a"] * 4, "B": range(4)}) + result = ( + df.groupby("A") + .ewm(com=1.0) + .agg( + B_mean=NamedAgg(column="B", aggfunc="mean"), + B_std=NamedAgg(column="B", aggfunc="std"), + B_var=NamedAgg(column="B", aggfunc="var"), + ) + ) + expected = DataFrame( + { + "B_mean": [ + 0.0, + 0.6666666666666666, + 1.4285714285714286, + 2.2666666666666666, + ], + "B_std": [np.nan, 0.707107, 0.963624, 1.177164], + "B_var": [np.nan, 0.5, 0.9285714285714286, 1.3857142857142857], + }, + index=MultiIndex.from_tuples( + [ + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ], + names=["A", None], + ), + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "method, expected_data", [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]], diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index b1cc7ec186f19..ff6a616bc5264 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -11,8 +12,18 @@ to_datetime, ) import pandas._testing as tm +from pandas.api.indexers import BaseIndexer +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=["single", "table"]) @@ -38,6 +49,11 @@ def arithmetic_numba_supported_operators(request): return request.param +@pytest.fixture +def roll_frame(): + return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) + + @td.skip_if_no("numba") @pytest.mark.filterwarnings("ignore") # Filter warnings when parallel=True and the function can't be parallelized by Numba @@ -67,6 +83,77 @@ def f(x, *args): ) tm.assert_series_equal(result, expected) + def test_apply_numba_with_kwargs(self, roll_frame): + # GH 58995 + # rolling apply + def func(sr, a=0): + return sr.sum() + a + + data = DataFrame(range(10)) + + result = data.rolling(5).apply(func, engine="numba", raw=True, kwargs={"a": 1}) + expected = data.rolling(5).sum() + 1 + tm.assert_frame_equal(result, expected) + + result = data.rolling(5).apply(func, engine="numba", raw=True, args=(1,)) + tm.assert_frame_equal(result, expected) + + # expanding apply + + result = data.expanding().apply(func, engine="numba", raw=True, kwargs={"a": 1}) + expected = data.expanding().sum() + 1 + tm.assert_frame_equal(result, expected) + + result = data.expanding().apply(func, engine="numba", raw=True, args=(1,)) + tm.assert_frame_equal(result, expected) + + # groupby rolling + result = ( + roll_frame.groupby("A") + .rolling(5) + .apply(func, engine="numba", raw=True, kwargs={"a": 1}) + ) + expected = roll_frame.groupby("A").rolling(5).sum() + 1 + tm.assert_frame_equal(result, expected) + + result = ( + roll_frame.groupby("A") + .rolling(5) + .apply(func, engine="numba", raw=True, args=(1,)) + ) + tm.assert_frame_equal(result, expected) + # groupby expanding + + result = ( + roll_frame.groupby("A") + .expanding() + .apply(func, engine="numba", raw=True, kwargs={"a": 1}) + ) + expected = roll_frame.groupby("A").expanding().sum() + 1 + tm.assert_frame_equal(result, expected) + + result = ( + roll_frame.groupby("A") + .expanding() + .apply(func, engine="numba", raw=True, args=(1,)) + ) + tm.assert_frame_equal(result, expected) + + def test_numba_min_periods(self): + # GH 58868 + def last_row(x): + assert len(x) == 3 + return x[-1] + + df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]]) + + result = df.rolling(3, method="table", min_periods=3).apply( + last_row, raw=True, engine="numba" + ) + + expected = DataFrame([[np.nan, np.nan], [np.nan, np.nan], [5, 6], [7, 8]]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "data", [ @@ -304,10 +391,23 @@ def f(x): @td.skip_if_no("numba") def test_invalid_kwargs_nopython(): - with pytest.raises(NumbaUtilError, match="numba does not support kwargs with"): + with pytest.raises(TypeError, match="got an unexpected keyword argument 'a'"): Series(range(1)).rolling(1).apply( lambda x: x, kwargs={"a": 1}, engine="numba", raw=True ) + with pytest.raises( + NumbaUtilError, match="numba does not support keyword-only arguments" + ): + Series(range(1)).rolling(1).apply( + lambda x, *, a: x, kwargs={"a": 1}, engine="numba", raw=True + ) + + tm.assert_series_equal( + Series(range(1), dtype=float) + 1, + Series(range(1)) + .rolling(1) + .apply(lambda x, a: (x + a).sum(), kwargs={"a": 1}, engine="numba", raw=True), + ) @td.skip_if_no("numba") @@ -328,7 +428,6 @@ def f(x): def test_table_method_rolling_methods( self, - axis, nogil, parallel, nopython, @@ -340,16 +439,14 @@ def test_table_method_rolling_methods( engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(np.eye(3)) - roll_table = df.rolling(2, method="table", axis=axis, min_periods=0, step=step) + roll_table = df.rolling(2, method="table", min_periods=0, step=step) if method in ("var", "std"): with pytest.raises(NotImplementedError, match=f"{method} not supported"): getattr(roll_table, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) else: - roll_single = df.rolling( - 2, method="single", axis=axis, min_periods=0, step=step - ) + roll_single = df.rolling(2, method="single", min_periods=0, step=step) result = getattr(roll_table, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) @@ -358,21 +455,53 @@ def test_table_method_rolling_methods( ) tm.assert_frame_equal(result, expected) - def test_table_method_rolling_apply(self, axis, nogil, parallel, nopython, step): + def test_table_method_rolling_apply(self, nogil, parallel, nopython, step): engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} def f(x): return np.sum(x, axis=0) + 1 df = DataFrame(np.eye(3)) - result = df.rolling( - 2, method="table", axis=axis, min_periods=0, step=step - ).apply(f, raw=True, engine_kwargs=engine_kwargs, engine="numba") - expected = df.rolling( - 2, method="single", axis=axis, min_periods=0, step=step - ).apply(f, raw=True, engine_kwargs=engine_kwargs, engine="numba") + result = df.rolling(2, method="table", min_periods=0, step=step).apply( + f, raw=True, engine_kwargs=engine_kwargs, engine="numba" + ) + expected = df.rolling(2, method="single", min_periods=0, step=step).apply( + f, raw=True, engine_kwargs=engine_kwargs, engine="numba" + ) tm.assert_frame_equal(result, expected) + def test_table_method_rolling_apply_col_order(self): + # GH#59666 + def f(x): + return np.nanmean(x[:, 0] - x[:, 1]) + + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [6, 7, 8, 5, 6, 7], + } + ) + result = df.rolling(3, method="table", min_periods=0)[["a", "b"]].apply( + f, raw=True, engine="numba" + ) + expected = DataFrame( + { + "a": [-5, -5, -5, -3.66667, -2.33333, -1], + "b": [-5, -5, -5, -3.66667, -2.33333, -1], + } + ) + tm.assert_almost_equal(result, expected) + result = df.rolling(3, method="table", min_periods=0)[["b", "a"]].apply( + f, raw=True, engine="numba" + ) + expected = DataFrame( + { + "b": [5, 5, 5, 3.66667, 2.33333, 1], + "a": [5, 5, 5, 3.66667, 2.33333, 1], + } + ) + tm.assert_almost_equal(result, expected) + def test_table_method_rolling_weighted_mean(self, step): def weighted_mean(x): arr = np.ones((1, x.shape[1])) @@ -393,37 +522,37 @@ def weighted_mean(x): )[::step] tm.assert_frame_equal(result, expected) - def test_table_method_expanding_apply(self, axis, nogil, parallel, nopython): + def test_table_method_expanding_apply(self, nogil, parallel, nopython): engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} def f(x): return np.sum(x, axis=0) + 1 df = DataFrame(np.eye(3)) - result = df.expanding(method="table", axis=axis).apply( + result = df.expanding(method="table").apply( f, raw=True, engine_kwargs=engine_kwargs, engine="numba" ) - expected = df.expanding(method="single", axis=axis).apply( + expected = df.expanding(method="single").apply( f, raw=True, engine_kwargs=engine_kwargs, engine="numba" ) tm.assert_frame_equal(result, expected) def test_table_method_expanding_methods( - self, axis, nogil, parallel, nopython, arithmetic_numba_supported_operators + self, nogil, parallel, nopython, arithmetic_numba_supported_operators ): method, kwargs = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(np.eye(3)) - expand_table = df.expanding(method="table", axis=axis) + expand_table = df.expanding(method="table") if method in ("var", "std"): with pytest.raises(NotImplementedError, match=f"{method} not supported"): getattr(expand_table, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) else: - expand_single = df.expanding(method="single", axis=axis) + expand_single = df.expanding(method="single") result = getattr(expand_table, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) @@ -434,15 +563,86 @@ def test_table_method_expanding_methods( @pytest.mark.parametrize("data", [np.eye(3), np.ones((2, 3)), np.ones((3, 2))]) @pytest.mark.parametrize("method", ["mean", "sum"]) - def test_table_method_ewm(self, data, method, axis, nogil, parallel, nopython): + def test_table_method_ewm(self, data, method, nogil, parallel, nopython): engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(data) - result = getattr(df.ewm(com=1, method="table", axis=axis), method)( + result = getattr(df.ewm(com=1, method="table"), method)( engine_kwargs=engine_kwargs, engine="numba" ) - expected = getattr(df.ewm(com=1, method="single", axis=axis), method)( + expected = getattr(df.ewm(com=1, method="single"), method)( engine_kwargs=engine_kwargs, engine="numba" ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_npfunc_no_warnings(): + df = DataFrame({"col1": [1, 2, 3, 4, 5]}) + with tm.assert_produces_warning(False): + df.col1.rolling(2).apply(np.prod, raw=True, engine="numba") + + +class PrescribedWindowIndexer(BaseIndexer): + def __init__(self, start, end): + self._start = start + self._end = end + super().__init__() + + def get_window_bounds( + self, num_values=None, min_periods=None, center=None, closed=None, step=None + ): + if num_values is None: + num_values = len(self._start) + start = np.clip(self._start, 0, num_values) + end = np.clip(self._end, 0, num_values) + return start, end + + +@td.skip_if_no("numba") +class TestMinMaxNumba: + @pytest.mark.parametrize( + "is_max, has_nan, exp_list", + [ + (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]), + (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]), + (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]), + (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]), + ], + ) + def test_minmax(self, is_max, has_nan, exp_list): + nan_idx = [0, 5, 8] + df = DataFrame( + { + "data": [5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 6.0, 7.0, 8.0, 9.0], + "start": [2, 0, 3, 0, 4, 0, 5, 5, 7, 3], + "end": [3, 4, 4, 5, 5, 6, 7, 8, 9, 10], + } + ) + if has_nan: + df.loc[nan_idx, "data"] = np.nan + expected = Series(exp_list, name="data") + r = df.data.rolling( + PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy()) + ) + if is_max: + result = r.max(engine="numba") + else: + result = r.min(engine="numba") + + tm.assert_series_equal(result, expected) + + def test_wrong_order(self): + start = np.array(range(5), dtype=np.int64) + end = start + 1 + end[3] = end[2] + start[3] = start[2] - 1 + + df = DataFrame({"data": start * 1.0, "start": start, "end": end}) + + r = df.data.rolling(PrescribedWindowIndexer(start, end)) + with pytest.raises( + ValueError, match="Start/End ordering requirement is violated at index 3" + ): + r.max(engine="numba") diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index 14d3a39107bc4..43d55a7992b3c 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -1,15 +1,24 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 3ceb58756bac6..d23c6501ed1d1 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -96,14 +96,14 @@ def test_flex_binary_frame(method, frame): tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) - frame2 = frame.copy() frame2 = DataFrame( - np.random.default_rng(2).standard_normal(frame2.shape), - index=frame2.index, - columns=frame2.columns, + np.random.default_rng(2).standard_normal(frame.shape), + index=frame.index, + columns=frame.columns, ) res3 = getattr(frame.rolling(window=10), method)(frame2) + res3.columns = Index(list(res3.columns)) exp = DataFrame( {k: getattr(frame[k].rolling(window=10), method)(frame2[k]) for k in frame} ) @@ -144,26 +144,26 @@ def test_corr_sanity(): def test_rolling_cov_diff_length(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.rolling(window=3, min_periods=2).cov(s2) expected = Series([None, None, 2.0]) tm.assert_series_equal(result, expected) - s2a = Series([1, None, 3], index=[0, 1, 2]) + s2a = Series([1, None, 3], index=range(3)) result = s1.rolling(window=3, min_periods=2).cov(s2a) tm.assert_series_equal(result, expected) def test_rolling_corr_diff_length(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.rolling(window=3, min_periods=2).corr(s2) expected = Series([None, None, 1.0]) tm.assert_series_equal(result, expected) - s2a = Series([1, None, 3], index=[0, 1, 2]) + s2a = Series([1, None, 3], index=range(3)) result = s1.rolling(window=3, min_periods=2).corr(s2a) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f353a7fa2f0fe..6e8f075d35490 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -10,6 +10,7 @@ IS64, is_platform_arm, is_platform_power, + is_platform_riscv64, ) from pandas import ( @@ -21,8 +22,6 @@ Timestamp, date_range, period_range, - to_datetime, - to_timedelta, ) import pandas._testing as tm from pandas.api.indexers import BaseIndexer @@ -579,7 +578,7 @@ def test_missing_minp_zero_variable(): [np.nan] * 4, index=DatetimeIndex(["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"]), ) - result = x.rolling(Timedelta("2d"), min_periods=0).sum() + result = x.rolling(Timedelta("2D"), min_periods=0).sum() expected = Series(0.0, index=x.index) tm.assert_series_equal(result, expected) @@ -594,39 +593,20 @@ def test_multi_index_names(): assert result.index.names == [None, "1", "2"] -def test_rolling_axis_sum(axis_frame): +def test_rolling_axis_sum(): # see gh-23372. df = DataFrame(np.ones((10, 20))) - axis = df._get_axis_number(axis_frame) - - if axis == 0: - msg = "The 'axis' keyword in DataFrame.rolling" - expected = DataFrame({i: [np.nan] * 2 + [3.0] * 8 for i in range(20)}) - else: - # axis == 1 - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - expected = DataFrame([[np.nan] * 2 + [3.0] * 18] * 10) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(3, axis=axis_frame).sum() + expected = DataFrame({i: [np.nan] * 2 + [3.0] * 8 for i in range(20)}) + result = df.rolling(3).sum() tm.assert_frame_equal(result, expected) -def test_rolling_axis_count(axis_frame): +def test_rolling_axis_count(): # see gh-26055 df = DataFrame({"x": range(3), "y": range(3)}) - axis = df._get_axis_number(axis_frame) - - if axis in [0, "index"]: - msg = "The 'axis' keyword in DataFrame.rolling" - expected = DataFrame({"x": [1.0, 2.0, 2.0], "y": [1.0, 2.0, 2.0]}) - else: - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - expected = DataFrame({"x": [1.0, 1.0, 1.0], "y": [2.0, 2.0, 2.0]}) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(2, axis=axis_frame, min_periods=0).count() + expected = DataFrame({"x": [1.0, 2.0, 2.0], "y": [1.0, 2.0, 2.0]}) + result = df.rolling(2, min_periods=0).count() tm.assert_frame_equal(result, expected) @@ -639,21 +619,14 @@ def test_readonly_array(): tm.assert_series_equal(result, expected) -def test_rolling_datetime(axis_frame, tz_naive_fixture): +def test_rolling_datetime(tz_naive_fixture): # GH-28192 tz = tz_naive_fixture df = DataFrame( {i: [1] * 2 for i in date_range("2019-8-01", "2019-08-03", freq="D", tz=tz)} ) - if axis_frame in [0, "index"]: - msg = "The 'axis' keyword in DataFrame.rolling" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.T.rolling("2D", axis=axis_frame).sum().T - else: - msg = "Support for axis=1 in DataFrame.rolling" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling("2D", axis=axis_frame).sum() + result = df.T.rolling("2D").sum().T expected = DataFrame( { **{ @@ -669,7 +642,6 @@ def test_rolling_datetime(axis_frame, tz_naive_fixture): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("center", [True, False]) def test_rolling_window_as_string(center): # see gh-22590 date_today = datetime.now() @@ -722,7 +694,7 @@ def test_rolling_count_default_min_periods_with_null_values(frame_or_series): "df,expected,window,min_periods", [ ( - DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), + {"A": [1, 2, 3], "B": [4, 5, 6]}, [ ({"A": [1], "B": [4]}, [0]), ({"A": [1, 2], "B": [4, 5]}, [0, 1]), @@ -732,7 +704,7 @@ def test_rolling_count_default_min_periods_with_null_values(frame_or_series): None, ), ( - DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), + {"A": [1, 2, 3], "B": [4, 5, 6]}, [ ({"A": [1], "B": [4]}, [0]), ({"A": [1, 2], "B": [4, 5]}, [0, 1]), @@ -742,7 +714,7 @@ def test_rolling_count_default_min_periods_with_null_values(frame_or_series): 1, ), ( - DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), + {"A": [1, 2, 3], "B": [4, 5, 6]}, [ ({"A": [1], "B": [4]}, [0]), ({"A": [1, 2], "B": [4, 5]}, [0, 1]), @@ -752,7 +724,7 @@ def test_rolling_count_default_min_periods_with_null_values(frame_or_series): 2, ), ( - DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), + {"A": [1, 2, 3], "B": [4, 5, 6]}, [ ({"A": [1], "B": [4]}, [0]), ({"A": [2], "B": [5]}, [1]), @@ -762,7 +734,7 @@ def test_rolling_count_default_min_periods_with_null_values(frame_or_series): 1, ), ( - DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), + {"A": [1, 2, 3], "B": [4, 5, 6]}, [ ({"A": [1], "B": [4]}, [0]), ({"A": [2], "B": [5]}, [1]), @@ -771,11 +743,11 @@ def test_rolling_count_default_min_periods_with_null_values(frame_or_series): 1, 0, ), - (DataFrame({"A": [1], "B": [4]}), [], 2, None), - (DataFrame({"A": [1], "B": [4]}), [], 2, 1), - (DataFrame(), [({}, [])], 2, None), + ({"A": [1], "B": [4]}, [], 2, None), + ({"A": [1], "B": [4]}, [], 2, 1), + (None, [({}, [])], 2, None), ( - DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}), + {"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [ ({"A": [1.0], "B": [np.nan]}, [0]), ({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]), @@ -788,9 +760,10 @@ def test_rolling_count_default_min_periods_with_null_values(frame_or_series): ) def test_iter_rolling_dataframe(df, expected, window, min_periods): # GH 11704 - expected = [DataFrame(values, index=index) for (values, index) in expected] + df = DataFrame(df) + expecteds = [DataFrame(values, index=index) for (values, index) in expected] - for expected, actual in zip(expected, df.rolling(window, min_periods=min_periods)): + for expected, actual in zip(expecteds, df.rolling(window, min_periods=min_periods)): tm.assert_frame_equal(actual, expected) @@ -833,10 +806,10 @@ def test_iter_rolling_on_dataframe(expected, window): } ) - expected = [ + expecteds = [ DataFrame(values, index=df.loc[index, "C"]) for (values, index) in expected ] - for expected, actual in zip(expected, df.rolling(window, on="C")): + for expected, actual in zip(expecteds, df.rolling(window, on="C")): tm.assert_frame_equal(actual, expected) @@ -884,9 +857,11 @@ def test_iter_rolling_on_dataframe_unordered(): ) def test_iter_rolling_series(ser, expected, window, min_periods): # GH 11704 - expected = [Series(values, index=index) for (values, index) in expected] + expecteds = [Series(values, index=index) for (values, index) in expected] - for expected, actual in zip(expected, ser.rolling(window, min_periods=min_periods)): + for expected, actual in zip( + expecteds, ser.rolling(window, min_periods=min_periods) + ): tm.assert_series_equal(actual, expected) @@ -932,11 +907,11 @@ def test_iter_rolling_datetime(expected, expected_index, window): # GH 11704 ser = Series(range(5), index=date_range(start="2020-01-01", periods=5, freq="D")) - expected = [ + expecteds = [ Series(values, index=idx) for (values, idx) in zip(expected, expected_index) ] - for expected, actual in zip(expected, ser.rolling(window)): + for expected, actual in zip(expecteds, ser.rolling(window)): tm.assert_series_equal(actual, expected) @@ -1065,75 +1040,6 @@ def test_rolling_numerical_too_large_numbers(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - ("func", "value"), - [("sum", 2.0), ("max", 1.0), ("min", 1.0), ("mean", 1.0), ("median", 1.0)], -) -def test_rolling_mixed_dtypes_axis_1(func, value): - # GH: 20649 - df = DataFrame(1, index=[1, 2], columns=["a", "b", "c"]) - df["c"] = 1.0 - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - roll = df.rolling(window=2, min_periods=1, axis=1) - result = getattr(roll, func)() - expected = DataFrame( - {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, - index=[1, 2], - ) - tm.assert_frame_equal(result, expected) - - -def test_rolling_axis_one_with_nan(): - # GH: 35596 - df = DataFrame( - [ - [0, 1, 2, 4, np.nan, np.nan, np.nan], - [0, 1, 2, np.nan, np.nan, np.nan, np.nan], - [0, 2, 2, np.nan, 2, np.nan, 1], - ] - ) - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(window=7, min_periods=1, axis="columns").sum() - expected = DataFrame( - [ - [0.0, 1.0, 3.0, 7.0, 7.0, 7.0, 7.0], - [0.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0], - [0.0, 2.0, 4.0, 4.0, 6.0, 6.0, 7.0], - ] - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "value", - ["test", to_datetime("2019-12-31"), to_timedelta("1 days 06:05:01.00003")], -) -def test_rolling_axis_1_non_numeric_dtypes(value): - # GH: 20649 - df = DataFrame({"a": [1, 2]}) - df["b"] = value - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(window=2, min_periods=1, axis=1).sum() - expected = DataFrame({"a": [1.0, 2.0]}) - tm.assert_frame_equal(result, expected) - - -def test_rolling_on_df_transposed(): - # GH: 32724 - df = DataFrame({"A": [1, None], "B": [4, 5], "C": [7, 8]}) - expected = DataFrame({"A": [1.0, np.nan], "B": [5.0, 5.0], "C": [11.0, 13.0]}) - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(min_periods=1, window=2, axis=1).sum() - tm.assert_frame_equal(result, expected) - - result = df.T.rolling(min_periods=1, window=2).sum().T - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( ("index", "window"), [ @@ -1176,7 +1082,7 @@ def test_rolling_sem(frame_or_series): @pytest.mark.xfail( - is_platform_arm() or is_platform_power(), + is_platform_arm() or is_platform_power() or is_platform_riscv64(), reason="GH 38921", ) @pytest.mark.parametrize( @@ -1247,7 +1153,7 @@ def test_timeoffset_as_window_parameter_for_corr(unit): index=dti, ) - res = df.rolling(window="3d").corr() + res = df.rolling(window="3D").corr() tm.assert_frame_equal(exp, res) @@ -1420,6 +1326,82 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [float("nan")] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [float("nan")] * 10, + ), + ], +) +def test_rolling_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0, 1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_rolling_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} @@ -1474,17 +1456,20 @@ def test_invalid_method(): Series(range(1)).rolling(1, method="foo") -@pytest.mark.parametrize("window", [1, "1d"]) -def test_rolling_descending_date_order_with_offset(window, frame_or_series): +def test_rolling_descending_date_order_with_offset(frame_or_series): # GH#40002 - idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") - obj = frame_or_series(range(1, 4), index=idx) - result = obj.rolling("1d", closed="left").sum() + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") + obj = frame_or_series(range(1, 4), index=idx) + result = obj.rolling("1d", closed="left").sum() + expected = frame_or_series([np.nan, 1, 2], index=idx) tm.assert_equal(result, expected) - result = obj.iloc[::-1].rolling("1d", closed="left").sum() - idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1d") + result = obj.iloc[::-1].rolling("1D", closed="left").sum() + idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1D") expected = frame_or_series([np.nan, 3, 2], index=idx) tm.assert_equal(result, expected) @@ -1576,60 +1561,9 @@ def test_rolling_zero_window(): tm.assert_series_equal(result, expected) -def test_rolling_float_dtype(float_numpy_dtype): - # GH#42452 - df = DataFrame({"A": range(5), "B": range(10, 15)}, dtype=float_numpy_dtype) - expected = DataFrame( - {"A": [np.nan] * 5, "B": range(10, 20, 2)}, - dtype=float_numpy_dtype, - ) - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(2, axis=1).sum() - tm.assert_frame_equal(result, expected, check_dtype=False) - - -def test_rolling_numeric_dtypes(): - # GH#41779 - df = DataFrame(np.arange(40).reshape(4, 10), columns=list("abcdefghij")).astype( - { - "a": "float16", - "b": "float32", - "c": "float64", - "d": "int8", - "e": "int16", - "f": "int32", - "g": "uint8", - "h": "uint16", - "i": "uint32", - "j": "uint64", - } - ) - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(window=2, min_periods=1, axis=1).min() - expected = DataFrame( - { - "a": range(0, 40, 10), - "b": range(0, 40, 10), - "c": range(1, 40, 10), - "d": range(2, 40, 10), - "e": range(3, 40, 10), - "f": range(4, 40, 10), - "g": range(5, 40, 10), - "h": range(6, 40, 10), - "i": range(7, 40, 10), - "j": range(8, 40, 10), - }, - dtype="float64", - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("window", [1, 3, 10, 20]) @pytest.mark.parametrize("method", ["min", "max", "average"]) @pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) def test_rank(window, method, pct, ascending, test_data): length = 20 @@ -1652,6 +1586,43 @@ def test_rank(window, method, pct, ascending, test_data): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("window", [1, 3, 10, 20]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"]) +def test_nunique(window, test_data): + length = 20 + if test_data == "default": + ser = Series(data=np.random.default_rng(2).random(length)) + elif test_data == "duplicates": + ser = Series(data=np.random.default_rng(2).choice(3, length)) + elif test_data == "nans": + ser = Series( + data=np.random.default_rng(2).choice( + [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length + ) + ) + elif test_data == "precision": + ser = Series( + data=[ + 0.3, + 0.1 * 3, # Not necessarily exactly 0.3 + 0.6, + 0.2 * 3, # Not necessarily exactly 0.6 + 0.9, + 0.3 * 3, # Not necessarily exactly 0.9 + 0.5, + 0.1 * 5, # Not necessarily exactly 0.5 + 0.8, + 0.2 * 4, # Not necessarily exactly 0.8 + ], + dtype=np.float64, + ) + + expected = ser.rolling(window).apply(lambda x: x.nunique()) + result = ser.rolling(window).nunique() + + tm.assert_series_equal(result, expected) + + def test_rolling_quantile_np_percentile(): # #9413: Tests that rolling window's quantile default behavior # is analogous to Numpy's percentile @@ -1694,12 +1665,11 @@ def test_rolling_quantile_interpolation_options(quantile, interpolation, data): if np.isnan(q1): assert np.isnan(q2) + elif not IS64: + # Less precision on 32-bit + assert np.allclose([q1], [q2], rtol=1e-07, atol=0) else: - if not IS64: - # Less precision on 32-bit - assert np.allclose([q1], [q2], rtol=1e-07, atol=0) - else: - assert q1 == q2 + assert q1 == q2 def test_invalid_quantile_value(): @@ -1947,7 +1917,6 @@ def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) @pytest.mark.parametrize("tz", [None, "UTC", "Europe/Prague"]) def test_rolling_timedelta_window_non_nanoseconds(unit, tz): # Test Sum, GH#55106 @@ -1977,3 +1946,66 @@ def test_rolling_timedelta_window_non_nanoseconds(unit, tz): df.index = df.index.as_unit("ns") tm.assert_frame_equal(ref_df, df) + + +class PrescribedWindowIndexer(BaseIndexer): + def __init__(self, start, end): + self._start = start + self._end = end + super().__init__() + + def get_window_bounds( + self, num_values=None, min_periods=None, center=None, closed=None, step=None + ): + if num_values is None: + num_values = len(self._start) + start = np.clip(self._start, 0, num_values) + end = np.clip(self._end, 0, num_values) + return start, end + + +class TestMinMax: + @pytest.mark.parametrize( + "is_max, has_nan, exp_list", + [ + (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]), + (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]), + (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]), + (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]), + ], + ) + def test_minmax(self, is_max, has_nan, exp_list): + nan_idx = [0, 5, 8] + df = DataFrame( + { + "data": [5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 6.0, 7.0, 8.0, 9.0], + "start": [2, 0, 3, 0, 4, 0, 5, 5, 7, 3], + "end": [3, 4, 4, 5, 5, 6, 7, 8, 9, 10], + } + ) + if has_nan: + df.loc[nan_idx, "data"] = np.nan + expected = Series(exp_list, name="data") + r = df.data.rolling( + PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy()) + ) + if is_max: + result = r.max() + else: + result = r.min() + + tm.assert_series_equal(result, expected) + + def test_wrong_order(self): + start = np.array(range(5), dtype=np.int64) + end = start + 1 + end[3] = end[2] + start[3] = start[2] - 1 + + df = DataFrame({"data": start * 1.0, "start": start, "end": end}) + + r = df.data.rolling(PrescribedWindowIndexer(start, end)) + with pytest.raises( + ValueError, match="Start/End ordering requirement is violated at index 3" + ): + r.max() diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index 5906ff52db098..6820ab7332975 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -340,6 +340,8 @@ def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(q=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), @@ -471,20 +473,19 @@ def test_rolling_median_memory_error(): ).median() -@pytest.mark.parametrize( - "data_type", - [np.dtype(f"f{width}") for width in [4, 8]] - + [np.dtype(f"{sign}{width}") for width in [1, 2, 4, 8] for sign in "ui"], -) -def test_rolling_min_max_numeric_types(data_type): +def test_rolling_min_max_numeric_types(any_real_numpy_dtype): # GH12373 # Just testing that these don't throw exceptions and that # the return type is float64. Other tests will cover quantitative # correctness - result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).max() + result = ( + DataFrame(np.arange(20, dtype=any_real_numpy_dtype)).rolling(window=5).max() + ) assert result.dtypes[0] == np.dtype("f8") - result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() + result = ( + DataFrame(np.arange(20, dtype=any_real_numpy_dtype)).rolling(window=5).min() + ) assert result.dtypes[0] == np.dtype("f8") @@ -502,6 +503,8 @@ def test_rolling_min_max_numeric_types(data_type): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), diff --git a/pandas/tests/window/test_rolling_quantile.py b/pandas/tests/window/test_rolling_quantile.py index d5a7010923563..66713f1cfaa8d 100644 --- a/pandas/tests/window/test_rolling_quantile.py +++ b/pandas/tests/window/test_rolling_quantile.py @@ -173,10 +173,3 @@ def test_center_reindex_frame(frame, q): ) frame_rs = frame.rolling(window=25, center=True).quantile(q) tm.assert_frame_equal(frame_xp, frame_rs) - - -def test_keyword_quantile_deprecated(): - # GH #52550 - s = Series([1, 2, 3, 4]) - with tm.assert_produces_warning(FutureWarning): - s.rolling(2).quantile(quantile=0.4) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index c99fc8a8eb60f..043f369566a5d 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -1,9 +1,12 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, DatetimeIndex, + Index, MultiIndex, NaT, Series, @@ -98,7 +101,7 @@ def test_on(self, regular): # column is valid df = df.copy() df["C"] = date_range("20130101", periods=len(df)) - df.rolling(window="2d", on="C").sum() + df.rolling(window="2D", on="C").sum() # invalid columns msg = "window must be an integer" @@ -106,7 +109,7 @@ def test_on(self, regular): df.rolling(window="2d", on="B") # ok even though on non-selected - df.rolling(window="2d", on="C").B.sum() + df.rolling(window="2D", on="C").B.sum() def test_monotonic_on(self): # on/index must be monotonic @@ -538,6 +541,42 @@ def test_ragged_max(self, ragged): expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) + def test_ragged_first(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_ragged_last(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "freq, op, result_data", [ @@ -583,6 +622,8 @@ def test_freqs_ops(self, freq, op, result_data): "skew", "min", "max", + "first", + "last", ], ) def test_all(self, f, regular): @@ -679,21 +720,28 @@ def test_rolling_on_multi_index_level(self): [date_range("20190101", periods=3), range(2)], names=["date", "seq"] ), ) - result = df.rolling("10d", on=df.index.get_level_values("date")).sum() + result = df.rolling("10D", on=df.index.get_level_values("date")).sum() expected = DataFrame( {"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("msg, axis", [["column", 1], ["index", 0]]) -def test_nat_axis_error(msg, axis): +def test_nat_axis_error(): idx = [Timestamp("2020"), NaT] - kwargs = {"columns" if axis == 1 else "index": idx} - df = DataFrame(np.eye(2), **kwargs) - warn_msg = "The 'axis' keyword in DataFrame.rolling is deprecated" - if axis == 1: - warn_msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with pytest.raises(ValueError, match=f"{msg} values must not have NaT"): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - df.rolling("D", axis=axis).mean() + df = DataFrame(np.eye(2), index=idx) + with pytest.raises(ValueError, match="index values must not have NaT"): + df.rolling("D").mean() + + +@td.skip_if_no("pyarrow") +def test_arrow_datetime_axis(): + # GH 55849 + expected = Series( + np.arange(5, dtype=np.float64), + index=Index( + date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]" + ), + ) + result = expected.rolling("1D").sum() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index 5052019ddb726..574dfc34b6d26 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -445,12 +445,12 @@ def test_cmov_window_regular_linear_range(win_types, step): # GH 8238 pytest.importorskip("scipy") vals = np.array(range(10), dtype=float) - xp = vals.copy() + rs = Series(vals).rolling(5, win_type=win_types, center=True, step=step).mean() + xp = vals xp[:2] = np.nan xp[-2:] = np.nan xp = Series(xp)[::step] - rs = Series(vals).rolling(5, win_type=win_types, center=True, step=step).mean() tm.assert_series_equal(xp, rs) @@ -648,16 +648,15 @@ def test_cmov_window_special_linear_range(win_types_special, step): } vals = np.array(range(10), dtype=float) - xp = vals.copy() - xp[:2] = np.nan - xp[-2:] = np.nan - xp = Series(xp)[::step] - rs = ( Series(vals) .rolling(5, win_type=win_types_special, center=True, step=step) .mean(**kwds[win_types_special]) ) + xp = vals + xp[:2] = np.nan + xp[-2:] = np.nan + xp = Series(xp)[::step] tm.assert_series_equal(xp, rs) @@ -669,20 +668,3 @@ def test_weighted_var_big_window_no_segfault(win_types, center): expected = Series(np.nan) tm.assert_series_equal(result, expected) - - -def test_rolling_center_axis_1(): - pytest.importorskip("scipy") - df = DataFrame( - {"a": [1, 1, 0, 0, 0, 1], "b": [1, 0, 0, 1, 0, 0], "c": [1, 0, 0, 1, 0, 1]} - ) - - msg = "Support for axis=1 in DataFrame.rolling is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rolling(window=3, axis=1, win_type="boxcar", center=True).sum() - - expected = DataFrame( - {"a": [np.nan] * 6, "b": [3.0, 1.0, 0.0, 2.0, 0.0, 2.0], "c": [np.nan] * 6} - ) - - tm.assert_frame_equal(result, expected, check_dtype=True) diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py index e361726dc6f80..c00843ecac418 100644 --- a/pandas/tseries/__init__.py +++ b/pandas/tseries/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index ec2d7d2304839..5ea899f1610a7 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -7,4 +7,4 @@ from pandas.tseries import offsets from pandas.tseries.frequencies import infer_freq -__all__ = ["infer_freq", "offsets", "guess_datetime_format"] +__all__ = ["guess_datetime_format", "infer_freq", "offsets"] diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 4a1a668426b36..88ea1bfa3c6ed 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -19,10 +19,7 @@ MONTHS, int_to_weekday, ) -from pandas._libs.tslibs.dtypes import ( - OFFSET_TO_PERIOD_FREQSTR, - freq_to_period_freqstr, -) +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR from pandas._libs.tslibs.fields import ( build_field_sarray, month_position_check, @@ -92,6 +89,11 @@ def infer_freq( """ Infer the most likely frequency given the input index. + This method attempts to deduce the most probable frequency (e.g., 'D' for daily, + 'H' for hourly) from a sequence of datetime-like objects. It is particularly useful + when the frequency of a time series is not explicitly set or known but can be + inferred from its values. + Parameters ---------- index : DatetimeIndex, TimedeltaIndex, Series or array-like @@ -109,9 +111,16 @@ def infer_freq( ValueError If there are fewer than three values. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex with day as the default. + period_range : Return a fixed frequency PeriodIndex. + DatetimeIndex.freq : Return the frequency object if it is set, otherwise None. + Examples -------- - >>> idx = pd.date_range(start='2020/12/01', end='2020/12/30', periods=30) + >>> idx = pd.date_range(start="2020/12/01", end="2020/12/30", periods=30) >>> pd.infer_freq(idx) 'D' """ @@ -136,8 +145,7 @@ def infer_freq( pass elif isinstance(index.dtype, PeriodDtype): raise TypeError( - "PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq." + "PeriodIndex given. Check the `freq` attribute instead of using infer_freq." ) elif lib.is_np_dtype(index.dtype, "m"): # Allow TimedeltaIndex and TimedeltaArray @@ -559,7 +567,7 @@ def _maybe_coerce_freq(code) -> str: """ assert code is not None if isinstance(code, DateOffset): - code = freq_to_period_freqstr(1, code.name) + code = PeriodDtype(to_offset(code.name))._freqstr if code in {"h", "min", "s", "ms", "us", "ns"}: return code else: diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 3c429a960b451..2d195fbbc4e84 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -4,6 +4,7 @@ datetime, timedelta, ) +from typing import TYPE_CHECKING import warnings from dateutil.relativedelta import ( @@ -17,6 +18,7 @@ ) import numpy as np +from pandas._libs.tslibs.offsets import BaseOffset from pandas.errors import PerformanceWarning from pandas import ( @@ -33,6 +35,9 @@ Easter, ) +if TYPE_CHECKING: + from collections.abc import Callable + def next_monday(dt: datetime) -> datetime: """ @@ -108,7 +113,7 @@ def nearest_workday(dt: datetime) -> datetime: def next_workday(dt: datetime) -> datetime: """ - returns next weekday used for observances + returns next workday used for observances """ dt += timedelta(days=1) while dt.weekday() > 4: @@ -119,7 +124,7 @@ def next_workday(dt: datetime) -> datetime: def previous_workday(dt: datetime) -> datetime: """ - returns previous weekday used for observances + returns previous workday used for observances """ dt -= timedelta(days=1) while dt.weekday() > 4: @@ -130,7 +135,7 @@ def previous_workday(dt: datetime) -> datetime: def before_nearest_workday(dt: datetime) -> datetime: """ - returns previous workday after nearest workday + returns previous workday before nearest workday """ return previous_workday(nearest_workday(dt)) @@ -159,24 +164,34 @@ def __init__( year=None, month=None, day=None, - offset=None, - observance=None, + offset: BaseOffset | list[BaseOffset] | None = None, + observance: Callable | None = None, start_date=None, end_date=None, - days_of_week=None, + days_of_week: tuple | None = None, ) -> None: """ Parameters ---------- name : str Name of the holiday , defaults to class name - offset : array of pandas.tseries.offsets or - class from pandas.tseries.offsets - computes offset from date - observance: function - computes when holiday is given a pandas Timestamp - days_of_week: - provide a tuple of days e.g (0,1,2,3,) for Monday Through Thursday + year : int, default None + Year of the holiday + month : int, default None + Month of the holiday + day : int, default None + Day of the holiday + offset : list of pandas.tseries.offsets or + class from pandas.tseries.offsets, default None + Computes offset from date + observance : function, default None + Computes when holiday is given a pandas Timestamp + start_date : datetime-like, default None + First date the holiday is observed + end_date : datetime-like, default None + Last date the holiday is observed + days_of_week : tuple of int or dateutil.relativedelta weekday strs, default None + Provide a tuple of days e.g (0,1,2,3,) for Monday Through Thursday Monday=0,..,Sunday=6 Examples @@ -200,8 +215,10 @@ class from pandas.tseries.offsets Holiday: July 3rd (month=7, day=3, ) >>> NewYears = pd.tseries.holiday.Holiday( - ... "New Years Day", month=1, day=1, - ... observance=pd.tseries.holiday.nearest_workday + ... "New Years Day", + ... month=1, + ... day=1, + ... observance=pd.tseries.holiday.nearest_workday, ... ) >>> NewYears # doctest: +SKIP Holiday: New Years Day ( @@ -209,14 +226,24 @@ class from pandas.tseries.offsets ) >>> July3rd = pd.tseries.holiday.Holiday( - ... "July 3rd", month=7, day=3, - ... days_of_week=(0, 1, 2, 3) + ... "July 3rd", month=7, day=3, days_of_week=(0, 1, 2, 3) ... ) >>> July3rd Holiday: July 3rd (month=7, day=3, ) """ - if offset is not None and observance is not None: - raise NotImplementedError("Cannot use both offset and observance.") + if offset is not None: + if observance is not None: + raise NotImplementedError("Cannot use both offset and observance.") + if not ( + isinstance(offset, BaseOffset) + or ( + isinstance(offset, list) + and all(isinstance(off, BaseOffset) for off in offset) + ) + ): + raise ValueError( + "Only BaseOffsets and flat lists of them are supported for offset." + ) self.name = name self.year = year @@ -384,7 +411,7 @@ def register(cls) -> None: holiday_calendars[name] = cls -def get_calendar(name: str): +def get_calendar(name: str) -> AbstractHolidayCalendar: """ Return an instance of a calendar based on its name. @@ -433,7 +460,7 @@ def __init__(self, name: str = "", rules=None) -> None: if rules is not None: self.rules = rules - def rule_from_name(self, name: str): + def rule_from_name(self, name: str) -> Holiday | None: for rule in self.rules: if rule.name == name: return rule @@ -484,9 +511,7 @@ def holidays(self, start=None, end=None, return_name: bool = False): else: # error: Incompatible types in assignment (expression has type # "Series", variable has type "DataFrame") - holidays = Series( - index=DatetimeIndex([]), dtype=object - ) # type: ignore[assignment] + holidays = Series(index=DatetimeIndex([]), dtype=object) # type: ignore[assignment] self._cache = (start, end, holidays.sort_index()) @@ -611,12 +636,17 @@ def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCal __all__ = [ + "FR", + "MO", + "SA", + "SU", + "TH", + "TU", + "WE", + "HolidayCalendarFactory", "after_nearest_workday", "before_nearest_workday", - "FR", "get_calendar", - "HolidayCalendarFactory", - "MO", "nearest_workday", "next_monday", "next_monday_or_tuesday", @@ -624,11 +654,6 @@ def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCal "previous_friday", "previous_workday", "register", - "SA", - "SU", "sunday_to_monday", - "TH", - "TU", - "WE", "weekend_to_monday", ] diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 169c9cc18a7fd..1f0c4281ffc77 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -4,6 +4,8 @@ FY5253, BaseOffset, BDay, + BHalfYearBegin, + BHalfYearEnd, BMonthBegin, BMonthEnd, BQuarterBegin, @@ -25,6 +27,8 @@ Day, Easter, FY5253Quarter, + HalfYearBegin, + HalfYearEnd, Hour, LastWeekOfMonth, Micro, @@ -46,46 +50,50 @@ ) __all__ = [ - "Day", + "FY5253", + "BDay", + "BHalfYearBegin", + "BHalfYearEnd", + "BMonthBegin", + "BMonthEnd", + "BQuarterBegin", + "BQuarterEnd", + "BYearBegin", + "BYearEnd", "BaseOffset", "BusinessDay", + "BusinessHour", "BusinessMonthBegin", "BusinessMonthEnd", - "BDay", + "CBMonthBegin", + "CBMonthEnd", + "CDay", "CustomBusinessDay", + "CustomBusinessHour", "CustomBusinessMonthBegin", "CustomBusinessMonthEnd", - "CDay", - "CBMonthEnd", - "CBMonthBegin", + "DateOffset", + "Day", + "Easter", + "FY5253Quarter", + "HalfYearBegin", + "HalfYearEnd", + "Hour", + "LastWeekOfMonth", + "Micro", + "Milli", + "Minute", "MonthBegin", - "BMonthBegin", "MonthEnd", - "BMonthEnd", - "SemiMonthEnd", - "SemiMonthBegin", - "BusinessHour", - "CustomBusinessHour", - "YearBegin", - "BYearBegin", - "YearEnd", - "BYearEnd", + "Nano", "QuarterBegin", - "BQuarterBegin", "QuarterEnd", - "BQuarterEnd", - "LastWeekOfMonth", - "FY5253Quarter", - "FY5253", + "Second", + "SemiMonthBegin", + "SemiMonthEnd", + "Tick", "Week", "WeekOfMonth", - "Easter", - "Tick", - "Hour", - "Minute", - "Second", - "Milli", - "Micro", - "Nano", - "DateOffset", + "YearBegin", + "YearEnd", ] diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index 82b3aa56c653c..dfab207635fec 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -25,5 +25,5 @@ def __getattr__(key: str): raise AttributeError(f"module 'pandas.util' has no attribute '{key}'") -def capitalize_first_letter(s): - return s[:1].upper() + s[1:] +def __dir__() -> list[str]: + return list(globals().keys()) + ["hash_array", "hash_pandas_object"] diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 4e8189e72c427..a1a0d51a7c72b 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -6,7 +6,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -19,7 +18,10 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import ( + Callable, + Mapping, + ) def deprecate( @@ -81,7 +83,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: if alternative.__doc__.count("\n") < 3: raise AssertionError(doc_error_msg) empty1, summary, empty2, doc_string = alternative.__doc__.split("\n", 3) - if empty1 or empty2 and not summary: + if empty1 or (empty2 and not summary): raise AssertionError(doc_error_msg) wrapper.__doc__ = dedent( f""" @@ -122,44 +124,41 @@ def deprecate_kwarg( -------- The following deprecates 'cols', using 'columns' instead - >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name='columns') - ... def f(columns=''): + >>> @deprecate_kwarg(old_arg_name="cols", new_arg_name="columns") + ... def f(columns=""): ... print(columns) - ... - >>> f(columns='should work ok') + >>> f(columns="should work ok") should work ok - >>> f(cols='should raise warning') # doctest: +SKIP + >>> f(cols="should raise warning") # doctest: +SKIP FutureWarning: cols is deprecated, use columns instead warnings.warn(msg, FutureWarning) should raise warning - >>> f(cols='should error', columns="can\'t pass do both") # doctest: +SKIP + >>> f(cols="should error", columns="can't pass do both") # doctest: +SKIP TypeError: Can only specify 'cols' or 'columns', not both - >>> @deprecate_kwarg('old', 'new', {'yes': True, 'no': False}) + >>> @deprecate_kwarg("old", "new", {"yes": True, "no": False}) ... def f(new=False): - ... print('yes!' if new else 'no!') - ... - >>> f(old='yes') # doctest: +SKIP + ... print("yes!" if new else "no!") + >>> f(old="yes") # doctest: +SKIP FutureWarning: old='yes' is deprecated, use new=True instead warnings.warn(msg, FutureWarning) yes! To raise a warning that a keyword will be removed entirely in the future - >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name=None) - ... def f(cols='', another_param=''): + >>> @deprecate_kwarg(old_arg_name="cols", new_arg_name=None) + ... def f(cols="", another_param=""): ... print(cols) - ... - >>> f(cols='should raise warning') # doctest: +SKIP + >>> f(cols="should raise warning") # doctest: +SKIP FutureWarning: the 'cols' keyword is deprecated and will be removed in a future version please takes steps to stop use of 'cols' should raise warning - >>> f(another_param='should not raise warning') # doctest: +SKIP + >>> f(another_param="should not raise warning") # doctest: +SKIP should not raise warning - >>> f(cols='should raise warning', another_param='') # doctest: +SKIP + >>> f(cols="should raise warning", another_param="") # doctest: +SKIP FutureWarning: the 'cols' keyword is deprecated and will be removed in a future version please takes steps to stop use of 'cols' should raise warning @@ -177,9 +176,9 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: if old_arg_value is not None: if new_arg_name is None: msg = ( - f"the {repr(old_arg_name)} keyword is deprecated and " + f"the {old_arg_name!r} keyword is deprecated and " "will be removed in a future version. Please take " - f"steps to stop the use of {repr(old_arg_name)}" + f"steps to stop the use of {old_arg_name!r}" ) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) kwargs[old_arg_name] = old_arg_value @@ -191,22 +190,22 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: else: new_arg_value = mapping.get(old_arg_value, old_arg_value) msg = ( - f"the {old_arg_name}={repr(old_arg_value)} keyword is " + f"the {old_arg_name}={old_arg_value!r} keyword is " "deprecated, use " - f"{new_arg_name}={repr(new_arg_value)} instead." + f"{new_arg_name}={new_arg_value!r} instead." ) else: new_arg_value = old_arg_value msg = ( - f"the {repr(old_arg_name)} keyword is deprecated, " - f"use {repr(new_arg_name)} instead." + f"the {old_arg_name!r} keyword is deprecated, " + f"use {new_arg_name!r} instead." ) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) if kwargs.get(new_arg_name) is not None: msg = ( - f"Can only specify {repr(old_arg_name)} " - f"or {repr(new_arg_name)}, not both." + f"Can only specify {old_arg_name!r} " + f"or {new_arg_name!r}, not both." ) raise TypeError(msg) kwargs[new_arg_name] = new_arg_value @@ -340,7 +339,7 @@ def wrapper(*args, **kwargs): return decorate -def doc(*docstrings: None | str | Callable, **params) -> Callable[[F], F]: +def doc(*docstrings: None | str | Callable, **params: object) -> Callable[[F], F]: """ A decorator to take docstring templates, concatenate them and perform string substitution on them. @@ -371,7 +370,7 @@ def decorator(decorated: F) -> F: continue if hasattr(docstring, "_docstring_components"): docstring_components.extend( - docstring._docstring_components # pyright: ignore[reportGeneralTypeIssues] + docstring._docstring_components # pyright: ignore[reportAttributeAccessIssue] ) elif isinstance(docstring, str) or docstring.__doc__: docstring_components.append(docstring) @@ -498,11 +497,32 @@ def indent(text: str | None, indents: int = 1) -> str: __all__ = [ "Appender", + "Substitution", "cache_readonly", "deprecate", "deprecate_kwarg", "deprecate_nonkeyword_arguments", "doc", "future_version_msg", - "Substitution", ] + + +def set_module(module) -> Callable[[F], F]: + """Private decorator for overriding __module__ on a function or class. + + Example usage:: + + @set_module("pandas") + def example(): + pass + + + assert example.__module__ == "pandas" + """ + + def decorator(func: F) -> F: + if module is not None: + func.__module__ = module + return func + + return decorator diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index 12619abf4baaf..828b7a4591bf2 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -9,6 +9,8 @@ if TYPE_CHECKING: from collections.abc import Iterable + from matplotlib.figure import Figure + class TablePlotter: """ @@ -46,7 +48,9 @@ def _get_cells(self, left, right, vertical) -> tuple[int, int]: hcells = sum([self._shape(df)[1] for df in left] + [self._shape(right)[1]]) return hcells, vcells - def plot(self, left, right, labels: Iterable[str] = (), vertical: bool = True): + def plot( + self, left, right, labels: Iterable[str] = (), vertical: bool = True + ) -> Figure: """ Plot left / right DataFrames in specified layout. diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 573f76a63459b..b3c8e54d3ca7f 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -4,15 +4,19 @@ import inspect import os import re -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import warnings if TYPE_CHECKING: from collections.abc import Generator + from types import FrameType @contextlib.contextmanager -def rewrite_exception(old_name: str, new_name: str) -> Generator[None, None, None]: +def rewrite_exception(old_name: str, new_name: str) -> Generator[None]: """ Rewrite the message of an exception. """ @@ -23,7 +27,7 @@ def rewrite_exception(old_name: str, new_name: str) -> Generator[None, None, Non raise msg = str(err.args[0]) msg = msg.replace(old_name, new_name) - args: tuple[str, ...] = (msg,) + args: tuple[Any, ...] = (msg,) if len(err.args) > 1: args = args + err.args[1:] err.args = args @@ -42,15 +46,20 @@ def find_stack_level() -> int: test_dir = os.path.join(pkg_dir, "tests") # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow - frame = inspect.currentframe() - n = 0 - while frame: - fname = inspect.getfile(frame) - if fname.startswith(pkg_dir) and not fname.startswith(test_dir): - frame = frame.f_back - n += 1 - else: - break + frame: FrameType | None = inspect.currentframe() + try: + n = 0 + while frame: + filename = inspect.getfile(frame) + if filename.startswith(pkg_dir) and not filename.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame return n @@ -60,7 +69,7 @@ def rewrite_warning( target_category: type[Warning], new_message: str, new_category: type[Warning] | None = None, -) -> Generator[None, None, None]: +) -> Generator[None]: """ Rewrite the message of a warning. diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index e39c2f7badb1d..bd20660bdbba6 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -33,7 +33,7 @@ def _get_commit_hash() -> str | None: except ImportError: from pandas._version import get_versions - versions = get_versions() + versions = get_versions() # type: ignore[no-untyped-call] return versions["full-revisionid"] @@ -45,7 +45,7 @@ def _get_sys_info() -> dict[str, JSONSerializable]: language_code, encoding = locale.getlocale() return { "commit": _get_commit_hash(), - "python": ".".join([str(i) for i in sys.version_info]), + "python": platform.python_version(), "python-bits": struct.calcsize("P") * 8, "OS": uname_result.system, "OS-release": uname_result.release, @@ -67,36 +67,27 @@ def _get_dependency_info() -> dict[str, JSONSerializable]: "pandas", # required "numpy", - "pytz", "dateutil", # install / build, - "setuptools", "pip", "Cython", - # test - "pytest", - "hypothesis", # docs "sphinx", - # Other, need a min version - "blosc", - "feather", - "xlsxwriter", - "lxml.etree", - "html5lib", - "pymysql", - "psycopg2", - "jinja2", # Other, not imported. "IPython", - "pandas_datareader", ] + # Optional dependencies deps.extend(list(VERSIONS)) result: dict[str, JSONSerializable] = {} for modname in deps: - mod = import_optional_dependency(modname, errors="ignore") - result[modname] = get_version(mod) if mod else None + try: + mod = import_optional_dependency(modname, errors="ignore") + except Exception: + # Dependency conflicts may cause a non ImportError + result[modname] = "N/A" + else: + result[modname] = get_version(mod) if mod else None return result @@ -115,6 +106,11 @@ def show_versions(as_json: str | bool = False) -> None: Info will be written to that file in JSON format. * If True, outputs info in JSON format to the console. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + Examples -------- >>> pd.show_versions() # doctest: +SKIP diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 2c1912bce856d..1c17587db72d4 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,25 +23,21 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ + from __future__ import annotations import locale -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import pytest -from pandas._config import get_option - if TYPE_CHECKING: + from collections.abc import Callable from pandas._typing import F -from pandas._config.config import _get_option - from pandas.compat import ( IS64, + WASM, is_platform_windows, ) from pandas.compat._optional import import_optional_dependency @@ -118,6 +114,10 @@ def skip_if_no(package: str, min_version: str | None = None) -> pytest.MarkDecor locale.getlocale()[0] != "en_US", reason=f"Set local {locale.getlocale()[0]} is not en_US", ) +skip_if_wasm = pytest.mark.skipif( + WASM, + reason="does not support wasm", +) def parametrize_fixture_doc(*args) -> Callable[[F], F]: @@ -145,29 +145,3 @@ def documented_fixture(fixture): return fixture return documented_fixture - - -def mark_array_manager_not_yet_implemented(request) -> None: - mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager") - request.applymarker(mark) - - -skip_array_manager_not_yet_implemented = pytest.mark.xfail( - _get_option("mode.data_manager", silent=True) == "array", - reason="Not yet implemented for ArrayManager", -) - -skip_array_manager_invalid_test = pytest.mark.skipif( - _get_option("mode.data_manager", silent=True) == "array", - reason="Test that relies on BlockManager internals or specific behaviour", -) - -skip_copy_on_write_not_yet_implemented = pytest.mark.xfail( - get_option("mode.copy_on_write") is True, - reason="Not yet implemented/adapted for Copy-on-Write mode", -) - -skip_copy_on_write_invalid_test = pytest.mark.skipif( - get_option("mode.copy_on_write") is True, - reason="Test not valid for Copy-on-Write mode", -) diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 7cfddef7ddff8..c0e9756372f47 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -1,6 +1,7 @@ """ Entrypoint for testing from the top-level namespace. """ + from __future__ import annotations import os @@ -26,6 +27,10 @@ def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> Non both doctests/regular tests, just append "--doctest-modules"/"--doctest-cython" to extra_args. + See Also + -------- + pytest.main : The main entry point for pytest testing framework. + Examples -------- >>> pd.test() # doctest: +SKIP diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index a47f622216ef7..9aab19fe340ec 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -2,6 +2,7 @@ Module that contains many useful utilities for validating data or function arguments """ + from __future__ import annotations from collections.abc import ( @@ -26,7 +27,7 @@ BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) -def _check_arg_length(fname, args, max_fname_arg_count, compat_args): +def _check_arg_length(fname, args, max_fname_arg_count, compat_args) -> None: """ Checks whether 'args' has length of at most 'compat_args'. Raises a TypeError if that is not the case, similar to in Python when a @@ -46,7 +47,7 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): ) -def _check_for_default_values(fname, arg_val_dict, compat_args): +def _check_for_default_values(fname, arg_val_dict, compat_args) -> None: """ Check that the keys in `arg_val_dict` are mapped to their default values as specified in `compat_args`. @@ -125,7 +126,7 @@ def validate_args(fname, args, max_fname_arg_count, compat_args) -> None: _check_for_default_values(fname, kwargs, compat_args) -def _check_for_invalid_keys(fname, kwargs, compat_args): +def _check_for_invalid_keys(fname, kwargs, compat_args) -> None: """ Checks whether 'kwargs' contains any keys that are not in 'compat_args' and raises a TypeError if there is one. @@ -265,7 +266,7 @@ def validate_bool_kwarg( f'For argument "{arg_name}" expected type bool, received ' f"type {type(value).__name__}." ) - return value # pyright: ignore[reportGeneralTypeIssues] + return value def validate_fillna_kwargs(value, method, validate_scalar_dict_value: bool = True): @@ -335,20 +336,17 @@ def validate_percentile(q: float | Iterable[float]) -> np.ndarray: if q_arr.ndim == 0: if not 0 <= q_arr <= 1: raise ValueError(msg) - else: - if not all(0 <= qs <= 1 for qs in q_arr): - raise ValueError(msg) + elif not all(0 <= qs <= 1 for qs in q_arr): + raise ValueError(msg) return q_arr @overload -def validate_ascending(ascending: BoolishT) -> BoolishT: - ... +def validate_ascending(ascending: BoolishT) -> BoolishT: ... @overload -def validate_ascending(ascending: Sequence[BoolishT]) -> list[BoolishT]: - ... +def validate_ascending(ascending: Sequence[BoolishT]) -> list[BoolishT]: ... def validate_ascending( @@ -444,7 +442,7 @@ def validate_insert_loc(loc: int, length: int) -> int: loc += length if not 0 <= loc <= length: raise IndexError(f"loc must be an integer between -{length} and {length}") - return loc # pyright: ignore[reportGeneralTypeIssues] + return loc # pyright: ignore[reportReturnType] def check_dtype_backend(dtype_backend) -> None: diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index 3a5efbbb09c1e..bd741140f6542 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -1,25 +1,22 @@ -# Vendored from https://github.com/pypa/packaging/blob/main/packaging/_structures.py -# and https://github.com/pypa/packaging/blob/main/packaging/_structures.py -# changeset ae891fd74d6dd4c6063bb04f2faeadaac6fc6313 -# 04/30/2021 +# Vendored from https://github.com/pypa/packaging/blob/main/src/packaging/_structures.py +# and https://github.com/pypa/packaging/blob/main/src/packaging/version.py +# changeset 24e5350b2ff3c5c7a36676c2af5f2cb39fd1baf8 # This file is dual licensed under the terms of the Apache License, Version # 2.0, and the BSD License. Licence at LICENSES/PACKAGING_LICENSE from __future__ import annotations -import collections -from collections.abc import Iterator +from collections.abc import Callable import itertools import re from typing import ( - Callable, + Any, + NamedTuple, SupportsInt, - Tuple, Union, ) -import warnings -__all__ = ["parse", "Version", "LegacyVersion", "InvalidVersion", "VERSION_PATTERN"] +__all__ = ["VERSION_PATTERN", "InvalidVersion", "Version", "parse"] class InfinityType: @@ -38,9 +35,6 @@ def __le__(self, other: object) -> bool: def __eq__(self, other: object) -> bool: return isinstance(other, type(self)) - def __ne__(self, other: object) -> bool: - return not isinstance(other, type(self)) - def __gt__(self, other: object) -> bool: return True @@ -70,9 +64,6 @@ def __le__(self, other: object) -> bool: def __eq__(self, other: object) -> bool: return isinstance(other, type(self)) - def __ne__(self, other: object) -> bool: - return not isinstance(other, type(self)) - def __gt__(self, other: object) -> bool: return False @@ -86,59 +77,61 @@ def __neg__(self: object) -> InfinityType: NegativeInfinity = NegativeInfinityType() -InfiniteTypes = Union[InfinityType, NegativeInfinityType] -PrePostDevType = Union[InfiniteTypes, tuple[str, int]] -SubLocalType = Union[InfiniteTypes, int, str] -LocalType = Union[ +LocalType = tuple[Union[int, str], ...] + +CmpPrePostDevType = Union[InfinityType, NegativeInfinityType, tuple[str, int]] +CmpLocalType = Union[ NegativeInfinityType, - tuple[ - Union[ - SubLocalType, - tuple[SubLocalType, str], - tuple[NegativeInfinityType, SubLocalType], - ], - ..., - ], + tuple[Union[tuple[int, str], tuple[NegativeInfinityType, Union[int, str]]], ...], ] CmpKey = tuple[ - int, tuple[int, ...], PrePostDevType, PrePostDevType, PrePostDevType, LocalType -] -LegacyCmpKey = tuple[int, tuple[str, ...]] -VersionComparisonMethod = Callable[ - [Union[CmpKey, LegacyCmpKey], Union[CmpKey, LegacyCmpKey]], bool + int, + tuple[int, ...], + CmpPrePostDevType, + CmpPrePostDevType, + CmpPrePostDevType, + CmpLocalType, ] +VersionComparisonMethod = Callable[[CmpKey, CmpKey], bool] -_Version = collections.namedtuple( - "_Version", ["epoch", "release", "dev", "pre", "post", "local"] -) +class _Version(NamedTuple): + epoch: int + release: tuple[int, ...] + dev: tuple[str, int] | None + pre: tuple[str, int] | None + post: tuple[str, int] | None + local: LocalType | None -def parse(version: str) -> LegacyVersion | Version: - """ - Parse the given version string and return either a :class:`Version` object - or a :class:`LegacyVersion` object depending on if the given version is - a valid PEP 440 version or a legacy version. - """ - try: - return Version(version) - except InvalidVersion: - return LegacyVersion(version) +def parse(version: str) -> Version: + return Version(version) + +# The docstring is from an older version of the packaging library to avoid +# errors in the docstring validation. class InvalidVersion(ValueError): """ An invalid version was found, users should refer to PEP 440. + The ``InvalidVersion`` exception is raised when a version string is + improperly formatted. Pandas uses this exception to ensure that all + version strings are PEP 440 compliant. + + See Also + -------- + util.version.Version : Class for handling and parsing version strings. + Examples -------- - >>> pd.util.version.Version('1.') + >>> pd.util.version.Version("1.") Traceback (most recent call last): InvalidVersion: Invalid version: '1.' """ class _BaseVersion: - _key: CmpKey | LegacyCmpKey + _key: tuple[Any, ...] def __hash__(self) -> int: return hash(self._key) @@ -183,132 +176,16 @@ def __ne__(self, other: object) -> bool: return self._key != other._key -class LegacyVersion(_BaseVersion): - def __init__(self, version: str) -> None: - self._version = str(version) - self._key = _legacy_cmpkey(self._version) - - warnings.warn( - "Creating a LegacyVersion has been deprecated and will be " - "removed in the next major release.", - DeprecationWarning, - ) - - def __str__(self) -> str: - return self._version - - def __repr__(self) -> str: - return f"" - - @property - def public(self) -> str: - return self._version - - @property - def base_version(self) -> str: - return self._version - - @property - def epoch(self) -> int: - return -1 - - @property - def release(self) -> None: - return None - - @property - def pre(self) -> None: - return None - - @property - def post(self) -> None: - return None - - @property - def dev(self) -> None: - return None - - @property - def local(self) -> None: - return None - - @property - def is_prerelease(self) -> bool: - return False - - @property - def is_postrelease(self) -> bool: - return False - - @property - def is_devrelease(self) -> bool: - return False - - -_legacy_version_component_re = re.compile(r"(\d+ | [a-z]+ | \.| -)", re.VERBOSE) - -_legacy_version_replacement_map = { - "pre": "c", - "preview": "c", - "-": "final-", - "rc": "c", - "dev": "@", -} - - -def _parse_version_parts(s: str) -> Iterator[str]: - for part in _legacy_version_component_re.split(s): - mapped_part = _legacy_version_replacement_map.get(part, part) - - if not mapped_part or mapped_part == ".": - continue - - if mapped_part[:1] in "0123456789": - # pad for numeric comparison - yield mapped_part.zfill(8) - else: - yield "*" + mapped_part - - # ensure that alpha/beta/candidate are before final - yield "*final" - - -def _legacy_cmpkey(version: str) -> LegacyCmpKey: - # We hardcode an epoch of -1 here. A PEP 440 version can only have a epoch - # greater than or equal to 0. This will effectively put the LegacyVersion, - # which uses the defacto standard originally implemented by setuptools, - # as before all PEP 440 versions. - epoch = -1 - - # This scheme is taken from pkg_resources.parse_version setuptools prior to - # it's adoption of the packaging library. - parts: list[str] = [] - for part in _parse_version_parts(version.lower()): - if part.startswith("*"): - # remove "-" before a prerelease tag - if part < "*final": - while parts and parts[-1] == "*final-": - parts.pop() - - # remove trailing zeros from each series of numeric parts - while parts and parts[-1] == "00000000": - parts.pop() - - parts.append(part) - - return epoch, tuple(parts) - - # Deliberately not anchored to the start and end of the string, to make it # easier for 3rd party code to reuse -VERSION_PATTERN = r""" +_VERSION_PATTERN = r""" v? (?: (?:(?P[0-9]+)!)? # epoch (?P[0-9]+(?:\.[0-9]+)*) # release segment (?P
                                              # pre-release
                 [-_\.]?
    -            (?P(a|b|c|rc|alpha|beta|pre|preview))
    +            (?Palpha|a|beta|b|preview|pre|c|rc)
                 [-_\.]?
                 (?P[0-9]+)?
             )?
    @@ -332,9 +209,12 @@ def _legacy_cmpkey(version: str) -> LegacyCmpKey:
         (?:\+(?P[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
     """
     
    +VERSION_PATTERN = _VERSION_PATTERN
    +
     
     class Version(_BaseVersion):
         _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
    +    _key: CmpKey
     
         def __init__(self, version: str) -> None:
             # Validate the version and parse it into pieces
    @@ -375,11 +255,11 @@ def __str__(self) -> str:
                 parts.append(f"{self.epoch}!")
     
             # Release segment
    -        parts.append(".".join([str(x) for x in self.release]))
    +        parts.append(".".join(str(x) for x in self.release))
     
             # Pre-release
             if self.pre is not None:
    -            parts.append("".join([str(x) for x in self.pre]))
    +            parts.append("".join(str(x) for x in self.pre))
     
             # Post-release
             if self.post is not None:
    @@ -397,18 +277,15 @@ def __str__(self) -> str:
     
         @property
         def epoch(self) -> int:
    -        _epoch: int = self._version.epoch
    -        return _epoch
    +        return self._version.epoch
     
         @property
         def release(self) -> tuple[int, ...]:
    -        _release: tuple[int, ...] = self._version.release
    -        return _release
    +        return self._version.release
     
         @property
         def pre(self) -> tuple[str, int] | None:
    -        _pre: tuple[str, int] | None = self._version.pre
    -        return _pre
    +        return self._version.pre
     
         @property
         def post(self) -> int | None:
    @@ -421,7 +298,7 @@ def dev(self) -> int | None:
         @property
         def local(self) -> str | None:
             if self._version.local:
    -            return ".".join([str(x) for x in self._version.local])
    +            return ".".join(str(x) for x in self._version.local)
             else:
                 return None
     
    @@ -438,7 +315,7 @@ def base_version(self) -> str:
                 parts.append(f"{self.epoch}!")
     
             # Release segment
    -        parts.append(".".join([str(x) for x in self.release]))
    +        parts.append(".".join(str(x) for x in self.release))
     
             return "".join(parts)
     
    @@ -468,7 +345,7 @@ def micro(self) -> int:
     
     
     def _parse_letter_version(
    -    letter: str, number: str | bytes | SupportsInt
    +    letter: str | None, number: str | bytes | SupportsInt | None
     ) -> tuple[str, int] | None:
         if letter:
             # We consider there to be an implicit 0 in a pre-release if there is
    @@ -505,10 +382,7 @@ def _parse_letter_version(
     _local_version_separators = re.compile(r"[\._-]")
     
     
    -def _parse_local_version(local: str) -> LocalType | None:
    -    """
    -    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
    -    """
    +def _parse_local_version(local: str | None) -> LocalType | None:
         if local is not None:
             return tuple(
                 part.lower() if not part.isdigit() else int(part)
    @@ -523,7 +397,7 @@ def _cmpkey(
         pre: tuple[str, int] | None,
         post: tuple[str, int] | None,
         dev: tuple[str, int] | None,
    -    local: tuple[SubLocalType] | None,
    +    local: LocalType | None,
     ) -> CmpKey:
         # When we compare a release version, we want to compare it with all of the
         # trailing zeros removed. So we'll use a reverse the list, drop all the now
    @@ -539,7 +413,7 @@ def _cmpkey(
         # if there is not a pre or a post segment. If we have one of those then
         # the normal sorting rules will handle this case correctly.
         if pre is None and post is None and dev is not None:
    -        _pre: PrePostDevType = NegativeInfinity
    +        _pre: CmpPrePostDevType = NegativeInfinity
         # Versions without a pre-release (except as noted above) should sort after
         # those with one.
         elif pre is None:
    @@ -549,21 +423,21 @@ def _cmpkey(
     
         # Versions without a post segment should sort before those with one.
         if post is None:
    -        _post: PrePostDevType = NegativeInfinity
    +        _post: CmpPrePostDevType = NegativeInfinity
     
         else:
             _post = post
     
         # Versions without a development segment should sort after those with one.
         if dev is None:
    -        _dev: PrePostDevType = Infinity
    +        _dev: CmpPrePostDevType = Infinity
     
         else:
             _dev = dev
     
         if local is None:
             # Versions without a local segment should sort before those with one.
    -        _local: LocalType = NegativeInfinity
    +        _local: CmpLocalType = NegativeInfinity
         else:
             # Versions with a local segment need that segment parsed to implement
             # the sorting rules in PEP440.
    diff --git a/pyproject.toml b/pyproject.toml
    index 5e65edf81f9c7..370bd0e0605cb 100644
    --- a/pyproject.toml
    +++ b/pyproject.toml
    @@ -2,16 +2,13 @@
     # Minimum requirements for the build system to execute.
     # See https://github.com/scipy/scipy/pull/12940 for the AIX issue.
     requires = [
    -    "meson-python==0.13.1",
    -    "meson==1.2.1",
    +    "meson-python>=0.13.1",
    +    "meson>=1.2.1,<2",
         "wheel",
    -    "Cython==3.0.5",  # Note: sync with setup.py, environment.yml and asv.conf.json
    -    # Any NumPy version should be fine for compiling.  Users are unlikely
    -    # to get a NumPy<1.25 so the result will be compatible with all relevant
    -    # NumPy versions (if not it is presumably compatible with their version).
    -    # Pin <2.0 for releases until tested against an RC.  But explicitly allow
    -    # testing the `.dev0` nightlies (which require the extra index).
    -    "numpy>1.22.4,<=2.0.0.dev0",
    +    "Cython==3.1.0rc1",  # Note: sync with setup.py, environment.yml and asv.conf.json
    +    # Force numpy higher than 2.0rc1, so that built wheels are compatible
    +    # with both numpy 1 and 2
    +    "numpy>=2.0.0rc1",
         "versioneer[toml]"
     ]
     
    @@ -28,13 +25,11 @@ authors = [
       { name = 'The Pandas Development Team', email='pandas-dev@python.org' },
     ]
     license = {file = 'LICENSE'}
    -requires-python = '>=3.9'
    +requires-python = '>=3.10'
     dependencies = [
    -  "numpy>=1.22.4; python_version<'3.11'",
    -  "numpy>=1.23.2; python_version=='3.11'",
    +  "numpy>=1.23.5; python_version<'3.12'",
       "numpy>=1.26.0; python_version>='3.12'",
       "python-dateutil>=2.8.2",
    -  "pytz>=2020.1",
       "tzdata>=2022.7"
     ]
     classifiers = [
    @@ -47,10 +42,10 @@ classifiers = [
         'Programming Language :: Python',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3 :: Only',
    -    'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
         'Programming Language :: Python :: 3.12',
    +    'Programming Language :: Python :: 3.13',
         'Topic :: Scientific/Engineering'
     ]
     
    @@ -63,68 +58,66 @@ repository = 'https://github.com/pandas-dev/pandas'
     matplotlib = "pandas:plotting._matplotlib"
     
     [project.optional-dependencies]
    -test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0']
    -performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4']
    -computation = ['scipy>=1.10.0', 'xarray>=2022.12.0']
    -fss = ['fsspec>=2022.11.0']
    -aws = ['s3fs>=2022.11.0']
    -gcp = ['gcsfs>=2022.11.0', 'pandas-gbq>=0.19.0']
    -excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.5']
    +test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0']
    +pyarrow = ['pyarrow>=10.0.1']
    +performance = ['bottleneck>=1.3.6', 'numba>=0.59.0', 'numexpr>=2.9.0']
    +computation = ['scipy>=1.12.0', 'xarray>=2024.1.1']
    +fss = ['fsspec>=2023.12.2']
    +aws = ['s3fs>=2023.12.2']
    +gcp = ['gcsfs>=2023.12.2']
    +excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.2', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.2.0']
     parquet = ['pyarrow>=10.0.1']
     feather = ['pyarrow>=10.0.1']
    -hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297)
    -        #'blosc>=1.20.1',
    -        'tables>=3.8.0']
    -spss = ['pyreadstat>=1.2.0']
    -postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.8.0']
    -mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2']
    -sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.8.0', 'adbc-driver-sqlite>=0.8.0']
    -html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2']
    +iceberg = ['pyiceberg>=0.7.1']
    +hdf5 = ['tables>=3.8.0']
    +spss = ['pyreadstat>=1.2.6']
    +postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0']
    +mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.1.0']
    +sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0']
    +html = ['beautifulsoup4>=4.12.3', 'html5lib>=1.1', 'lxml>=4.9.2']
     xml = ['lxml>=4.9.2']
    -plot = ['matplotlib>=3.6.3']
    -output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0']
    +plot = ['matplotlib>=3.8.3']
    +output-formatting = ['jinja2>=3.1.3', 'tabulate>=0.9.0']
     clipboard = ['PyQt5>=5.15.9', 'qtpy>=2.3.0']
    -compression = ['zstandard>=0.19.0']
    -consortium-standard = ['dataframe-api-compat>=0.1.7']
    -all = ['adbc-driver-postgresql>=0.8.0',
    +compression = ['zstandard>=0.22.0']
    +timezone = ['pytz>=2023.4']
    +all = ['adbc-driver-postgresql>=0.10.0',
            'adbc-driver-sqlite>=0.8.0',
    -       'beautifulsoup4>=4.11.2',
    -       # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297)
    -       #'blosc>=1.21.3',
    +       'beautifulsoup4>=4.12.3',
            'bottleneck>=1.3.6',
    -       'dataframe-api-compat>=0.1.7',
    -       'fastparquet>=2022.12.0',
    -       'fsspec>=2022.11.0',
    -       'gcsfs>=2022.11.0',
    +       'fastparquet>=2024.2.0',
    +       'fsspec>=2023.12.2',
    +       'gcsfs>=2023.12.2',
            'html5lib>=1.1',
    -       'hypothesis>=6.46.1',
    -       'jinja2>=3.1.2',
    +       'hypothesis>=6.84.0',
    +       'jinja2>=3.1.3',
            'lxml>=4.9.2',
    -       'matplotlib>=3.6.3',
    -       'numba>=0.56.4',
    -       'numexpr>=2.8.4',
    +       'matplotlib>=3.8.3',
    +       'numba>=0.59.0',
    +       'numexpr>=2.9.0',
            'odfpy>=1.4.1',
    -       'openpyxl>=3.1.0',
    -       'pandas-gbq>=0.19.0',
    +       'openpyxl>=3.1.2',
            'psycopg2>=2.9.6',
            'pyarrow>=10.0.1',
    -       'pymysql>=1.0.2',
    +       'pyiceberg>=0.7.1',
    +       'pymysql>=1.1.0',
            'PyQt5>=5.15.9',
    -       'pyreadstat>=1.2.0',
    +       'pyreadstat>=1.2.6',
            'pytest>=7.3.2',
    -       'pytest-xdist>=2.2.0',
    +       'pytest-xdist>=3.4.0',
            'python-calamine>=0.1.7',
    +       'pytz>=2023.4',
            'pyxlsb>=1.0.10',
            'qtpy>=2.3.0',
    -       'scipy>=1.10.0',
    -       's3fs>=2022.11.0',
    +       'scipy>=1.12.0',
    +       's3fs>=2023.12.2',
            'SQLAlchemy>=2.0.0',
            'tables>=3.8.0',
            'tabulate>=0.9.0',
    -       'xarray>=2022.12.0',
    +       'xarray>=2024.1.1',
            'xlrd>=2.0.1',
    -       'xlsxwriter>=3.0.5',
    -       'zstandard>=0.19.0']
    +       'xlsxwriter>=3.2.0',
    +       'zstandard>=0.22.0']
     
     # TODO: Remove after setuptools support is dropped.
     [tool.setuptools]
    @@ -152,34 +145,39 @@ parentdir_prefix = "pandas-"
     setup = ['--vsenv'] # For Windows
     
     [tool.cibuildwheel]
    -skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x"
    -build-verbosity = "3"
    +skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x"
    +build-verbosity = 3
     environment = {LDFLAGS="-Wl,--strip-all"}
    -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0"
    +test-requires = "hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0"
     test-command = """
       PANDAS_CI='1' python -c 'import pandas as pd; \
       pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \
       pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \
       """
    -
    -[tool.cibuildwheel.macos]
    -archs = "x86_64 arm64"
    -test-skip = "*_arm64"
    +enable = ["cpython-freethreading"]
    +before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh"
     
     [tool.cibuildwheel.windows]
    +environment = {}
     before-build = "pip install delvewheel"
    +test-command = """
    +  set PANDAS_CI='1' && \
    +  python -c "import pandas as pd; \
    +  pd.test(extra_args=['--no-strict-data-files', '-m not clipboard and not single_cpu and not slow and not network and not db']);" \
    +  """
     repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}"
     
     [[tool.cibuildwheel.overrides]]
    -select = "*-musllinux*"
    -before-test = "apk update && apk add musl-locales"
    +select = "*-manylinux_aarch64*"
    +test-command = """
    +  PANDAS_CI='1' python -c 'import pandas as pd; \
    +  pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db and not fails_arm_wheels", "-n 2", "--no-strict-data-files"]); \
    +  pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \
    +  """
     
     [[tool.cibuildwheel.overrides]]
    -select = "*-win*"
    -# We test separately for Windows, since we use
    -# the windowsservercore docker image to check if any dlls are
    -# missing from the wheel
    -test-command = ""
    +select = "*-musllinux*"
    +before-test = "apk update && apk add musl-locales"
     
     [[tool.cibuildwheel.overrides]]
     # Don't strip wheels on macOS.
    @@ -188,31 +186,22 @@ test-command = ""
     select = "*-macosx*"
     environment = {CFLAGS="-g0"}
     
    -[tool.black]
    -target-version = ['py39', 'py310']
    -required-version = '23.11.0'
    -exclude = '''
    -(
    -    asv_bench/env
    -  | \.egg
    -  | \.git
    -  | \.hg
    -  | \.mypy_cache
    -  | \.nox
    -  | \.tox
    -  | \.venv
    -  | _build
    -  | buck-out
    -  | build
    -  | dist
    -  | setup.py
    -)
    -'''
    +[[tool.cibuildwheel.overrides]]
    +select = "*pyodide*"
    +test-requires = "pytest>=7.3.2 hypothesis>=6.84.0"
    +# Pyodide repairs wheels on its own, using auditwheel-emscripten
    +repair-wheel-command = ""
    +test-command = """
    +  PANDAS_CI='1' python -c 'import pandas as pd; \
    +  pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \
    +  """
     
     [tool.ruff]
     line-length = 88
     target-version = "py310"
     fix = true
    +
    +[tool.ruff.lint]
     unfixable = []
     typing-modules = ["pandas._typing"]
     
    @@ -233,6 +222,8 @@ select = [
       "INT",
       # pylint
       "PL",
    +  # flake8-pytest-style
    +  "PT",
       # misc lints
       "PIE",
       # flake8-pyi
    @@ -241,8 +232,8 @@ select = [
       "TID",
       # implicit string concatenation
       "ISC",
    -  # type-checking imports
    -  "TCH",
    +  # flake8-type-checking
    +  "TC",
       # comprehensions
       "C4",
       # pygrep-hooks
    @@ -259,42 +250,32 @@ select = [
       "FLY",
       # flake8-logging-format
       "G",
    +  # flake8-future-annotations
    +  "FA",
    +  # unconventional-import-alias
    +  "ICN001",
    +  # flake8-slots
    +  "SLOT",
    +  # flake8-raise
    +  "RSE"
     ]
     
     ignore = [
       ### Intentionally disabled
    -  # space before : (needed for how black formats slicing)
    -  "E203",
       # module level import not at top of file
       "E402",
       # do not assign a lambda expression, use a def
       "E731",
    -  # line break before binary operator
    -  # "W503",  # not yet implemented
    -  # line break after binary operator
    -  # "W504",  # not yet implemented
    -  # controversial
    -  "B006",
       # controversial
       "B007",
       # controversial
       "B008",
    -  # setattr is used to side-step mypy
    -  "B009",
       # getattr is used to side-step mypy
       "B010",
    -  # tests use assert False
    -  "B011",
       # tests use comparisons but not their returned value
       "B015",
    -  # false positives
    -  "B019",
    -  # Loop control variable overrides iterable it iterates
    -  "B020",
       # Function definition does not bind loop variable
       "B023",
    -  # Functions defined inside a loop must not use variables redefined in the loop
    -  # "B301",  # not yet implemented
       # Only works with python >=3.10
       "B905",
       # Too many arguments to function call
    @@ -309,41 +290,106 @@ ignore = [
       "PLW2901",
       # Global statements are discouraged
       "PLW0603",
    -  # Docstrings should not be included in stubs
    -  "PYI021",
       # Use `typing.NamedTuple` instead of `collections.namedtuple`
       "PYI024",
    -  # No builtin `eval()` allowed
    -  "PGH001",
    -  # compare-to-empty-string
    -  "PLC1901",
    +  # Use of possibly insecure function; consider using ast.literal_eval
    +  "S307",
       # while int | float can be shortened to float, the former is more explicit
       "PYI041",
       # incorrect-dict-iterator, flags valid Series.items usage
       "PERF102",
       # try-except-in-loop, becomes useless in Python 3.11
       "PERF203",
    -
    +  # pytest-parametrize-names-wrong-type
    +  "PT006",
    +  # pytest-parametrize-values-wrong-type
    +  "PT007",
    +  # pytest-patch-with-lambda
    +  "PT008",
    +  # pytest-raises-with-multiple-statements
    +  "PT012",
    +  # pytest-assert-in-except
    +  "PT017",
    +  # pytest-composite-assertion
    +  "PT018",
    +  # pytest-fixture-param-without-value
    +  "PT019",
    +  # The following rules may cause conflicts when used with the formatter:
    +  "ISC001",
    +  # if-stmt-min-max
    +  "PLR1730",
     
       ### TODO: Enable gradually
       # Useless statement
       "B018",
    -  # Within an except clause, raise exceptions with ...
    -  "B904",
       # Magic number
       "PLR2004",
       # comparison-with-itself
       "PLR0124",
    -  # Consider `elif` instead of `else` then `if` to remove indentation level
    -  "PLR5501",
       # collection-literal-concatenation
       "RUF005",
       # pairwise-over-zipped (>=PY310 only)
       "RUF007",
    -  # explicit-f-string-type-conversion
    -  "RUF010",
       # mutable-class-default
    -  "RUF012"
    +  "RUF012",
    +  # type-comparison
    +  "E721",
    +  # repeated-equality-comparison
    +  "PLR1714",
    +  # self-or-cls-assignment
    +  "PLW0642",
    +  # literal-membership
    +  "PLR6201", # 847 errors
    +  # Method could be a function, class method, or static method
    +  "PLR6301", # 11411 errors
    +  # Private name import
    +  "PLC2701", # 27 errors
    +  # Too many positional arguments (6/5)
    +  "PLR0917", # 470 errors
    +  # compare-to-empty-string
    +  "PLC1901",
    +  # `tempfile.NamedTemporaryFile` in text mode without explicit `encoding` argument
    +  "PLW1514", # 1 error
    +  # Object does not implement `__hash__` method
    +  "PLW1641", # 16 errors
    +  # Bad or misspelled dunder method name
    +  "PLW3201", # 69 errors, seems to be all false positive
    +  # Unnecessary lookup of dictionary value by key
    +  "PLR1733", # 5 errors, it seems like we wannt to ignore these
    +  # Unnecessary lookup of list item by index
    +  "PLR1736", # 4 errors, we're currently having inline pylint ignore
    +  # Unpacking a dictionary in iteration without calling `.items()`
    +  "PLE1141", # autofixable
    +  # import-outside-toplevel
    +  "PLC0415",
    +  # unnecessary-dunder-call
    +  "PLC2801",
    +  # comparison-with-itself
    +  "PLR0124",
    +  # too-many-public-methods
    +  "PLR0904",
    +  # too-many-return-statements
    +  "PLR0911",
    +  # too-many-branches
    +  "PLR0912",
    +  # too-many-arguments
    +  "PLR0913",
    +  # too-many-locals
    +  "PLR0914",
    +  # too-many-statements
    +  "PLR0915",
    +  # too-many-boolean-expressions
    +  "PLR0916",
    +  # too-many-nested-blocks
    +  "PLR1702",
    +  # redefined-argument-from-local
    +  "PLR1704",
    +  # unnecessary-lambda
    +  "PLW0108",
    +  # global-statement
    +  "PLW0603",
    +  # runtime-cast-value
    +  "TC006",
     ]
     
     exclude = [
    @@ -358,7 +404,24 @@ exclude = [
       "env",
     ]
     
    -[tool.ruff.per-file-ignores]
    +[tool.ruff.lint.flake8-tidy-imports.banned-api]
    +"urllib.request.urlopen".msg = "Use pandas.io.common.urlopen instead of urllib.request.urlopen"
    +# numpy.random is banned but np.random is not. Is this intentional?
    +# "numpy.random".msg = "Do not use numpy.random"
    +"pytest.warns".msg = "Use tm.assert_produces_warning instead of pytest.warns"
    +"pytest.xfail".msg = "Use pytest.mark.xfail instead of pytest.xfail"
    +"conftest".msg = "No direct imports from conftest"
    +"numpy.testing".msg = "Do not use numpy.testing"
    +# "numpy.array_equal".msg = "Do not use numpy.array_equal" # Used in pandas/core
    +"unittest.mock".msg = "use pytest builtin monkeypatch fixture instead"
    +"os.remove".msg = "Do not use os.remove"
    +
    +
    +
    +[tool.ruff.lint.flake8-import-conventions.aliases]
    +"pandas.core.construction.array" = "pd_array"
    +
    +[tool.ruff.lint.per-file-ignores]
     # relative imports allowed for asv_bench
     "asv_bench/*" = ["TID", "NPY002"]
     # to be enabled gradually
    @@ -366,111 +429,14 @@ exclude = [
     "pandas/tests/*" = ["B028", "FLY"]
     "scripts/*" = ["B028"]
     # Keep this one enabled
    -"pandas/_typing.py" = ["TCH"]
    -
    -[tool.pylint.messages_control]
    -max-line-length = 88
    -disable = [
    - # intentionally turned off
    -  "bad-mcs-classmethod-argument",
    -  "broad-except",
    -  "c-extension-no-member",
    -  "comparison-with-itself",
    -  "consider-using-enumerate",
    -  "import-error",
    -  "import-outside-toplevel",
    -  "invalid-name",
    -  "invalid-unary-operand-type",
    -  "line-too-long",
    -  "no-else-continue",
    -  "no-else-raise",
    -  "no-else-return",
    -  "no-member",
    -  "no-name-in-module",
    -  "not-an-iterable",
    -  "overridden-final-method",
    -  "pointless-statement",
    -  "redundant-keyword-arg",
    -  "singleton-comparison",
    -  "too-many-ancestors",
    -  "too-many-arguments",
    -  "too-many-boolean-expressions",
    -  "too-many-branches",
    -  "too-many-function-args",
    -  "too-many-instance-attributes",
    -  "too-many-locals",
    -  "too-many-nested-blocks",
    -  "too-many-public-methods",
    -  "too-many-return-statements",
    -  "too-many-statements",
    -  "unexpected-keyword-arg",
    -  "ungrouped-imports",
    -  "unsubscriptable-object",
    -  "unsupported-assignment-operation",
    -  "unsupported-membership-test",
    -  "unused-import",
    -  "use-dict-literal",
    -  "use-implicit-booleaness-not-comparison",
    -  "use-implicit-booleaness-not-len",
    -  "wrong-import-order",
    -  "wrong-import-position",
    -  "redefined-loop-name",
    -
    - # misc
    -  "abstract-class-instantiated",
    -  "no-value-for-parameter",
    -  "undefined-variable",
    -  "unpacking-non-sequence",
    -  "used-before-assignment",
    -
    - # pylint type "C": convention, for programming standard violation
    -  "missing-class-docstring",
    -  "missing-function-docstring",
    -  "missing-module-docstring",
    -  "superfluous-parens",
    -  "too-many-lines",
    -  "unidiomatic-typecheck",
    -  "unnecessary-dunder-call",
    -  "unnecessary-lambda-assignment",
    -
    -  # pylint type "R": refactor, for bad code smell
    -  "consider-using-with",
    -  "cyclic-import",
    -  "duplicate-code",
    -  "inconsistent-return-statements",
    -  "redefined-argument-from-local",
    -  "too-few-public-methods",
    -
    -  # pylint type "W": warning, for python specific problems
    -  "abstract-method",
    -  "arguments-differ",
    -  "arguments-out-of-order",
    -  "arguments-renamed",
    -  "attribute-defined-outside-init",
    -  "broad-exception-raised",
    -  "comparison-with-callable",
    -  "dangerous-default-value",
    -  "deprecated-module",
    -  "eval-used",
    -  "expression-not-assigned",
    -  "fixme",
    -  "global-statement",
    -  "invalid-overridden-method",
    -  "keyword-arg-before-vararg",
    -  "possibly-unused-variable",
    -  "protected-access",
    -  "raise-missing-from",
    -  "redefined-builtin",
    -  "redefined-outer-name",
    -  "self-cls-assignment",
    -  "signature-differs",
    -  "super-init-not-called",
    -  "try-except-raise",
    -  "unnecessary-lambda",
    -  "unused-argument",
    -  "unused-variable",
    -  "using-constant-test"
    -]
    +"pandas/_typing.py" = ["TC"]
    +
    +[tool.ruff.lint.flake8-pytest-style]
    +fixture-parentheses = false
    +mark-parentheses = false
    +
    +[tool.ruff.format]
    +docstring-code-format = true
     
     [tool.pytest.ini_options]
     # sync minversion with pyproject.toml & install.rst
    @@ -500,20 +466,10 @@ filterwarnings = [
       "ignore::ResourceWarning:asyncio",
       # From plotting doctests
       "ignore:More than 20 figures have been opened:RuntimeWarning",
    -  # Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758
    -  "ignore:`np.MachAr` is deprecated:DeprecationWarning:numba",
       "ignore:.*urllib3:DeprecationWarning:botocore",
       "ignore:Setuptools is replacing distutils.:UserWarning:_distutils_hack",
       # https://github.com/PyTables/PyTables/issues/822
       "ignore:a closed node found in the registry:UserWarning:tables",
    -  "ignore:`np.object` is a deprecated:DeprecationWarning:tables",
    -  "ignore:tostring:DeprecationWarning:tables",
    -  "ignore:distutils Version classes are deprecated:DeprecationWarning:pandas_datareader",
    -  "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr",
    -  "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet",
    -  "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec",
    -  # Can be removed once https://github.com/numpy/numpy/pull/24794 is merged
    -  "ignore:.*In the future `np.long` will be defined as.*:FutureWarning",
     ]
     junit_family = "xunit2"
     markers = [
    @@ -524,6 +480,10 @@ markers = [
       "clipboard: mark a pd.read_clipboard test",
       "arm_slow: mark a test as slow for arm64 architecture",
       "skip_ubsan: Tests known to fail UBSAN check",
    +  # TODO: someone should investigate this ...
    +  # these tests only fail in the wheel builder and don't fail in regular
    +  # ARM CI
    +  "fails_arm_wheels: Tests that fail in the ARM wheel build only",
     ]
     
     [tool.mypy]
    @@ -581,14 +541,8 @@ module = [
       "pandas._config.config", # TODO
       "pandas._libs.*",
       "pandas._testing.*", # TODO
    -  "pandas.arrays", # TODO
       "pandas.compat.numpy.function", # TODO
    -  "pandas.compat._optional", # TODO
    -  "pandas.compat.compressors", # TODO
    -  "pandas.compat.pickle_compat", # TODO
       "pandas.core._numba.executor", # TODO
    -  "pandas.core.array_algos.datetimelike_accumulations", # TODO
    -  "pandas.core.array_algos.masked_accumulations", # TODO
       "pandas.core.array_algos.masked_reductions", # TODO
       "pandas.core.array_algos.putmask", # TODO
       "pandas.core.array_algos.quantile", # TODO
    @@ -602,9 +556,7 @@ module = [
       "pandas.core.dtypes.concat", # TODO
       "pandas.core.dtypes.dtypes", # TODO
       "pandas.core.dtypes.generic", # TODO
    -  "pandas.core.dtypes.inference", # TODO
       "pandas.core.dtypes.missing", # TODO
    -  "pandas.core.groupby.categorical", # TODO
       "pandas.core.groupby.generic", # TODO
       "pandas.core.groupby.grouper", # TODO
       "pandas.core.groupby.groupby", # TODO
    @@ -615,11 +567,8 @@ module = [
       "pandas.core.interchange.dataframe_protocol", # TODO
       "pandas.core.interchange.from_dataframe", # TODO
       "pandas.core.internals.*", # TODO
    -  "pandas.core.methods.*", # TODO
       "pandas.core.ops.array_ops", # TODO
       "pandas.core.ops.common", # TODO
    -  "pandas.core.ops.invalid", # TODO
    -  "pandas.core.ops.mask_ops", # TODO
       "pandas.core.ops.missing", # TODO
       "pandas.core.reshape.*", # TODO
       "pandas.core.strings.*", # TODO
    @@ -636,7 +585,6 @@ module = [
       "pandas.core.arraylike", # TODO
       "pandas.core.base", # TODO
       "pandas.core.common", # TODO
    -  "pandas.core.config_init", # TODO
       "pandas.core.construction", # TODO
       "pandas.core.flags", # TODO
       "pandas.core.frame", # TODO
    @@ -653,17 +601,12 @@ module = [
       "pandas.io.clipboard", # TODO
       "pandas.io.excel._base", # TODO
       "pandas.io.excel._odfreader", # TODO
    -  "pandas.io.excel._odswriter", # TODO
       "pandas.io.excel._openpyxl", # TODO
       "pandas.io.excel._pyxlsb", # TODO
       "pandas.io.excel._xlrd", # TODO
       "pandas.io.excel._xlsxwriter", # TODO
    -  "pandas.io.formats.console", # TODO
    -  "pandas.io.formats.css", # TODO
       "pandas.io.formats.excel", # TODO
       "pandas.io.formats.format", # TODO
    -  "pandas.io.formats.info", # TODO
    -  "pandas.io.formats.printing", # TODO
       "pandas.io.formats.style", # TODO
       "pandas.io.formats.style_render", # TODO
       "pandas.io.formats.xml", # TODO
    @@ -672,14 +615,10 @@ module = [
       "pandas.io.sas.sas_xport", # TODO
       "pandas.io.sas.sas7bdat", # TODO
       "pandas.io.clipboards", # TODO
    -  "pandas.io.common", # TODO
    -  "pandas.io.gbq", # TODO
       "pandas.io.html", # TODO
    -  "pandas.io.gbq", # TODO
       "pandas.io.parquet", # TODO
       "pandas.io.pytables", # TODO
       "pandas.io.sql", # TODO
    -  "pandas.io.stata", # TODO
       "pandas.io.xml", # TODO
       "pandas.plotting.*", # TODO
       "pandas.tests.*",
    @@ -687,7 +626,6 @@ module = [
       "pandas.tseries.holiday", # TODO
       "pandas.util._decorators", # TODO
       "pandas.util._doctools", # TODO
    -  "pandas.util._print_versions", # TODO
       "pandas.util._test_decorators", # TODO
       "pandas.util._validators", # TODO
       "pandas.util", # TODO
    @@ -756,14 +694,22 @@ reportUntypedNamedTuple = true
     reportUnusedImport = true
     disableBytesTypePromotions = true
     # disable subset of "basic"
    +reportArgumentType = false
    +reportAssignmentType = false
    +reportAttributeAccessIssue = false
    +reportCallIssue = false
     reportGeneralTypeIssues = false
    +reportIndexIssue = false
     reportMissingModuleSource = false
    +reportOperatorIssue = false
     reportOptionalCall = false
     reportOptionalIterable = false
     reportOptionalMemberAccess = false
     reportOptionalOperand = false
     reportOptionalSubscript = false
     reportPrivateImportUsage = false
    +reportRedeclaration = false
    +reportReturnType = false
     reportUnboundVariable = false
     
     [tool.coverage.run]
    @@ -796,5 +742,5 @@ exclude_lines = [
     directory = "coverage_html_report"
     
     [tool.codespell]
    -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs"
    +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru, indx, abd, ABD"
     ignore-regex = 'https://([\w/\.])+'
    diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json
    index a38343d6198ae..9103d4d533cf3 100644
    --- a/pyright_reportGeneralTypeIssues.json
    +++ b/pyright_reportGeneralTypeIssues.json
    @@ -1,6 +1,14 @@
     {
         "typeCheckingMode": "off",
    +    "reportArgumentType": true,
    +    "reportAssignmentType": true,
    +    "reportAttributeAccessIssue": true,
    +    "reportCallIssue": true,
         "reportGeneralTypeIssues": true,
    +    "reportIndexIssue": true,
    +    "reportOperatorIssue": true,
    +    "reportRedeclaration": true,
    +    "reportReturnType": true,
         "useLibraryCodeForTypes": false,
         "analyzeUnannotatedFunctions": false,
         "disableBytesTypePromotions": true,
    @@ -41,7 +49,6 @@
             "pandas/core/arrays/string_arrow.py",
             "pandas/core/arrays/timedeltas.py",
             "pandas/core/computation/align.py",
    -        "pandas/core/computation/ops.py",
             "pandas/core/construction.py",
             "pandas/core/dtypes/cast.py",
             "pandas/core/dtypes/common.py",
    @@ -64,7 +71,6 @@
             "pandas/core/indexes/period.py",
             "pandas/core/indexing.py",
             "pandas/core/internals/api.py",
    -        "pandas/core/internals/array_manager.py",
             "pandas/core/internals/blocks.py",
             "pandas/core/internals/construction.py",
             "pandas/core/internals/managers.py",
    @@ -99,11 +105,11 @@
             "pandas/io/parsers/base_parser.py",
             "pandas/io/parsers/c_parser_wrapper.py",
             "pandas/io/pytables.py",
    -        "pandas/io/sas/sas_xport.py",
             "pandas/io/sql.py",
             "pandas/io/stata.py",
             "pandas/plotting/_matplotlib/boxplot.py",
             "pandas/plotting/_matplotlib/core.py",
    +        "pandas/plotting/_matplotlib/misc.py",
             "pandas/plotting/_matplotlib/timeseries.py",
             "pandas/plotting/_matplotlib/tools.py",
             "pandas/tseries/frequencies.py",
    diff --git a/requirements-dev.txt b/requirements-dev.txt
    index cbfb6336b2e16..c2bac550bc664 100644
    --- a/requirements-dev.txt
    +++ b/requirements-dev.txt
    @@ -3,65 +3,64 @@
     
     pip
     versioneer[toml]
    -cython==3.0.5
    +cython~=3.0.5
     meson[ninja]==1.2.1
     meson-python==0.13.1
     pytest>=7.3.2
     pytest-cov
    -pytest-xdist>=2.2.0
    -pytest-qt>=4.2.0
    +pytest-xdist>=3.4.0
    +pytest-qt>=4.4.0
    +pytest-localserver
     PyQt5>=5.15.9
     coverage
     python-dateutil
    -numpy<2
    -pytz
    -beautifulsoup4>=4.11.2
    -blosc
    +numpy<3
    +beautifulsoup4>=4.12.3
     bottleneck>=1.3.6
    -fastparquet>=2022.12.0
    -fsspec>=2022.11.0
    +fastparquet>=2024.2.0
    +fsspec>=2023.12.2
     html5lib>=1.1
    -hypothesis>=6.46.1
    -gcsfs>=2022.11.0
    +hypothesis>=6.84.0
    +gcsfs>=2023.12.2
     ipython
    -jinja2>=3.1.2
    +pickleshare
    +jinja2>=3.1.3
     lxml>=4.9.2
    -matplotlib>=3.6.3
    -numba>=0.56.4
    -numexpr>=2.8.4
    -openpyxl>=3.1.0
    +matplotlib>=3.8.3
    +numba>=0.59.0
    +numexpr>=2.9.0
    +openpyxl>=3.1.2
     odfpy>=1.4.1
    -py
     psycopg2-binary>=2.9.6
     pyarrow>=10.0.1
    -pymysql>=1.0.2
    -pyreadstat>=1.2.0
    +pyiceberg>=0.7.1
    +pymysql>=1.1.0
    +pyreadstat>=1.2.6
     tables>=3.8.0
     python-calamine>=0.1.7
    +pytz>=2023.4
     pyxlsb>=1.0.10
    -s3fs>=2022.11.0
    -scipy>=1.10.0
    +s3fs>=2023.12.2
    +scipy>=1.12.0
     SQLAlchemy>=2.0.0
     tabulate>=0.9.0
    -xarray>=2022.12.0
    +xarray>=2024.1.1
     xlrd>=2.0.1
    -xlsxwriter>=3.0.5
    -zstandard>=0.19.0
    +xlsxwriter>=3.2.0
    +zstandard>=0.22.0
     dask
     seaborn
     moto
     flask
     asv>=0.6.1
    -flake8==6.1.0
    -mypy==1.7.1
    +flake8==7.1.0
    +mypy==1.13.0
     tokenize-rt
    -pre-commit>=3.6.0
    +pre-commit>=4.2.0
     gitpython
    -gitdb
    -google-auth
     natsort
     numpydoc
    -pydata-sphinx-theme==0.14
    +pydata-sphinx-theme==0.16
     pytest-cython
     sphinx
     sphinx-design
    @@ -83,8 +82,9 @@ feedparser
     pyyaml
     requests
     pygments
    -adbc-driver-postgresql>=0.8.0
    +jupyterlite-core
    +jupyterlite-pyodide-kernel
    +adbc-driver-postgresql>=0.10.0
     adbc-driver-sqlite>=0.8.0
    -dataframe-api-compat>=0.1.7
     typing_extensions; python_version<"3.11"
     tzdata>=2022.7
    diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py
    index 52eca6f6d93ac..ec0a4a408c800 100644
    --- a/scripts/check_for_inconsistent_pandas_namespace.py
    +++ b/scripts/check_for_inconsistent_pandas_namespace.py
    @@ -27,10 +27,7 @@
         Sequence,
     )
     import sys
    -from typing import (
    -    NamedTuple,
    -    Optional,
    -)
    +from typing import NamedTuple
     
     ERROR_MESSAGE = (
         "{path}:{lineno}:{col_offset}: "
    @@ -89,7 +86,7 @@ def replace_inconsistent_pandas_namespace(visitor: Visitor, content: str) -> str
     
     def check_for_inconsistent_pandas_namespace(
         content: str, path: str, *, replace: bool
    -) -> Optional[str]:
    +) -> str | None:
         tree = ast.parse(content)
     
         visitor = Visitor()
    @@ -121,7 +118,7 @@ def check_for_inconsistent_pandas_namespace(
         return replace_inconsistent_pandas_namespace(visitor, content)
     
     
    -def main(argv: Optional[Sequence[str]] = None) -> None:
    +def main(argv: Sequence[str] | None = None) -> None:
         parser = argparse.ArgumentParser()
         parser.add_argument("paths", nargs="*")
         parser.add_argument("--replace", action="store_true")
    diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh
    new file mode 100644
    index 0000000000000..274848972bd7e
    --- /dev/null
    +++ b/scripts/cibw_before_build.sh
    @@ -0,0 +1,5 @@
    +#!/bin/bash
    +# Add 3rd party licenses, like numpy does
    +for file in $PACKAGE_DIR/LICENSES/*; do
    +  cat $file >> $PACKAGE_DIR/LICENSE
    +done
    diff --git a/scripts/download_wheels.sh b/scripts/download_wheels.sh
    index 84279ac7a04d1..3dcae0cadcdcf 100755
    --- a/scripts/download_wheels.sh
    +++ b/scripts/download_wheels.sh
    @@ -1,4 +1,4 @@
    -#!/bin/sh
    +#!/bin/bash
     #
     # Download all wheels for a pandas version.
     #
    @@ -11,11 +11,12 @@
     # one by one to the dist/ directory where they would be generated.
     
     VERSION=$1
    -mkdir -p $(dirname -- $0)/../dist
    -DIST_DIR="$(realpath $(dirname -- $0)/../dist)"
    +BASE_DIR=$(dirname -- $0)
    +mkdir -p $BASE_DIR/../dist
    +DIST_DIR="$(realpath $BASE_DIR/../dist)"
     
    -if [ -z $VERSION ]; then
    -    printf "Usage:\n\t$0 \n\nWhere  is for example 1.5.3"
    +if [ -z "$VERSION" ]; then
    +    printf "Usage:\n\t%s \n\nWhere  is for example 1.5.3"  "$0"
         exit 1
     fi
     
    @@ -23,7 +24,7 @@ curl "https://anaconda.org/multibuild-wheels-staging/pandas/files?version=${VERS
         grep "href=\"/multibuild-wheels-staging/pandas/${VERSION}" | \
         sed -r 's/.*.*/\1/g' | \
         awk '{print "https://anaconda.org" $0 }' | \
    -    xargs wget -P $DIST_DIR
    +    xargs wget -P "$DIST_DIR"
     
    -printf "\nWheels downloaded to $DIST_DIR\nYou can upload them to PyPI using:\n\n"
    -printf "\ttwine upload ${DIST_DIR}/pandas-${VERSION}*.{whl,tar.gz} --skip-existing"
    +printf '\nWheels downloaded to %s\nYou can upload them to PyPI using:\n\n' "$DIST_DIR"
    +printf "\ttwine upload %s/pandas-%s*.{whl,tar.gz} --skip-existing" "$DIST_DIR" "$VERSION"
    diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py
    index 5fcf09cd073fe..a57876902ad36 100755
    --- a/scripts/generate_pip_deps_from_conda.py
    +++ b/scripts/generate_pip_deps_from_conda.py
    @@ -26,6 +26,8 @@
     EXCLUDE = {"python", "c-compiler", "cxx-compiler"}
     REMAP_VERSION = {"tzdata": "2022.7"}
     CONDA_TO_PIP = {
    +    "versioneer": "versioneer[toml]",
    +    "meson": "meson[ninja]",
         "pytables": "tables",
         "psycopg2": "psycopg2-binary",
         "dask-core": "dask",
    @@ -45,7 +47,7 @@ def conda_package_to_pip(package: str):
         - A package requiring a specific version, in conda is defined with a single
           equal (e.g. ``pandas=1.0``) and in pip with two (e.g. ``pandas==1.0``)
         """
    -    package = re.sub("(?<=[^<>])=", "==", package).strip()
    +    package = re.sub("(?<=[^<>~])=", "==", package).strip()
     
         for compare in ("<=", ">=", "=="):
             if compare in package:
    diff --git a/scripts/no_bool_in_generic.py b/scripts/no_bool_in_generic.py
    deleted file mode 100644
    index 15bc2247469c7..0000000000000
    --- a/scripts/no_bool_in_generic.py
    +++ /dev/null
    @@ -1,98 +0,0 @@
    -"""
    -Check that pandas/core/generic.py doesn't use bool as a type annotation.
    -
    -There is already the method `bool`, so the alias `bool_t` should be used instead.
    -
    -This is meant to be run as a pre-commit hook - to run it manually, you can do:
    -
    -    pre-commit run no-bool-in-core-generic --all-files
    -
    -The function `visit` is adapted from a function by the same name in pyupgrade:
    -https://github.com/asottile/pyupgrade/blob/5495a248f2165941c5d3b82ac3226ba7ad1fa59d/pyupgrade/_data.py#L70-L113
    -Licence at LICENSES/PYUPGRADE_LICENSE
    -"""
    -from __future__ import annotations
    -
    -import argparse
    -import ast
    -import collections
    -from typing import TYPE_CHECKING
    -
    -if TYPE_CHECKING:
    -    from collections.abc import Sequence
    -
    -
    -def visit(tree: ast.Module) -> dict[int, list[int]]:
    -    """Step through tree, recording when nodes are in annotations."""
    -    in_annotation = False
    -    nodes: list[tuple[bool, ast.AST]] = [(in_annotation, tree)]
    -    to_replace = collections.defaultdict(list)
    -
    -    while nodes:
    -        in_annotation, node = nodes.pop()
    -
    -        if isinstance(node, ast.Name) and in_annotation and node.id == "bool":
    -            to_replace[node.lineno].append(node.col_offset)
    -
    -        for name in reversed(node._fields):
    -            value = getattr(node, name)
    -            if name in {"annotation", "returns"}:
    -                next_in_annotation = True
    -            else:
    -                next_in_annotation = in_annotation
    -            if isinstance(value, ast.AST):
    -                nodes.append((next_in_annotation, value))
    -            elif isinstance(value, list):
    -                nodes.extend(
    -                    (next_in_annotation, value)
    -                    for value in reversed(value)
    -                    if isinstance(value, ast.AST)
    -                )
    -
    -    return to_replace
    -
    -
    -def replace_bool_with_bool_t(to_replace, content: str) -> str:
    -    new_lines = []
    -
    -    for n, line in enumerate(content.splitlines(), start=1):
    -        replaced_line = line
    -        if n in to_replace:
    -            for col_offset in reversed(to_replace[n]):
    -                replaced_line = (
    -                    replaced_line[:col_offset]
    -                    + "bool_t"
    -                    + replaced_line[col_offset + 4 :]
    -                )
    -        new_lines.append(replaced_line)
    -    return "\n".join(new_lines) + "\n"
    -
    -
    -def check_for_bool_in_generic(content: str) -> tuple[bool, str]:
    -    tree = ast.parse(content)
    -    to_replace = visit(tree)
    -
    -    if not to_replace:
    -        mutated = False
    -        return mutated, content
    -
    -    mutated = True
    -    return mutated, replace_bool_with_bool_t(to_replace, content)
    -
    -
    -def main(argv: Sequence[str] | None = None) -> None:
    -    parser = argparse.ArgumentParser()
    -    parser.add_argument("paths", nargs="*")
    -    args = parser.parse_args(argv)
    -
    -    for path in args.paths:
    -        with open(path, encoding="utf-8") as fd:
    -            content = fd.read()
    -        mutated, new_content = check_for_bool_in_generic(content)
    -        if mutated:
    -            with open(path, "w", encoding="utf-8") as fd:
    -                fd.write(new_content)
    -
    -
    -if __name__ == "__main__":
    -    main()
    diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py
    index 6307afa1bc822..e87a7d53f4ff3 100644
    --- a/scripts/run_stubtest.py
    +++ b/scripts/run_stubtest.py
    @@ -44,6 +44,7 @@
         "pandas._libs.hashtable.HashTable.set_na",
         "pandas._libs.hashtable.HashTable.sizeof",
         "pandas._libs.hashtable.HashTable.unique",
    +    "pandas._libs.hashtable.HashTable.hash_inner_join",
         # stubtest might be too sensitive
         "pandas._libs.lib.NoDefault",
         "pandas._libs.lib._NoDefault.no_default",
    @@ -84,9 +85,11 @@
         ]
     
         # create allowlist
    -    with tempfile.NamedTemporaryFile(mode="w+t") as allow:
    -        allow.write("\n".join(_ALLOWLIST))
    -        allow.flush()
    +    with tempfile.TemporaryDirectory() as td:
    +        allow = os.path.join(td, "test")
    +        with open(allow, "w+t") as allow:
    +            allow.write("\n".join(_ALLOWLIST))
    +            allow.flush()
     
             args = pyi_modules + [
                 "--ignore-missing-stub",
    diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml
    index 7bb95d05afb45..d4ecd9f64a68d 100644
    --- a/scripts/tests/data/deps_expected_random.yaml
    +++ b/scripts/tests/data/deps_expected_random.yaml
    @@ -12,7 +12,7 @@ dependencies:
       # test dependencies
       - pytest>=7.3.2
       - pytest-cov
    -  - pytest-xdist>=2.2.0
    +  - pytest-xdist>=3.4.0
       - psutil
       - boto3
     
    @@ -23,7 +23,6 @@ dependencies:
     
       # optional dependencies
       - beautifulsoup4>=5.9.3
    -  - blosc
       - bottleneck>=1.3.2
       - fastparquet>=0.6.3
       - fsspec>=2021.07.0
    @@ -39,7 +38,7 @@ dependencies:
       - odfpy>=1.4.1
       - psycopg2>=2.8.6
       - pyarrow<11, >=7.0.0
    -  - pymysql>=1.0.2
    +  - pymysql>=1.1.0
       - pyreadstat>=1.1.2
       - pytables>=3.6.1
       - python-calamine>=0.1.7
    diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml
    index 3be6be17d1ee2..21c269f573b3d 100644
    --- a/scripts/tests/data/deps_minimum.toml
    +++ b/scripts/tests/data/deps_minimum.toml
    @@ -39,8 +39,6 @@ classifiers = [
         'Programming Language :: Python',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3 :: Only',
    -    'Programming Language :: Python :: 3.8',
    -    'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
         'Topic :: Scientific/Engineering'
    @@ -55,7 +53,7 @@ repository = 'https://github.com/pandas-dev/pandas'
     matplotlib = "pandas:plotting._matplotlib"
     
     [project.optional-dependencies]
    -test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0']
    +test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0']
     performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1']
     timezone = ['tzdata>=2022.1']
     computation = ['scipy>=1.7.1', 'xarray>=0.21.0']
    @@ -65,22 +63,18 @@ gcp = ['gcsfs>=2021.07.0']
     excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3']
     parquet = ['pyarrow>=7.0.0']
     feather = ['pyarrow>=7.0.0']
    -hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297)
    -        #'blosc>=1.20.1',
    -        'tables>=3.6.1']
    +hdf5 = ['tables>=3.6.1']
     spss = ['pyreadstat>=1.1.2']
     postgresql = ['SQLAlchemy>=1.4.16', 'psycopg2>=2.8.6']
    -mysql = ['SQLAlchemy>=1.4.16', 'pymysql>=1.0.2']
    +mysql = ['SQLAlchemy>=1.4.16', 'pymysql>=1.1.0']
     sql-other = ['SQLAlchemy>=1.4.16']
     html = ['beautifulsoup4>=4.9.3', 'html5lib>=1.1', 'lxml>=4.6.3']
     xml = ['lxml>=4.6.3']
     plot = ['matplotlib>=3.6.1']
     output_formatting = ['jinja2>=3.0.0', 'tabulate>=0.8.9']
    -clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.2.0']
    +clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.3.0']
     compression = ['zstandard>=0.15.2']
     all = ['beautifulsoup4>=5.9.3',
    -       # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297)
    -       #'blosc>=1.21.0',
            'bottleneck>=1.3.2',
            'fastparquet>=0.6.3',
            'fsspec>=2021.07.0',
    @@ -96,14 +90,14 @@ all = ['beautifulsoup4>=5.9.3',
            'openpyxl>=3.0.7',
            'psycopg2>=2.8.6',
            'pyarrow>=7.0.0',
    -       'pymysql>=1.0.2',
    +       'pymysql>=1.1.0',
            'PyQt5>=5.15.1',
            'pyreadstat>=1.1.2',
            'pytest>=7.3.2',
    -       'pytest-xdist>=2.2.0',
    +       'pytest-xdist>=3.4.0',
            'python-calamine>=0.1.7',
            'pyxlsb>=1.0.8',
    -       'qtpy>=2.2.0',
    +       'qtpy>=2.3.0',
            'scipy>=1.7.1',
            's3fs>=2021.08.0',
            'SQLAlchemy>=1.4.16',
    @@ -140,7 +134,7 @@ parentdir_prefix = "pandas-"
     [tool.cibuildwheel]
     skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*"
     build-verbosity = "3"
    -test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0"
    +test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=3.4.0"
     test-command = "python {project}/ci/test_wheels.py"
     
     [tool.cibuildwheel.macos]
    @@ -161,26 +155,6 @@ test-command = ""
     select = "*-win32"
     environment = { IS_32_BIT="true" }
     
    -[tool.black]
    -target-version = ['py38', 'py39']
    -exclude = '''
    -(
    -    asv_bench/env
    -  | \.egg
    -  | \.git
    -  | \.hg
    -  | \.mypy_cache
    -  | \.nox
    -  | \.tox
    -  | \.venv
    -  | _build
    -  | buck-out
    -  | build
    -  | dist
    -  | setup.py
    -)
    -'''
    -
     [tool.ruff]
     line-length = 88
     update-check = false
    @@ -251,104 +225,6 @@ exclude = [
       "env",
     ]
     
    -[tool.pylint.messages_control]
    -max-line-length = 88
    -disable = [
    - # intentionally turned off
    -  "broad-except",
    -  "c-extension-no-member",
    -  "comparison-with-itself",
    -  "import-error",
    -  "import-outside-toplevel",
    -  "invalid-name",
    -  "invalid-unary-operand-type",
    -  "line-too-long",
    -  "no-else-continue",
    -  "no-else-raise",
    -  "no-else-return",
    -  "no-member",
    -  "no-name-in-module",
    -  "not-an-iterable",
    -  "overridden-final-method",
    -  "pointless-statement",
    -  "redundant-keyword-arg",
    -  "singleton-comparison",
    -  "too-many-ancestors",
    -  "too-many-arguments",
    -  "too-many-boolean-expressions",
    -  "too-many-branches",
    -  "too-many-function-args",
    -  "too-many-instance-attributes",
    -  "too-many-locals",
    -  "too-many-nested-blocks",
    -  "too-many-public-methods",
    -  "too-many-return-statements",
    -  "too-many-statements",
    -  "unexpected-keyword-arg",
    -  "ungrouped-imports",
    -  "unsubscriptable-object",
    -  "unsupported-assignment-operation",
    -  "unsupported-membership-test",
    -  "unused-import",
    -  "use-implicit-booleaness-not-comparison",
    -  "use-implicit-booleaness-not-len",
    -  "wrong-import-order",
    -  "wrong-import-position",
    -
    - # misc
    -  "abstract-class-instantiated",
    -  "no-value-for-parameter",
    -  "undefined-variable",
    -  "unpacking-non-sequence",
    -
    - # pylint type "C": convention, for programming standard violation
    -  "missing-class-docstring",
    -  "missing-function-docstring",
    -  "missing-module-docstring",
    -  "too-many-lines",
    -  "unidiomatic-typecheck",
    -  "unnecessary-dunder-call",
    -  "unnecessary-lambda-assignment",
    -
    -  # pylint type "R": refactor, for bad code smell
    -  "consider-using-with",
    -  "cyclic-import",
    -  "duplicate-code",
    -  "inconsistent-return-statements",
    -  "redefined-argument-from-local",
    -  "too-few-public-methods",
    -
    -  # pylint type "W": warning, for python specific problems
    -  "abstract-method",
    -  "arguments-differ",
    -  "arguments-out-of-order",
    -  "arguments-renamed",
    -  "attribute-defined-outside-init",
    -  "comparison-with-callable",
    -  "dangerous-default-value",
    -  "deprecated-module",
    -  "eval-used",
    -  "expression-not-assigned",
    -  "fixme",
    -  "global-statement",
    -  "invalid-overridden-method",
    -  "keyword-arg-before-vararg",
    -  "possibly-unused-variable",
    -  "protected-access",
    -  "raise-missing-from",
    -  "redefined-builtin",
    -  "redefined-outer-name",
    -  "self-cls-assignment",
    -  "signature-differs",
    -  "super-init-not-called",
    -  "try-except-raise",
    -  "unnecessary-lambda",
    -  "unspecified-encoding",
    -  "unused-argument",
    -  "unused-variable",
    -  "using-constant-test"
    -]
    -
     [tool.pytest.ini_options]
     # sync minversion with pyproject.toml & install.rst
     minversion = "7.0"
    @@ -514,14 +390,11 @@ exclude_lines = [
       "pragma: no cover",
       # Don't complain about missing debug-only code:s
       "def __repr__",
    -  "if self.debug",
       # Don't complain if tests don't hit defensive assertion code:
       "raise AssertionError",
       "raise NotImplementedError",
       "AbstractMethodError",
       # Don't complain if non-runnable code isn't run:
    -  "if 0:",
    -  "if __name__ == .__main__.:",
       "if TYPE_CHECKING:",
     ]
     
    diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml
    index 49299aa078ce4..4b0f4ffb51b92 100644
    --- a/scripts/tests/data/deps_unmodified_random.yaml
    +++ b/scripts/tests/data/deps_unmodified_random.yaml
    @@ -12,7 +12,7 @@ dependencies:
       # test dependencies
       - pytest>=7.3.2
       - pytest-cov
    -  - pytest-xdist>=2.2.0
    +  - pytest-xdist>=3.4.0
       - psutil
       - boto3
     
    @@ -23,7 +23,6 @@ dependencies:
     
       # optional dependencies
       - beautifulsoup4
    -  - blosc
       - bottleneck>=1.3.2
       - fastparquet>=0.6.3
       - fsspec>=2021.07.0
    @@ -39,7 +38,7 @@ dependencies:
       - odfpy>=1.4.1
       - psycopg2
       - pyarrow<11, >=7.0.0
    -  - pymysql>=1.0.2
    +  - pymysql>=1.1.0
       - pyreadstat>=1.1.2
       - pytables>=3.6.1
       - python-calamine>=0.1.7
    diff --git a/scripts/tests/test_no_bool_in_generic.py b/scripts/tests/test_no_bool_in_generic.py
    deleted file mode 100644
    index 20ac214ce8a4a..0000000000000
    --- a/scripts/tests/test_no_bool_in_generic.py
    +++ /dev/null
    @@ -1,20 +0,0 @@
    -from scripts.no_bool_in_generic import check_for_bool_in_generic
    -
    -BAD_FILE = "def foo(a: bool) -> bool:\n    return bool(0)\n"
    -GOOD_FILE = "def foo(a: bool_t) -> bool_t:\n    return bool(0)\n"
    -
    -
    -def test_bad_file_with_replace() -> None:
    -    content = BAD_FILE
    -    mutated, result = check_for_bool_in_generic(content)
    -    expected = GOOD_FILE
    -    assert result == expected
    -    assert mutated
    -
    -
    -def test_good_file_with_replace() -> None:
    -    content = GOOD_FILE
    -    mutated, result = check_for_bool_in_generic(content)
    -    expected = content
    -    assert result == expected
    -    assert not mutated
    diff --git a/scripts/tests/test_use_io_common_urlopen.py b/scripts/tests/test_use_io_common_urlopen.py
    deleted file mode 100644
    index c2c4a7fe9cb58..0000000000000
    --- a/scripts/tests/test_use_io_common_urlopen.py
    +++ /dev/null
    @@ -1,23 +0,0 @@
    -import pytest
    -
    -from scripts.use_io_common_urlopen import use_io_common_urlopen
    -
    -PATH = "t.py"
    -
    -
    -def test_inconsistent_usage(capsys) -> None:
    -    content = "from urllib.request import urlopen"
    -    result_msg = (
    -        "t.py:1:0: Don't use urllib.request.urlopen, "
    -        "use pandas.io.common.urlopen instead\n"
    -    )
    -    with pytest.raises(SystemExit, match=None):
    -        use_io_common_urlopen(content, PATH)
    -    expected_msg, _ = capsys.readouterr()
    -    assert result_msg == expected_msg
    -
    -
    -def test_consistent_usage() -> None:
    -    # should not raise
    -    content = "from pandas.io.common import urlopen"
    -    use_io_common_urlopen(content, PATH)
    diff --git a/scripts/tests/test_use_pd_array_in_core.py b/scripts/tests/test_use_pd_array_in_core.py
    deleted file mode 100644
    index f58c92722caad..0000000000000
    --- a/scripts/tests/test_use_pd_array_in_core.py
    +++ /dev/null
    @@ -1,26 +0,0 @@
    -import pytest
    -
    -from scripts.use_pd_array_in_core import use_pd_array
    -
    -BAD_FILE_0 = "import pandas as pd\npd.array"
    -BAD_FILE_1 = "\nfrom pandas import array"
    -GOOD_FILE_0 = "from pandas import array as pd_array"
    -GOOD_FILE_1 = "from pandas.core.construction import array as pd_array"
    -PATH = "t.py"
    -
    -
    -@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1])
    -def test_inconsistent_usage(content, capsys) -> None:
    -    result_msg = (
    -        "t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n"
    -    )
    -    with pytest.raises(SystemExit, match=None):
    -        use_pd_array(content, PATH)
    -    expected_msg, _ = capsys.readouterr()
    -    assert result_msg == expected_msg
    -
    -
    -@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1])
    -def test_consistent_usage(content) -> None:
    -    # should not raise
    -    use_pd_array(content, PATH)
    diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py
    index 02c6808658a33..3bffd1f1987aa 100644
    --- a/scripts/tests/test_validate_docstrings.py
    +++ b/scripts/tests/test_validate_docstrings.py
    @@ -36,7 +36,7 @@ def redundant_import(self, paramx=None, paramy=None) -> None:
             >>> import pandas as pd
             >>> df = pd.DataFrame(np.ones((3, 3)),
             ...                   columns=('a', 'b', 'c'))
    -        >>> df.all(1)
    +        >>> df.all(axis=1)
             0    True
             1    True
             2    True
    @@ -199,13 +199,43 @@ def test_bad_docstrings(self, capsys, klass, func, msgs) -> None:
             for msg in msgs:
                 assert msg in " ".join([err[1] for err in result["errors"]])
     
    -    def test_leftover_files_raises(self) -> None:
    -        with pytest.raises(Exception, match="The following files"):
    -            validate_docstrings.pandas_validate(
    -                self._import_path(klass="BadDocstrings", func="leftover_files")
    -            )
    +    def test_validate_all_ignore_deprecated(self, monkeypatch) -> None:
    +        monkeypatch.setattr(
    +            validate_docstrings,
    +            "pandas_validate",
    +            lambda func_name: {
    +                "docstring": "docstring1",
    +                "errors": [
    +                    ("ER01", "err desc"),
    +                    ("ER02", "err desc"),
    +                    ("ER03", "err desc"),
    +                ],
    +                "warnings": [],
    +                "examples_errors": "",
    +                "deprecated": True,
    +            },
    +        )
    +        result = validate_docstrings.validate_all(prefix=None, ignore_deprecated=True)
    +        assert len(result) == 0
     
    -    def test_validate_all_ignore_functions(self, monkeypatch) -> None:
    +    def test_validate_all_ignore_errors(self, monkeypatch):
    +        monkeypatch.setattr(
    +            validate_docstrings,
    +            "pandas_validate",
    +            lambda func_name: {
    +                "docstring": "docstring1",
    +                "errors": [
    +                    ("ER01", "err desc"),
    +                    ("ER02", "err desc"),
    +                    ("ER03", "err desc")
    +                ],
    +                "warnings": [],
    +                "examples_errors": "",
    +                "deprecated": True,
    +                "file": "file1",
    +                "file_line": "file_line1"
    +            },
    +        )
             monkeypatch.setattr(
                 validate_docstrings,
                 "get_all_api_items",
    @@ -224,31 +254,30 @@ def test_validate_all_ignore_functions(self, monkeypatch) -> None:
                     ),
                 ],
             )
    -        result = validate_docstrings.validate_all(
    +
    +        exit_status = validate_docstrings.print_validate_all_results(
    +            output_format="default",
                 prefix=None,
    -            ignore_functions=["pandas.DataFrame.align"],
    +            ignore_deprecated=False,
    +            ignore_errors={None: {"ER03"}},
             )
    -        assert len(result) == 1
    -        assert "pandas.Index.all" in result
    +        # two functions * two not ignored errors
    +        assert exit_status == 2 * 2
     
    -    def test_validate_all_ignore_deprecated(self, monkeypatch) -> None:
    -        monkeypatch.setattr(
    -            validate_docstrings,
    -            "pandas_validate",
    -            lambda func_name: {
    -                "docstring": "docstring1",
    -                "errors": [
    -                    ("ER01", "err desc"),
    -                    ("ER02", "err desc"),
    -                    ("ER03", "err desc"),
    -                ],
    -                "warnings": [],
    -                "examples_errors": "",
    -                "deprecated": True,
    -            },
    +        exit_status = validate_docstrings.print_validate_all_results(
    +            output_format="default",
    +            prefix=None,
    +            ignore_deprecated=False,
    +            ignore_errors={
    +                None: {"ER03"},
    +                "pandas.DataFrame.align": {"ER01"},
    +                # ignoring an error that is not requested should be of no effect
    +                "pandas.Index.all": {"ER03"}
    +            }
             )
    -        result = validate_docstrings.validate_all(prefix=None, ignore_deprecated=True)
    -        assert len(result) == 0
    +        # two functions * two not global ignored errors - one function ignored error
    +        assert exit_status == 2 * 2 - 1
    +
     
     
     class TestApiItems:
    @@ -368,12 +397,11 @@ def test_exit_status_for_main(self, monkeypatch) -> None:
             exit_status = validate_docstrings.main(
                 func_name="docstring1",
                 prefix=None,
    -            errors=[],
                 output_format="default",
                 ignore_deprecated=False,
    -            ignore_functions=None,
    +            ignore_errors={},
             )
    -        assert exit_status == 0
    +        assert exit_status == 3
     
         def test_exit_status_errors_for_validate_all(self, monkeypatch) -> None:
             monkeypatch.setattr(
    @@ -399,10 +427,9 @@ def test_exit_status_errors_for_validate_all(self, monkeypatch) -> None:
             exit_status = validate_docstrings.main(
                 func_name=None,
                 prefix=None,
    -            errors=[],
                 output_format="default",
                 ignore_deprecated=False,
    -            ignore_functions=None,
    +            ignore_errors={},
             )
             assert exit_status == 5
     
    @@ -417,16 +444,14 @@ def test_no_exit_status_noerrors_for_validate_all(self, monkeypatch) -> None:
             )
             exit_status = validate_docstrings.main(
                 func_name=None,
    -            prefix=None,
    -            errors=[],
                 output_format="default",
    +            prefix=None,
                 ignore_deprecated=False,
    -            ignore_functions=None,
    +            ignore_errors={},
             )
             assert exit_status == 0
     
         def test_exit_status_for_validate_all_json(self, monkeypatch) -> None:
    -        print("EXECUTED")
             monkeypatch.setattr(
                 validate_docstrings,
                 "validate_all",
    @@ -443,11 +468,10 @@ def test_exit_status_for_validate_all_json(self, monkeypatch) -> None:
             )
             exit_status = validate_docstrings.main(
                 func_name=None,
    -            prefix=None,
    -            errors=[],
                 output_format="json",
    +            prefix=None,
                 ignore_deprecated=False,
    -            ignore_functions=None,
    +            ignore_errors={},
             )
             assert exit_status == 0
     
    @@ -477,22 +501,29 @@ def test_errors_param_filters_errors(self, monkeypatch) -> None:
                     },
                 },
             )
    +        monkeypatch.setattr(
    +            validate_docstrings,
    +            "ERROR_MSGS",
    +            {
    +                "ER01": "err desc",
    +                "ER02": "err desc",
    +                "ER03": "err desc",
    +            },
    +        )
             exit_status = validate_docstrings.main(
                 func_name=None,
    -            prefix=None,
    -            errors=["ER01"],
                 output_format="default",
    +            prefix=None,
                 ignore_deprecated=False,
    -            ignore_functions=None,
    +            ignore_errors={None: {"ER02", "ER03"}},
             )
             assert exit_status == 3
     
             exit_status = validate_docstrings.main(
                 func_name=None,
    -            prefix=None,
    -            errors=["ER03"],
                 output_format="default",
    +            prefix=None,
                 ignore_deprecated=False,
    -            ignore_functions=None,
    +            ignore_errors={None: {"ER01", "ER02"}},
             )
             assert exit_status == 1
    diff --git a/scripts/tests/test_validate_unwanted_patterns.py b/scripts/tests/test_validate_unwanted_patterns.py
    index bef9d369a0a3c..4c433d03aff4d 100644
    --- a/scripts/tests/test_validate_unwanted_patterns.py
    +++ b/scripts/tests/test_validate_unwanted_patterns.py
    @@ -5,154 +5,6 @@
     from scripts import validate_unwanted_patterns
     
     
    -class TestBarePytestRaises:
    -    @pytest.mark.parametrize(
    -        "data",
    -        [
    -            (
    -                """
    -    with pytest.raises(ValueError, match="foo"):
    -        pass
    -    """
    -            ),
    -            (
    -                """
    -    # with pytest.raises(ValueError, match="foo"):
    -    #    pass
    -    """
    -            ),
    -            (
    -                """
    -    # with pytest.raises(ValueError):
    -    #    pass
    -    """
    -            ),
    -            (
    -                """
    -    with pytest.raises(
    -        ValueError,
    -        match="foo"
    -    ):
    -        pass
    -    """
    -            ),
    -        ],
    -    )
    -    def test_pytest_raises(self, data) -> None:
    -        fd = io.StringIO(data.strip())
    -        result = list(validate_unwanted_patterns.bare_pytest_raises(fd))
    -        assert result == []
    -
    -    @pytest.mark.parametrize(
    -        "data, expected",
    -        [
    -            (
    -                (
    -                    """
    -    with pytest.raises(ValueError):
    -        pass
    -    """
    -                ),
    -                [
    -                    (
    -                        1,
    -                        (
    -                            "Bare pytests raise have been found. "
    -                            "Please pass in the argument 'match' "
    -                            "as well the exception."
    -                        ),
    -                    ),
    -                ],
    -            ),
    -            (
    -                (
    -                    """
    -    with pytest.raises(ValueError, match="foo"):
    -        with pytest.raises(ValueError):
    -            pass
    -        pass
    -    """
    -                ),
    -                [
    -                    (
    -                        2,
    -                        (
    -                            "Bare pytests raise have been found. "
    -                            "Please pass in the argument 'match' "
    -                            "as well the exception."
    -                        ),
    -                    ),
    -                ],
    -            ),
    -            (
    -                (
    -                    """
    -    with pytest.raises(ValueError):
    -        with pytest.raises(ValueError, match="foo"):
    -            pass
    -        pass
    -    """
    -                ),
    -                [
    -                    (
    -                        1,
    -                        (
    -                            "Bare pytests raise have been found. "
    -                            "Please pass in the argument 'match' "
    -                            "as well the exception."
    -                        ),
    -                    ),
    -                ],
    -            ),
    -            (
    -                (
    -                    """
    -    with pytest.raises(
    -        ValueError
    -    ):
    -        pass
    -    """
    -                ),
    -                [
    -                    (
    -                        1,
    -                        (
    -                            "Bare pytests raise have been found. "
    -                            "Please pass in the argument 'match' "
    -                            "as well the exception."
    -                        ),
    -                    ),
    -                ],
    -            ),
    -            (
    -                (
    -                    """
    -    with pytest.raises(
    -        ValueError,
    -        # match = "foo"
    -    ):
    -        pass
    -    """
    -                ),
    -                [
    -                    (
    -                        1,
    -                        (
    -                            "Bare pytests raise have been found. "
    -                            "Please pass in the argument 'match' "
    -                            "as well the exception."
    -                        ),
    -                    ),
    -                ],
    -            ),
    -        ],
    -    )
    -    def test_pytest_raises_raises(self, data, expected) -> None:
    -        fd = io.StringIO(data.strip())
    -        result = list(validate_unwanted_patterns.bare_pytest_raises(fd))
    -        assert result == expected
    -
    -
     class TestStringsWithWrongPlacedWhitespace:
         @pytest.mark.parametrize(
             "data",
    diff --git a/scripts/use_io_common_urlopen.py b/scripts/use_io_common_urlopen.py
    deleted file mode 100644
    index ade97f53cd827..0000000000000
    --- a/scripts/use_io_common_urlopen.py
    +++ /dev/null
    @@ -1,67 +0,0 @@
    -"""
    -Check that pandas/core imports pandas.array as pd_array.
    -
    -This makes it easier to grep for usage of pandas array.
    -
    -This is meant to be run as a pre-commit hook - to run it manually, you can do:
    -
    -    pre-commit run use-io-common-urlopen --all-files
    -
    -"""
    -
    -from __future__ import annotations
    -
    -import argparse
    -import ast
    -import sys
    -from typing import TYPE_CHECKING
    -
    -if TYPE_CHECKING:
    -    from collections.abc import Sequence
    -
    -
    -ERROR_MESSAGE = (
    -    "{path}:{lineno}:{col_offset}: "
    -    "Don't use urllib.request.urlopen, use pandas.io.common.urlopen instead\n"
    -)
    -
    -
    -class Visitor(ast.NodeVisitor):
    -    def __init__(self, path: str) -> None:
    -        self.path = path
    -
    -    def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
    -        # Check that pandas.io.common.urlopen is used instead of
    -        # urllib.request.urlopen
    -        if (
    -            node.module is not None
    -            and node.module.startswith("urllib.request")
    -            and any(i.name == "urlopen" for i in node.names)
    -        ):
    -            msg = ERROR_MESSAGE.format(
    -                path=self.path, lineno=node.lineno, col_offset=node.col_offset
    -            )
    -            sys.stdout.write(msg)
    -            sys.exit(1)
    -        super().generic_visit(node)
    -
    -
    -def use_io_common_urlopen(content: str, path: str) -> None:
    -    tree = ast.parse(content)
    -    visitor = Visitor(path)
    -    visitor.visit(tree)
    -
    -
    -def main(argv: Sequence[str] | None = None) -> None:
    -    parser = argparse.ArgumentParser()
    -    parser.add_argument("paths", nargs="*")
    -    args = parser.parse_args(argv)
    -
    -    for path in args.paths:
    -        with open(path, encoding="utf-8") as fd:
    -            content = fd.read()
    -        use_io_common_urlopen(content, path)
    -
    -
    -if __name__ == "__main__":
    -    main()
    diff --git a/scripts/use_pd_array_in_core.py b/scripts/use_pd_array_in_core.py
    deleted file mode 100644
    index c9e14dece44e4..0000000000000
    --- a/scripts/use_pd_array_in_core.py
    +++ /dev/null
    @@ -1,80 +0,0 @@
    -"""
    -Check that pandas/core imports pandas.array as pd_array.
    -
    -This makes it easier to grep for usage of pandas array.
    -
    -This is meant to be run as a pre-commit hook - to run it manually, you can do:
    -
    -    pre-commit run use-pd_array-in-core --all-files
    -
    -"""
    -
    -from __future__ import annotations
    -
    -import argparse
    -import ast
    -import sys
    -from typing import TYPE_CHECKING
    -
    -if TYPE_CHECKING:
    -    from collections.abc import Sequence
    -
    -
    -ERROR_MESSAGE = (
    -    "{path}:{lineno}:{col_offset}: "
    -    "Don't use pd.array in core, import array as pd_array instead\n"
    -)
    -
    -
    -class Visitor(ast.NodeVisitor):
    -    def __init__(self, path: str) -> None:
    -        self.path = path
    -
    -    def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
    -        # If array has been imported from somewhere in pandas,
    -        # check it's aliased as pd_array.
    -        if (
    -            node.module is not None
    -            and node.module.startswith("pandas")
    -            and any(i.name == "array" and i.asname != "pd_array" for i in node.names)
    -        ):
    -            msg = ERROR_MESSAGE.format(
    -                path=self.path, lineno=node.lineno, col_offset=node.col_offset
    -            )
    -            sys.stdout.write(msg)
    -            sys.exit(1)
    -        super().generic_visit(node)
    -
    -    def visit_Attribute(self, node: ast.Attribute) -> None:
    -        if (
    -            isinstance(node.value, ast.Name)
    -            and node.value.id == "pd"
    -            and node.attr == "array"
    -        ):
    -            msg = ERROR_MESSAGE.format(
    -                path=self.path, lineno=node.lineno, col_offset=node.col_offset
    -            )
    -            sys.stdout.write(msg)
    -            sys.exit(1)
    -        super().generic_visit(node)
    -
    -
    -def use_pd_array(content: str, path: str) -> None:
    -    tree = ast.parse(content)
    -    visitor = Visitor(path)
    -    visitor.visit(tree)
    -
    -
    -def main(argv: Sequence[str] | None = None) -> None:
    -    parser = argparse.ArgumentParser()
    -    parser.add_argument("paths", nargs="*")
    -    args = parser.parse_args(argv)
    -
    -    for path in args.paths:
    -        with open(path, encoding="utf-8") as fd:
    -            content = fd.read()
    -        use_pd_array(content, path)
    -
    -
    -if __name__ == "__main__":
    -    main()
    diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
    index 76d64d27b221c..944575dcc8659 100755
    --- a/scripts/validate_docstrings.py
    +++ b/scripts/validate_docstrings.py
    @@ -16,9 +16,9 @@
     from __future__ import annotations
     
     import argparse
    +import collections
     import doctest
     import importlib
    -import io
     import json
     import os
     import pathlib
    @@ -28,15 +28,13 @@
     
     import matplotlib
     import matplotlib.pyplot as plt
    -import numpy
     from numpydoc.docscrape import get_doc_object
     from numpydoc.validate import (
    +    ERROR_MSGS as NUMPYDOC_ERROR_MSGS,
         Validator,
         validate,
     )
     
    -import pandas
    -
     # With template backend, matplotlib plots nothing
     matplotlib.use("template")
     
    @@ -47,6 +45,7 @@
         "Styler.template_html_style",
         "Styler.template_html_table",
         "Styler.template_latex",
    +    "Styler.template_typst",
         "Styler.template_string",
         "Styler.loader",
         "errors.InvalidComparison",
    @@ -60,15 +59,18 @@
     ERROR_MSGS = {
         "GL04": "Private classes ({mentioned_private_classes}) should not be "
         "mentioned in public docstrings",
    -    "GL05": "Use 'array-like' rather than 'array_like' in docstrings.",
    +    "PD01": "Use 'array-like' rather than 'array_like' in docstrings.",
         "SA05": "{reference_name} in `See Also` section does not need `pandas` "
         "prefix, use {right_reference} instead.",
    -    "EX02": "Examples do not pass tests:\n{doctest_log}",
         "EX03": "flake8 error: line {line_number}, col {col_number}: {error_code} "
         "{error_message}",
         "EX04": "Do not import {imported_library}, as it is imported "
         "automatically for the examples (numpy as np, pandas as pd)",
     }
    +ALL_ERRORS = set(NUMPYDOC_ERROR_MSGS).union(set(ERROR_MSGS))
    +duplicated_errors = set(NUMPYDOC_ERROR_MSGS).intersection(set(ERROR_MSGS))
    +assert not duplicated_errors, (f"Errors {duplicated_errors} exist in both pandas "
    +                               "and numpydoc, should they be removed from pandas?")
     
     
     def pandas_error(code, **kwargs):
    @@ -76,7 +78,7 @@ def pandas_error(code, **kwargs):
         Copy of the numpydoc error function, since ERROR_MSGS can't be updated
         with our custom errors yet.
         """
    -    return (code, ERROR_MSGS[code].format(**kwargs))
    +    return code, ERROR_MSGS[code].format(**kwargs)
     
     
     def get_api_items(api_doc_fd):
    @@ -95,7 +97,7 @@ def get_api_items(api_doc_fd):
         Yields
         ------
         name : str
    -        The name of the object (e.g. 'pandas.Series.str.upper).
    +        The name of the object (e.g. 'pandas.Series.str.upper').
         func : function
             The object itself. In most cases this will be a function or method,
             but it can also be classes, properties, cython objects...
    @@ -167,32 +169,6 @@ def name(self):
         def mentioned_private_classes(self):
             return [klass for klass in PRIVATE_CLASSES if klass in self.raw_doc]
     
    -    @property
    -    def examples_errors(self):
    -        flags = doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL
    -        finder = doctest.DocTestFinder()
    -        runner = doctest.DocTestRunner(optionflags=flags)
    -        context = {"np": numpy, "pd": pandas}
    -        error_msgs = ""
    -        current_dir = set(os.listdir())
    -        for test in finder.find(self.raw_doc, self.name, globs=context):
    -            f = io.StringIO()
    -            runner.run(test, out=f.write)
    -            error_msgs += f.getvalue()
    -        leftovers = set(os.listdir()).difference(current_dir)
    -        if leftovers:
    -            for leftover in leftovers:
    -                path = pathlib.Path(leftover).resolve()
    -                if path.is_dir():
    -                    path.rmdir()
    -                elif path.is_file():
    -                    path.unlink(missing_ok=True)
    -            raise Exception(
    -                f"The following files were leftover from the doctest: "
    -                f"{leftovers}. Please use # doctest: +SKIP"
    -            )
    -        return error_msgs
    -
         @property
         def examples_source_code(self):
             lines = doctest.DocTestParser().get_examples(self.raw_doc)
    @@ -219,12 +195,12 @@ def validate_pep8(self):
                 file.write(content)
                 file.flush()
                 cmd = [
    -                "python",
    +                sys.executable,
                     "-m",
                     "flake8",
                     "--format=%(row)d\t%(col)d\t%(code)s\t%(text)s",
                     "--max-line-length=88",
    -                "--ignore=E203,E3,W503,W504,E402,E731",
    +                "--ignore=E203,E3,W503,W504,E402,E731,E128,E124,E704",
                     file.name,
                 ]
                 response = subprocess.run(cmd, capture_output=True, check=False, text=True)
    @@ -270,7 +246,6 @@ def pandas_validate(func_name: str):
         doc_obj = get_doc_object(func_obj, doc=func_obj.__doc__)
         doc = PandasDocstring(func_name, doc_obj)
         result = validate(doc_obj)
    -
         mentioned_errs = doc.mentioned_private_classes
         if mentioned_errs:
             result["errors"].append(
    @@ -282,7 +257,7 @@ def pandas_validate(func_name: str):
                 pandas_error(
                     "SA05",
                     reference_name=rel_name,
    -                right_reference=rel_name[len("pandas.") :],
    +                right_reference=rel_name[len("pandas."):],
                 )
                 for rel_name in doc.see_also
                 if rel_name.startswith("pandas.")
    @@ -290,12 +265,6 @@ def pandas_validate(func_name: str):
     
         result["examples_errs"] = ""
         if doc.examples:
    -        result["examples_errs"] = doc.examples_errors
    -        if result["examples_errs"]:
    -            result["errors"].append(
    -                pandas_error("EX02", doctest_log=result["examples_errs"])
    -            )
    -
             for error_code, error_message, line_number, col_number in doc.validate_pep8():
                 result["errors"].append(
                     pandas_error(
    @@ -314,13 +283,13 @@ def pandas_validate(func_name: str):
             )
     
         if doc.non_hyphenated_array_like():
    -        result["errors"].append(pandas_error("GL05"))
    +        result["errors"].append(pandas_error("PD01"))
     
         plt.close("all")
         return result
     
     
    -def validate_all(prefix, ignore_deprecated=False, ignore_functions=None):
    +def validate_all(prefix, ignore_deprecated=False):
         """
         Execute the validation of all docstrings, and return a dict with the
         results.
    @@ -332,8 +301,6 @@ def validate_all(prefix, ignore_deprecated=False, ignore_functions=None):
             validated. If None, all docstrings will be validated.
         ignore_deprecated: bool, default False
             If True, deprecated objects are ignored when validating docstrings.
    -    ignore_functions: list of str or None, default None
    -        If not None, contains a list of function to ignore
     
         Returns
         -------
    @@ -344,11 +311,7 @@ def validate_all(prefix, ignore_deprecated=False, ignore_functions=None):
         result = {}
         seen = {}
     
    -    ignore_functions = set(ignore_functions or [])
    -
         for func_name, _, section, subsection in get_all_api_items():
    -        if func_name in ignore_functions:
    -            continue
             if prefix and not func_name.startswith(prefix):
                 continue
             doc_info = pandas_validate(func_name)
    @@ -381,16 +344,17 @@ def get_all_api_items():
     
     
     def print_validate_all_results(
    -    prefix: str,
    -    errors: list[str] | None,
         output_format: str,
    +    prefix: str | None,
         ignore_deprecated: bool,
    -    ignore_functions: list[str] | None,
    +    ignore_errors: dict[str, set[str]],
     ):
         if output_format not in ("default", "json", "actions"):
             raise ValueError(f'Unknown output_format "{output_format}"')
    +    if ignore_errors is None:
    +        ignore_errors = {}
     
    -    result = validate_all(prefix, ignore_deprecated, ignore_functions)
    +    result = validate_all(prefix, ignore_deprecated)
     
         if output_format == "json":
             sys.stdout.write(json.dumps(result))
    @@ -398,20 +362,30 @@ def print_validate_all_results(
     
         prefix = "##[error]" if output_format == "actions" else ""
         exit_status = 0
    -    for name, res in result.items():
    -        for err_code, err_desc in res["errors"]:
    -            if errors and err_code not in errors:
    -                continue
    +    for func_name, res in result.items():
    +        error_messages = dict(res["errors"])
    +        actual_failures = set(error_messages)
    +        expected_failures = (ignore_errors.get(func_name, set())
    +                             | ignore_errors.get(None, set()))
    +        for err_code in actual_failures - expected_failures:
                 sys.stdout.write(
                     f'{prefix}{res["file"]}:{res["file_line"]}:'
    -                f"{err_code}:{name}:{err_desc}\n"
    +                f'{err_code}:{func_name}:{error_messages[err_code]}\n'
    +            )
    +            exit_status += 1
    +        for err_code in ignore_errors.get(func_name, set()) - actual_failures:
    +            sys.stdout.write(
    +                f'{prefix}{res["file"]}:{res["file_line"]}:'
    +                f"{err_code}:{func_name}:"
    +                "EXPECTED TO FAIL, BUT NOT FAILING\n"
                 )
                 exit_status += 1
     
         return exit_status
     
     
    -def print_validate_one_results(func_name: str) -> None:
    +def print_validate_one_results(func_name: str,
    +                               ignore_errors: dict[str, set[str]]) -> int:
         def header(title, width=80, char="#") -> str:
             full_line = char * width
             side_len = (width - len(title) - 2) // 2
    @@ -422,6 +396,9 @@ def header(title, width=80, char="#") -> str:
     
         result = pandas_validate(func_name)
     
    +    result["errors"] = [(code, message) for code, message in result["errors"]
    +                        if code not in ignore_errors.get(None, set())]
    +
         sys.stderr.write(header(f"Docstring ({func_name})"))
         sys.stderr.write(f"{result['docstring']}\n")
     
    @@ -429,10 +406,7 @@ def header(title, width=80, char="#") -> str:
         if result["errors"]:
             sys.stderr.write(f'{len(result["errors"])} Errors found for `{func_name}`:\n')
             for err_code, err_desc in result["errors"]:
    -            if err_code == "EX02":  # Failing examples are printed at the end
    -                sys.stderr.write("\tExamples do not pass tests\n")
    -                continue
    -            sys.stderr.write(f"\t{err_desc}\n")
    +            sys.stderr.write(f"\t{err_code}\t{err_desc}\n")
         else:
             sys.stderr.write(f'Docstring for "{func_name}" correct. :)\n')
     
    @@ -440,22 +414,64 @@ def header(title, width=80, char="#") -> str:
             sys.stderr.write(header("Doctests"))
             sys.stderr.write(result["examples_errs"])
     
    -
    -def main(func_name, prefix, errors, output_format, ignore_deprecated, ignore_functions):
    +    return len(result["errors"]) + len(result["examples_errs"])
    +
    +
    +def _format_ignore_errors(raw_ignore_errors):
    +    ignore_errors = collections.defaultdict(set)
    +    if raw_ignore_errors:
    +        for error_codes in raw_ignore_errors:
    +            obj_name = None
    +            if " " in error_codes:
    +                obj_name, error_codes = error_codes.split(" ")
    +
    +            # function errors "pandas.Series PR01,SA01"
    +            if obj_name:
    +                if obj_name in ignore_errors:
    +                    raise ValueError(
    +                        f"Object `{obj_name}` is present in more than one "
    +                        "--ignore_errors argument. Please use it once and specify "
    +                        "the errors separated by commas.")
    +                ignore_errors[obj_name] = set(error_codes.split(","))
    +
    +                unknown_errors = ignore_errors[obj_name] - ALL_ERRORS
    +                if unknown_errors:
    +                    raise ValueError(
    +                        f"Object `{obj_name}` is ignoring errors {unknown_errors} "
    +                        f"which are not known. Known errors are: {ALL_ERRORS}")
    +
    +            # global errors "PR02,ES01"
    +            else:
    +                ignore_errors[None].update(set(error_codes.split(",")))
    +
    +        unknown_errors = ignore_errors["*"] - ALL_ERRORS
    +        if unknown_errors:
    +            raise ValueError(
    +                f"Unknown errors {unknown_errors} specified using --ignore_errors "
    +                "Known errors are: {ALL_ERRORS}")
    +
    +    return ignore_errors
    +
    +
    +def main(
    +    func_name,
    +    output_format,
    +    prefix,
    +    ignore_deprecated,
    +    ignore_errors
    +):
         """
         Main entry point. Call the validation for one or for all docstrings.
         """
         if func_name is None:
             return print_validate_all_results(
    -            prefix,
    -            errors,
                 output_format,
    +            prefix,
                 ignore_deprecated,
    -            ignore_functions,
    +            ignore_errors
             )
         else:
    -        print_validate_one_results(func_name)
    -        return 0
    +        return print_validate_one_results(func_name, ignore_errors)
     
     
     if __name__ == "__main__":
    @@ -485,14 +501,6 @@ def main(func_name, prefix, errors, output_format, ignore_deprecated, ignore_fun
             "of methods starting by this pattern. It is "
             "ignored if parameter function is provided",
         )
    -    argparser.add_argument(
    -        "--errors",
    -        default=None,
    -        help="comma separated "
    -        "list of error codes to validate. By default it "
    -        "validates all errors (ignored when validating "
    -        "a single docstring)",
    -    )
         argparser.add_argument(
             "--ignore_deprecated",
             default=False,
    @@ -502,21 +510,24 @@ def main(func_name, prefix, errors, output_format, ignore_deprecated, ignore_fun
             "all docstrings",
         )
         argparser.add_argument(
    -        "--ignore_functions",
    -        nargs="*",
    -        help="function or method to not validate "
    -        "(e.g. pandas.DataFrame.head). "
    -        "Inverse of the `function` argument.",
    +        "--ignore_errors",
    +        "-i",
    +        default=None,
    +        action="append",
    +        help="comma-separated list of error codes "
    +        "(e.g. 'PR02,SA01'), with optional object path "
    +        "to ignore errors for a single object "
    +        "(e.g. pandas.DataFrame.head PR02,SA01). "
    +        "Partial validation for more than one function"
    +        "can be achieved by repeating this parameter.",
         )
    +    args = argparser.parse_args(sys.argv[1:])
     
    -    args = argparser.parse_args()
         sys.exit(
    -        main(
    -            args.function,
    -            args.prefix,
    -            args.errors.split(",") if args.errors else None,
    -            args.format,
    -            args.ignore_deprecated,
    -            args.ignore_functions,
    -        )
    +        main(args.function,
    +             args.format,
    +             args.prefix,
    +             args.ignore_deprecated,
    +             _format_ignore_errors(args.ignore_errors),
    +             )
         )
    diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py
    index 7dd3e96e6ec18..7908aaef3d890 100755
    --- a/scripts/validate_min_versions_in_sync.py
    +++ b/scripts/validate_min_versions_in_sync.py
    @@ -36,7 +36,7 @@
     SETUP_PATH = pathlib.Path("pyproject.toml").resolve()
     YAML_PATH = pathlib.Path("ci/deps")
     ENV_PATH = pathlib.Path("environment.yml")
    -EXCLUDE_DEPS = {"tzdata", "blosc", "pandas-gbq", "pyqt", "pyqt5"}
    +EXCLUDE_DEPS = {"tzdata", "pyqt", "pyqt5"}
     EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"])
     # pandas package is not available
     # in pre-commit environment
    @@ -143,7 +143,7 @@ def clean_version_list(
         yaml_versions: list[str], toml_version: version.Version
     ) -> list[str]:
         for i in range(len(yaml_versions)):
    -        yaml_version = yaml_versions[i]
    +        yaml_version = yaml_versions[i].strip()
             operator = get_operator_from(yaml_version)
             assert operator is not None
             if "<=" in operator or ">=" in operator:
    diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py
    index 44318cd797163..73a90f4fca0f6 100755
    --- a/scripts/validate_rst_title_capitalization.py
    +++ b/scripts/validate_rst_title_capitalization.py
    @@ -92,6 +92,11 @@
         "BYearBegin",
         "BYearEnd",
         "YearOffset",
    +    "HalfYearBegin",
    +    "HalfYearEnd",
    +    "BHalfYearBegin",
    +    "BHalfYearEnd",
    +    "HalfYearOffset",
         "QuarterBegin",
         "QuarterEnd",
         "BQuarterBegin",
    diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py
    index 89b67ddd9f5b6..d804e15f6d48f 100755
    --- a/scripts/validate_unwanted_patterns.py
    +++ b/scripts/validate_unwanted_patterns.py
    @@ -12,14 +12,14 @@
     
     import argparse
     import ast
    -from collections.abc import Iterable
    +from collections.abc import (
    +    Callable,
    +    Iterable,
    +)
     import sys
     import token
     import tokenize
    -from typing import (
    -    IO,
    -    Callable,
    -)
    +from typing import IO
     
     PRIVATE_IMPORTS_TO_IGNORE: set[str] = {
         "_extension_array_shared_docs",
    @@ -27,16 +27,12 @@
         "_interval_shared_docs",
         "_merge_doc",
         "_shared_docs",
    -    "_apply_docs",
         "_new_Index",
         "_new_PeriodIndex",
    -    "_agg_template_series",
    -    "_agg_template_frame",
         "_pipe_template",
         "_apply_groupings_depr",
         "__main__",
         "_transform_template",
    -    "_use_inf_as_na",
         "_get_plot_backend",
         "_matplotlib",
         "_arrow_utils",
    @@ -50,14 +46,13 @@
         "_global_config",
         "_chained_assignment_msg",
         "_chained_assignment_method_msg",
    -    "_chained_assignment_warning_msg",
    -    "_chained_assignment_warning_method_msg",
    -    "_check_cacher",
         "_version_meson",
         # The numba extensions need this to mock the iloc object
         "_iLocIndexer",
    -    # TODO(3.0): GH#55043 - remove upon removal of ArrayManager
    +    # TODO(4.0): GH#55043 - remove upon removal of CoW option
         "_get_option",
    +    "_fill_limit_area_1d",
    +    "_make_block",
     }
     
     
    @@ -94,65 +89,6 @@ def _get_literal_string_prefix_len(token_string: str) -> int:
             return 0
     
     
    -def bare_pytest_raises(file_obj: IO[str]) -> Iterable[tuple[int, str]]:
    -    """
    -    Test Case for bare pytest raises.
    -
    -    For example, this is wrong:
    -
    -    >>> with pytest.raise(ValueError):
    -    ...     # Some code that raises ValueError
    -
    -    And this is what we want instead:
    -
    -    >>> with pytest.raise(ValueError, match="foo"):
    -    ...     # Some code that raises ValueError
    -
    -    Parameters
    -    ----------
    -    file_obj : IO
    -        File-like object containing the Python code to validate.
    -
    -    Yields
    -    ------
    -    line_number : int
    -        Line number of unconcatenated string.
    -    msg : str
    -        Explanation of the error.
    -
    -    Notes
    -    -----
    -    GH #23922
    -    """
    -    contents = file_obj.read()
    -    tree = ast.parse(contents)
    -
    -    for node in ast.walk(tree):
    -        if not isinstance(node, ast.Call):
    -            continue
    -
    -        try:
    -            if not (node.func.value.id == "pytest" and node.func.attr == "raises"):
    -                continue
    -        except AttributeError:
    -            continue
    -
    -        if not node.keywords:
    -            yield (
    -                node.lineno,
    -                "Bare pytests raise have been found. "
    -                "Please pass in the argument 'match' as well the exception.",
    -            )
    -        # Means that there are arguments that are being passed in,
    -        # now we validate that `match` is one of the passed in arguments
    -        elif not any(keyword.arg == "match" for keyword in node.keywords):
    -            yield (
    -                node.lineno,
    -                "Bare pytests raise have been found. "
    -                "Please pass in the argument 'match' as well the exception.",
    -            )
    -
    -
     PRIVATE_FUNCTIONS_ALLOWED = {"sys._getframe"}  # no known alternative
     
     
    @@ -232,7 +168,7 @@ def private_import_across_module(file_obj: IO[str]) -> Iterable[tuple[int, str]]
                     continue
     
                 if module_name.startswith("_"):
    -                yield (node.lineno, f"Import of internal function {repr(module_name)}")
    +                yield (node.lineno, f"Import of internal function {module_name!r}")
     
     
     def strings_with_wrong_placed_whitespace(
    @@ -383,10 +319,10 @@ def nodefault_used_not_only_for_typing(file_obj: IO[str]) -> Iterable[tuple[int,
         while nodes:
             in_annotation, node = nodes.pop()
             if not in_annotation and (
    -            isinstance(node, ast.Name)  # Case `NoDefault`
    -            and node.id == "NoDefault"
    -            or isinstance(node, ast.Attribute)  # Cases e.g. `lib.NoDefault`
    -            and node.attr == "NoDefault"
    +            (isinstance(node, ast.Name)  # Case `NoDefault`
    +            and node.id == "NoDefault")
    +            or (isinstance(node, ast.Attribute)  # Cases e.g. `lib.NoDefault`
    +            and node.attr == "NoDefault")
             ):
                 yield (node.lineno, "NoDefault is used not only for typing")
     
    @@ -456,7 +392,6 @@ def main(
     
     if __name__ == "__main__":
         available_validation_types: list[str] = [
    -        "bare_pytest_raises",
             "private_function_across_module",
             "private_import_across_module",
             "strings_with_wrong_placed_whitespace",
    diff --git a/setup.py b/setup.py
    index c7c76d0f7636a..db1852b43cfa9 100755
    --- a/setup.py
    +++ b/setup.py
    @@ -37,7 +37,7 @@ def is_platform_mac():
     
     
     # note: sync with pyproject.toml, environment.yml and asv.conf.json
    -min_cython_ver = "3.0.5"
    +min_cython_ver = "3.0"
     
     try:
         from Cython import (
    @@ -364,7 +364,7 @@ def run(self) -> None:
     # enable coverage by building cython files by setting the environment variable
     # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext
     # with `--with-cython-coverage`enabled
    -linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False)
    +linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False)  # noqa: PLW1508
     if "--with-cython-coverage" in sys.argv:
         linetrace = True
         sys.argv.remove("--with-cython-coverage")
    diff --git a/web/interactive_terminal/README.md b/web/interactive_terminal/README.md
    new file mode 100644
    index 0000000000000..6457cbccf2016
    --- /dev/null
    +++ b/web/interactive_terminal/README.md
    @@ -0,0 +1,38 @@
    +# The interactive `pandas` REPL
    +
    +An interactive REPL to easily try `pandas` in the browser, powered by JupyterLite.
    +
    +![image](https://user-images.githubusercontent.com/591645/175000291-e8c69f6f-5f2c-48d7-817c-cff05ab2cde9.png)
    +
    +## Build
    +
    +The interactive REPL is built with the `jupyter lite` CLI.
    +
    +First make sure `jupyterlite` and a kernel are installed:
    +
    +```bash
    +python -m pip install jupyterlite-core
    +python -m pip install jupyterlite-pyodide-kernel
    +```
    +
    +Then in `web/interactive_terminal`, run the following command:
    +
    +```bash
    +jupyter lite build
    +```
    +
    +## Configuration
    +
    +This folder contains configuration files for the interactive terminal powered by JupyterLite:
    +
    +- `jupyter_lite_config.json`: build time configuration, used when building the assets with the `jupyter lite build` command
    +- `jupyter-lite.json` run time configuration applied when launching the application in the browser
    +
    +This interactive `pandas` JupyterLite deployment enables a couple of optimizations to only include the `repl` app in the generated static assets, and disables source maps, which can make the assets smaller and faster to load, at the cost of
    +debugging capabilities.
    +
    +To learn more about it, check out the JupyterLite documentation:
    +
    +- Optimizations: https://jupyterlite.readthedocs.io/en/latest/howto/configure/advanced/optimizations.html
    +- JupyterLite schema: https://jupyterlite.readthedocs.io/en/latest/reference/schema-v0.html
    +- CLI reference: https://jupyterlite.readthedocs.io/en/latest/reference/cli.html
    diff --git a/web/interactive_terminal/jupyter-lite.json b/web/interactive_terminal/jupyter-lite.json
    new file mode 100644
    index 0000000000000..2199acf1d368a
    --- /dev/null
    +++ b/web/interactive_terminal/jupyter-lite.json
    @@ -0,0 +1,10 @@
    +{
    +  "jupyter-lite-schema-version": 0,
    +  "jupyter-config-data": {
    +    "appName": "Pandas REPL",
    +    "appUrl": "./repl",
    +    "enableMemoryStorage": true,
    +    "settingsStorageDrivers": ["memoryStorageDriver"],
    +    "contentsStorageDrivers": ["memoryStorageDriver"]
    +  }
    +}
    diff --git a/web/interactive_terminal/jupyter_lite_config.json b/web/interactive_terminal/jupyter_lite_config.json
    new file mode 100644
    index 0000000000000..42e64f26f2356
    --- /dev/null
    +++ b/web/interactive_terminal/jupyter_lite_config.json
    @@ -0,0 +1,8 @@
    +{
    +  "LiteBuildConfig": {
    +    "apps": ["repl"],
    +    "no_unused_shared_packages": true,
    +    "output_dir": "../build/lite",
    +    "no_sourcemaps": true
    +  }
    +}
    diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html
    index c8025aeef3791..c26b093b0c4ba 100644
    --- a/web/pandas/_templates/layout.html
    +++ b/web/pandas/_templates/layout.html
    @@ -14,7 +14,7 @@
                 
             {% endfor %}
    -        
    +        
         
         
             
    @@ -64,27 +64,27 @@ diff --git a/web/pandas/about/citing.md b/web/pandas/about/citing.md index 4ce1fdb207865..a3c470d05e55f 100644 --- a/web/pandas/about/citing.md +++ b/web/pandas/about/citing.md @@ -20,7 +20,7 @@ following paper: url = {https://doi.org/10.5281/zenodo.3509134} } -- [Data structures for statistical computing in python](https://conference.scipy.org/proceedings/scipy2010/pdfs/mckinney.pdf), +- [Data structures for statistical computing in python](https://pub.curvenote.com/01908378-3686-7168-a380-d82bbf21c799/public/mckinney-57fc0d4e8a08cd7f26a4b8bf468a71f4.pdf), McKinney, Proceedings of the 9th Python in Science Conference, Volume 445, 2010. @InProceedings{ mckinney-proc-scipy-2010, diff --git a/web/pandas/about/roadmap.md b/web/pandas/about/roadmap.md index 4c12e3e786a32..aba95ec2c03fc 100644 --- a/web/pandas/about/roadmap.md +++ b/web/pandas/about/roadmap.md @@ -90,33 +90,6 @@ data types within pandas. This will let us take advantage of its I/O capabilities and provide for better interoperability with other languages and libraries using Arrow. -### Block manager rewrite - -We'd like to replace pandas current internal data structures (a -collection of 1 or 2-D arrays) with a simpler collection of 1-D arrays. - -Pandas internal data model is quite complex. A DataFrame is made up of -one or more 2-dimensional "blocks", with one or more blocks per dtype. -This collection of 2-D arrays is managed by the BlockManager. - -The primary benefit of the BlockManager is improved performance on -certain operations (construction from a 2D array, binary operations, -reductions across the columns), especially for wide DataFrames. However, -the BlockManager substantially increases the complexity and maintenance -burden of pandas. - -By replacing the BlockManager we hope to achieve - -- Substantially simpler code -- Easier extensibility with new logical types -- Better user control over memory use and layout -- Improved micro-performance -- Option to provide a C / Cython API to pandas' internals - -See [these design -documents](https://dev.pandas.io/pandas2/internal-architecture.html#removal-of-blockmanager-new-dataframe-internals) -for more. - ### Decoupling of indexing and internals The code for getting and setting values in pandas' data structures diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md index 49b8a26ab56e8..7a19fd7af6595 100644 --- a/web/pandas/about/team.md +++ b/web/pandas/about/team.md @@ -41,9 +41,7 @@ If you want to support pandas development, you can find information in the [dona ## Governance -Wes McKinney is the Benevolent Dictator for Life (BDFL). - -The project governance is available in the [project governance page]({{ base_url }}governance.html). +The project governance is available in the [project governance page]({{ base_url }}about/governance.html). ## Workgroups diff --git a/web/pandas/community/benchmarks.md b/web/pandas/community/benchmarks.md new file mode 100644 index 0000000000000..ddfcbd57d3d78 --- /dev/null +++ b/web/pandas/community/benchmarks.md @@ -0,0 +1,40 @@ +# Benchmarks + +Benchmarks are tests to measure the performance of pandas. There are two different +kinds of benchmarks relevant to pandas: + +* Internal pandas benchmarks to measure speed and memory usage over time +* Community benchmarks comparing the speed or memory usage of different tools at + doing the same job + +## pandas benchmarks + +pandas benchmarks are implemented in the [asv_bench](https://github.com/pandas-dev/pandas/tree/main/asv_bench) +directory of our repository. The benchmarks are implemented for the +[airspeed velocity](https://asv.readthedocs.io/en/latest/) (asv for short) framework. + +The benchmarks can be run locally by any pandas developer. This can be done +with the `asv run` command, and it can be useful to detect if local changes have +an impact in performance, by running the benchmarks before and after the changes. +More information on running the performance test suite is found +[here](https://pandas.pydata.org/docs/dev/development/contributing_codebase.html#running-the-performance-test-suite). + +Note that benchmarks are not deterministic, and running in different hardware or +running in the same hardware with different levels of stress have a big impact in +the result. Even running the benchmarks with identical hardware and almost identical +conditions can produce significant differences when running the same exact code. + +## Automated benchmark runner + +The [asv-runner](https://github.com/pandas-dev/asv-runner/) repository automatically runs the pandas asv benchmark suite +for every (or almost every) commit to the `main` branch. It is run on GitHub actions. +See the linked repository for more details. The results are available at: + +https://pandas-dev.github.io/asv-runner/ + +## Community benchmarks + +The main benchmarks comparing dataframe tools that include pandas are: + +- [DuckDB (former H2O.ai) benchmarks](https://duckdblabs.github.io/db-benchmark/) +- [TPCH benchmarks](https://pola.rs/posts/benchmarks/) diff --git a/web/pandas/community/blog/2019-user-survey.md b/web/pandas/community/blog/2019-user-survey.md index 312ee49bdf387..821fdd01acf65 100644 --- a/web/pandas/community/blog/2019-user-survey.md +++ b/web/pandas/community/blog/2019-user-survey.md @@ -77,7 +77,7 @@ For environment isolation, [conda](https://conda.io/en/latest/) was the most pop ![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_13_0.png) -Most repondents are Python 3 only. +Most respondents are Python 3 only. diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 58c5da67bcd74..1ebd4f3d3f1dc 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -8,7 +8,7 @@ developers to build powerful and more focused data tools. The creation of libraries that complement pandas' functionality also allows pandas development to remain focused around its original requirements. -This is an community-maintained list of projects that build on pandas in order +This is a community-maintained list of projects that build on pandas in order to provide tools in the PyData space. The pandas core development team does not necessarily endorse any particular project on this list or have any knowledge of the maintenance status of any particular library. For a more complete list of projects that depend on pandas, see the [libraries.io usage page for @@ -21,10 +21,6 @@ please let us know. ## Statistics and machine learning -### [pandas-tfrecords](https://pypi.org/project/pandas-tfrecords/) - -Easy saving pandas dataframe to tensorflow tfrecords format and reading tfrecords to pandas. - ### [Statsmodels](https://www.statsmodels.org/) Statsmodels is the prominent Python "statistics and econometrics @@ -34,10 +30,11 @@ modeling functionality that is out of pandas' scope. Statsmodels leverages pandas objects as the underlying data container for computation. -### [sklearn-pandas](https://github.com/scikit-learn-contrib/sklearn-pandas) +### [skrub](https://skrub-data.org) -Use pandas DataFrames in your [scikit-learn](https://scikit-learn.org/) -ML pipeline. +Skrub facilitates machine learning on dataframes. It bridges pandas +to scikit-learn and related. In particular it facilitates building +features from dataframes. ### [Featuretools](https://github.com/alteryx/featuretools/) @@ -91,6 +88,20 @@ pd.set_option("plotting.backend", "pandas_bokeh") It is very similar to the matplotlib plotting backend, but provides interactive web-based charts and maps. +### [pygwalker](https://github.com/Kanaries/pygwalker) + +PyGWalker is an interactive data visualization and +exploratory data analysis tool built upon Graphic Walker +with support for visualization, cleaning, and annotation workflows. + +pygwalker can save interactively created charts +to Graphic-Walker and Vega-Lite JSON. + +``` +import pygwalker as pyg +pyg.walk(df) +``` + ### [seaborn](https://seaborn.pydata.org) Seaborn is a Python visualization library based on @@ -103,12 +114,17 @@ pandas with the option to perform statistical estimation while plotting, aggregating across observations and visualizing the fit of statistical models to emphasize patterns in a dataset. +``` +import seaborn as sns +sns.set_theme() +``` + ### [plotnine](https://github.com/has2k1/plotnine/) Hadley Wickham's [ggplot2](https://ggplot2.tidyverse.org/) is a foundational exploratory visualization package for the R language. Based on ["The Grammar of -Graphics"](https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html) +Graphics"](https://doi.org/10.1007/0-387-28695-0) it provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data. Various implementations to other languages are available. @@ -130,11 +146,8 @@ Seaborn](https://plot.ly/python/matplotlib-to-plotly-tutorial/) can convert figures into interactive web-based plots. Plots can be drawn in [IPython Notebooks](https://plot.ly/ipython-notebooks/) , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly -is free for unlimited sharing, and has -[cloud](https://plot.ly/product/plans/), -[offline](https://plot.ly/python/offline/), or -[on-premise](https://plot.ly/product/enterprise/) accounts for private -use. +is free for unlimited sharing, and has cloud, offline, or on-premise +accounts for private use. ### [Lux](https://github.com/lux-org/lux) @@ -148,14 +161,7 @@ df = pd.read_csv("data.csv") df # discover interesting insights! ``` -By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html>) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. - -### [QtPandas](https://github.com/draperjames/qtpandas) - -Spun off from the main pandas library, the -[qtpandas](https://github.com/draperjames/qtpandas) library enables -DataFrame visualization and manipulation in PyQt4 and PySide -applications. +By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. ### [D-Tale](https://github.com/man-group/dtale) @@ -202,7 +208,7 @@ standard output formats (HTML, HTML presentation slides, LaTeX, PDF, ReStructuredText, Markdown, Python) through 'Download As' in the web interface and `jupyter convert` in a shell. -Pandas DataFrames implement `_repr_html_`and `_repr_latex` methods which +Pandas DataFrames implement `_repr_html_` and `_repr_latex` methods which are utilized by Jupyter Notebook for displaying (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may or may not be compatible with non-HTML Jupyter output formats.) @@ -210,12 +216,6 @@ or may not be compatible with non-HTML Jupyter output formats.) See [Options and Settings](https://pandas.pydata.org/docs/user_guide/options.html) for pandas `display.` settings. -### [modin-project/modin-spreadsheet](https://github.com/modin-project/modin-spreadsheet) - -modin-spreadsheet is an interactive grid for sorting and filtering DataFrames in IPython Notebook. -It is a fork of qgrid and is actively maintained by the modin project. -modin-spreadsheet provides similar functionality to qgrid and allows for easy data exploration and manipulation in a tabular format. - ### [Spyder](https://www.spyder-ide.org/) Spyder is a cross-platform PyQt-based IDE combining the editing, @@ -242,6 +242,17 @@ Console](https://docs.spyder-ide.org/current/panes/ipythonconsole.html), and Spy render Numpydoc documentation on pandas objects in rich text with Sphinx both automatically and on-demand. +### [marimo](https://marimo.io) + +marimo is a reactive notebook for Python and SQL that enhances productivity when working with dataframes. It provides several features to make data manipulation and visualization more interactive and fun: + +1. Rich, interactive displays: marimo can display pandas dataframes in interactive tables or charts with filtering and sorting capabilities. +2. Data selection: Users can select data in tables or pandas-backed plots, and the selections are automatically sent to Python as pandas dataframes. +3. No-code transformations: Users can interactively transform pandas dataframes using a GUI, without writing code. The generated code can be copied and pasted into the notebook. +4. Custom filters: marimo allows the creation of pandas-backed filters using UI elements like sliders and dropdowns. +5. Dataset explorer: marimo automatically discovers and displays all dataframes in the notebook, allowing users to explore and visualize data interactively. +6. SQL integration: marimo allows users to write SQL queries against any pandas dataframes existing in memory. + ## API ### [pandas-datareader](https://github.com/pydata/pandas-datareader) @@ -271,18 +282,6 @@ The following data feeds are available: - Stooq Index Data - MOEX Data -### [quandl/Python](https://github.com/quandl/Python) - -Quandl API for Python wraps the Quandl REST API to return Pandas -DataFrames with timeseries indexes. - -### [pydatastream](https://github.com/vfilimonov/pydatastream) - -PyDatastream is a Python interface to the [Thomson Dataworks Enterprise -(DWE/Datastream)](http://dataworks.thomson.com/Dataworks/Enterprise/1.0/) -SOAP API to return indexed Pandas DataFrames with financial data. This -package requires valid credentials for this API (non free). - ### [pandaSDMX](https://pandasdmx.readthedocs.io) pandaSDMX is a library to retrieve and acquire statistical data and @@ -305,13 +304,6 @@ point-in-time data from ALFRED. fredapi makes use of pandas and returns data in a Series or DataFrame. This module requires a FRED API key that you can obtain for free on the FRED website. -### [dataframe_sql](https://github.com/zbrookle/dataframe_sql) - -``dataframe_sql`` is a Python package that translates SQL syntax directly into -operations on pandas DataFrames. This is useful when migrating from a database to -using pandas or for users more comfortable with SQL looking for a way to interface -with pandas. - ## Domain specific ### [Geopandas](https://github.com/geopandas/geopandas) @@ -353,7 +345,7 @@ It supports the following data types: - pandas data types - data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) -- data types defined in [Table Schema specification](http://dataprotocols.org/json-table-schema/#field-types-and-formats) +- data types defined in [Table Schema specification](https://datapackage.org/standard/table-schema/) The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). @@ -382,13 +374,154 @@ Deltalake python package lets you access tables stored in JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert any Delta table into Pandas dataframe. +### [pandas-gbq](https://github.com/googleapis/python-bigquery-pandas) + +pandas-gbq provides high performance reads and writes to and from +[Google BigQuery](https://cloud.google.com/bigquery/). Previously (before version 2.2.0), +these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`. +Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead. + + +### [ArcticDB](https://github.com/man-group/ArcticDB) + +ArcticDB is a serverless DataFrame database engine designed for the Python Data Science ecosystem. ArcticDB enables you to store, retrieve, and process pandas DataFrames at scale. It is a storage engine designed for object storage and also supports local-disk storage using LMDB. ArcticDB requires zero additional infrastructure beyond a running Python environment and access to object storage and can be installed in seconds. Please find full documentation [here](https://docs.arcticdb.io/latest/). + +#### ArcticDB Terminology + +ArcticDB is structured to provide a scalable and efficient way to manage and retrieve DataFrames, organized into several key components: + +- `Object Store` Collections of libraries. Used to separate logical environments from each other. Analogous to a database server. +- `Library` Contains multiple symbols which are grouped in a certain way (different users, markets, etc). Analogous to a database. +- `Symbol` Atomic unit of data storage. Identified by a string name. Data stored under a symbol strongly resembles a pandas DataFrame. Analogous to tables. +- `Version` Every modifying action (write, append, update) performed on a symbol creates a new version of that object. + +#### Installation + +To install, simply run: + +```console +pip install arcticdb +``` + +To get started, we can import ArcticDB and instantiate it: + +```python +import arcticdb as adb +import numpy as np +import pandas as pd +# this will set up the storage using the local file system +arctic = adb.Arctic("lmdb://arcticdb_test") +``` + +> **Note:** ArcticDB supports any S3 API compatible storage, including AWS. ArcticDB also supports Azure Blob storage. +> ArcticDB also supports LMDB for local/file based storage - to use LMDB, pass an LMDB path as the URI: `adb.Arctic('lmdb://path/to/desired/database')`. + +#### Library Setup + +ArcticDB is geared towards storing many (potentially millions) of tables. Individual tables (DataFrames) are called symbols and are stored in collections called libraries. A single library can store many symbols. Libraries must first be initialized prior to use: + +```python +lib = arctic.get_library('sample', create_if_missing=True) +``` + +#### Writing Data to ArcticDB + +Now we have a library set up, we can get to reading and writing data. ArcticDB has a set of simple functions for DataFrame storage. Let's write a DataFrame to storage. + +```python +df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3) + } +) + +df +df.dtypes +``` + +Write to ArcticDB. + +```python +write_record = lib.write("test", df) +``` + +> **Note:** When writing pandas DataFrames, ArcticDB supports the following index types: +> +> - `pandas.Index` containing int64 (or the corresponding dedicated types Int64Index, UInt64Index) +> - `RangeIndex` +> - `DatetimeIndex` +> - `MultiIndex` composed of above supported types +> +> The "row" concept in `head`/`tail` refers to the row number ('iloc'), not the value in the `pandas.Index` ('loc'). + +#### Reading Data from ArcticDB + +Read the data back from storage: + +```python +read_record = lib.read("test") +read_record.data +df.dtypes +``` + +ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/processing/#arcticdb.QueryBuilder). + +### [Hugging Face](https://huggingface.co/datasets) + +The Hugging Face Dataset Hub provides a large collection of ready-to-use datasets for machine learning shared by the community. The platform offers a user-friendly interface to explore, discover and visualize datasets, and provides tools to easily load and work with these datasets in Python thanks to the [huggingface_hub](https://github.com/huggingface/huggingface_hub) library. + +You can access datasets on Hugging Face using `hf://` paths in pandas, in the form `hf://datasets/username/dataset_name/...`. + +For example, here is how to load the [stanfordnlp/imdb dataset](https://huggingface.co/datasets/stanfordnlp/imdb): + +```python +import pandas as pd + +# Load the IMDB dataset +df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") +``` + +Tip: on a dataset page, click on "Use this dataset" to get the code to load it in pandas. + +To save a dataset on Hugging Face you need to [create a public or private dataset](https://huggingface.co/new-dataset) and [login](https://huggingface.co/docs/huggingface_hub/quick-start#login-command), and then you can use `df.to_csv/to_json/to_parquet`: + +```python +# Save the dataset to my Hugging Face account +df.to_parquet("hf://datasets/username/dataset_name/train.parquet") +``` + +You can find more information about the Hugging Face Dataset Hub in the [documentation](https://huggingface.co/docs/hub/en/datasets). + ## Out-of-core -### [Blaze](https://blaze.pydata.org/) +### [Bodo](https://github.com/bodo-ai/Bodo) + + +Bodo is a high-performance compute engine for Python data processing. +Using an auto-parallelizing just-in-time (JIT) compiler, Bodo simplifies scaling Pandas +workloads from laptops to clusters without major code changes. +Under the hood, Bodo relies on MPI-based high-performance computing (HPC) technology—making it +both easier to use and often much faster than alternatives. +Bodo also provides a SQL engine that can query distributed pandas dataframes efficiently. + +```python +import pandas as pd +import bodo + +@bodo.jit +def process_data(): + df = pd.read_parquet("my_data.pq") + df2 = pd.DataFrame({"A": df.apply(lambda r: 0 if r.A == 0 else (r.B // r.A), axis=1)}) + df2.to_parquet("out.pq") + +process_data() +``` -Blaze provides a standard API for doing computations with various -in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, -PyTables, PySpark. ### [Cylon](https://cylondata.org/) @@ -457,18 +590,10 @@ import modin.pandas as pd df = pd.read_csv("big.csv") # use all your cores! ``` -### [Odo](http://odo.pydata.org) - -Odo provides a uniform API for moving data between different formats. It -uses pandas own `read_csv` for CSV IO and leverages many existing -packages such as PyTables, h5py, and pymongo to move data between non -pandas formats. Its graph based approach is also extensible by end users -for custom formats that may be too specific for the core of odo. - ### [Pandarallel](https://github.com/nalepae/pandarallel) Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. -If also displays progress bars. +It also displays progress bars. ```python from pandarallel import pandarallel @@ -479,23 +604,6 @@ pandarallel.initialize(progress_bar=True) df.parallel_apply(func) ``` -### [Ray](https://docs.ray.io/en/latest/data/modin/index.html) - -Pandas on Ray is an early stage DataFrame library that wraps Pandas and -transparently distributes the data and computation. The user does not -need to know how many cores their system has, nor do they need to -specify how to distribute the data. In fact, users can continue using -their previous Pandas notebooks while experiencing a considerable -speedup from Pandas on Ray, even on a single machine. Only a -modification of the import statement is needed, as we demonstrate below. -Once you've changed your import statement, you're ready to use Pandas on -Ray just like you would Pandas. - -``` -# import pandas as pd -import ray.dataframe as pd -``` - ### [Vaex](https://vaex.io/docs/) Increasingly, packages are being built on top of pandas to address @@ -540,11 +648,6 @@ to make data processing pipelines more readable and robust. Dataframes contain information that pandera explicitly validates at runtime. This is useful in production-critical data pipelines or reproducible research settings. -### [Engarde](https://engarde.readthedocs.io/en/latest/) - -Engarde is a lightweight library used to explicitly state your -assumptions about your datasets and check that they're *actually* true. - ## Extension data types Pandas provides an interface for defining @@ -552,18 +655,19 @@ Pandas provides an interface for defining The following libraries implement that interface to provide types not found in NumPy or pandas, which work well with pandas' data containers. -### [awkward-pandas](https://awkward-pandas.readthedocs.io/) +### [awkward-pandas](https://github.com/scikit-hep/awkward) Awkward-pandas provides an extension type for storing [Awkward Arrays](https://awkward-array.org/) inside pandas' Series and DataFrame. It also provides an accessor for using awkward functions on Series that are of awkward type. -### [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) +### [db-dtypes](https://github.com/googleapis/python-db-dtypes-pandas) -Cyberpandas provides an extension type for storing arrays of IP -Addresses. These arrays can be stored inside pandas' Series and -DataFrame. +db-dtypes provides an extension types for working with types like +DATE, TIME, and JSON from database systems. This package is used +by pandas-gbq to provide natural dtypes for BigQuery data types without +a natural numpy type. ### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/) @@ -587,7 +691,7 @@ units aware. ### [Text Extensions](https://ibm.biz/text-extensions-for-pandas) -Text Extensions for Pandas provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into Pandas DataFrames. +Text Extensions for Pandas provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into pandas DataFrames. ## Accessors @@ -599,15 +703,11 @@ authors to coordinate on the namespace. | Library | Accessor | Classes | | -------------------------------------------------------------------- | ---------- | --------------------- | | [awkward-pandas](https://awkward-pandas.readthedocs.io/en/latest/) | `ak` | `Series` | - | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` | | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` | - | [pandas_path](https://github.com/drivendataorg/pandas-path/) | `path` | `Index`, `Series` | | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | | [physipandas](https://github.com/mocquin/physipandas) | `physipy` | `Series`, `DataFrame` | | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | - | [datatest](https://datatest.readthedocs.io/en/stable/) | `validate` | `Series`, `DataFrame` | - | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | | [gurobipy-pandas](https://github.com/Gurobi/gurobipy-pandas) | `gppd` | `Series`, `DataFrame` | | [staircase](https://www.staircase.dev/) | `sc` | `Series`, `DataFrame` | | [woodwork](https://github.com/alteryx/woodwork) | `slice` | `Series`, `DataFrame` | @@ -629,7 +729,7 @@ See installation and usage instructions on the [GitHub page](https://github.com/ Hamilton is a declarative dataflow framework that came out of Stitch Fix. It was designed to help one manage a Pandas code base, specifically with respect to feature engineering for machine learning models. -It prescibes an opinionated paradigm, that ensures all code is: +It prescribes an opinionated paradigm, that ensures all code is: * unit testable * integration testing friendly diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 27e5ea25c1bad..cb5447591dab6 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -54,6 +54,8 @@ navbar: target: community/coc.html - name: "Ecosystem" target: community/ecosystem.html + - name: "Benchmarks" + target: community/benchmarks.html - name: "Contribute" target: contribute.html blog: @@ -70,11 +72,9 @@ blog: - https://phofl.github.io/feeds/pandas.atom.xml maintainers: active: - - wesm - jorisvandenbossche - TomAugspurger - jreback - - gfyoung - WillAyd - mroeschke - jbrockmendel @@ -84,14 +84,11 @@ maintainers: - alimcmaster1 - bashtage - Dr-Irv - - MarcoGorelli - rhshadrach - phofl - attack68 - fangchenli - - twoertwein - lithomas1 - - mzeitlin11 - lukemanley - noatamir inactive: @@ -106,6 +103,11 @@ maintainers: - jschendel - charlesdong1991 - dsaxton + - wesm + - gfyoung + - mzeitlin11 + - twoertwein + - MarcoGorelli workgroups: coc: name: Code of Conduct @@ -119,74 +121,41 @@ workgroups: finance: name: Finance contact: finance@pandas.pydata.org - responsibilities: "Approve the project expenses." + responsibilities: "Manage the funding. Coordinate the request of grants. Approve the project expenses." members: - - Wes McKinney + - Matthew Roeschke - Jeff Reback - Joris Van den Bossche - - Tom Augspurger - - Matthew Roeschke + - Patrick Hoefler infrastructure: name: Infrastructure contact: infrastructure@pandas.pydata.org responsibilities: "Keep the pandas infrastructure up and working. In particular the servers for the website, benchmarks, CI and others needed." members: - - Marc Garcia - - Matthew Roeschke + - William Ayd - Thomas Li communications: name: Communications contact: communications@pandas.pydata.org responsibilities: "Share relevant information with the broader community, mainly via our social networks, as well as being the main point of contact between NumFOCUS and the core team." members: - - Marco Gorelli - - Marc Garcia + - datapythonista sponsors: active: - name: "NumFOCUS" url: https://numfocus.org/ logo: static/img/partners/numfocus.svg kind: numfocus - - name: "Two Sigma" - url: https://www.twosigma.com/ - logo: static/img/partners/two_sigma.svg - kind: partner - description: "Jeff Reback" - - name: "Voltron Data" - url: https://voltrondata.com/ - logo: static/img/partners/voltron_data.svg - kind: partner - description: "Joris Van den Bossche" - - name: "Coiled" - url: https://www.coiled.io - logo: static/img/partners/coiled.svg - kind: partner - description: "Patrick Hoefler" - - name: "Quansight" - url: https://quansight.com/ - logo: static/img/partners/quansight_labs.svg - kind: partner - description: "Marco Gorelli" - name: "Nvidia" url: https://www.nvidia.com logo: static/img/partners/nvidia.svg kind: partner description: "Matthew Roeschke" - - name: "Intel" - url: https://www.intel.com/ - logo: /static/img/partners/intel.svg - kind: partner - description: "Brock Mendel" - name: "Tidelift" url: https://tidelift.com logo: static/img/partners/tidelift.svg kind: regular description: "pandas is part of the Tidelift subscription. You can support pandas by becoming a Tidelift subscriber." - - name: "Chan Zuckerberg Initiative" - url: https://chanzuckerberg.com/ - logo: static/img/partners/czi.svg - kind: regular - description: "pandas is funded by the Essential Open Source Software for Science program of the Chan Zuckerberg Initiative. The funding is used for general maintenance, improve extension types, and a efficient string type." - name: "Bodo" url: https://www.bodo.ai/ logo: static/img/partners/bodo.svg @@ -218,5 +187,20 @@ sponsors: - name: "d-fine GmbH" url: https://www.d-fine.com/en/ kind: partner + - name: "Two Sigma" + url: https://www.twosigma.com/ + kind: partner + - name: "Voltron Data" + url: https://voltrondata.com/ + kind: partner + - name: "Intel" + url: https://www.intel.com/ + kind: partner + - name: "Chan Zuckerberg Initiative" + url: https://chanzuckerberg.com/ + kind: regular + - name: "Coiled" + url: https://www.coiled.io + kind: partner roadmap: pdeps_path: pdeps diff --git a/web/pandas/contribute.md b/web/pandas/contribute.md index 258ba149f1849..3307ddcfb1a0a 100644 --- a/web/pandas/contribute.md +++ b/web/pandas/contribute.md @@ -48,7 +48,7 @@ and about current sponsors in the [sponsors page]({{ base_url }}about/sponsors.h infrastructure, travel expenses for our volunteer contributors to attend the in-person sprints, or to give small grants to develop features.

    -

    Make your donation in the donate page

    +

    Make your donation in the donate page

    diff --git a/web/pandas/donate.md b/web/pandas/donate.md deleted file mode 100644 index 69db7e4648e77..0000000000000 --- a/web/pandas/donate.md +++ /dev/null @@ -1,14 +0,0 @@ -# Donate to pandas - -
    -
    - - -_pandas_ is a Sponsored Project of [NumFOCUS](https://numfocus.org/), a 501(c)(3) nonprofit charity in the United States. -NumFOCUS provides _pandas_ with fiscal, legal, and administrative support to help ensure the -health and sustainability of the project. Visit numfocus.org for more information. - -Donations to _pandas_ are managed by NumFOCUS. For donors in the United States, your gift is tax-deductible -to the extent provided by law. As with any donation, you should consult with your tax adviser about your particular tax situation. diff --git a/web/pandas/getting_started.md b/web/pandas/getting_started.md index cb14e52edad2c..c556eda57ac31 100644 --- a/web/pandas/getting_started.md +++ b/web/pandas/getting_started.md @@ -2,33 +2,8 @@ ## Installation instructions -The next steps provides the easiest and recommended way to set up your -environment to use pandas. Other installation options can be found in -the [advanced installation page]({{ base_url}}docs/getting_started/install.html). - -1. Download [Anaconda](https://www.anaconda.com/distribution/) for your operating system and - the latest Python version, run the installer, and follow the steps. Please note: - - - It is not needed (and discouraged) to install Anaconda as root or administrator. - - When asked if you wish to initialize Anaconda3, answer yes. - - Restart the terminal after completing the installation. - - Detailed instructions on how to install Anaconda can be found in the - [Anaconda documentation](https://docs.anaconda.com/anaconda/install/). - -2. In the Anaconda prompt (or terminal in Linux or macOS), start JupyterLab: - - - -3. In JupyterLab, create a new (Python 3) notebook: - - - -4. In the first cell of the notebook, you can import pandas and check the version with: - - - -5. Now you are ready to use pandas, and you can write your code in the next cells. +To install pandas, please reference the [installation page]({{ base_url}}docs/getting_started/install.html) +from the pandas documentation. ## Tutorials @@ -42,16 +17,28 @@ The book we recommend to learn pandas is [Python for Data Analysis](https://amzn by [Wes McKinney](https://wesmckinney.com/), creator of pandas. - Python for Data Analysis + Python for Data Analysis ## Videos - + ## Cheat sheet [pandas cheat sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf) + +## Try pandas in your browser (experimental) + +You can try pandas in your browser with the following interactive shell +without needing to install anything on your system. + +

    + Try it in your browser +

    diff --git a/web/pandas/index.html b/web/pandas/index.html index 3d2e5363a2a2d..c520a16b8160f 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -18,9 +18,9 @@

    pandas

    @@ -46,10 +46,10 @@
    With the support of:
    {% for row in sponsors.active | batch(6, "") %}
    {% for company in row %} -
    +
    {% if company %} - {{ company.name }} + {{ company.name }} {% endif %}
    @@ -74,26 +74,36 @@

    Follow us

    -

    Get the book

    +

    Recommended books

    - Python for Data Analysis + Python for Data Analysis + +

    +

    + + Pandas Cookbook, Third Edition + +

    +

    + + Effective pandas 2

    {% if releases[1:5] %} @@ -116,7 +126,7 @@

    Previous versions

      {% for release in releases[5:] %}
    • - {{ release.name }} ({{ release.published.strftime("%Y-%m-%d") }})
      + {{ release.name }} ({{ release.published.strftime("%b %d, %Y") }})
      changelog | docs | code diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md index 24c91fbab0808..7f5f0326eba6c 100644 --- a/web/pandas/pdeps/0001-purpose-and-guidelines.md +++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md @@ -6,7 +6,7 @@ [#51417](https://github.com/pandas-dev/pandas/pull/51417) - Author: [Marc Garcia](https://github.com/datapythonista), [Noa Tamir](https://github.com/noatamir) -- Revision: 2 +- Revision: 3 ## PDEP definition, purpose and scope @@ -56,31 +56,102 @@ advisor on the PDEP when it is submitted to the PDEP repository. ### Workflow +#### Rationale + +Our workflow was created to support and enable a consensus seeking process, and to provide clarity, +for current and future authors, as well as voting members. It is not a strict policy, and we +discourage any interpretation which seeks to take advantage of it in a way that could "force" or +"sneak" decisions in one way or another. We expect and encourage transparency, active discussion, +feedback, and compromise from all our community members. + +#### PDEP States + The possible states of a PDEP are: +- Draft - Under discussion - Accepted - Implemented - Rejected +- Withdrawn Next is described the workflow that PDEPs can follow. #### Submitting a PDEP -Proposing a PDEP is done by creating a PR adding a new file to `web/pdeps/`. -The file is a markdown file, you can use `web/pdeps/0001.md` as a reference +Proposing a PDEP is done by creating a PR adding a new file to `web/pandas/pdeps/`. +The file is a markdown file, you can use `web/pandas/pdeps/0001-purpose-and-guidelines.md` as a reference for the expected format. -The initial status of a PDEP will be `Status: Under discussion`. This will be changed to -`Status: Accepted` when the PDEP is ready and has the approval of the core team. +The initial status of a PDEP will be `Status: Draft`. This will be changed to +`Status: Under discussion` by the author(s), when they are ready to proceed with the decision +making process. -#### Accepted PDEP +#### PDEP Discussion Timeline + +A PDEP discussion will remain open for up to 60 days. This period aims to enable participation +from volunteers, who might not always be available to respond quickly, as well as provide ample +time to make changes based on suggestions and considerations offered by the participants. +Similarly, the following voting period will remain open for 15 days. + +To enable and encourage discussions on PDEPs, we follow a notification schedule. At each of the +following steps, the pandas team, and the pandas-dev mailing list are notified via GitHub and +E-mail: + +- Once a PDEP is ready for discussion. +- After 30 days, with a note that there is at most 30 days remaining for discussion, + and that a vote will be called for if no discussion occurs in the next 15 days. +- After 45 days, with a note that there is at most 15 days remaining for discussion, + and that a vote will be called for in 15 days. +- Once the voting period starts, after 60 days or in case of an earlier vote, with 15 days + remaining for voting. +- After 10 voting days, with 5 days remaining for voting. + +After 30 discussion days, in case 15 days passed without any new unaddressed comments, +the authors may close the discussion period preemptively, by sending an early reminder +of 15 days remaining until the voting period starts. + +#### Casting Votes + +As the voting period starts, a VOTE issue is created which links to the PDEP discussion pull request. +Each voting member, including author(s) with voting rights, may cast a vote by adding one of the following comments: + +- +1: approve. +- 0: abstain. + - Reason: A one sentence reason is required. +- -1: disapprove + - Reason: A one sentence reason is required. + +A disapprove vote requires prior participation in the PDEP discussion issue. -A PDEP can only be accepted by the core development team, if the proposal is considered -worth implementing. Decisions will be made based on the process detailed in the -[pandas governance document](https://github.com/pandas-dev/pandas-governance/blob/master/governance.md). -In general, more than one approval will be needed before the PR is merged. And -there should not be any `Request changes` review at the time of merging. +Comments made on the public VOTE issue by non-voting members will be deleted. + +Once the voting period ends, any voter may tally the votes in a comment, using the format: w-x-y-z, +where w stands for the total of approving, x of abstaining, y of disapproving votes cast, and z +of number of voting members who did not respond to the VOTE issue. The tally of the votes will state +if a quorum has been reached or not. + +#### Quorum and Majority + +For a PDEP vote to result in accepting the proposal, a quorum is required. All votes (including +abstentions) are counted towards the quorum. The quorum is computed as the lower of these two +values: + +- 11 voting members. +- 50% of voting members. + +Given a quorum, a majority of 70% of the non-abstaining votes is required as well, i.e. 70% of +the approving and disapproving votes must be in favor. + +Thus, abstaining votes count towards a quorum, but not towards a majority. A voting member might +choose to abstain when they have participated in the discussion, have some objections to the +proposal, but do not wish to stop the proposal from moving forward, nor indicate their full +support. + +If a quorum was not reached by the end of the voting period, the PDEP is not accepted. Its status +will change to rejected. + +#### Accepted PDEP Once a PDEP is accepted, any contributions can be made toward the implementation of the PDEP, with an open-ended completion timeline. Development of pandas is difficult to understand and @@ -109,6 +180,17 @@ discussion. A PDEP can be rejected for different reasons, for example good ideas that are not backward-compatible, and the breaking changes are not considered worth implementing. +The PDEP author(s) can also decide to withdraw the PDEP before a final decision +is made (`Status: Withdrawn`), when the PDEP authors themselves have decided +that the PDEP is actually a bad idea, or have accepted that it is not broadly +supported or a competing proposal is a better alternative. + +The author(s) may choose to resubmit a rejected or withdrawn PDEP. We expect +authors to use their judgement in that case, as to whether they believe more +discussion, or an amended proposal has the potential to lead to a different +result. A new PDEP is then created, which includes a link to the previously +rejected PDEP. + #### Invalid PDEP For submitted PDEPs that do not contain proper documentation, are out of scope, or @@ -184,6 +266,7 @@ hope can help clarify our meaning here: - 3 August 2022: Initial version ([GH-47938][47938]) - 15 February 2023: Version 2 ([GH-51417][51417]) clarifies the scope of PDEPs and adds examples +- 09 June 2023: Version 3 ([GH-53576][53576]) defines a structured decision making process for PDEPs [7217]: https://github.com/pandas-dev/pandas/pull/7217 [8074]: https://github.com/pandas-dev/pandas/issues/8074 @@ -202,3 +285,4 @@ hope can help clarify our meaning here: [51417]: https://github.com/pandas-dev/pandas/pull/51417 [28900]: https://github.com/pandas-dev/pandas/issues/28900 [35407]: https://github.com/pandas-dev/pandas/issues/35407 +[53576]: https://github.com/pandas-dev/pandas/pull/53576 diff --git a/web/pandas/pdeps/0006-ban-upcasting.md b/web/pandas/pdeps/0006-ban-upcasting.md index a86455b70c71a..ae5872186bf23 100644 --- a/web/pandas/pdeps/0006-ban-upcasting.md +++ b/web/pandas/pdeps/0006-ban-upcasting.md @@ -1,7 +1,7 @@ # PDEP-6: Ban upcasting in setitem-like operations - Created: 23 December 2022 -- Status: Accepted +- Status: Implemented - Discussion: [#39584](https://github.com/pandas-dev/pandas/pull/50402) - Author: [Marco Gorelli](https://github.com/MarcoGorelli) ([original issue](https://github.com/pandas-dev/pandas/issues/39584) by [Joris Van den Bossche](https://github.com/jorisvandenbossche)) - Revision: 1 @@ -244,3 +244,4 @@ Deprecate sometime in the 2.x releases (after 2.0.0 has already been released), ### PDEP History - 23 December 2022: Initial draft +- 4 July 2024: Change status to "implemented" diff --git a/web/pandas/pdeps/0007-copy-on-write.md b/web/pandas/pdeps/0007-copy-on-write.md index e45fbaf555bc1..f5adb6a571120 100644 --- a/web/pandas/pdeps/0007-copy-on-write.md +++ b/web/pandas/pdeps/0007-copy-on-write.md @@ -1,7 +1,7 @@ # PDEP-7: Consistent copy/view semantics in pandas with Copy-on-Write - Created: July 2021 -- Status: Accepted +- Status: Implemented - Discussion: [#36195](https://github.com/pandas-dev/pandas/issues/36195) - Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) - Revision: 1 diff --git a/web/pandas/pdeps/0010-required-pyarrow-dependency.md b/web/pandas/pdeps/0010-required-pyarrow-dependency.md index 4d6e928ce68bd..0c3bf3c776988 100644 --- a/web/pandas/pdeps/0010-required-pyarrow-dependency.md +++ b/web/pandas/pdeps/0010-required-pyarrow-dependency.md @@ -40,7 +40,7 @@ PyArrow is an optional dependency of pandas that provides a wide range of supple data types within the `ExtensionArray` interface - Since pandas 2.0.0, all I/O readers have the option to return PyArrow-backed data types, and many methods now utilize PyArrow compute functions to -accelerate PyArrow-backed data in pandas, notibly string and datetime types. +accelerate PyArrow-backed data in pandas, notably string and datetime types. As of pandas 2.0, one can feasibly utilize PyArrow as an alternative data representation to NumPy with advantages such as: @@ -117,7 +117,7 @@ In[4]: %timeit ser_string.str.startswith("a") ### Immediate User Benefit 2: Nested Datatypes -Currently, if you try storing `dict`s in a pandas `Series`, you will again get the horrendeous `object` dtype: +Currently, if you try storing `dict`s in a pandas `Series`, you will again get the horrendous `object` dtype: ```python In [6]: pd.Series([{'a': 1, 'b': 2}, {'a': 2, 'b': 99}]) Out[6]: @@ -185,7 +185,6 @@ Additionally, if a user is installing pandas in an environment where wheels are the user will need to also build Arrow C++ and related dependencies when installing from source. These environments include - Alpine linux (commonly used as a base for Docker containers) -- WASM (pyodide and pyscript) - Python development versions Lastly, pandas development and releases will need to be mindful of PyArrow's development and release cadance. For example when diff --git a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md index 3ae34f4bb6ebb..71f669825f979 100644 --- a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md +++ b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md @@ -291,7 +291,7 @@ Note: The JSON format for the TableSchema interface is the existing. -The JSON format for the Global interface is defined in [JSON-TAB](https://github.com/loco-philippe/NTV/blob/main/documentation/JSON-TAB-standard.pdf) specification. +The JSON format for the Global interface is defined in [JSON-TAB](https://github.com/loco-philippe/NTV/blob/v1.1.0/documentation/JSON-TAB-standard.pdf) specification. It includes the naming rules originally defined in the [JSON-ND project](https://github.com/glenkleidon/JSON-ND) and support for categorical data. The specification have to be updated to include sparse data. @@ -448,7 +448,7 @@ To conclude, ## Core team decision -Vote was open from september-11 to setpember-26: +Vote was open from september-11 to september-26: - Final tally is 0 approvals, 5 abstentions, 7 disapprove. The quorum has been met. The PDEP fails. diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/pdeps/0014-string-dtype.md new file mode 100644 index 0000000000000..5b74f71216454 --- /dev/null +++ b/web/pandas/pdeps/0014-string-dtype.md @@ -0,0 +1,375 @@ +# PDEP-14: Dedicated string data type for pandas 3.0 + +- Created: May 3, 2024 +- Status: Accepted +- Discussion: https://github.com/pandas-dev/pandas/pull/58551 +- Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) +- Revision: 1 + +## Abstract + +This PDEP proposes to introduce a dedicated string dtype that will be used by +default in pandas 3.0: + +* In pandas 3.0, enable a string dtype (`"str"`) by default, using PyArrow if available + or otherwise a string dtype using numpy object-dtype under the hood as fallback. +* The default string dtype will use missing value semantics (using NaN) consistent + with the other default data types. + +This will give users a long-awaited proper string dtype for 3.0, while 1) not +(yet) making PyArrow a _hard_ dependency, but only a dependency used by default, +and 2) leaving room for future improvements (different missing value semantics, +using NumPy 2.0 strings, etc). + +## Background + +Currently, pandas by default stores text data in an `object`-dtype NumPy array. +The current implementation has two primary drawbacks. First, `object` dtype is +not specific to strings: any Python object can be stored in an `object`-dtype +array, not just strings, and seeing `object` as the dtype for a column with +strings is confusing for users. Second: this is not efficient (all string +methods on a Series are eventually calling Python methods on the individual +string objects). + +To solve the first issue, a dedicated extension dtype for string data has +already been +[added in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#dedicated-string-data-type). +This has always been opt-in for now, requiring users to explicitly request the +dtype (with `dtype="string"` or `dtype=pd.StringDtype()`). The array backing +this string dtype was initially almost the same as the default implementation, +i.e. an `object`-dtype NumPy array of Python strings. + +To solve the second issue (performance), pandas contributed to the development +of string kernels in the PyArrow package, and a variant of the string dtype +backed by PyArrow was +[added in pandas 1.3](https://pandas.pydata.org/docs/whatsnew/v1.3.0.html#pyarrow-backed-string-data-type). +This could be specified with the `storage` keyword in the opt-in string dtype +(`pd.StringDtype(storage="pyarrow")`). + +Since its introduction, the `StringDtype` has always been opt-in, and has used +the experimental `pd.NA` sentinel for missing values (which was also [introduced +in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#experimental-na-scalar-to-denote-missing-values)). +However, up to this date, pandas has not yet taken the step to use `pd.NA` for +for any default dtype, and thus the `StringDtype` deviates in missing value +behaviour compared to the default data types. + +In 2023, [PDEP-10](https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html) +proposed to start using a PyArrow-backed string dtype by default in pandas 3.0 +(i.e. infer this type for string data instead of object dtype). To ensure we +could use the variant of `StringDtype` backed by PyArrow instead of Python +objects (for better performance), it proposed to make `pyarrow` a new required +runtime dependency of pandas. + +In the meantime, NumPy has also been working on a native variable-width string +data type, which was made available [starting with NumPy +2.0](https://numpy.org/devdocs/release/2.0.0-notes.html#stringdtype-has-been-added-to-numpy). +This can provide a potential alternative to PyArrow for implementing a string +data type in pandas that is not backed by Python objects. + +After acceptance of PDEP-10, two aspects of the proposal have been under +reconsideration: + +- Based on feedback from users and maintainers from other packages (mostly + around installation complexity and size), it has been considered to relax the + new `pyarrow` requirement to not be a _hard_ runtime dependency. In addition, + NumPy 2.0 could in the future potentially reduce the need to make PyArrow a + required dependency specifically for a dedicated pandas string dtype. +- PDEP-10 did not consider the usage of the experimental `pd.NA` as a + consequence of adopting one of the existing implementations of the + `StringDtype`. + +For the second aspect, another variant of the `StringDtype` was +[introduced in pandas 2.1](https://pandas.pydata.org/docs/whatsnew/v2.1.0.html#whatsnew-210-enhancements-infer-strings) +that is still backed by PyArrow but follows the default missing values semantics +pandas uses for all other default data types (and using `NaN` as the missing +value sentinel) ([GH-54792](https://github.com/pandas-dev/pandas/issues/54792)). +At the time, the `storage` option for this new variant was called +`"pyarrow_numpy"` to disambiguate from the existing `"pyarrow"` option using +`pd.NA` (but this PDEP proposes a better naming scheme, see the "Naming" +subsection below). + +This last dtype variant is what users currently (pandas 2.2) get for string data +when enabling the ``future.infer_string`` option (to enable the behaviour which +is intended to become the default in pandas 3.0). + +## Proposal + +To be able to move forward with a string data type in pandas 3.0, this PDEP proposes: + +1. For pandas 3.0, a `"str"` string dtype is enabled by default, i.e. this + string dtype will be used as the default dtype for text data when creating + pandas objects (e.g. inference in constructors, I/O functions). +2. This default string dtype will follow the same behaviour for missing values + as other default data types, and use `NaN` as the missing value sentinel. +3. The string dtype will use PyArrow if installed, and otherwise falls back to + an in-house functionally-equivalent (but slower) version. This fallback can + reuse (with minor code additions) the existing numpy object-dtype backed + StringArray for its implementation. +4. Installation guidelines are updated to clearly encourage users to install + pyarrow for the default user experience. + +Those string dtypes enabled by default will then no longer be considered as +experimental. + +### Default inference of a string dtype + +By default, pandas will infer this new string dtype instead of object dtype for +string data (when creating pandas objects, such as in constructors or IO +functions). + +In pandas 2.2, the existing `future.infer_string` option can be used to opt-in to the future +default behaviour: + +```python +>>> pd.options.future.infer_string = True +>>> pd.Series(["a", "b", None]) +0 a +1 b +2 NaN +dtype: string +``` + +Right now (pandas 2.2), the existing option only enables the PyArrow-based +future dtype. For the remaining 2.x releases, this option will be expanded to +also work when PyArrow is not installed to enable the object-dtype fallback in +that case. + +### Missing value semantics + +As mentioned in the background section, the original `StringDtype` has always +used the experimental `pd.NA` sentinel for missing values. In addition to using +`pd.NA` as the scalar for a missing value, this essentially means that: + +- String columns follow ["NA-semantics"](https://pandas.pydata.org/docs/user_guide/missing_data.html#na-semantics) + for missing values, where `NA` propagates in boolean operations such as + comparisons or predicates. +- Operations on the string column that give a numeric or boolean result use the + nullable Integer/Float/Boolean data types (e.g. `ser.str.len()` returns the + nullable `"Int64"` / `pd.Int64Dtype()` dtype instead of the numpy `int64` + dtype (or `float64` in case of missing values)). + +However, up to this date, all other default data types still use `NaN` semantics +for missing values. Therefore, this proposal says that a new default string +dtype should also still use the same default missing value semantics and return +default data types when doing operations on the string column, to be consistent +with the other default dtypes at this point. + +In practice, this means that the default string dtype will use `NaN` as +the missing value sentinel, and: + +- String columns will follow NaN-semantics for missing values, where `NaN` gives + False in boolean operations such as comparisons or predicates. +- Operations on the string column that give a numeric or boolean result will use + the default data types (i.e. numpy `int64`/`float64`/`bool`). + +Because the original `StringDtype` implementations already use `pd.NA` and +return masked integer and boolean arrays in operations, a new variant of the +existing dtypes that uses `NaN` and default data types was needed. The original +variant of `StringDtype` using `pd.NA` will continue to be available for those +who were already using it. + +### Object-dtype "fallback" implementation + +To avoid a hard dependency on PyArrow for pandas 3.0, this PDEP proposes to keep +a "fallback" option in case PyArrow is not installed. The original `StringDtype` +backed by a numpy object-dtype array of Python strings can be mostly reused for +this (adding a new variant of the dtype) and a new `StringArray` subclass only +needs minor changes to follow the above-mentioned missing value semantics +([GH-58451](https://github.com/pandas-dev/pandas/pull/58451)). + +For pandas 3.0, this is the most realistic option given this implementation has +already been available for a long time. Beyond 3.0, further improvements such as +using NumPy 2.0 ([GH-58503](https://github.com/pandas-dev/pandas/issues/58503)) +or nanoarrow ([GH-58552](https://github.com/pandas-dev/pandas/issues/58552)) can +still be explored, but at that point that is an implementation detail that +should not have a direct impact on users (except for performance). + +For the original variant of `StringDtype` using `pd.NA`, currently the default +storage is `"python"` (the object-dtype based implementation). Also for this +variant, it is proposed to follow the same logic for determining the default +storage, i.e. default to `"pyarrow"` if available, and otherwise +fall back to `"python"`. + +### Naming + +Given the long history of this topic, the naming of the dtypes is a difficult +topic. + +In the first place, it should be acknowledged that most users should not need to +use storage-specific options. Users are expected to specify a generic name (such +as `"str"` or `"string"`), and that will give them their default string dtype +(which depends on whether PyArrow is installed or not). + +For the generic string alias to specify the dtype, `"string"` is already used +for the `StringDtype` using `pd.NA`. This PDEP proposes to use `"str"` for the +new default `StringDtype` using `NaN`. This ensures backwards compatibility for +code using `dtype="string"`, and was also chosen because `dtype="str"` or +`dtype=str` currently already works to ensure your data is converted to +strings (only using object dtype for the result). + +But for testing purposes and advanced use cases that want control over the exact +variant of the `StringDtype`, we need some way to specify this and distinguish +them from the other string dtypes. + +Currently (pandas 2.2), `StringDtype(storage="pyarrow_numpy")` is used for the new variant using `NaN`, +where the `"pyarrow_numpy"` storage was used to disambiguate from the existing +`"pyarrow"` option using `pd.NA`. However, `"pyarrow_numpy"` is a rather confusing +option and doesn't generalize well. Therefore, this PDEP proposes a new naming +scheme as outlined below, and `"pyarrow_numpy"` will be deprecated as an alias +in pandas 2.3 and removed in pandas 3.0. + +The `storage` keyword of `StringDtype` is kept to disambiguate the underlying +storage of the string data (using pyarrow or python objects), but an additional +`na_value` is introduced to disambiguate the the variants using NA semantics +and NaN semantics. + +Overview of the different ways to specify a dtype and the resulting concrete +dtype of the data: + +| User specification | Concrete dtype | String alias | Note | +|---------------------------------------------|---------------------------------------------------------------|---------------------------------------|----------| +| Unspecified (inference) | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `"str"` or `StringDtype(na_value=np.nan)` | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `StringDtype("pyarrow", na_value=np.nan)` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "str" | | +| `StringDtype("python", na_value=np.nan)` | `StringDtype(storage="python", na_value=np.nan)` | "str" | | +| `StringDtype("pyarrow")` | `StringDtype(storage="pyarrow", na_value=pd.NA)` | "string[pyarrow]" | | +| `StringDtype("python")` | `StringDtype(storage="python", na_value=pd.NA)` | "string[python]" | | +| `"string"` or `StringDtype()` | `StringDtype(storage="pyarrow"\|"python", na_value=pd.NA)` | "string[pyarrow]" or "string[python]" | (1) | +| `StringDtype("pyarrow_numpy")` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "string[pyarrow_numpy]" | (2) | + +Notes: + +- (1) You get "pyarrow" or "python" depending on pyarrow being installed. +- (2) "pyarrow_numpy" is kept temporarily because this is already in a released + version, but it will be deprecated in 2.x and removed for 3.0. + +For the new default string dtype, only the `"str"` alias can be used to +specify the dtype as a string, i.e. pandas would not provide a way to make the +underlying storage (pyarrow or python) explicit through the string alias. This +string alias is only a convenience shortcut and for most users `"str"` is +sufficient (they don't need to specify the storage), and the explicit +`pd.StringDtype(storage=..., na_value=np.nan)` is still available for more +fine-grained control. + +Also for the existing variant using `pd.NA`, specifying the storage through the +string alias could be deprecated, but that is left for a separate decision. + +## Alternatives + +### Why not delay introducing a default string dtype? + +To avoid introducing a new string dtype while other discussions and changes are +in flux (eventually making pyarrow a required dependency? adopting `pd.NA` as +the default missing value sentinel? using the new NumPy 2.0 capabilities? +overhauling all our dtypes to use a logical data type system?), introducing a +default string dtype could also be delayed until there is more clarity in those +other discussions. Specifically, it would avoid temporarily switching to use +`NaN` for the string dtype, while in a future version we might switch back +to `pd.NA` by default. + +However: + +1. Delaying has a cost: it further postpones introducing a dedicated string + dtype that has significant benefits for users, both in usability as (for the + part of the user base that has PyArrow installed) in performance. +2. In case pandas eventually transitions to use `pd.NA` as the default missing value + sentinel, a migration path for _all_ pandas data types will be needed, and thus + the challenges around this will not be unique to the string dtype and + therefore not a reason to delay this. + +Making this change now for 3.0 will benefit the majority of users, and the PDEP +author believes this is worth the cost of the added complexity around "yet +another dtype" (also for other data types we already have multiple variants). + +### Why not use the existing StringDtype with `pd.NA`? + +Wouldn't adding even more variants of the string dtype make things only more +confusing? Indeed, this proposal unfortunately introduces more variants of the +string dtype. However, the reason for this is to ensure the actual default user +experience is _less_ confusing, and the new string dtype fits better with the +other default data types. + +If the new default string data type would use `pd.NA`, then after some +operations, a user can easily end up with a DataFrame that mixes columns using +`NaN` semantics and columns using `NA` semantics (and thus a DataFrame that +could have columns with two different int64, two different float64, two different +bool, etc dtypes). This would lead to a very confusing default experience. + +With the proposed new variant of the StringDtype, this will ensure that for the +_default_ experience, a user will only see only 1 kind of integer dtype, only +kind of 1 bool dtype, etc. For now, a user should only get columns using `pd.NA` +when explicitly opting into this. + +### Naming alternatives + +An initial version of this PDEP proposed to use the `"string"` alias and the +default `pd.StringDtype()` class constructor for the new default dtype. +However, that caused a lot of discussion around backwards compatibility for +existing users of `dtype=pd.StringDtype()` and `dtype="string"`, that uses +`pd.NA` to represent missing values. + +During the discussion, several alternatives have been brought up. Both +alternative keyword names as using a different constructor. In the end, +this PDEP proposes to use a different string alias (`"str"`) but to keep +using the existing `pd.StringDtype` (with the existing `storage` keyword but +with an additional `na_value` keyword) for now to keep the changes as +minimal as possible, leaving a larger overhaul of the dtype system (potentially +including different constructor functions or namespace) for a future discussion. +See [GH-58613](https://github.com/pandas-dev/pandas/issues/58613) for the full +discussion. + +One consequence is that when using the class constructor for the default dtype, +it has to be used with non-default arguments, i.e. a user needs to specify +`pd.StringDtype(na_value=np.nan)` to get the default dtype using `NaN`. +Therefore, the pandas documentation will focus on the usage of `dtype="str"`. + +## Backward compatibility + +The most visible backwards incompatible change will be that columns with string +data will no longer have an `object` dtype. Therefore, code that assumes +`object` dtype (such as `ser.dtype == object`) will need to be updated. This +change is done as a hard break in a major release, as warning in advance for the +changed inference is deemed too noisy. + +To allow testing code in advance, the +`pd.options.future.infer_string = True` option is available for users. + +Otherwise, the actual string-specific functionality (such as the `.str` accessor +methods) should generally all keep working as is. + +By preserving the current missing value semantics, this proposal is also mostly +backwards compatible on this aspect. When storing strings in object dtype, pandas +however did allow using `None` as the missing value indicator as well (and in +certain cases such as the `shift` method, pandas even introduced this itself). +For all the cases where currently `None` was used as the missing value sentinel, +this will change to consistently use `NaN`. + +### For existing users of `StringDtype` + +Existing code that already opted in to use the `StringDtype` using `pd.NA` +should generally keep working as is. The latest version of this PDEP preserves +the behaviour of `dtype="string"` or `dtype=pd.StringDtype()` to mean the +`pd.NA` variant of the dtype. + +It does propose the change the default storage to `"pyarrow"` (if available) for +the opt-in `pd.NA` variant as well, but this should have limited, if any, +user-visible impact. + +## Timeline + +The future PyArrow-backed string dtype was already made available behind a feature +flag in pandas 2.1 (enabled by `pd.options.future.infer_string = True`). + +The variant using numpy object-dtype can also be backported to the 2.2.x branch +to allow easier testing. It is proposed to release this as 2.3.0 (created from +the 2.2.x branch, given that the main branch already includes many other changes +targeted for 3.0), together with the changes to the naming scheme. + +The 2.3.0 release would then have all future string functionality available +(both the pyarrow and object-dtype based variants of the default string dtype). + +For pandas 3.0, this `future.infer_string` flag becomes enabled by default. + +## PDEP-14 History + +- 3 May 2024: Initial version diff --git a/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md b/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md new file mode 100644 index 0000000000000..b8eba90f399c9 --- /dev/null +++ b/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md @@ -0,0 +1,74 @@ +# PDEP-17: Backwards compatibility and deprecation policy + +- Created: 27 June 2024 +- Status: Accepted +- Discussion: [#59125](https://github.com/pandas-dev/pandas/issues/59125) +- Author: [Abdulaziz Aloqeely](https://github.com/Aloqeely) +- Revision: 1 + +## Abstract + +This PDEP defines pandas' backwards compatibility and deprecation policy. + +The main additions to [pandas' current version policy](https://pandas.pydata.org/pandas-docs/version/2.2/development/policies.html) are: +- Deprecated functionality should remain unchanged in at least 2 minor releases before being changed or removed. +- Deprecations should initially use DeprecationWarning, and then be switched to FutureWarning in the last minor release before the major release they are planned to be removed in + +## Motivation + +Having a clear backwards compatibility and deprecation policy is crucial to having a healthy ecosystem. We want to ensure users can rely on pandas being stable while still allowing the library to evolve. + +This policy will ensure that users have enough time to deal with deprecations while also minimizing disruptions on downstream packages' users. + +## Scope + +This PDEP covers pandas' approach to backwards compatibility and the deprecation and removal process. + +## Background + +pandas uses a loose variant of semantic versioning. +A pandas release number is written in the format of ``MAJOR.MINOR.PATCH``. + +## General policy + +This policy applies to the [public API][1]. Anything not part of the [public API][1] or is marked as "Experimental" may be changed or removed at anytime. + +- Breaking backwards compatibility should benefit more than it harms users. +- Breaking changes should go through a deprecation cycle before being implemented if possible. +- Breaking changes should only occur in major releases. +- No deprecations should be introduced in patch releases. +- Deprecated functionality should remain unchanged in at least 2 minor releases before being changed or removed. + +Some bug fixes may require breaking backwards compatibility. In these cases, a deprecation cycle is not necessary. However, bug fixes which have a large impact on users might be treated as a breaking change. Whether or not a change is a bug fix or an API breaking change is a judgement call. + +## Deprecation process + +Deprecation provides a way to warn developers and give them time to adapt their code to the new functionality before the old behavior is eventually removed. + +A deprecation's warning message should: +- Provide information on what is changing. +- Mention how to achieve similar behavior if an alternative is available. +- For large-scale deprecations, it is recommended to include a reason for the deprecation, alongside a discussion link to get user feedback. + +Additionally, when one introduces a deprecation, they should: +- Use the appropriate warning class. More info on this can be found below. +- Add the GitHub issue/PR number as a comment above the warning line. +- Add an entry in the release notes. +- Mention that the functionality is deprecated in the documentation using the ``.. deprecated::`` directive. + +### Which warning class to use + +Deprecations should initially use ``DeprecationWarning``, and then be switched to ``FutureWarning`` for broader visibility in the last minor release before the major release they are planned to be removed in. +This implementation detail can be ignored by using the appropriate ``PandasDeprecationWarning`` variable, which will be aliased to the proper warning class based on the pandas version. + +### Enforcement of deprecations + +When one enforces a deprecation, they should: +- Add an entry in the release notes. +- For API changes, replace the ``.. deprecated::`` directive in the documentation with a ``.. versionchanged::`` directive. + +### PDEP-17 History + +- 27 June 2024: Initial version. + +[1]: https://pandas.pydata.org/docs/reference/index.html diff --git a/web/pandas/static/img/books/effective_pandas_2.gif b/web/pandas/static/img/books/effective_pandas_2.gif new file mode 100644 index 0000000000000..ae3efbcbd18a8 Binary files /dev/null and b/web/pandas/static/img/books/effective_pandas_2.gif differ diff --git a/web/pandas/static/img/books/pandas_cookbook_3.gif b/web/pandas/static/img/books/pandas_cookbook_3.gif new file mode 100644 index 0000000000000..aa9d351d489e0 Binary files /dev/null and b/web/pandas/static/img/books/pandas_cookbook_3.gif differ diff --git a/web/pandas/static/img/pydata_book.gif b/web/pandas/static/img/books/pydata_book.gif similarity index 100% rename from web/pandas/static/img/pydata_book.gif rename to web/pandas/static/img/books/pydata_book.gif diff --git a/web/pandas/static/img/install/anaconda_prompt.png b/web/pandas/static/img/install/anaconda_prompt.png deleted file mode 100644 index 7b547e4ebb02a..0000000000000 Binary files a/web/pandas/static/img/install/anaconda_prompt.png and /dev/null differ diff --git a/web/pandas/static/img/install/jupyterlab_home.png b/web/pandas/static/img/install/jupyterlab_home.png deleted file mode 100644 index c62d33a5e0fc6..0000000000000 Binary files a/web/pandas/static/img/install/jupyterlab_home.png and /dev/null differ diff --git a/web/pandas/static/img/install/pandas_import_and_version.png b/web/pandas/static/img/install/pandas_import_and_version.png deleted file mode 100644 index 64c1303ac495c..0000000000000 Binary files a/web/pandas/static/img/install/pandas_import_and_version.png and /dev/null differ diff --git a/web/pandas/static/img/partners/coiled.svg b/web/pandas/static/img/partners/coiled.svg deleted file mode 100644 index 2d76ce150084b..0000000000000 --- a/web/pandas/static/img/partners/coiled.svg +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/web/pandas/static/img/partners/czi.svg b/web/pandas/static/img/partners/czi.svg deleted file mode 100644 index b0ad9eb80580b..0000000000000 --- a/web/pandas/static/img/partners/czi.svg +++ /dev/null @@ -1,38 +0,0 @@ - - - - Group - Created with Sketch. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/web/pandas/static/img/partners/intel.svg b/web/pandas/static/img/partners/intel.svg deleted file mode 100644 index 61d4ac6ce192b..0000000000000 --- a/web/pandas/static/img/partners/intel.svg +++ /dev/null @@ -1,64 +0,0 @@ - - - - - - - - - - - diff --git a/web/pandas/static/img/partners/quansight_labs.svg b/web/pandas/static/img/partners/quansight_labs.svg deleted file mode 100644 index d49ab1b7d39ec..0000000000000 --- a/web/pandas/static/img/partners/quansight_labs.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/web/pandas/static/img/partners/two_sigma.svg b/web/pandas/static/img/partners/two_sigma.svg deleted file mode 100644 index d38df12766ed6..0000000000000 --- a/web/pandas/static/img/partners/two_sigma.svg +++ /dev/null @@ -1 +0,0 @@ -Logo diff --git a/web/pandas/static/img/partners/voltron_data.svg b/web/pandas/static/img/partners/voltron_data.svg deleted file mode 100644 index 0fb7dfd850166..0000000000000 --- a/web/pandas/static/img/partners/voltron_data.svg +++ /dev/null @@ -1,52 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/web/pandas/try.md b/web/pandas/try.md new file mode 100644 index 0000000000000..ee2f98b05aa64 --- /dev/null +++ b/web/pandas/try.md @@ -0,0 +1,12 @@ +# Try pandas in your browser (experimental) + +Try our experimental [JupyterLite](https://jupyterlite.readthedocs.io/en/stable/) live shell with `pandas`, powered by [Pyodide](https://pyodide.org/en/stable/). + +**Please note it can take a while (>30 seconds) before the shell is initialized and ready to run commands.** + +**Running it requires a reasonable amount of bandwidth and resources (>70 MiB on the first load), so it may not work properly on all devices or networks.** + + diff --git a/web/pandas/versions.json b/web/pandas/versions.json index e355005c7c937..2d2599ae8585b 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -5,11 +5,16 @@ "url": "https://pandas.pydata.org/docs/dev/" }, { - "name": "2.1 (stable)", - "version": "2.1", + "name": "2.2 (stable)", + "version": "2.2", "url": "https://pandas.pydata.org/docs/", "preferred": true }, + { + "name": "2.1", + "version": "2.1", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/" + }, { "name": "2.0", "version": "2.0", diff --git a/web/pandas_web.py b/web/pandas_web.py index ab2e72da850c4..b3872b829c73a 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -23,6 +23,7 @@ The rest of the items in the file will be added directly to the context. """ + import argparse import collections import datetime @@ -118,11 +119,11 @@ def blog_add_posts(context): summary = re.sub(tag_expr, "", html) try: body_position = summary.index(title) + len(title) - except ValueError: + except ValueError as err: raise ValueError( f'Blog post "{fname}" should have a markdown header ' f'corresponding to its "Title" element "{title}"' - ) + ) from err summary = " ".join(summary[body_position:].split(" ")[:30]) posts.append( { @@ -279,6 +280,7 @@ def roadmap_pdeps(context): PDEP's in different status from the directory tree and GitHub. """ KNOWN_STATUS = { + "Draft", "Under discussion", "Accepted", "Implemented", @@ -318,7 +320,7 @@ def roadmap_pdeps(context): github_repo_url = context["main"]["github_repo_url"] resp = requests.get( "https://api.github.com/search/issues?" - f"q=is:pr is:open label:PDEP repo:{github_repo_url}", + f"q=is:pr is:open label:PDEP draft:false repo:{github_repo_url}", headers=GITHUB_API_HEADERS, timeout=5, ) diff --git a/web/tests/test_pandas_web.py b/web/tests/test_pandas_web.py index a5f76875dfe23..6b88263f98be6 100644 --- a/web/tests/test_pandas_web.py +++ b/web/tests/test_pandas_web.py @@ -1,4 +1,4 @@ -from unittest.mock import ( +from unittest.mock import ( # noqa: TID251 mock_open, patch, ) @@ -30,7 +30,7 @@ def context() -> dict: } -@pytest.fixture(scope="function") +@pytest.fixture def mock_response(monkeypatch, request) -> None: def mocked_resp(*args, **kwargs): status_code, response = request.param