diff --git a/.circleci/config.yml b/.circleci/config.yml index 139ea9d220453..98d03910879c5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -54,7 +54,7 @@ jobs: command: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.14.0 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 899b49cc4eff5..705f3240308b2 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -233,7 +233,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" @@ -272,7 +272,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.14.0 meson[ninja]==1.3.2 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir @@ -344,7 +344,7 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov @@ -389,7 +389,7 @@ jobs: # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov diff --git a/.gitignore b/.gitignore index a188e216d9f70..d33c95043b69d 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,9 @@ doc/source/savefig/ # Interactive terminal generated files # ######################################## .jupyterlite.doit.db + +# meson subproject files # +########################## +subprojects/* +!subprojects/packagefiles +!subprojects/*.wrap diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 983c45fc493d1..1b490d4edc060 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -98,7 +98,7 @@ repos: rev: v19.1.6 hooks: - id: clang-format - files: ^pandas/_libs/src|^pandas/_libs/include + files: ^pandas/_libs|pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: local diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index c7c72828db481..b044bf9fd58cb 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 74cab4e0970dc..ed79abbe03111 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 092ca18d61259..f3f9696ee2f78 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -8,8 +8,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 325a6d45d74fd..7f0ed67258d07 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer - - meson=1.2.1 - - meson-python=0.13.1 + - meson=1.3.2 + - meson-python=0.14.0 - cython>=0.29.33 # test dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 22e4907e5a6e5..e693f74175df4 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,9 +6,9 @@ dependencies: # build dependencies - versioneer - - meson=1.2.1 + - meson=1.3.2 - cython>=0.29.33 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index b6f515dceaea9..f44365b9f8423 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index bc66f8a5382c9..f20346fad0d71 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 90933b24b88db..b10a6fca13890 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -10,8 +10,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index 3f09e27d0fe4b..0ba7a33599a9e 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/environment.yml b/environment.yml index 69647a436e3ad..5da739d01ee31 100644 --- a/environment.yml +++ b/environment.yml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer - cython~=3.0.5 - - meson=1.2.1 - - meson-python=0.13.1 + - meson>=1.3.0 + - meson-python>=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/meson.build b/meson.build index efe543b7a267c..c4447b53c3497 100644 --- a/meson.build +++ b/meson.build @@ -4,11 +4,15 @@ project( 'c', 'cpp', 'cython', version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', - meson_version: '>=1.2.1', + meson_version: '>=1.3.0', default_options: [ 'buildtype=release', 'c_std=c11', + 'cpp_std=c++17', 'warning_level=2', + 'default_library=static', + # TODO: how can we only set this for nanobind? + 'cpp_args=-Wno-sign-compare' ] ) diff --git a/pandas/_libs/arrow_string_accumulations.cc b/pandas/_libs/arrow_string_accumulations.cc new file mode 100644 index 0000000000000..d0d2a940099fe --- /dev/null +++ b/pandas/_libs/arrow_string_accumulations.cc @@ -0,0 +1,223 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace nanoarrow::literals; +namespace nb = nanobind; + +static auto ReleaseArrowArray(void *ptr) noexcept -> void { + auto array = static_cast(ptr); + if (array->release != nullptr) { + ArrowArrayRelease(array); + } + + delete array; +} + +static auto ReleaseArrowSchema(void *ptr) noexcept -> void { + auto schema = static_cast(ptr); + if (schema->release != nullptr) { + ArrowSchemaRelease(schema); + } + + delete schema; +} + +template +static auto CumSum(struct ArrowArrayStream *array_stream, + struct ArrowArray *out, bool skipna) { + bool seen_na = false; + std::stringstream ss{}; + + nanoarrow::UniqueSchema schema{}; + NANOARROW_THROW_NOT_OK( + ArrowArrayStreamGetSchema(array_stream, schema.get(), nullptr)); + + nanoarrow::ViewArrayStream array_stream_view(array_stream); + for (const auto &array : array_stream_view) { + for (const auto &sv : nanoarrow::ViewArrayAsBytes(&array)) { + if ((!sv || seen_na) && !skipna) { + seen_na = true; + ArrowArrayAppendNull(out, 1); + } else { + if (sv) { + ss << std::string_view{(*sv).data, + static_cast((*sv).size_bytes)}; + } + const auto str = ss.str(); + const ArrowStringView asv{str.c_str(), + static_cast(str.size())}; + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(out, asv)); + } + } + } +} + +// TODO: doesn't seem like all compilers in CI support this? +// template +// concept MinOrMaxOp = +// std::same_as> || std::same_as>; +// template +// requires MinOrMaxOp +template typename CompareOp> +static auto CumMinOrMax(struct ArrowArrayStream *array_stream, + struct ArrowArray *out, bool skipna) { + bool seen_na = false; + std::optional current_str{}; + + nanoarrow::UniqueSchema schema{}; + NANOARROW_THROW_NOT_OK( + ArrowArrayStreamGetSchema(array_stream, schema.get(), nullptr)); + + nanoarrow::ViewArrayStream array_stream_view(array_stream); + for (const auto &array : array_stream_view) { + for (const auto &sv : nanoarrow::ViewArrayAsBytes(&array)) { + if ((!sv || seen_na) && !skipna) { + seen_na = true; + ArrowArrayAppendNull(out, 1); + } else { + if (sv || current_str) { + if (sv) { + const nb::str pyval{(*sv).data, + static_cast((*sv).size_bytes)}; + if (current_str) { + const nb::str pycurrent{current_str->data(), current_str->size()}; + if (CompareOp{}(pyval, pycurrent)) { + current_str = std::string{ + (*sv).data, static_cast((*sv).size_bytes)}; + } + } else { + current_str = std::string{(*sv).data, + static_cast((*sv).size_bytes)}; + } + } + + struct ArrowStringView out_sv{ + current_str->data(), static_cast(current_str->size())}; + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(out, out_sv)); + } else { + ArrowArrayAppendEmpty(out, 1); + } + } + } + } +} + +class ArrowStringAccumulation { +public: + ArrowStringAccumulation(nb::object array_object, std::string accumulation, + bool skipna) + : skipna_(skipna) { + if ((accumulation == "cumsum") || (accumulation == "cummin") || + (accumulation == "cummax")) { + accumulation_ = std::move(accumulation); + } else { + const auto error_message = + std::string("Unsupported accumulation: ") + accumulation; + throw nb::value_error(error_message.c_str()); + } + + const auto obj = nb::getattr(array_object, "__arrow_c_stream__")(); + const auto pycapsule_obj = nb::cast(obj); + + const auto stream = static_cast( + PyCapsule_GetPointer(pycapsule_obj.ptr(), "arrow_array_stream")); + if (stream == nullptr) { + throw std::invalid_argument("Invalid Arrow Stream capsule provided!"); + } + + if (stream->get_schema(stream, schema_.get()) != 0) { + std::string error_msg{stream->get_last_error(stream)}; + throw std::runtime_error("Could not read from arrow schema:" + error_msg); + } + struct ArrowSchemaView schema_view{}; + NANOARROW_THROW_NOT_OK( + ArrowSchemaViewInit(&schema_view, schema_.get(), nullptr)); + + switch (schema_view.type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + break; + default: + const auto error_message = + std::string("Expected a string-like array type, got: ") + + ArrowTypeString(schema_view.type); + throw std::invalid_argument(error_message); + } + + ArrowArrayStreamMove(stream, stream_.get()); + } + + std::pair Accumulate(nb::object) { + struct ArrowSchemaView schema_view{}; + NANOARROW_THROW_NOT_OK( + ArrowSchemaViewInit(&schema_view, schema_.get(), nullptr)); + auto uschema = nanoarrow::UniqueSchema{}; + ArrowSchemaInit(uschema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetType(uschema.get(), schema_view.type)); + + // TODO: even though we are reading a stream we are returning an array + // We should return a like sized stream of data in the future + auto uarray_out = nanoarrow::UniqueArray{}; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(uarray_out.get(), uschema.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(uarray_out.get())); + + if (accumulation_ == "cumsum") { + if (schema_view.type == NANOARROW_TYPE_STRING) { + CumSum<32>(stream_.get(), uarray_out.get(), skipna_); + } else { + CumSum<64>(stream_.get(), uarray_out.get(), skipna_); + } + + } else if (accumulation_ == "cummin") { + if (schema_view.type == NANOARROW_TYPE_STRING) { + CumMinOrMax<32, std::less>(stream_.get(), uarray_out.get(), skipna_); + } else { + CumMinOrMax<64, std::less>(stream_.get(), uarray_out.get(), skipna_); + } + } else if (accumulation_ == "cummax") { + if (schema_view.type == NANOARROW_TYPE_STRING) { + CumMinOrMax<32, std::greater>(stream_.get(), uarray_out.get(), skipna_); + } else { + CumMinOrMax<64, std::greater>(stream_.get(), uarray_out.get(), skipna_); + } + } else { + throw std::runtime_error("Unexpected branch"); + } + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuildingDefault(uarray_out.get(), nullptr)); + + auto out_schema = new struct ArrowSchema; + ArrowSchemaMove(uschema.get(), out_schema); + nb::capsule schema_capsule{out_schema, "arrow_schema", &ReleaseArrowSchema}; + + auto out_array = new struct ArrowArray; + ArrowArrayMove(uarray_out.get(), out_array); + nb::capsule array_capsule{out_array, "arrow_array", &ReleaseArrowArray}; + + return std::pair{schema_capsule, array_capsule}; + } + +private: + nanoarrow::UniqueArrayStream stream_; + nanoarrow::UniqueSchema schema_; + std::string accumulation_; + bool skipna_; +}; + +NB_MODULE(arrow_string_accumulations, m) { + nb::class_(m, "ArrowStringAccumulation") + .def(nb::init()) + .def("__arrow_c_array__", &ArrowStringAccumulation::Accumulate, + nb::arg("requested_schema") = nb::none()); +} diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c27386743c6e9..b6ecbe2fba97f 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -122,6 +122,16 @@ foreach ext_name, ext_dict : libs_sources ) endforeach +nanobind_dep = dependency('nanobind') +nanoarrow_dep = dependency('nanoarrow') +py.extension_module( + 'arrow_string_accumulations', + sources: ['arrow_string_accumulations.cc'], + dependencies: [nanobind_dep, nanoarrow_dep], + subdir: 'pandas/_libs', + install: true, +) + # Basically just __init__.py and the .pyi files sources_to_install = [ '__init__.py', diff --git a/pandas/conftest.py b/pandas/conftest.py index 106518678df6a..0b3654bbcc16e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1317,6 +1317,22 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param("str[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'str[pyarrow]' + * 'string[pyarrow]' + """ + return request.param + + @pytest.fixture( params=[ "python", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4d9c8eb3a41b6..1c930ee642f13 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -16,6 +16,7 @@ import numpy as np from pandas._libs import lib +import pandas._libs.arrow_string_accumulations as sa from pandas._libs.tslibs import ( Timedelta, Timestamp, @@ -41,6 +42,7 @@ is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1619,6 +1621,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1654,6 +1659,22 @@ def _accumulate( return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + pa_result = pa.array(sa.ArrowStringAccumulation(self._pa_array, name, skipna)) + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index c52168ae48ca8..ce71cfec535e4 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,7 +4,10 @@ import numpy as np import pytest -from pandas.compat import WASM +from pandas.compat import ( + HAS_PYARROW, + WASM, +) from pandas.core.dtypes.common import is_number @@ -163,10 +166,10 @@ def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if series.dtype == "string" and func == "cumsum": + if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: request.applymarker( pytest.mark.xfail( - raises=(TypeError, NotImplementedError), + raises=NotImplementedError, reason="TODO(infer_string) cumsum not yet implemented for string", ) ) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c5f5a65b77eea..4fccf02e08bd6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e19351b2ad058..6ce48e434d329 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -24,6 +24,8 @@ from pandas.compat import HAS_PYARROW +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -192,6 +194,14 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) + return ser.dtype.storage == "pyarrow" and op_name in [ + "cummin", + "cummax", + "cumsum", + ] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a9d5486139b46..610903d77512d 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -227,3 +229,55 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", "x", "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, ["", "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, ["", "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", "x", "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, ["", "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cum_methods_pyarrow_strings( + self, pyarrow_string_dtype, data, op, skipna, expected_data + ): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(data, dtype=pyarrow_string_dtype) + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) diff --git a/pyproject.toml b/pyproject.toml index 7ab9cd2c17669..00b509ba775b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,8 +2,8 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "meson-python>=0.13.1", - "meson>=1.2.1,<2", + "meson-python>=0.14.0", + "meson>=1.3.0,<2", "wheel", "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json # Force numpy higher than 2.0rc1, so that built wheels are compatible @@ -145,6 +145,7 @@ parentdir_prefix = "pandas-" [tool.meson-python.args] setup = ['--vsenv'] # For Windows +install = ['--skip-subprojects'] [tool.cibuildwheel] skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x" diff --git a/requirements-dev.txt b/requirements-dev.txt index fb4d9cdb589ca..4e8d1eb2e9e8b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,8 +4,8 @@ pip versioneer[toml] cython~=3.0.5 -meson[ninja]==1.2.1 -meson-python==0.13.1 +meson[ninja]>=1.3.0 +meson-python>=0.14.0 pytest>=7.3.2 pytest-cov pytest-xdist>=3.4.0 diff --git a/subprojects/nanoarrow.wrap b/subprojects/nanoarrow.wrap new file mode 100644 index 0000000000000..bd98febad3911 --- /dev/null +++ b/subprojects/nanoarrow.wrap @@ -0,0 +1,9 @@ +[wrap-file] +directory = arrow-nanoarrow-7a808701819cb4c5f6b6ddf7c51c09389cd097ff +source_url = https://github.com/apache/arrow-nanoarrow/archive/7a808701819cb4c5f6b6ddf7c51c09389cd097ff.tar.gz +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/nanoarrow_0.6.0-1/apache-arrow-nanoarrow-0.6.0.tar.gz +source_filename = arrow-nanoarrow-7a808701819cb4c5f6b6ddf7c51c09389cd097ff.tar.gz +source_hash = 1f4924dc341bc3bf357ee23320651f18c05a4e031e089b2bc09eeadee2664855 + +[provide] +nanoarrow = nanoarrow_dep diff --git a/subprojects/nanobind.wrap b/subprojects/nanobind.wrap new file mode 100644 index 0000000000000..78e2e7c5d011b --- /dev/null +++ b/subprojects/nanobind.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = nanobind-2.4.0 +source_url = https://github.com/wjakob/nanobind/archive/refs/tags/v2.4.0.tar.gz +source_filename = nanobind-2.4.0.tar.gz +source_hash = bb35deaed7efac5029ed1e33880a415638352f757d49207a8e6013fefb6c49a7 +patch_filename = nanobind_2.4.0-2_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/nanobind_2.4.0-2/get_patch +patch_hash = cf493bda0b11ea4e8d9dd42229c3bbdd52af88cc4aedac75a1eccb102b86dd4a +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/nanobind_2.4.0-2/nanobind-2.4.0.tar.gz +wrapdb_version = 2.4.0-2 + +[provide] +nanobind = nanobind_dep diff --git a/subprojects/robin-map.wrap b/subprojects/robin-map.wrap new file mode 100644 index 0000000000000..3da2993bb709e --- /dev/null +++ b/subprojects/robin-map.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = robin-map-1.3.0 +source_url = https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz +source_filename = robin-map-1.3.0.tar.gz +source_hash = a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236 +patch_filename = robin-map_1.3.0-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/robin-map_1.3.0-1/get_patch +patch_hash = 6d090f988541ffb053512607e0942cbd0dbc2a4fa0563e44ff6a37e810b8c739 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/robin-map_1.3.0-1/robin-map-1.3.0.tar.gz +wrapdb_version = 1.3.0-1 + +[provide] +robin-map = robin_map_dep