Skip to content
36 changes: 22 additions & 14 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from narwhals._pandas_like.utils import (
NUMPY_VERSION,
align_and_extract_native,
binary_string_sum_fallback,
broadcast_series_to_index,
get_dtype_backend,
import_array_module,
Expand All @@ -26,6 +27,7 @@
from narwhals._typing_compat import assert_never
from narwhals._utils import NO_DEFAULT, Implementation, is_list_of
from narwhals.dependencies import is_numpy_array_1d, is_pandas_like_series
from narwhals.dtypes import String
from narwhals.exceptions import InvalidOperationError

if TYPE_CHECKING:
Expand Down Expand Up @@ -399,23 +401,29 @@ def first(self) -> PythonLiteral:
def last(self) -> PythonLiteral:
return self.native.iloc[-1] if len(self.native) else None

def _with_binary(self, op: Callable[..., PandasLikeSeries], other: Any) -> Self:
def _with_binary(self, op: Callable[..., pd.Series], other: Any) -> Self:
Comment thread
FBruzzesi marked this conversation as resolved.
ser, other_native = align_and_extract_native(self, other)
preserve_broadcast = self._broadcast and getattr(other, "_broadcast", True)
if (
str(self.native.dtype) == "large_string[pyarrow]"
and isinstance(other_native, str)
and op.__name__ == "add"
):
# https://github.com/pandas-dev/pandas/issues/64393
import pyarrow as pa # ignore-banned-import

other_native = pa.scalar(other_native, type=pa.large_string())
return self._with_native(
op(ser, other_native), preserve_broadcast=preserve_broadcast
).alias(self.name)
try:
res = op(ser, other_native)
except TypeError:
if (
op.__name__ == "add"
and self.dtype == String
and (
isinstance(other, str)
or (isinstance(other, self.__class__) and other.dtype == String)
)
):
pdx = self.__native_namespace__()
res = binary_string_sum_fallback(ser, other_native, pdx)
else:
raise
Comment thread
FBruzzesi marked this conversation as resolved.
return self._with_native(res, preserve_broadcast=preserve_broadcast).alias(
self.name
)

def _with_binary_right(self, op: Callable[..., PandasLikeSeries], other: Any) -> Self:
def _with_binary_right(self, op: Callable[..., pd.Series], other: Any) -> Self:
return self._with_binary(lambda x, y: op(y, x), other).alias(self.name)

def __eq__(self, other: object) -> Self: # type: ignore[override]
Expand Down
33 changes: 33 additions & 0 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,3 +708,36 @@ def broadcast_series_to_index(
return series_class(pa_array, index=index, name=native.name)

return series_class(value, index=index, dtype=native.dtype, name=native.name)


def binary_string_sum_fallback(left: pd.Series, right: Any, pdx: Any) -> pd.Series:
# Workaround some upstream issues:
# - https://github.com/pandas-dev/pandas/issues/64393
# - https://github.com/pandas-dev/pandas/issues/65220
left_dtype = left.dtype
left_dtype_str = str(left_dtype)
if left_dtype_str == "large_string[pyarrow]" and isinstance(right, str):
import pyarrow as pa # ignore-banned-import

return left + pa.scalar(right, type=pa.large_string())
if isinstance(right, pdx.Series):
right_dtype = right.dtype
if left_dtype_str == "object": # pragma: no cover
# Only for pandas pre 3.0. Anything is better than `object`, so take RHS.
return left.astype(right_dtype) + right
if hasattr(left.values, "__arrow_array__") and hasattr(
right.values, "__arrow_array__"
):
import pyarrow as pa # ignore-banned-import

left_arrow = left.values.__arrow_array__().type # noqa: PD011 # type: ignore[attr-defined]
right_arrow = right.values.__arrow_array__().type # noqa: PD011 # type: ignore[attr-defined]
if pa.types.is_string(left_arrow) and pa.types.is_large_string(right_arrow):
# https://github.com/pandas-dev/pandas/blob/b00d4f6710ff6c1c80319196657c31c2cf6c70ff/pandas/core/arrays/arrow/array.py#L1064-L1068
pd_pa_large_string = pd.ArrowDtype(pa.large_string())
return left.astype(pd_pa_large_string) + right.astype(pd_pa_large_string)
else: # pragma: no cover
pass
# Give precedence to the left-hand-side dtype.
return left + right.astype(left_dtype)
return left + right # pragma: no cover
117 changes: 117 additions & 0 deletions tests/expr_and_series/pandas_str_dtypes_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

from typing import Any

import pytest

import narwhals as nw
from tests.utils import assert_equal_data

pytest.importorskip("pandas", minversion="3.0.0")
pytest.importorskip("pyarrow")

import numpy as np
import pandas as pd
import pyarrow as pa

STRING_DTYPE_NAN = pd.StringDtype("pyarrow", na_value=np.nan) # type: ignore[call-arg]
STRING_DTYPE_NA = pd.StringDtype("pyarrow", na_value=pd.NA) # type: ignore[call-arg]
Comment thread
camriddell marked this conversation as resolved.
STRING_DTYPE_PYTHON_NAN = pd.StringDtype("python", na_value=np.nan) # type: ignore[call-arg]
STRING_DTYPE_PYTHON_NA = pd.StringDtype("python", na_value=pd.NA) # type: ignore[call-arg]


@pytest.mark.parametrize(
("left_dtype", "right_dtype", "result_dtype"),
[
(STRING_DTYPE_NAN, STRING_DTYPE_NAN, STRING_DTYPE_NAN),
(STRING_DTYPE_NAN, STRING_DTYPE_NA, STRING_DTYPE_NAN),
(STRING_DTYPE_NAN, pd.ArrowDtype(pa.string()), STRING_DTYPE_NAN),
(STRING_DTYPE_NAN, pd.ArrowDtype(pa.large_string()), STRING_DTYPE_NAN),
(STRING_DTYPE_NAN, STRING_DTYPE_PYTHON_NAN, STRING_DTYPE_NAN),
(STRING_DTYPE_NAN, STRING_DTYPE_PYTHON_NA, STRING_DTYPE_NAN),
(STRING_DTYPE_NA, STRING_DTYPE_NAN, STRING_DTYPE_NA),
(STRING_DTYPE_NA, STRING_DTYPE_NA, STRING_DTYPE_NA),
(STRING_DTYPE_NA, pd.ArrowDtype(pa.string()), STRING_DTYPE_NA),
(STRING_DTYPE_NA, pd.ArrowDtype(pa.large_string()), STRING_DTYPE_NA),
(STRING_DTYPE_NA, STRING_DTYPE_PYTHON_NAN, STRING_DTYPE_NA),
(STRING_DTYPE_NA, STRING_DTYPE_PYTHON_NA, STRING_DTYPE_NA),
(pd.ArrowDtype(pa.string()), STRING_DTYPE_NAN, pd.ArrowDtype(pa.large_string())),
(pd.ArrowDtype(pa.string()), STRING_DTYPE_NA, pd.ArrowDtype(pa.large_string())),
(
pd.ArrowDtype(pa.string()),
pd.ArrowDtype(pa.string()),
pd.ArrowDtype(pa.string()),
),
(
pd.ArrowDtype(pa.string()),
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
STRING_DTYPE_NAN,
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
STRING_DTYPE_NA,
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.string()),
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.large_string()),
),
Comment thread
camriddell marked this conversation as resolved.
(STRING_DTYPE_PYTHON_NAN, STRING_DTYPE_PYTHON_NAN, STRING_DTYPE_PYTHON_NAN),
(STRING_DTYPE_PYTHON_NAN, STRING_DTYPE_PYTHON_NA, STRING_DTYPE_PYTHON_NA),
(STRING_DTYPE_PYTHON_NAN, STRING_DTYPE_NAN, STRING_DTYPE_NAN),
(STRING_DTYPE_PYTHON_NAN, STRING_DTYPE_NA, STRING_DTYPE_NA),
(STRING_DTYPE_PYTHON_NAN, pd.ArrowDtype(pa.string()), pd.ArrowDtype(pa.string())),
(
STRING_DTYPE_PYTHON_NAN,
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.large_string()),
),
(STRING_DTYPE_PYTHON_NA, STRING_DTYPE_PYTHON_NAN, STRING_DTYPE_PYTHON_NA),
(STRING_DTYPE_PYTHON_NA, STRING_DTYPE_PYTHON_NA, STRING_DTYPE_PYTHON_NA),
(STRING_DTYPE_PYTHON_NA, STRING_DTYPE_NAN, STRING_DTYPE_PYTHON_NA),
(STRING_DTYPE_PYTHON_NA, STRING_DTYPE_NA, STRING_DTYPE_NA),
(STRING_DTYPE_PYTHON_NA, pd.ArrowDtype(pa.string()), pd.ArrowDtype(pa.string())),
(
STRING_DTYPE_PYTHON_NA,
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.large_string()),
),
(pd.ArrowDtype(pa.string()), STRING_DTYPE_PYTHON_NAN, pd.ArrowDtype(pa.string())),
(pd.ArrowDtype(pa.string()), STRING_DTYPE_PYTHON_NA, pd.ArrowDtype(pa.string())),
(
pd.ArrowDtype(pa.large_string()),
STRING_DTYPE_PYTHON_NAN,
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
STRING_DTYPE_PYTHON_NA,
pd.ArrowDtype(pa.large_string()),
),
],
)
def test_pandas_str_types(left_dtype: Any, right_dtype: Any, result_dtype: Any) -> None:
df = pd.DataFrame({"fruit": ["apple", "banana"]}, dtype=left_dtype)
df["new_str_col"] = "!"
df["new_str_col"] = df["new_str_col"].astype(right_dtype) # pyrefly: ignore[missing-attribute] https://github.com/facebook/pyrefly/issues/3299
res = nw.from_native(df).with_columns(
concat_col=nw.concat_str([nw.col("fruit"), nw.col("new_str_col")])
)
expected = {
"fruit": ["apple", "banana"],
"new_str_col": ["!", "!"],
"concat_col": ["apple!", "banana!"],
}
assert_equal_data(res, expected)
assert res.to_native()["concat_col"].dtype == result_dtype
Loading