Skip to content

Commit 08b2896

Browse files
williambdeanpre-commit-ci[bot]FBruzzesidangotbanned
authored
feat: Add support for Series|Expr.str.zfill (#2598)
* get pandas zfill to work * add to the api-reference * add tests for the zfill * implement for other backends * add the series tests * add additional test cases * add additional test cases * implement for duckdb * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * support polars like zfill * make test cases act like polars * implement for ibis * implement for spark_like * add attempt at arrow * add skip reason Co-authored-by: Francesco Bruzzesi <[email protected]> * consolidate on inner function name * add to docstring * feat: Get `pyarrow` working Man they really make the simple things complex 😅] * fix(typing): `ibis` casts * fix(typing): `pyspark` ignore * add dask implementation * add exceptions and skips * adapt pandas / stdlib zfill behavior for all * correct the example in the docstring * skip based on pandas version * change the condition * add expr_str example as well * fixing the import * implement for older polars versions * add skipif for second test :tear: * fix the doctests * refactor logic to look like _spark_like * pyarrow variable naming, typing and some misc * pyarrow use repeat * remove warnings * chore(typing): Add missing `zfill` Would've been caught by #2500 * Update narwhals/_ibis/expr_str.py Co-authored-by: Dan Redding <[email protected]> * Update narwhals/_ibis/expr_str.py Co-authored-by: Dan Redding <[email protected]> * Update narwhals/_arrow/series_str.py Co-authored-by: Dan Redding <[email protected]> * perf: Reuse `PolarsExpr` impl in `PolarsSeries` Resolves #2598 (comment) * use xfail instead * rename helper function --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Francesco Bruzzesi <[email protected]> Co-authored-by: dangotbanned <[email protected]> Co-authored-by: FBruzzesi <[email protected]>
1 parent 99222f6 commit 08b2896

File tree

15 files changed

+256
-4
lines changed

15 files changed

+256
-4
lines changed

docs/api-reference/expr_str.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@
1818
- to_datetime
1919
- to_lowercase
2020
- to_uppercase
21+
- zfill
2122
show_source: false
2223
show_bases: false

docs/api-reference/series_str.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@
1818
- to_datetime
1919
- to_lowercase
2020
- to_uppercase
21+
- zfill
2122
show_source: false
2223
show_bases: false

narwhals/_arrow/series_str.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
import string
44
from typing import TYPE_CHECKING
55

6+
import pyarrow as pa
67
import pyarrow.compute as pc
78

89
from narwhals._arrow.utils import ArrowSeriesNamespace, lit, parse_datetime_format
910

1011
if TYPE_CHECKING:
1112
from narwhals._arrow.series import ArrowSeries
13+
from narwhals._arrow.typing import Incomplete
1214

1315

1416
class ArrowSeriesStringNamespace(ArrowSeriesNamespace):
@@ -60,3 +62,36 @@ def to_uppercase(self) -> ArrowSeries:
6062

6163
def to_lowercase(self) -> ArrowSeries:
6264
return self.with_native(pc.utf8_lower(self.native))
65+
66+
def zfill(self, width: int) -> ArrowSeries:
67+
binary_join: Incomplete = pc.binary_join_element_wise
68+
native = self.native
69+
hyphen, plus = lit("-"), lit("+")
70+
first_char, remaining_chars = self.slice(0, 1).native, self.slice(1, None).native
71+
72+
# Conditions
73+
less_than_width = pc.less(pc.utf8_length(native), lit(width))
74+
starts_with_hyphen = pc.equal(first_char, hyphen)
75+
starts_with_plus = pc.equal(first_char, plus)
76+
77+
conditions = pc.make_struct(
78+
pc.and_(starts_with_hyphen, less_than_width),
79+
pc.and_(starts_with_plus, less_than_width),
80+
less_than_width,
81+
)
82+
83+
# Cases
84+
padded_remaining_chars = pc.utf8_lpad(remaining_chars, width - 1, padding="0")
85+
86+
result = pc.case_when(
87+
conditions,
88+
binary_join(
89+
pa.repeat(hyphen, len(native)), padded_remaining_chars, ""
90+
), # starts with hyphen and less than width
91+
binary_join(
92+
pa.repeat(plus, len(native)), padded_remaining_chars, ""
93+
), # starts with plus and less than width
94+
pc.utf8_lpad(native, width=width, padding="0"), # less than width
95+
native,
96+
)
97+
return self.with_native(result)

narwhals/_compliant/any_namespace.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def split(self, by: str) -> CompliantT_co: ...
7979
def to_datetime(self, format: str | None) -> CompliantT_co: ...
8080
def to_lowercase(self) -> CompliantT_co: ...
8181
def to_uppercase(self) -> CompliantT_co: ...
82+
def zfill(self, width: int) -> CompliantT_co: ...
8283

8384

8485
class StructNamespace(_StoresCompliant[CompliantT_co], Protocol[CompliantT_co]):

narwhals/_compliant/expr.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,9 @@ def to_lowercase(self) -> EagerExprT:
11271127
def to_uppercase(self) -> EagerExprT:
11281128
return self.compliant._reuse_series_namespace("str", "to_uppercase")
11291129

1130+
def zfill(self, width: int) -> EagerExprT:
1131+
return self.compliant._reuse_series_namespace("str", "zfill", width=width)
1132+
11301133

11311134
class EagerExprStructNamespace(
11321135
EagerExprNamespace[EagerExprT], StructNamespace[EagerExprT], Generic[EagerExprT]

narwhals/_dask/expr_str.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,8 @@ def to_lowercase(self) -> DaskExpr:
9696
return self._compliant_expr._with_callable(
9797
lambda expr: expr.str.lower(), "to_lowercase"
9898
)
99+
100+
def zfill(self, width: int) -> DaskExpr:
101+
return self._compliant_expr._with_callable(
102+
lambda expr, width: expr.str.zfill(width), "zfill", width=width
103+
)

narwhals/_duckdb/expr_str.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from duckdb import FunctionExpression
66

7-
from narwhals._duckdb.utils import lit
7+
from narwhals._duckdb.utils import lit, when
88
from narwhals._utils import not_implemented
99

1010
if TYPE_CHECKING:
@@ -100,4 +100,31 @@ def to_datetime(self, format: str | None) -> DuckDBExpr:
100100
lambda expr: FunctionExpression("strptime", expr, lit(format))
101101
)
102102

103+
def zfill(self, width: int) -> DuckDBExpr:
104+
# DuckDB does not have a built-in zfill function, so we need to implement it manually
105+
# using string manipulation functions.
106+
107+
def func(expr: Expression) -> Expression:
108+
less_than_width = FunctionExpression("length", expr) < lit(width)
109+
zero, hyphen, plus = lit("0"), lit("-"), lit("+")
110+
111+
starts_with_minus = FunctionExpression("starts_with", expr, hyphen)
112+
starts_with_plus = FunctionExpression("starts_with", expr, plus)
113+
substring = FunctionExpression("substr", expr, lit(2))
114+
padded_substring = FunctionExpression("lpad", substring, lit(width - 1), zero)
115+
return (
116+
when(
117+
starts_with_minus & less_than_width,
118+
FunctionExpression("concat", hyphen, padded_substring),
119+
)
120+
.when(
121+
starts_with_plus & less_than_width,
122+
FunctionExpression("concat", plus, padded_substring),
123+
)
124+
.when(less_than_width, FunctionExpression("lpad", expr, lit(width), zero))
125+
.otherwise(expr)
126+
)
127+
128+
return self._compliant_expr._with_callable(func)
129+
103130
replace = not_implemented()

narwhals/_ibis/expr_str.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING, Any, Callable
3+
from typing import TYPE_CHECKING, Any, Callable, cast
44

5+
import ibis
6+
import ibis.expr.types as ir
57
from ibis.expr.datatypes import Timestamp
68

9+
from narwhals._ibis.utils import lit
710
from narwhals._utils import _is_naive_format, not_implemented
811

912
if TYPE_CHECKING:
10-
import ibis.expr.types as ir
11-
1213
from narwhals._ibis.expr import IbisExpr
1314

1415

@@ -100,4 +101,23 @@ def to_datetime(self, format: str | None) -> IbisExpr:
100101
fn = self._to_datetime_naive if _is_naive_format(format) else self._to_datetime
101102
return self._compliant_expr._with_callable(fn(format))
102103

104+
def zfill(self, width: int) -> IbisExpr:
105+
def func(expr: ir.StringColumn) -> ir.Value:
106+
length = expr.length()
107+
less_than_width = length < lit(width)
108+
zero, hyphen, plus = "0", "-", "+"
109+
starts_with_minus = expr.startswith(hyphen)
110+
starts_with_plus = expr.startswith(plus)
111+
one = cast("ir.IntegerScalar", lit(1))
112+
sub_length = cast("ir.IntegerValue", length - one)
113+
substring = expr.substr(one, sub_length).lpad(width - 1, zero)
114+
return ibis.cases(
115+
(starts_with_minus & less_than_width, (substring.lpad(width, hyphen))),
116+
(starts_with_plus & less_than_width, (substring.lpad(width, plus))),
117+
(less_than_width, expr.lpad(width, zero)),
118+
else_=expr,
119+
)
120+
121+
return self._compliant_expr._with_callable(func)
122+
103123
replace = not_implemented()

narwhals/_pandas_like/series_str.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,6 @@ def to_uppercase(self) -> PandasLikeSeries:
7777

7878
def to_lowercase(self) -> PandasLikeSeries:
7979
return self.with_native(self.native.str.lower())
80+
81+
def zfill(self, width: int) -> PandasLikeSeries:
82+
return self.with_native(self.native.str.zfill(width))

narwhals/_polars/expr.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,27 @@ class PolarsExprStringNamespace:
339339
def __init__(self, expr: PolarsExpr) -> None:
340340
self._compliant_expr = expr
341341

342+
def zfill(self, width: int) -> PolarsExpr:
343+
native_expr = self._compliant_expr.native
344+
native_result = native_expr.str.zfill(width)
345+
346+
if self._compliant_expr._backend_version <= (1, 30, 0):
347+
length = native_expr.str.len_chars()
348+
less_than_width = length < width
349+
plus = "+"
350+
starts_with_plus = native_expr.str.starts_with(plus)
351+
native_result = (
352+
pl.when(starts_with_plus & less_than_width)
353+
.then(
354+
native_expr.str.slice(1, length)
355+
.str.zfill(width - 1)
356+
.str.pad_start(width, plus)
357+
)
358+
.otherwise(native_result)
359+
)
360+
361+
return self._compliant_expr._with_native(native_result)
362+
342363
def __getattr__(self, attr: str) -> Callable[[Any], PolarsExpr]:
343364
def func(*args: Any, **kwargs: Any) -> PolarsExpr:
344365
pos, kwds = extract_args_kwargs(args, kwargs)

narwhals/_polars/series.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,12 @@ class PolarsSeriesStringNamespace:
686686
def __init__(self, series: PolarsSeries) -> None:
687687
self._compliant_series = series
688688

689+
def zfill(self, width: int) -> PolarsSeries:
690+
series = self._compliant_series
691+
name = series.name
692+
ns = series.__narwhals_namespace__()
693+
return series.to_frame().select(ns.col(name).str.zfill(width)).get_column(name)
694+
689695
def __getattr__(self, attr: str) -> Any:
690696
def func(*args: Any, **kwargs: Any) -> Any:
691697
pos, kwds = extract_args_kwargs(args, kwargs)

narwhals/_spark_like/expr_str.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,30 @@ def to_datetime(self, format: str | None) -> SparkLikeExpr:
113113
return self._compliant_expr._with_callable(
114114
lambda expr: function(F.replace(expr, F.lit("T"), F.lit(" ")))
115115
)
116+
117+
def zfill(self, width: int) -> SparkLikeExpr:
118+
def func(expr: Column) -> Column:
119+
F = self._compliant_expr._F # noqa: N806
120+
121+
length = F.length(expr)
122+
less_than_width = length < width
123+
hyphen, plus = F.lit("-"), F.lit("+")
124+
starts_with_minus = F.startswith(expr, hyphen)
125+
starts_with_plus = F.startswith(expr, plus)
126+
sub_length = length - F.lit(1)
127+
# NOTE: `len` annotated as `int`, but `Column.substr` accepts `int | Column`
128+
substring = F.substring(expr, 2, sub_length) # pyright: ignore[reportArgumentType]
129+
padded_substring = F.lpad(substring, width - 1, "0")
130+
return (
131+
F.when(
132+
starts_with_minus & less_than_width,
133+
F.concat(hyphen, padded_substring),
134+
)
135+
.when(
136+
starts_with_plus & less_than_width, F.concat(plus, padded_substring)
137+
)
138+
.when(less_than_width, F.lpad(expr, width, "0"))
139+
.otherwise(expr)
140+
)
141+
142+
return self._compliant_expr._with_callable(func)

narwhals/expr_str.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,3 +447,34 @@ def to_lowercase(self) -> ExprT:
447447
return self._expr._with_elementwise_op(
448448
lambda plx: self._expr._to_compliant_expr(plx).str.to_lowercase()
449449
)
450+
451+
def zfill(self, width: int) -> ExprT:
452+
"""Transform string to zero-padded variant.
453+
454+
Arguments:
455+
width: The desired length of the string after padding. If the length of the
456+
string is greater than `width`, no padding is applied.
457+
If `width` is less than 0, no padding is applied.
458+
459+
Returns:
460+
A new expression.
461+
462+
Examples:
463+
>>> import pandas as pd
464+
>>> import narwhals as nw
465+
>>> df_native = pd.DataFrame({"digits": ["+1", "-1", "1", None]})
466+
>>> df = nw.from_native(df_native)
467+
>>> df.with_columns(zfill_col=nw.col("digits").str.zfill(3))
468+
┌──────────────────┐
469+
|Narwhals DataFrame|
470+
|------------------|
471+
| digits zfill_col|
472+
|0 +1 +01|
473+
|1 -1 -01|
474+
|2 1 001|
475+
|3 None None|
476+
└──────────────────┘
477+
"""
478+
return self._expr._with_elementwise_op(
479+
lambda plx: self._expr._to_compliant_expr(plx).str.zfill(width)
480+
)

narwhals/series_str.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,3 +398,28 @@ def to_datetime(self, format: str | None = None) -> SeriesT:
398398
return self._narwhals_series._with_compliant(
399399
self._narwhals_series._compliant_series.str.to_datetime(format=format)
400400
)
401+
402+
def zfill(self, width: int) -> SeriesT:
403+
r"""Pad strings with zeros on the left.
404+
405+
Arguments:
406+
width: The target width of the string. If the string is shorter than this width, it will be padded with zeros on the left.
407+
408+
Returns:
409+
A new Series with strings padded with zeros on the left.
410+
411+
Examples:
412+
>>> import pandas as pd
413+
>>> import narwhals as nw
414+
>>> s_native = pd.Series(["+1", "-23", "456", "123456"])
415+
>>> s = nw.from_native(s_native, series_only=True)
416+
>>> s.str.zfill(5).to_native()
417+
0 +0001
418+
1 -0023
419+
2 00456
420+
3 123456
421+
dtype: object
422+
"""
423+
return self._narwhals_series._with_compliant(
424+
self._narwhals_series._compliant_series.str.zfill(width)
425+
)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from __future__ import annotations
2+
3+
import pytest
4+
5+
import narwhals as nw
6+
from tests.utils import PANDAS_VERSION, Constructor, ConstructorEager, assert_equal_data
7+
8+
data = {"a": ["-1", "+1", "1", "12", "123", "99999", "+9999", None]}
9+
expected = {"a": ["-01", "+01", "001", "012", "123", "99999", "+9999", None]}
10+
11+
12+
def uses_pyarrow_backend(constructor: Constructor | ConstructorEager) -> bool:
13+
return constructor.__name__ in {
14+
"pandas_pyarrow_constructor",
15+
"modin_pyarrow_constructor",
16+
}
17+
18+
19+
@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="different zfill behavior")
20+
def test_str_zfill(request: pytest.FixtureRequest, constructor: Constructor) -> None:
21+
if uses_pyarrow_backend(constructor):
22+
reason = (
23+
"pandas with pyarrow backend doesn't support str.zfill, see "
24+
"https://github.com/pandas-dev/pandas/issues/61485"
25+
)
26+
request.applymarker(pytest.mark.xfail(reason=reason))
27+
28+
df = nw.from_native(constructor(data))
29+
result = df.select(nw.col("a").str.zfill(3))
30+
assert_equal_data(result, expected)
31+
32+
33+
@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="different zfill behavior")
34+
def test_str_zfill_series(
35+
request: pytest.FixtureRequest, constructor_eager: ConstructorEager
36+
) -> None:
37+
if uses_pyarrow_backend(constructor_eager):
38+
reason = (
39+
"pandas with pyarrow backend doesn't support str.zfill, see "
40+
"https://github.com/pandas-dev/pandas/issues/61485"
41+
)
42+
request.applymarker(pytest.mark.xfail(reason=reason))
43+
44+
df = nw.from_native(constructor_eager(data), eager_only=True)
45+
result = df["a"].str.zfill(3)
46+
assert_equal_data({"a": result}, expected)

0 commit comments

Comments
 (0)