16
16
import numpy as np
17
17
18
18
from pandas ._libs import lib
19
+ from pandas ._libs .missing import NA
19
20
from pandas ._libs .tslibs import (
20
21
Timedelta ,
21
22
Timestamp ,
@@ -360,7 +361,7 @@ def _from_sequence_of_strings(
360
361
# duration to string casting behavior
361
362
mask = isna (scalars )
362
363
if not isinstance (strings , (pa .Array , pa .ChunkedArray )):
363
- strings = pa .array (strings , type = pa .string (), from_pandas = True )
364
+ strings = pa .array (strings , type = pa .string ())
364
365
strings = pc .if_else (mask , None , strings )
365
366
try :
366
367
scalars = strings .cast (pa .int64 ())
@@ -381,7 +382,7 @@ def _from_sequence_of_strings(
381
382
if isinstance (strings , (pa .Array , pa .ChunkedArray )):
382
383
scalars = strings
383
384
else :
384
- scalars = pa .array (strings , type = pa .string (), from_pandas = True )
385
+ scalars = pa .array (strings , type = pa .string ())
385
386
scalars = pc .if_else (pc .equal (scalars , "1.0" ), "1" , scalars )
386
387
scalars = pc .if_else (pc .equal (scalars , "0.0" ), "0" , scalars )
387
388
scalars = scalars .cast (pa .bool_ ())
@@ -393,6 +394,13 @@ def _from_sequence_of_strings(
393
394
from pandas .core .tools .numeric import to_numeric
394
395
395
396
scalars = to_numeric (strings , errors = "raise" )
397
+ if not pa .types .is_decimal (pa_type ):
398
+ # TODO: figure out why doing this cast breaks with decimal dtype
399
+ # in test_from_sequence_of_strings_pa_array
400
+ mask = strings .is_null ()
401
+ scalars = pa .array (scalars , mask = np .array (mask ), type = pa_type )
402
+ # TODO: could we just do strings.cast(pa_type)?
403
+
396
404
else :
397
405
raise NotImplementedError (
398
406
f"Converting strings to { pa_type } is not implemented."
@@ -435,7 +443,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
435
443
"""
436
444
if isinstance (value , pa .Scalar ):
437
445
pa_scalar = value
438
- elif isna (value ):
446
+ elif isna (value ) and not lib . is_float ( value ) :
439
447
pa_scalar = pa .scalar (None , type = pa_type )
440
448
else :
441
449
# Workaround https://github.com/apache/arrow/issues/37291
@@ -452,7 +460,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
452
460
value = value .as_unit (pa_type .unit )
453
461
value = value ._value
454
462
455
- pa_scalar = pa .scalar (value , type = pa_type , from_pandas = True )
463
+ pa_scalar = pa .scalar (value , type = pa_type )
456
464
457
465
if pa_type is not None and pa_scalar .type != pa_type :
458
466
pa_scalar = pa_scalar .cast (pa_type )
@@ -484,6 +492,13 @@ def _box_pa_array(
484
492
if copy :
485
493
value = value .copy ()
486
494
pa_array = value .__arrow_array__ ()
495
+
496
+ elif hasattr (value , "__arrow_array__" ):
497
+ # e.g. StringArray
498
+ if copy :
499
+ value = value .copy ()
500
+ pa_array = value .__arrow_array__ ()
501
+
487
502
else :
488
503
if (
489
504
isinstance (value , np .ndarray )
@@ -510,19 +525,32 @@ def _box_pa_array(
510
525
value = to_timedelta (value , unit = pa_type .unit ).as_unit (pa_type .unit )
511
526
value = value .to_numpy ()
512
527
528
+ mask = None
529
+ if getattr (value , "dtype" , None ) is None or value .dtype .kind not in "mfM" :
530
+ # similar to isna(value) but exclude NaN
531
+ # TODO: cythonize!
532
+ mask = np .array ([x is NA or x is None for x in value ], dtype = bool )
533
+
534
+ from_pandas = False
535
+ if pa .types .is_integer (pa_type ):
536
+ # If user specifically asks to cast a numpy float array with NaNs
537
+ # to pyarrow integer, we'll treat those NaNs as NA
538
+ from_pandas = True
513
539
try :
514
- pa_array = pa .array (value , type = pa_type , from_pandas = True )
540
+ pa_array = pa .array (
541
+ value , type = pa_type , mask = mask , from_pandas = from_pandas
542
+ )
515
543
except (pa .ArrowInvalid , pa .ArrowTypeError ):
516
544
# GH50430: let pyarrow infer type, then cast
517
- pa_array = pa .array (value , from_pandas = True )
545
+ pa_array = pa .array (value , mask = mask , from_pandas = from_pandas )
518
546
519
547
if pa_type is None and pa .types .is_duration (pa_array .type ):
520
548
# Workaround https://github.com/apache/arrow/issues/37291
521
549
from pandas .core .tools .timedeltas import to_timedelta
522
550
523
551
value = to_timedelta (value )
524
552
value = value .to_numpy ()
525
- pa_array = pa .array (value , type = pa_type , from_pandas = True )
553
+ pa_array = pa .array (value , type = pa_type )
526
554
527
555
if pa .types .is_duration (pa_array .type ) and pa_array .null_count > 0 :
528
556
# GH52843: upstream bug for duration types when originally
@@ -1169,7 +1197,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
1169
1197
if not len (values ):
1170
1198
return np .zeros (len (self ), dtype = bool )
1171
1199
1172
- result = pc .is_in (self ._pa_array , value_set = pa .array (values , from_pandas = True ))
1200
+ result = pc .is_in (self ._pa_array , value_set = pa .array (values ))
1173
1201
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
1174
1202
# to False
1175
1203
return np .array (result , dtype = np .bool_ )
@@ -2002,7 +2030,7 @@ def __setitem__(self, key, value) -> None:
2002
2030
raise ValueError ("Length of indexer and values mismatch" )
2003
2031
chunks = [
2004
2032
* self ._pa_array [:key ].chunks ,
2005
- pa .array ([value ], type = self ._pa_array .type , from_pandas = True ),
2033
+ pa .array ([value ], type = self ._pa_array .type ),
2006
2034
* self ._pa_array [key + 1 :].chunks ,
2007
2035
]
2008
2036
data = pa .chunked_array (chunks ).combine_chunks ()
@@ -2056,7 +2084,7 @@ def _rank_calc(
2056
2084
pa_type = pa .float64 ()
2057
2085
else :
2058
2086
pa_type = pa .uint64 ()
2059
- result = pa .array (ranked , type = pa_type , from_pandas = True )
2087
+ result = pa .array (ranked , type = pa_type )
2060
2088
return result
2061
2089
2062
2090
data = self ._pa_array .combine_chunks ()
@@ -2308,7 +2336,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
2308
2336
right , right_type = _to_numpy_and_type (right )
2309
2337
pa_type = left_type or right_type
2310
2338
result = np .where (cond , left , right )
2311
- return pa .array (result , type = pa_type , from_pandas = True )
2339
+ return pa .array (result , type = pa_type )
2312
2340
2313
2341
@classmethod
2314
2342
def _replace_with_mask (
@@ -2351,7 +2379,7 @@ def _replace_with_mask(
2351
2379
replacements = replacements .as_py ()
2352
2380
result = np .array (values , dtype = object )
2353
2381
result [mask ] = replacements
2354
- return pa .array (result , type = values .type , from_pandas = True )
2382
+ return pa .array (result , type = values .type )
2355
2383
2356
2384
# ------------------------------------------------------------------
2357
2385
# GroupBy Methods
@@ -2430,7 +2458,7 @@ def _groupby_op(
2430
2458
return type (self )(pa_result )
2431
2459
else :
2432
2460
# DatetimeArray, TimedeltaArray
2433
- pa_result = pa .array (result , from_pandas = True )
2461
+ pa_result = pa .array (result )
2434
2462
return type (self )(pa_result )
2435
2463
2436
2464
def _apply_elementwise (self , func : Callable ) -> list [list [Any ]]:
0 commit comments