Skip to content

Commit 7503537

Browse files
Gabe SmallGabe Small
authored andcommitted
BUG: Fix pandas-dev#61221: Exception with unstack(sort=False) and NA in index.
1 parent 1f1e13f commit 7503537

File tree

2 files changed

+97
-3
lines changed

2 files changed

+97
-3
lines changed

pandas/core/reshape/reshape.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,11 @@ def __init__(
134134
self.removed_level_full = index.levels[self.level]
135135
if not self.sort:
136136
unique_codes = unique(self.index.codes[self.level])
137+
# Bug Fix GH 61221
138+
# The -1 in the unsorted unique codes causes for doubling and an eventual ValueError
139+
# saving the NA location to be used in the repeater
140+
self.na = np.where(unique_codes == -1)[0][0] if -1 in unique_codes else None
141+
unique_codes = unique_codes[unique_codes != -1]
137142
self.removed_level = self.removed_level.take(unique_codes)
138143
self.removed_level_full = self.removed_level_full.take(unique_codes)
139144

@@ -381,11 +386,22 @@ def _repeater(self) -> np.ndarray:
381386
# In this case, we remap the new codes to the original level:
382387
repeater = self.removed_level_full.get_indexer(self.removed_level)
383388
if self.lift:
384-
repeater = np.insert(repeater, 0, -1)
389+
if not self.sort and self.na:
390+
repeater = np.insert(repeater, self.na, -1)
391+
else:
392+
repeater = np.insert(repeater, 0, -1)
385393
else:
386394
# Otherwise, we just use each level item exactly once:
387395
stride = len(self.removed_level) + self.lift
388-
repeater = np.arange(stride) - self.lift
396+
if self.sort or not self.na:
397+
repeater = np.arange(stride) - self.lift
398+
else :
399+
#move the -1 to the position at self.na
400+
repeater = np.arange(stride)
401+
if(self.na):
402+
repeater[self.na] = -1
403+
if(self.na + 1) < len(repeater):
404+
repeater[self.na + 1:] -= 1
389405

390406
return repeater
391407

@@ -1049,7 +1065,7 @@ def stack_reshape(
10491065
else:
10501066
data.columns = default_index(len(data.columns))
10511067
buf.append(data)
1052-
1068+
10531069
if len(buf) > 0 and not frame.empty:
10541070
result = concat(buf, ignore_index=True)
10551071
else:

pandas/tests/frame/test_stack_unstack.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,6 +1605,84 @@ def test_stack_sort_false(future_stack):
16051605
tm.assert_frame_equal(result, expected)
16061606

16071607

1608+
def assert_na_safe_equal(left, right):
1609+
"""Compare DataFrames ignoring NA type differences"""
1610+
left = left.rename(columns={pd.NA: np.nan}, level=1)
1611+
right = right.rename(columns={pd.NA: np.nan}, level=1)
1612+
tm.assert_frame_equal(left, right, check_dtype=False)
1613+
1614+
def test_unstack_sort_false_na():
1615+
# GH 61221
1616+
levels1 = ['b','a']
1617+
levels2 = pd.Index([1, 2, 3, pd.NA], dtype=pd.Int64Dtype())
1618+
index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2'])
1619+
df = pd.DataFrame(dict(value=range(len(index))), index=index)
1620+
result = df.unstack(level='level2', sort=False)
1621+
expected = pd.DataFrame(
1622+
{
1623+
('value', 1): [0, 4],
1624+
('value', 2): [1, 5],
1625+
('value', 3): [2, 6],
1626+
('value', pd.Int64Dtype().na_value): [3, 7]
1627+
},
1628+
index=pd.Index(['b', 'a'], name='level1'),
1629+
columns=pd.MultiIndex.from_tuples([
1630+
('value', 1), ('value', 2), ('value', 3), ('value', pd.Int64Dtype().na_value)
1631+
], names=[None, 'level2'])
1632+
)
1633+
assert_na_safe_equal(result, expected)
1634+
levels2 = pd.Index([pd.NA, 1, 2, 3], dtype=pd.Int64Dtype())
1635+
index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2'])
1636+
df = pd.DataFrame(dict(value=range(len(index))), index=index)
1637+
result = df.unstack(level='level2', sort=False)
1638+
expected = pd.DataFrame(
1639+
{
1640+
('value', pd.Int64Dtype().na_value): [0, 4],
1641+
('value', 1): [1, 5],
1642+
('value', 2): [2, 6],
1643+
('value', 3): [3, 7] # Use actual pd.NA object
1644+
},
1645+
index=pd.Index(['b', 'a'], name='level1'),
1646+
columns=pd.MultiIndex.from_tuples([
1647+
('value', pd.Int64Dtype().na_value), ('value', 1), ('value', 2), ('value', 3)
1648+
], names=[None, 'level2'])
1649+
)
1650+
assert_na_safe_equal(result, expected)
1651+
levels2 = pd.Index([ 1, pd.NA, 2, 3], dtype=pd.Int64Dtype())
1652+
index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2'])
1653+
df = pd.DataFrame(dict(value=range(len(index))), index=index)
1654+
result = df.unstack(level='level2', sort=False)
1655+
expected = pd.DataFrame(
1656+
{
1657+
('value', 1): [0, 4],
1658+
('value', pd.Int64Dtype().na_value): [1, 5],
1659+
('value', 2): [2, 6],
1660+
('value', 3): [3, 7] # Use actual pd.NA object
1661+
},
1662+
index=pd.Index(['b', 'a'], name='level1'),
1663+
columns=pd.MultiIndex.from_tuples([
1664+
('value', 1), ('value', pd.Int64Dtype().na_value), ('value', 2), ('value', 3)
1665+
], names=[None, 'level2'])
1666+
)
1667+
assert_na_safe_equal(result, expected)
1668+
levels2 = pd.Index([3, pd.NA, 1, 2], dtype=pd.Int64Dtype())
1669+
index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2'])
1670+
df = pd.DataFrame(dict(value=range(len(index))), index=index)
1671+
result = df.unstack(level='level2', sort=False)
1672+
expected = pd.DataFrame(
1673+
{
1674+
('value', 3): [0, 4],
1675+
('value', pd.Int64Dtype().na_value): [1, 5],
1676+
('value', 1): [2, 6],
1677+
('value', 2): [3, 7] # Use actual pd.NA object
1678+
},
1679+
index=pd.Index(['b', 'a'], name='level1'),
1680+
columns=pd.MultiIndex.from_tuples([
1681+
('value', 3), ('value', pd.Int64Dtype().na_value), ('value', 1), ('value', 2)
1682+
], names=[None, 'level2'])
1683+
)
1684+
assert_na_safe_equal(result, expected)
1685+
16081686
@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
16091687
def test_stack_sort_false_multi_level(future_stack):
16101688
# GH 15105

0 commit comments

Comments
 (0)