Skip to content

BUG: Error when repr-ing nested DataFrames #58911

Closed
@huzecong

Description

@huzecong

Pandas version checks

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd

df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df_outer = pd.DataFrame({"a": [{"x": df}]})
print(df_outer)

Issue Description

The above code crashed with the stack trace:

Stack trace

---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
Cell In[1], line 5
      3 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
      4 df_outer = pd.DataFrame({"a": [{"x": df}]})
----> 5 print(df_outer)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/core/frame.py:1214, in DataFrame.__repr__(self)
   1211     return buf.getvalue()
   1213 repr_params = fmt.get_dataframe_repr_params()
-> 1214 return self.to_string(**repr_params)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/util/_decorators.py:333, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    327 if len(args) > num_allow_args:
    328     warnings.warn(
    329         msg.format(arguments=_format_argument_list(allow_args)),
    330         FutureWarning,
    331         stacklevel=find_stack_level(),
    332     )
--> 333 return func(*args, **kwargs)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/core/frame.py:1394, in DataFrame.to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, line_width, min_rows, max_colwidth, encoding)
   1375 with option_context("display.max_colwidth", max_colwidth):
   1376     formatter = fmt.DataFrameFormatter(
   1377         self,
   1378         columns=columns,
   (...)
   1392         decimal=decimal,
   1393     )
-> 1394     return fmt.DataFrameRenderer(formatter).to_string(
   1395         buf=buf,
   1396         encoding=encoding,
   1397         line_width=line_width,
   1398     )

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/format.py:962, in DataFrameRenderer.to_string(self, buf, encoding, line_width)
    959 from pandas.io.formats.string import StringFormatter
    961 string_formatter = StringFormatter(self.fmt, line_width=line_width)
--> 962 string = string_formatter.to_string()
    963 return save_to_buffer(string, buf=buf, encoding=encoding)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/string.py:29, in StringFormatter.to_string(self)
     28 def to_string(self) -> str:
---> 29     text = self._get_string_representation()
     30     if self.fmt.should_show_dimensions:
     31         text = f"{text}{self.fmt.dimensions_info}"

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/string.py:44, in StringFormatter._get_string_representation(self)
     41 if self.fmt.frame.empty:
     42     return self._empty_info_line
---> 44 strcols = self._get_strcols()
     46 if self.line_width is None:
     47     # no need to wrap around just print the whole frame
     48     return self.adj.adjoin(1, *strcols)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/string.py:35, in StringFormatter._get_strcols(self)
     34 def _get_strcols(self) -> list[list[str]]:
---> 35     strcols = self.fmt.get_strcols()
     36     if self.fmt.is_truncated:
     37         strcols = self._insert_dot_separators(strcols)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/format.py:476, in DataFrameFormatter.get_strcols(self)
    472 def get_strcols(self) -> list[list[str]]:
    473     """
    474     Render a DataFrame to a list of columns (as lists of strings).
    475     """
--> 476     strcols = self._get_strcols_without_index()
    478     if self.index:
    479         str_index = self._get_formatted_index(self.tr_frame)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/format.py:740, in DataFrameFormatter._get_strcols_without_index(self)
    736 cheader = str_columns[i]
    737 header_colwidth = max(
    738     int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
    739 )
--> 740 fmt_values = self.format_col(i)
    741 fmt_values = _make_fixed_width(
    742     fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
    743 )
    745 max_len = max(*(self.adj.len(x) for x in fmt_values), header_colwidth)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/format.py:754, in DataFrameFormatter.format_col(self, i)
    752 frame = self.tr_frame
    753 formatter = self._get_formatter(i)
--> 754 return format_array(
    755     frame.iloc[:, i]._values,
    756     formatter,
    757     float_format=self.float_format,
    758     na_rep=self.na_rep,
    759     space=self.col_space.get(frame.columns[i]),
    760     decimal=self.decimal,
    761     leading_space=self.index,
    762 )

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/format.py:1161, in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting, fallback_formatter)
   1145     digits = get_option("display.precision")
   1147 fmt_obj = fmt_klass(
   1148     values,
   1149     digits=digits,
   (...)
   1158     fallback_formatter=fallback_formatter,
   1159 )
-> 1161 return fmt_obj.get_result()

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/format.py:1194, in _GenericArrayFormatter.get_result(self)
   1193 def get_result(self) -> list[str]:
-> 1194     fmt_values = self._format_strings()
   1195     return _make_fixed_width(fmt_values, self.justify)

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/format.py:1259, in _GenericArrayFormatter._format_strings(self)
   1257 for i, v in enumerate(vals):
   1258     if (not is_float_type[i] or self.formatter is not None) and leading_space:
-> 1259         fmt_values.append(f" {_format(v)}")
   1260     elif is_float_type[i]:
   1261         fmt_values.append(float_format(v))

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/format.py:1239, in _GenericArrayFormatter._format_strings.<locals>._format(x)
   1236     return repr(x)
   1237 else:
   1238     # object dtype
-> 1239     return str(formatter(x))

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/printing.py:219, in pprint_thing(thing, _nest_lvl, escape_chars, default_escapes, quote_strings, max_seq_items)
    215     return str(thing)
    216 elif isinstance(thing, dict) and _nest_lvl < get_option(
    217     "display.pprint_nest_depth"
    218 ):
--> 219     result = _pprint_dict(
    220         thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
    221     )
    222 elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
    223     result = _pprint_seq(
    224         thing,
    225         _nest_lvl,
   (...)
    228         max_seq_items=max_seq_items,
    229     )

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/printing.py:155, in _pprint_dict(seq, _nest_lvl, max_seq_items, **kwds)
    149     nitems = max_seq_items or get_option("max_seq_items") or len(seq)
    151 for k, v in list(seq.items())[:nitems]:
    152     pairs.append(
    153         pfmt.format(
    154             key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
--> 155             val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
    156         )
    157     )
    159 if nitems < len(seq):
    160     return fmt.format(things=", ".join(pairs) + ", ...")

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/printing.py:223, in pprint_thing(thing, _nest_lvl, escape_chars, default_escapes, quote_strings, max_seq_items)
    219     result = _pprint_dict(
    220         thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
    221     )
    222 elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
--> 223     result = _pprint_seq(
    224         thing,
    225         _nest_lvl,
    226         escape_chars=escape_chars,
    227         quote_strings=quote_strings,
    228         max_seq_items=max_seq_items,
    229     )
    230 elif isinstance(thing, str) and quote_strings:
    231     result = f"'{as_escaped_string(thing)}'"

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/printing.py:120, in _pprint_seq(seq, _nest_lvl, max_seq_items, **kwds)
    118 s = iter(seq)
    119 # handle sets, no slicing
--> 120 r = [
    121     pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
    122     for i in range(min(nitems, len(seq)))
    123 ]
    124 body = ", ".join(r)
    126 if nitems < len(seq):

File ~/Library/Python/3.10/lib/python/site-packages/pandas/io/formats/printing.py:121, in <listcomp>(.0)
    118 s = iter(seq)
    119 # handle sets, no slicing
    120 r = [
--> 121     pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
    122     for i in range(min(nitems, len(seq)))
    123 ]
    124 body = ", ".join(r)
    126 if nitems < len(seq):

StopIteration:

My interpretation is that this happens because pandas treats the nested DataFrame as a normal sequence and tries to iterate on it, but for DataFrames len(df) != len(list(df)) because the former is #rows and the latter is #columns.

This issue is essentially the same as #49195, but that issue was incorrectly triaged as been an issue with an external library.

Expected Behavior

Any reasonable repr output. Should not crash.

Installed Versions


INSTALLED VERSIONS
------------------
commit                : d9cdd2ee5a58015ef6f4d15c7226110c9aab8140
python                : 3.10.8.final.0
python-bits           : 64
OS                    : Darwin
OS-release            : 23.5.0
Version               : Darwin Kernel Version 23.5.0: Wed May  1 20:12:58 PDT 2024; root:xnu-10063.121.3~5/RELEASE_ARM64_T6000
machine               : arm64
processor             : arm
byteorder             : little
LC_ALL                : en_US.UTF-8
LANG                  : en_US.UTF-8
LOCALE                : en_US.UTF-8

pandas                : 2.2.2
numpy                 : 1.26.4
pytz                  : 2024.1
dateutil              : 2.9.0.post0
setuptools            : 63.2.0
pip                   : 23.2.1
Cython                : None
pytest                : 7.4.2
hypothesis            : None
sphinx                : None
blosc                 : None
feather               : None
xlsxwriter            : None
lxml.etree            : None
html5lib              : None
pymysql               : None
psycopg2              : None
jinja2                : None
IPython               : 8.14.0
pandas_datareader     : None
adbc-driver-postgresql: None
adbc-driver-sqlite    : None
bs4                   : None
bottleneck            : None
dataframe-api-compat  : None
fastparquet           : None
fsspec                : None
gcsfs                 : None
matplotlib            : None
numba                 : None
numexpr               : None
odfpy                 : None
openpyxl              : None
pandas_gbq            : None
pyarrow               : None
pyreadstat            : None
python-calamine       : None
pyxlsb                : None
s3fs                  : None
scipy                 : None
sqlalchemy            : None
tables                : None
tabulate              : 0.9.0
xarray                : None
xlrd                  : None
zstandard             : None
tzdata                : 2024.1
qtpy                  : None
pyqt5                 : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    BugNeeds TriageIssue that has not been reviewed by a pandas team member

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions