diff --git a/pandas/io/common.py b/pandas/io/common.py index 1a9e6b472463d..bf8f7a4320bad 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,6 +10,7 @@ from collections import defaultdict from collections.abc import ( Hashable, + Iterable, Mapping, Sequence, ) @@ -26,7 +27,10 @@ ) import mmap import os -from pathlib import Path +from pathlib import ( + Path, + PurePosixPath, +) import re import tarfile from typing import ( @@ -42,6 +46,7 @@ overload, ) from urllib.parse import ( + unquote, urljoin, urlparse as parse_url, uses_netloc, @@ -55,6 +60,7 @@ BaseBuffer, ReadCsvBuffer, ) +from pandas.compat import is_platform_windows from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -1282,3 +1288,151 @@ def dedup_names( counts[col] = cur_count + 1 return names + + +def _infer_protocol(path: str) -> str: + # Treat Windows drive letters like C:\ as local file paths + if is_platform_windows() and re.match(r"^[a-zA-Z]:[\\/]", path): + return "file" + + if is_fsspec_url(path) or path.startswith("http"): + parsed = parse_url(path) + return parsed.scheme + return "file" + + +def _match_file( + path: Path | PurePosixPath, extensions: set[str] | None, glob: str | None +) -> bool: + """ + Check if the file matches the given extensions and glob pattern. + Parameters + ---------- + path : Path or PurePosixPath + The file path to check. + extensions : set[str] + A set of file extensions to match against. + glob : str + A glob pattern to match against. + Returns + ------- + bool + True if the file matches the extensions and glob pattern, False otherwise. + """ + return (extensions is None or path.suffix.lower() in extensions) and ( + glob is None or path.match(glob) + ) + + +def _resolve_local_path(path_str: str) -> Path: + parsed = parse_url(path_str) + if is_platform_windows(): + if parsed.netloc: + return Path(f"//{parsed.netloc}{unquote(parsed.path)}") + + path = unquote(parsed.path) + if path.startswith("\\") and not path.startswith("\\\\"): + drive = os.path.splitdrive(os.getcwd())[0] + return Path(drive + path) + + return Path(path) + return Path(unquote(parsed.path)) + + +def iterdir( + path: FilePath | BaseBuffer, + extensions: str | Iterable[str] | None = None, + glob: str | None = None, +) -> list[str | Path] | BaseBuffer: + """Yield file paths in a directory (no nesting allowed). + + Supports: + - Local paths (str, os.PathLike) + - file:// URLs + - Remote paths (e.g., s3://) via fsspec (if installed) + + Parameters + ---------- + path : FilePath + Path to the directory (local or remote). + extensions : str or list of str, optional + Only yield files with the given extension(s). Case-insensitive. + If None, all files are yielded. + glob : str, optional + Only yield files matching the given glob pattern. + If None, all files are yielded. + + Returns + ------ + list of str or Path, BaseBuffer + If `path` is a file-like object, returns it directly. + Otherwise, returns list of file paths in the directory. + + Raises + ------ + NotADirectoryError + If the given path is not a directory. + ImportError + If fsspec is required but not installed. + """ + if hasattr(path, "read") or hasattr(path, "write"): + return path + + if not isinstance(path, (str, os.PathLike)): + raise TypeError( + f"Expected file path name or file-like object, got {type(path)} type" + ) + + if extensions is not None: + if isinstance(extensions, str): + extensions = {extensions.lower()} + else: + extensions = {ext.lower() for ext in extensions} + + path_str = os.fspath(path) + scheme = _infer_protocol(path_str) + + if scheme == "file": + resolved_path = _resolve_local_path(path_str) + if resolved_path.is_file(): + if _match_file( + resolved_path, + extensions, + glob, + ): + return [resolved_path] + + result = [] + for entry in resolved_path.iterdir(): + if entry.is_file(): + if _match_file( + entry, + extensions, + glob, + ): + result.append(entry) + return result + + # Remote paths + fsspec = import_optional_dependency("fsspec", extra=scheme) + fs, inner_path = fsspec.core.url_to_fs(path_str) + if fs.isfile(inner_path): + path_obj = PurePosixPath(inner_path) + if _match_file( + inner_path, + extensions, + glob, + ): + return [path] + + result = [] + for file in fs.ls(inner_path, detail=True): + if file["type"] == "file": + path_obj = PurePosixPath(file["name"]) + if _match_file( + path_obj, + extensions, + glob, + ): + result.append(f"{scheme}://{path_obj}") + return result diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 67193f930b4dc..efcb28d67fb76 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -55,6 +55,7 @@ from pandas.io.common import ( IOHandles, get_handle, + iterdir, stringify_path, validate_header_arg, ) @@ -73,6 +74,7 @@ if TYPE_CHECKING: from collections.abc import ( Callable, + Generator, Hashable, Iterable, Mapping, @@ -668,9 +670,23 @@ def _validate_names(names: Sequence[Hashable] | None) -> None: raise ValueError("Names should be an ordered collection.") +def _multi_file_generator( + list_of_files: list[str], kwds +) -> Generator[DataFrame] | Generator[TextFileReader]: + """Generator for multiple files.""" + for file in list_of_files: + parser = TextFileReader(file, **kwds) + + if kwds.get("chunksize", None) or kwds.get("iterator", False): + yield parser + else: + with parser: + yield parser.read(kwds.get("nrows", None)) + + def _read( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds -) -> DataFrame | TextFileReader: +) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]: """Generic reader of line files.""" # if we pass a date_format and parse_dates=False, we should not parse the # dates GH#44366 @@ -709,14 +725,26 @@ def _read( # Check for duplicates in names. _validate_names(kwds.get("names", None)) - # Create the parser. - parser = TextFileReader(filepath_or_buffer, **kwds) + extensions = kwds.get("extensions", None) + glob = kwds.get("glob", None) + files = iterdir(filepath_or_buffer, extensions, glob) + + if isinstance(files, list) and not files: + raise FileNotFoundError( + f"No files found in {filepath_or_buffer}, " + f"with extension(s) {extensions} and glob pattern {glob}" + ) - if chunksize or iterator: - return parser + if (isinstance(files, list) and len(files) == 1) or not isinstance(files, list): + file = files[0] if isinstance(files, list) else files + parser = TextFileReader(file, **kwds) - with parser: - return parser.read(nrows) + if chunksize or iterator: + return parser + + with parser: + return parser.read(nrows) + return _multi_file_generator(files, kwds) @overload @@ -832,7 +860,7 @@ def read_csv( float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, -) -> DataFrame | TextFileReader: +) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]: # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -932,10 +960,9 @@ def read_table( skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = None, + na_values: ( + Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None + ) = None, keep_default_na: bool = True, na_filter: bool = True, skip_blank_lines: bool = True, @@ -968,7 +995,7 @@ def read_table( float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, -) -> DataFrame | TextFileReader: +) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]: # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1038,7 +1065,7 @@ def read_fwf( iterator: bool = False, chunksize: int | None = None, **kwds: Unpack[_read_shared[HashableT]], -) -> DataFrame | TextFileReader: +) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]: r""" Read a table of fixed-width formatted lines into DataFrame. diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index a5ddda9d66e7a..8332d349e3cf3 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -223,3 +223,46 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture +def local_csv_directory(tmp_path): + """ + Fixture to create a directory with dummy CSV files for testing. + """ + for i in range(3): + file_path = tmp_path / f"{i}.csv" + file_path.touch() + return tmp_path + + +@pytest.fixture +def remote_csv_directory(): + _ = pytest.importorskip("fsspec", reason="fsspec is required for remote tests") + + import fsspec + from fsspec.implementations.memory import MemoryFileSystem + + fsspec.register_implementation("s3", MemoryFileSystem) + fs = fsspec.filesystem("s3") + fs.store.clear() + + dir_name = "remote-bucket" + fs.pipe(f"{dir_name}/a.csv", b"a,b,c\n1,2,3\n") + fs.pipe(f"{dir_name}/b.csv", b"a,b,c\n4,5,6\n") + fs.pipe(f"{dir_name}/nested/ignored.csv", b"x,y,z\n") + + assert fs.exists(dir_name), "Remote directory was not created" + assert fs.isdir(dir_name), "Remote path is not a directory" + + return f"s3://{dir_name}" + + +@pytest.fixture +def empty_local_file(tmp_path): + """ + Fixture to create an empty local file. + """ + file_path = tmp_path / "empty_file.csv" + file_path.touch() + return file_path diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index cef57318195ec..dd5935653d7c5 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -314,21 +314,21 @@ class InvalidBuffer: pass parser = all_parsers - msg = "Invalid file path or buffer object type" + msg = "Expected file path name or file-like object" - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): parser.read_csv(InvalidBuffer()) def test_invalid_file_buffer_mock(all_parsers): # see gh-15337 parser = all_parsers - msg = "Invalid file path or buffer object type" + msg = "Expected file path name or file-like object" class Foo: pass - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): parser.read_csv(Foo()) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 191d0de50b12f..941e5bbb429cc 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -34,6 +34,15 @@ def parser_and_data(all_parsers, csv1): return parser, data, expected +@pytest.fixture +def empty_zip_file(tmp_path): + # Create an empty zip file for testing + zip_path = tmp_path / "empty.zip" + with zipfile.ZipFile(zip_path, "w"): + pass + return zip_path + + @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) def test_zip(parser_and_data, compression): parser, data, expected = parser_and_data @@ -158,14 +167,14 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) -def test_invalid_compression(all_parsers, invalid_compression): +def test_invalid_compression(all_parsers, empty_zip_file, invalid_compression): parser = all_parsers compress_kwargs = {"compression": invalid_compression} msg = f"Unrecognized compression type: {invalid_compression}" with pytest.raises(ValueError, match=msg): - parser.read_csv("test_file.zip", **compress_kwargs) + parser.read_csv(empty_zip_file, **compress_kwargs) def test_compression_tar_archive(all_parsers, csv_dir_path): diff --git a/pandas/tests/io/parser/test_directory.py b/pandas/tests/io/parser/test_directory.py new file mode 100644 index 0000000000000..84edc58570036 --- /dev/null +++ b/pandas/tests/io/parser/test_directory.py @@ -0,0 +1,37 @@ +from csv import ( + DictWriter, + reader as csv_reader, +) + +import pytest + + +@pytest.fixture +def directory_data(): + return ["a", "b", "c"], [ + {"first": {"a": 1, "b": 2, "c": 3}}, + {"second": {"a": 4, "b": 5, "c": 6}}, + {"third": {"a": 7, "b": 8, "c": 9}}, + ] + + +@pytest.fixture +def directory_data_to_file(tmp_path, directory_data): + field_names, data_list = directory_data + for data in data_list: + file_name = next(iter(data.keys())) + path = tmp_path / f"{file_name}.csv" + with path.open("w", newline="", encoding="utf-8") as file: + writer = DictWriter(file, fieldnames=field_names) + writer.writeheader() + writer.writerow(data[file_name]) + return tmp_path + + +def test_directory_data(directory_data_to_file): + assert len(list(directory_data_to_file.iterdir())) == 3 + for file in directory_data_to_file.iterdir(): + with file.open(encoding="utf-8") as f: + reader = csv_reader(f) + header = next(reader) + assert header == ["a", "b", "c"] diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 07f84466e3ac2..f04a82a925f08 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -185,15 +185,10 @@ def test_close_file_handle_on_invalid_usecols(all_parsers): os.unlink(fname) -def test_invalid_file_inputs(request, all_parsers): +def test_invalid_file_inputs(all_parsers): # GH#45957 parser = all_parsers - if parser.engine == "python": - request.applymarker( - pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.") - ) - - with pytest.raises(ValueError, match="Invalid"): + with pytest.raises(TypeError, match="Expected file path name or file-like"): parser.read_csv([]) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4a5e41397b59d..53bb99088c0fa 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -695,3 +695,31 @@ def test_pyarrow_read_csv_datetime_dtype(): expect = pd.DataFrame({"date": expect_data}) tm.assert_frame_equal(expect, result) + + +def test_iterdir_local(local_csv_directory): + for file in icom.iterdir(local_csv_directory): + assert file.is_file() + assert file.suffix == ".csv" + + +def test_remote_csv_directory(remote_csv_directory): + import fsspec + from fsspec.implementations.memory import MemoryFileSystem + + fs = fsspec.filesystem("s3") + assert isinstance(fs, MemoryFileSystem) + + assert fs.exists("remote-bucket") + assert fs.isdir("remote-bucket") + + files = fs.ls("remote-bucket", detail=True) + + file_names = sorted(f["name"] for f in files if f["type"] == "file") + assert file_names == ["/remote-bucket/a.csv", "/remote-bucket/b.csv"] + + dir_names = [f["name"] for f in files if f["type"] == "directory"] + assert "/remote-bucket/nested" in dir_names + + nested_files = fs.ls("remote-bucket/nested", detail=True) + assert nested_files[0]["name"] == "/remote-bucket/nested/ignored.csv" diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 2e3e74a9d31ff..1c6e497af5de6 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -342,13 +342,13 @@ def test_markdown_options(fsspectest): assert fsspectest.cat("testmem://mockfile") -def test_non_fsspec_options(): +def test_non_fsspec_options(empty_local_file): pytest.importorskip("pyarrow") with pytest.raises(ValueError, match="storage_options"): - read_csv("localfile", storage_options={"a": True}) + read_csv(empty_local_file, storage_options={"a": True}) with pytest.raises(ValueError, match="storage_options"): # separate test for parquet, which has a different code path - read_parquet("localfile", storage_options={"a": True}) + read_parquet(empty_local_file, storage_options={"a": True}) by = io.BytesIO() with pytest.raises(ValueError, match="storage_options"): diff --git a/web/pandas/static/img/books/pandas_cookbook_3.jpeg b/web/pandas/static/img/books/pandas_cookbook_3.jpeg new file mode 100644 index 0000000000000..cf1c27037de68 Binary files /dev/null and b/web/pandas/static/img/books/pandas_cookbook_3.jpeg differ