-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Fixes aggregation of image datasets #2717
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |||||
| import shutil | ||||||
| from pathlib import Path | ||||||
|
|
||||||
| import datasets | ||||||
| import pandas as pd | ||||||
| import tqdm | ||||||
|
|
||||||
|
|
@@ -32,6 +33,7 @@ | |||||
| DEFAULT_VIDEO_FILE_SIZE_IN_MB, | ||||||
| DEFAULT_VIDEO_PATH, | ||||||
| get_file_size_in_mb, | ||||||
| get_hf_features_from_features, | ||||||
| get_parquet_file_size_in_mb, | ||||||
| to_parquet_with_hf_images, | ||||||
| update_chunk_file_indices, | ||||||
|
|
@@ -402,12 +404,21 @@ def aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_si | |||||
| } | ||||||
|
|
||||||
| unique_chunk_file_ids = sorted(unique_chunk_file_ids) | ||||||
| contains_images = len(dst_meta.image_keys) > 0 | ||||||
|
|
||||||
| # retrieve features schema for proper image typing in parquet | ||||||
| hf_features = get_hf_features_from_features(dst_meta.features) if contains_images else None | ||||||
|
|
||||||
| for src_chunk_idx, src_file_idx in unique_chunk_file_ids: | ||||||
| src_path = src_meta.root / DEFAULT_DATA_PATH.format( | ||||||
| chunk_index=src_chunk_idx, file_index=src_file_idx | ||||||
| ) | ||||||
| df = pd.read_parquet(src_path) | ||||||
| if contains_images: | ||||||
| # Use HuggingFace datasets to read source data to preserve image format | ||||||
| src_ds = datasets.Dataset.from_parquet(str(src_path)) | ||||||
| df = src_ds.to_pandas() | ||||||
| else: | ||||||
| df = pd.read_parquet(src_path) | ||||||
| df = update_data_df(df, src_meta, dst_meta) | ||||||
|
|
||||||
| data_idx = append_or_create_parquet_file( | ||||||
|
|
@@ -417,8 +428,9 @@ def aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_si | |||||
| data_files_size_in_mb, | ||||||
| chunk_size, | ||||||
| DEFAULT_DATA_PATH, | ||||||
| contains_images=len(dst_meta.image_keys) > 0, | ||||||
| contains_images=contains_images, | ||||||
| aggr_root=dst_meta.root, | ||||||
| hf_features=hf_features, | ||||||
| ) | ||||||
|
|
||||||
| return data_idx | ||||||
|
|
@@ -488,6 +500,7 @@ def append_or_create_parquet_file( | |||||
| default_path: str, | ||||||
| contains_images: bool = False, | ||||||
| aggr_root: Path = None, | ||||||
| hf_features: datasets.Features | None = None, | ||||||
| ): | ||||||
| """Appends data to an existing parquet file or creates a new one based on size constraints. | ||||||
|
|
||||||
|
|
@@ -503,6 +516,7 @@ def append_or_create_parquet_file( | |||||
| default_path: Format string for generating file paths. | ||||||
| contains_images: Whether the data contains images requiring special handling. | ||||||
| aggr_root: Root path for the aggregated dataset. | ||||||
| hf_features: Optional HuggingFace Features schema for proper image typing. | ||||||
|
|
||||||
| Returns: | ||||||
| dict: Updated index dictionary with current chunk and file indices. | ||||||
|
|
@@ -512,7 +526,7 @@ def append_or_create_parquet_file( | |||||
| if not dst_path.exists(): | ||||||
| dst_path.parent.mkdir(parents=True, exist_ok=True) | ||||||
| if contains_images: | ||||||
| to_parquet_with_hf_images(df, dst_path) | ||||||
| to_parquet_with_hf_images(df, dst_path, features=hf_features) | ||||||
|
||||||
| else: | ||||||
| df.to_parquet(dst_path) | ||||||
| return idx | ||||||
|
|
@@ -527,12 +541,17 @@ def append_or_create_parquet_file( | |||||
| final_df = df | ||||||
| target_path = new_path | ||||||
| else: | ||||||
| existing_df = pd.read_parquet(dst_path) | ||||||
| if contains_images: | ||||||
| # Use HuggingFace datasets to read existing data to preserve image format | ||||||
| existing_ds = datasets.Dataset.from_parquet(str(dst_path)) | ||||||
|
||||||
| existing_ds = datasets.Dataset.from_parquet(str(dst_path)) | |
| existing_ds = datasets.Dataset.from_parquet(str(dst_path), features=hf_features) |
Copilot
AI
Dec 24, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The function to_parquet_with_hf_images is being called with a features parameter, but the current function signature in utils.py only accepts (df: pandas.DataFrame, path: Path) and does not have a features parameter. This will cause a TypeError at runtime. The function signature needs to be updated to accept and use the features parameter to properly preserve the HuggingFace Image schema.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When reading image datasets using
datasets.Dataset.from_parquet, thefeaturesparameter should be passed to ensure image columns are properly loaded with the correct schema. Without this, the image data might not be correctly preserved during the read-update-write cycle. Consider usingdatasets.Dataset.from_parquet(str(src_path), features=hf_features)to maintain schema consistency.