datachain-ai
diff --git a/‎src/datachain/lib/dc/datachain.py‎
Lines changed: 23 additions & 12 deletions b/‎src/datachain/lib/dc/datachain.py‎
Lines changed: 23 additions & 12 deletions
diff --git a/‎src/datachain/lib/dc/records.py‎
Lines changed: 4 additions & 2 deletions b/‎src/datachain/lib/dc/records.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/datachain/lib/settings.py‎
Lines changed: 23 additions & 0 deletions b/‎src/datachain/lib/settings.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/datachain/lib/udf.py‎
Lines changed: 10 additions & 3 deletions b/‎src/datachain/lib/udf.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎src/datachain/query/dataset.py‎
Lines changed: 18 additions & 20 deletions b/‎src/datachain/query/dataset.py‎
Lines changed: 18 additions & 20 deletions
@@ -324,29 +324,32 @@ def settings(
         sys: Optional[bool] = None,
         namespace: Optional[str] = None,
         project: Optional[str] = None,
+        batch_rows: Optional[int] = None,
     ) -> "Self":
         """Change settings for chain.
 
         This function changes specified settings without changing not specified ones.
         It returns chain, so, it can be chained later with next operation.
 
         Parameters:
-            cache : data caching (default=False)
+            cache : data caching. (default=False)
             parallel : number of thread for processors. True is a special value to
-                enable all available CPUs (default=1)
+                enable all available CPUs. (default=1)
             workers : number of distributed workers. Only for Studio mode. (default=1)
-            min_task_size : minimum number of tasks (default=1)
-            prefetch: number of workers to use for downloading files in advance.
+            min_task_size : minimum number of tasks. (default=1)
+            prefetch : number of workers to use for downloading files in advance.
                       This is enabled by default and uses 2 workers.
                       To disable prefetching, set it to 0.
-            namespace: namespace name.
-            project: project name.
+            namespace : namespace name.
+            project : project name.
+            batch_rows : row limit per insert to balance speed and memory usage.
+                      (default=2000)
 
         Example:
             ```py
             chain = (
                 chain
-                .settings(cache=True, parallel=8)
+                .settings(cache=True, parallel=8, batch_rows=300)
                 .map(laion=process_webdataset(spec=WDSLaion), params="file")
             )
             ```
@@ -356,7 +359,14 @@ def settings(
         settings = copy.copy(self._settings)
         settings.add(
             Settings(
-                cache, parallel, workers, min_task_size, prefetch, namespace, project
+                cache,
+                parallel,
+                workers,
+                min_task_size,
+                prefetch,
+                namespace,
+                project,
+                batch_rows,
             )
         )
         return self._evolve(settings=settings, _sys=sys)
@@ -711,7 +721,7 @@ def map(
 
         return self._evolve(
             query=self._query.add_signals(
-                udf_obj.to_udf_wrapper(),
+                udf_obj.to_udf_wrapper(self._settings.batch_rows),
                 **self._settings.to_dict(),
             ),
             signal_schema=self.signals_schema | udf_obj.output,
@@ -749,7 +759,7 @@ def gen(
             udf_obj.prefetch = prefetch
         return self._evolve(
             query=self._query.generate(
-                udf_obj.to_udf_wrapper(),
+                udf_obj.to_udf_wrapper(self._settings.batch_rows),
                 **self._settings.to_dict(),
             ),
             signal_schema=udf_obj.output,
@@ -885,7 +895,7 @@ def my_agg(files: list[File]) -> Iterator[tuple[File, int]]:
         udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
         return self._evolve(
             query=self._query.generate(
-                udf_obj.to_udf_wrapper(),
+                udf_obj.to_udf_wrapper(self._settings.batch_rows),
                 partition_by=processed_partition_by,
                 **self._settings.to_dict(),
             ),
@@ -919,9 +929,10 @@ def batch_map(
             ```
         """
         udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
+
         return self._evolve(
             query=self._query.add_signals(
-                udf_obj.to_udf_wrapper(batch),
+                udf_obj.to_udf_wrapper(self._settings.batch_rows, batch=batch),
                 **self._settings.to_dict(),
             ),
             signal_schema=self.signals_schema | udf_obj.output,
 
@@ -15,6 +15,8 @@
 
     P = ParamSpec("P")
 
+READ_RECORDS_BATCH_SIZE = 10000
+
 
 def read_records(
     to_insert: Optional[Union[dict, Iterable[dict]]],
@@ -41,7 +43,7 @@ def read_records(
     Notes:
         This call blocks until all records are inserted.
     """
-    from datachain.query.dataset import INSERT_BATCH_SIZE, adjust_outputs, get_col_types
+    from datachain.query.dataset import adjust_outputs, get_col_types
     from datachain.sql.types import SQLType
     from datachain.utils import batched
 
@@ -94,7 +96,7 @@ def read_records(
         {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
     )
     records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
-    for chunk in batched(records, INSERT_BATCH_SIZE):
+    for chunk in batched(records, READ_RECORDS_BATCH_SIZE):
         warehouse.insert_rows(table, chunk)
     warehouse.insert_rows_done(table)
     return read_dataset(name=dsr.full_name, session=session, settings=settings)
@@ -1,4 +1,5 @@
 from datachain.lib.utils import DataChainParamsError
+from datachain.utils import DEFAULT_CHUNK_ROWS
 
 
 class SettingsError(DataChainParamsError):
@@ -16,6 +17,7 @@ def __init__(
         prefetch=None,
         namespace=None,
         project=None,
+        batch_rows=None,
     ):
         self._cache = cache
         self.parallel = parallel
@@ -24,6 +26,7 @@ def __init__(
         self.prefetch = prefetch
         self.namespace = namespace
         self.project = project
+        self._chunk_rows = batch_rows
 
         if not isinstance(cache, bool) and cache is not None:
             raise SettingsError(
@@ -53,6 +56,18 @@ def __init__(
                 f", {min_task_size.__class__.__name__} was given"
             )
 
+        if batch_rows is not None and not isinstance(batch_rows, int):
+            raise SettingsError(
+                "'batch_rows' argument must be int or None"
+                f", {batch_rows.__class__.__name__} was given"
+            )
+
+        if batch_rows is not None and batch_rows <= 0:
+            raise SettingsError(
+                "'batch_rows' argument must be positive integer"
+                f", {batch_rows} was given"
+            )
+
     @property
     def cache(self):
         return self._cache if self._cache is not None else False
@@ -61,6 +76,10 @@ def cache(self):
     def workers(self):
         return self._workers if self._workers is not None else False
 
+    @property
+    def batch_rows(self):
+        return self._chunk_rows if self._chunk_rows is not None else DEFAULT_CHUNK_ROWS
+
     def to_dict(self):
         res = {}
         if self._cache is not None:
@@ -75,6 +94,8 @@ def to_dict(self):
             res["namespace"] = self.namespace
         if self.project is not None:
             res["project"] = self.project
+        if self._chunk_rows is not None:
+            res["batch_rows"] = self._chunk_rows
         return res
 
     def add(self, settings: "Settings"):
@@ -86,3 +107,5 @@ def add(self, settings: "Settings"):
         self.project = settings.project or self.project
         if settings.prefetch is not None:
             self.prefetch = settings.prefetch
+        if settings._chunk_rows is not None:
+            self._chunk_rows = settings._chunk_rows
@@ -62,19 +62,21 @@ def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
         return self.udf.get_batching(use_partitioning)
 
     @property
-    def batch(self):
-        return self.udf.batch
+    def batch_rows(self):
+        return self.udf.batch_rows
 
 
 @attrs.define(slots=False)
 class UDFAdapter:
     inner: "UDFBase"
     output: UDFOutputSpec
+    batch_rows: Optional[int] = None
     batch: int = 1
 
     def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
         if use_partitioning:
             return Partition()
+
         if self.batch == 1:
             return NoBatching()
         if self.batch > 1:
@@ -233,10 +235,15 @@ def verbose_name(self):
     def signal_names(self) -> Iterable[str]:
         return self.output.to_udf_spec().keys()
 
-    def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
+    def to_udf_wrapper(
+        self,
+        batch_rows: Optional[int] = None,
+        batch: int = 1,
+    ) -> UDFAdapter:
         return UDFAdapter(
             self,
             self.output.to_udf_spec(),
+            batch_rows,
             batch,
         )
 
 
@@ -333,32 +333,24 @@ def process_udf_outputs(
     udf_table: "Table",
     udf_results: Iterator[Iterable["UDFResult"]],
     udf: "UDFAdapter",
-    batch_size: int = INSERT_BATCH_SIZE,
     cb: Callback = DEFAULT_CALLBACK,
 ) -> None:
-    import psutil
-
-    rows: list[UDFResult] = []
     # Optimization: Compute row types once, rather than for every row.
     udf_col_types = get_col_types(warehouse, udf.output)
+    batch_rows = udf.batch_rows or INSERT_BATCH_SIZE
 
-    for udf_output in udf_results:
-        if not udf_output:
-            continue
-        with safe_closing(udf_output):
-            for row in udf_output:
-                cb.relative_update()
-                rows.append(adjust_outputs(warehouse, row, udf_col_types))
-                if len(rows) >= batch_size or (
-                    len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
-                ):
-                    for row_chunk in batched(rows, batch_size):
-                        warehouse.insert_rows(udf_table, row_chunk)
-                    rows.clear()
+    def _insert_rows():
+        for udf_output in udf_results:
+            if not udf_output:
+                continue
+
+            with safe_closing(udf_output):
+                for row in udf_output:
+                    cb.relative_update()
+                    yield adjust_outputs(warehouse, row, udf_col_types)
 
-    if rows:
-        for row_chunk in batched(rows, batch_size):
-            warehouse.insert_rows(udf_table, row_chunk)
+    for row_chunk in batched(_insert_rows(), batch_rows):
+        warehouse.insert_rows(udf_table, row_chunk)
 
     warehouse.insert_rows_done(udf_table)
 
@@ -401,6 +393,7 @@ class UDFStep(Step, ABC):
     min_task_size: Optional[int] = None
     is_generator = False
     cache: bool = False
+    batch_rows: Optional[int] = None
 
     @abstractmethod
     def create_udf_table(self, query: Select) -> "Table":
@@ -602,6 +595,7 @@ def clone(self, partition_by: Optional[PartitionByType] = None) -> "Self":
                 parallel=self.parallel,
                 workers=self.workers,
                 min_task_size=self.min_task_size,
+                batch_rows=self.batch_rows,
             )
         return self.__class__(self.udf, self.catalog)
 
@@ -1633,6 +1627,7 @@ def add_signals(
         min_task_size: Optional[int] = None,
         partition_by: Optional[PartitionByType] = None,
         cache: bool = False,
+        batch_rows: Optional[int] = None,
     ) -> "Self":
         """
         Adds one or more signals based on the results from the provided UDF.
@@ -1658,6 +1653,7 @@ def add_signals(
                 workers=workers,
                 min_task_size=min_task_size,
                 cache=cache,
+                batch_rows=batch_rows,
             )
         )
         return query
@@ -1679,6 +1675,7 @@ def generate(
         namespace: Optional[str] = None,
         project: Optional[str] = None,
         cache: bool = False,
+        batch_rows: Optional[int] = None,
     ) -> "Self":
         query = self.clone()
         steps = query.steps
@@ -1691,6 +1688,7 @@ def generate(
                 workers=workers,
                 min_task_size=min_task_size,
                 cache=cache,
+                batch_rows=batch_rows,
             )
         )
         return query