Address comments

amritghimire · amritghimire · commit f4febedbf4a3 · 2025-08-01T22:19:40.000+05:30
diff --git a/src/datachain/lib/memory_utils.py b/src/datachain/lib/memory_utils.py
@@ -0,0 +1,64 @@
+"""Memory estimation utilities for DataChain."""
+
+import sys
+from typing import Any, Union
+
+# Default batch processing values
+DEFAULT_CHUNK_ROWS = 2000
+DEFAULT_CHUNK_MB = 1000
+
+# Memory monitoring threshold (percentage)
+MEMORY_USAGE_THRESHOLD = 80
+
+# System memory check frequency (every N rows)
+MEMORY_CHECK_FREQUENCY = 100
+
+# Shared constant for object overhead estimation
+OBJECT_OVERHEAD_BYTES = 100
+
+
+def estimate_memory_recursive(item: Any) -> int:
+    if item is None:
+        return 0
+
+    if isinstance(item, (str, bytes, int, float, bool)):
+        return sys.getsizeof(item)
+    if isinstance(item, (list, tuple)):
+        total_size = sys.getsizeof(item)
+        for subitem in item:
+            total_size += sys.getsizeof(subitem)
+        return total_size
+    # For complex objects, use a conservative estimate
+    return sys.getsizeof(item) + OBJECT_OVERHEAD_BYTES
+
+
+def estimate_row_memory(row: Union[list, tuple]) -> int:
+    if not row:
+        return 0
+
+    total_size = 0
+    for item in row:
+        total_size += estimate_memory_recursive(item)
+
+    return total_size
+
+
+def get_system_memory_percent() -> float:
+    try:
+        import psutil
+
+        return psutil.virtual_memory().percent
+    except ImportError:
+        import warnings
+
+        warnings.warn(
+            "psutil not available. Memory-based checks will be skipped. "
+            "Install psutil to enable memory monitoring.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return 0.0
+
+
+def is_memory_usage_high() -> bool:
+    return get_system_memory_percent() > MEMORY_USAGE_THRESHOLD
diff --git a/src/datachain/lib/settings.py b/src/datachain/lib/settings.py
@@ -1,3 +1,4 @@
+from datachain.lib.memory_utils import DEFAULT_CHUNK_MB, DEFAULT_CHUNK_ROWS
 from datachain.lib.utils import DataChainParamsError
 
 
@@ -63,12 +64,23 @@ def __init__(
                 f", {chunk_rows.__class__.__name__} was given"
             )
 
+        if chunk_rows is not None and chunk_rows <= 0:
+            raise SettingsError(
+                "'chunk_rows' argument must be positive integer"
+                f", {chunk_rows} was given"
+            )
+
         if chunk_mb is not None and not isinstance(chunk_mb, (int, float)):
             raise SettingsError(
                 "'chunk_mb' argument must be int/float or None"
                 f", {chunk_mb.__class__.__name__} was given"
             )
 
+        if chunk_mb is not None and chunk_mb <= 0:
+            raise SettingsError(
+                f"'chunk_mb' argument must be positive number, {chunk_mb} was given"
+            )
+
     @property
     def cache(self):
         return self._cache if self._cache is not None else False
@@ -79,11 +91,11 @@ def workers(self):
 
     @property
     def chunk_rows(self):
-        return self._chunk_rows if self._chunk_rows is not None else 2000
+        return self._chunk_rows if self._chunk_rows is not None else DEFAULT_CHUNK_ROWS
 
     @property
     def chunk_mb(self):
-        return self._chunk_mb if self._chunk_mb is not None else 1000
+        return self._chunk_mb if self._chunk_mb is not None else DEFAULT_CHUNK_MB
 
     def to_dict(self):
         res = {}
@@ -115,6 +127,6 @@ def add(self, settings: "Settings"):
         if settings.prefetch is not None:
             self.prefetch = settings.prefetch
         if settings._chunk_rows is not None:
-            self._chunk_rows = settings.chunk_rows
+            self._chunk_rows = settings._chunk_rows
         if settings._chunk_mb is not None:
             self._chunk_mb = settings._chunk_mb
diff --git a/src/datachain/lib/udf.py b/src/datachain/lib/udf.py
@@ -14,6 +14,7 @@
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
 from datachain.lib.file import DataModel, File
+from datachain.lib.memory_utils import DEFAULT_CHUNK_MB, DEFAULT_CHUNK_ROWS
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
 from datachain.query.batch import (
     BatchingStrategy,
@@ -89,14 +90,21 @@ def get_batching(
 
         # If we have explicit chunk_rows/chunk_mb set on this adapter, use them
         if self.chunk_rows is not None or self.chunk_mb is not None:
-            return DynamicBatch(self.chunk_rows, self.chunk_mb, is_input_batched)
+            return DynamicBatch(
+                self.chunk_rows if self.chunk_rows is not None else DEFAULT_CHUNK_ROWS,
+                self.chunk_mb if self.chunk_mb is not None else DEFAULT_CHUNK_MB,
+                is_input_batched,
+            )
 
         # If settings are provided and have batch configuration, use appropriate
         # batching
         if settings:
             max_rows: Optional[int] = getattr(settings, "_chunk_rows", None)
             max_mem: Optional[Union[int, float]] = getattr(settings, "_chunk_mb", None)
             if max_rows is not None or max_mem is not None:
+                # Use settings values, falling back to defaults if None
+                max_rows = max_rows if max_rows is not None else DEFAULT_CHUNK_ROWS
+                max_mem = max_mem if max_mem is not None else DEFAULT_CHUNK_MB
                 return DynamicBatch(max_rows, max_mem, is_input_batched)
 
         return NoBatching()
diff --git a/src/datachain/query/batch.py b/src/datachain/query/batch.py
@@ -1,20 +1,18 @@
 import contextlib
 import math
-import sys
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Sequence
 from typing import Callable, Optional, Union
 
 import sqlalchemy as sa
 
 from datachain.data_storage.schema import PARTITION_COLUMN_ID
+from datachain.lib.memory_utils import estimate_row_memory, is_memory_usage_high
 from datachain.query.utils import get_query_column
 
 RowsOutputBatch = Sequence[Sequence]
 RowsOutput = Union[Sequence, RowsOutputBatch]
 
-OBJECT_OVERHEAD_BYTES = 100
-
 
 class BatchingStrategy(ABC):
     """BatchingStrategy provides means of batching UDF executions."""
@@ -116,35 +114,12 @@ def __init__(
         # If we yield individual rows, set is_batching to False
         self.is_batching = is_input_batched
 
-    def _estimate_row_memory(self, row) -> int:
-        """Estimate memory usage of a row in bytes."""
-        if not row:
-            return 0
-
-        total_size = 0
-        for item in row:
-            if isinstance(item, (str, bytes, int, float, bool)):
-                total_size += sys.getsizeof(item)
-            elif isinstance(item, (list, tuple)):
-                total_size += sys.getsizeof(item)
-                for subitem in item:
-                    total_size += sys.getsizeof(subitem)
-            else:
-                # For complex objects, use a conservative estimate
-                total_size += (
-                    sys.getsizeof(item) + OBJECT_OVERHEAD_BYTES
-                )  # Add buffer for object overhead
-
-        return total_size
-
     def __call__(
         self,
         execute: Callable,
         query: sa.Select,
         id_col: Optional[sa.ColumnElement] = None,
     ) -> Generator[RowsOutput, None, None]:
-        import psutil
-
         from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
 
         ids_only = False
@@ -162,7 +137,7 @@ def __call__(
 
         with contextlib.closing(execute(query, page_size=page_size)) as chunk_rows:
             for row in chunk_rows:
-                row_memory = self._estimate_row_memory(row)
+                row_memory = estimate_row_memory(row)
                 row_count += 1
 
                 # Check if adding this row would exceed limits
@@ -171,7 +146,7 @@ def __call__(
                 should_yield = (
                     len(results) >= self.max_rows
                     or current_memory + row_memory > self.max_memory_bytes
-                    or (row_count % 100 == 0 and psutil.virtual_memory().percent > 80)
+                    or (row_count % 100 == 0 and is_memory_usage_high())
                 )
 
                 if should_yield and results:  # Yield current batch if we have one
diff --git a/src/datachain/utils.py b/src/datachain/utils.py
@@ -31,9 +31,13 @@
 except ImportError:
     psutil = None
 
-# Constants for memory estimation
-OBJECT_OVERHEAD_BYTES = 100
-
+# Import shared memory utilities
+from datachain.lib.memory_utils import (
+    DEFAULT_CHUNK_MB,
+    DEFAULT_CHUNK_ROWS,
+    estimate_memory_recursive,
+    is_memory_usage_high,
+)
 
 logger = logging.getLogger("datachain")
 
@@ -246,16 +250,14 @@ def _dynamic_batched_core(
     current_memory = 0
 
     for row_count, item in enumerate(iterable):
-        item_memory = _estimate_item_memory(item)
+        item_memory = estimate_memory_recursive(item)
 
         # Check if adding this item would exceed limits
         # Also check system memory usage every 100 items
         should_yield = (
             len(batch) >= chunk_rows
             or current_memory + item_memory > max_memory_bytes
-            or (
-                row_count % 100 == 0 and psutil and psutil.virtual_memory().percent > 80
-            )
+            or (row_count % 100 == 0 and is_memory_usage_high())
         )
 
         if should_yield and batch:  # Yield current batch if we have one
@@ -284,7 +286,9 @@ def batched(
 
 
 def batched_it(
-    iterable: Iterable[_T_co], chunk_rows: int = 2000, chunk_mb: float = 1000
+    iterable: Iterable[_T_co],
+    chunk_rows: int = DEFAULT_CHUNK_ROWS,
+    chunk_mb: float = DEFAULT_CHUNK_MB,
 ) -> Iterator[Iterator[_T_co]]:
     """
     Batch data into iterators with dynamic sizing
@@ -295,25 +299,6 @@ def batched_it(
     )
 
 
-def _estimate_item_memory(item) -> int:
-    """Estimate memory usage of an item in bytes."""
-    if item is None:
-        return 0
-
-    total_size = 0
-    if isinstance(item, (str, bytes, int, float, bool)):
-        total_size += sys.getsizeof(item)
-    elif isinstance(item, (list, tuple)):
-        total_size += sys.getsizeof(item)
-        for subitem in item:
-            total_size += sys.getsizeof(subitem)
-    else:
-        # For complex objects, use a conservative estimate
-        total_size += sys.getsizeof(item) + OBJECT_OVERHEAD_BYTES
-
-    return total_size
-
-
 def flatten(items):
     for item in items:
         if isinstance(item, (list, tuple)):
diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
@@ -2470,9 +2470,10 @@ def add_one_with_batch_size(x):
     chain = dc.read_values(x=list(range(100)), session=test_session)
     chain_with_settings = chain.settings(chunk_rows=50, chunk_mb=1000)
 
-    result = chain_with_settings.batch_map(
-        add_one_with_batch_size, output={"result": Result}, batch=15
-    )
+    with pytest.warns(DeprecationWarning):
+        result = chain_with_settings.batch_map(
+            add_one_with_batch_size, output={"result": Result}, batch=15
+        )
 
     results = [r[0] for r in result.to_iter("result")]
 
@@ -2485,8 +2486,8 @@ def add_one_with_batch_size(x):
 
     assert len(results) == 100
 
-    expected_values = list(range(1, 101))
-    actual_values = [r.result for r in results]
+    expected_values = set(range(1, 101))
+    actual_values = {r.result for r in results}
     assert actual_values == expected_values
 
 
diff --git a/tests/unit/lib/test_memory_utils.py b/tests/unit/lib/test_memory_utils.py
@@ -0,0 +1,35 @@
+"""Tests for memory utility functions."""
+
+from datachain.lib.memory_utils import (
+    OBJECT_OVERHEAD_BYTES,
+    estimate_memory_recursive,
+    estimate_row_memory,
+    get_system_memory_percent,
+)
+
+
+def test_estimate_memory_recursive():
+    """Test memory estimation for basic types."""
+    assert estimate_memory_recursive(None) == 0
+    assert estimate_memory_recursive(42) > 0
+    assert estimate_memory_recursive("test") > 0
+    assert estimate_memory_recursive([1, 2, 3]) > 0
+
+
+def test_estimate_row_memory():
+    """Test memory estimation for rows."""
+    assert estimate_row_memory([]) == 0
+    assert estimate_row_memory([1, "test", 3.14]) > 0
+
+
+def test_system_memory_functions():
+    """Test system memory monitoring functions."""
+    memory_percent = get_system_memory_percent()
+    assert isinstance(memory_percent, (int, float))
+    assert 0.0 <= memory_percent <= 100.0
+
+
+def test_object_overhead_constant():
+    """Test that OBJECT_OVERHEAD_BYTES is defined."""
+    assert isinstance(OBJECT_OVERHEAD_BYTES, int)
+    assert OBJECT_OVERHEAD_BYTES > 0
diff --git a/tests/unit/lib/test_settings.py b/tests/unit/lib/test_settings.py