huggingface
diff --git a/‎timm/data/__init__.py
Lines changed: 1 addition & 0 deletions b/‎timm/data/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎timm/data/naflex_dataset.py
Lines changed: 17 additions & 6 deletions b/‎timm/data/naflex_dataset.py
Lines changed: 17 additions & 6 deletions
diff --git a/‎timm/data/naflex_loader.py
Lines changed: 49 additions & 10 deletions b/‎timm/data/naflex_loader.py
Lines changed: 49 additions & 10 deletions
diff --git a/‎timm/data/naflex_mixup.py
Lines changed: 50 additions & 32 deletions b/‎timm/data/naflex_mixup.py
Lines changed: 50 additions & 32 deletions
@@ -10,6 +10,7 @@
 from .mixup import Mixup, FastCollateMixup
 from .naflex_dataset import VariableSeqMapWrapper
 from .naflex_loader import create_naflex_loader
+from .naflex_mixup import NaFlexMixup, pairwise_mixup_target, mix_batch_variable_size
 from .naflex_transforms import (
     ResizeToSequence,
     CenterCropToSequence,
 
@@ -83,8 +83,11 @@ def __call__(self, batch):
         batch_size = len(batch)
 
         # Extract targets
-        # FIXME need to handle dense (float) targets or always done downstream of this?
-        targets = torch.tensor([item[1] for item in batch], dtype=torch.int64)
+        targets = [item[1] for item in batch]
+        if isinstance(targets[0], torch.Tensor):
+            targets = torch.stack(targets)
+        else:
+            targets = torch.tensor(targets, dtype=torch.int64)
 
         # Get patch dictionaries
         patch_dicts = [item[0] for item in batch]
@@ -139,6 +142,7 @@ def __init__(
             seq_lens: List[int] = (128, 256, 576, 784, 1024),
             max_tokens_per_batch: int = 4096 * 4, # Example: 16k tokens
             transform_factory: Optional[Callable] = None,
+            mixup_fn: Optional[Callable] = None,
             seed: int = 42,
             shuffle: bool = True,
             distributed: bool = False,
@@ -172,6 +176,7 @@ def __init__(
             else:
                 self.transforms[seq_len] = None # No transform
             self.collate_fns[seq_len] = NaFlexCollator(seq_len)
+        self.mixup_fn = mixup_fn
         self.patchifier = Patchify(self.patch_size)
 
         # --- Canonical Schedule Calculation (Done Once) ---
@@ -393,6 +398,8 @@ def __iter__(self) -> Iterator[Tuple[Dict[str, torch.Tensor], torch.Tensor]]:
             transform = self.transforms.get(seq_len)
 
             batch_samples = []
+            batch_imgs = []
+            batch_targets = []
             for idx in indices:
                 try:
                     # Get original image and label from map-style dataset
@@ -405,9 +412,8 @@ def __iter__(self) -> Iterator[Tuple[Dict[str, torch.Tensor], torch.Tensor]]:
                         warnings.warn(f"Transform returned None for index {idx}. Skipping sample.")
                         continue
 
-                    # Apply patching
-                    patch_data = self.patchifier(processed_img)
-                    batch_samples.append((patch_data, label))
+                    batch_imgs.append(processed_img)
+                    batch_targets.append(label)
 
                 except IndexError:
                      warnings.warn(f"IndexError encountered for index {idx} (possibly due to padding/repeated indices). Skipping sample.")
@@ -417,8 +423,13 @@ def __iter__(self) -> Iterator[Tuple[Dict[str, torch.Tensor], torch.Tensor]]:
                     warnings.warn(f"Error processing sample index {idx}. Error: {e}. Skipping sample.")
                     continue # Skip problematic sample
 
-            # Collate the processed samples into a batch
+            if self.mixup_fn is not None:
+                batch_imgs, batch_targets = self.mixup_fn(batch_imgs, batch_targets)
+
+            batch_imgs = [self.patchifier(img) for img in batch_imgs]
+            batch_samples = list(zip(batch_imgs, batch_targets))
             if batch_samples: # Only yield if we successfully processed samples
+                # Collate the processed samples into a batch
                 yield self.collate_fns[seq_len](batch_samples)
 
             # If batch_samples is empty after processing 'indices', an empty batch is skipped.
@@ -3,11 +3,13 @@
 from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 
+
 import torch
 
 from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .loader import _worker_init
+from .loader import _worker_init, adapt_to_chs
 from .naflex_dataset import VariableSeqMapWrapper, NaFlexCollator
+from .naflex_random_erasing import PatchRandomErasing
 from .transforms_factory import create_transform
 
 
@@ -16,19 +18,41 @@ class NaFlexPrefetchLoader:
 
     def __init__(
             self,
-            loader,
-            mean=(0.485, 0.456, 0.406),
-            std=(0.229, 0.224, 0.225),
-            img_dtype=torch.float32,
-            device=torch.device('cuda')
+            loader: torch.utils.data.DataLoader,
+            mean: Tuple[float, ...] = IMAGENET_DEFAULT_MEAN,
+            std: Tuple[float, ...] = IMAGENET_DEFAULT_STD,
+            channels: int = 3,
+            device: torch.device = torch.device('cuda'),
+            img_dtype: Optional[torch.dtype] = None,
+            re_prob: float = 0.,
+            re_mode: str = 'const',
+            re_count: int = 1,
+            re_num_splits: int = 0,
         ):
         self.loader = loader
         self.device = device
         self.img_dtype = img_dtype or torch.float32
 
         # Create mean/std tensors for normalization (will be applied to patches)
-        self.mean = torch.tensor([x * 255 for x in mean], device=device, dtype=self.img_dtype).view(1, 1, 3)
-        self.std = torch.tensor([x * 255 for x in std], device=device, dtype=self.img_dtype).view(1, 1, 3)
+        mean = adapt_to_chs(mean, channels)
+        std = adapt_to_chs(std, channels)
+        normalization_shape = (1, 1, channels)
+        self.channels = channels
+        self.mean = torch.tensor(
+            [x * 255 for x in mean], device=device, dtype=self.img_dtype).view(normalization_shape)
+        self.std = torch.tensor(
+            [x * 255 for x in std], device=device, dtype=self.img_dtype).view(normalization_shape)
+
+        if re_prob > 0.:
+            self.random_erasing = PatchRandomErasing(
+                erase_prob=re_prob,
+                mode=re_mode,
+                max_count=re_count,
+                num_splits=re_num_splits,
+                device=device,
+            )
+        else:
+            self.random_erasing = None
 
         # Check for CUDA/NPU availability
         self.is_cuda = device.type == 'cuda' and torch.cuda.is_available()
@@ -62,9 +86,18 @@ def __iter__(self):
 
                 # Normalize patch values (assuming patches are in format [B, N, P*P*C])
                 batch_size, num_patches, patch_pixels = next_input_dict['patches'].shape
-                patches = next_input_dict['patches'].view(batch_size, -1, 3) # to [B*N, P*P, C] for normalization
+
+                # To [B*N, P*P, C] for normalization and erasing
+                patches = next_input_dict['patches'].view(batch_size, num_patches, -1, self.channels)
                 patches = patches.sub(self.mean).div(self.std)
 
+                if self.random_erasing is not None:
+                    patches = self.random_erasing(
+                        patches,
+                        patch_coord=next_input_dict['patch_coord'],
+                        patch_valid=next_input_dict.get('patch_valid', None),
+                    )
+
                 # Reshape back
                 next_input_dict['patches'] = patches.reshape(batch_size, num_patches, patch_pixels)
 
@@ -103,6 +136,7 @@ def create_naflex_loader(
         max_seq_len: int = 576,  # Fixed sequence length for validation
         batch_size: int = 32,  # Used for max_seq_len and max(train_seq_lens)
         is_training: bool = False,
+        mixup_fn: Optional[Callable] = None,
 
         no_aug: bool = False,
         re_prob: float = 0.,
@@ -141,7 +175,8 @@ def create_naflex_loader(
         persistent_workers: bool = True,
         worker_seeding: str = 'all',
     ):
-    """Create a data loader with dynamic sequence length sampling for training."""
+    """Create a data loader with dynamic sequence length sampling for training.
+    """
 
     if is_training:
         # For training, use the dynamic sequence length mechanism
@@ -186,6 +221,7 @@ def create_naflex_loader(
             patch_size=patch_size,
             seq_lens=train_seq_lens,
             max_tokens_per_batch=max_tokens_per_batch,
+            mixup_fn=mixup_fn,
             seed=seed,
             distributed=distributed,
             rank=rank,
@@ -219,6 +255,9 @@ def create_naflex_loader(
                 std=std,
                 img_dtype=img_dtype,
                 device=device,
+                re_prob=re_prob,
+                re_mode=re_mode,
+                re_count=re_count,
             )
 
     else:
 
@@ -26,27 +26,24 @@ def mix_batch_variable_size(
         cutmix_alpha: float = 1.0,
         switch_prob: float = 0.5,
         local_shuffle: int = 4,
-) -> Tuple[List[torch.Tensor], List[float], Dict[int, int], bool]:
+) -> Tuple[List[torch.Tensor], List[float], Dict[int, int]]:
     """Apply Mixup or CutMix on a batch of variable‑sized images.
 
     The function first sorts images by aspect ratio and pairs neighbouring
     samples (optionally shuffling within small windows so pairs vary between
     epochs).  Only the mutual central‑overlap region of each pair is mixed
 
     Args:
-        imgs: List of transformed images shaped (C, H, W).  Heights and
-            widths may differ between samples.
-        mixup_alpha: Beta‑distribution *α* for Mixup.  Set to 0 to disable Mixup.
-        cutmix_alpha: Beta‑distribution *α* for CutMix.  Set to 0 to disable CutMix.
+        imgs: List of transformed images shaped (C, H, W).  Heights and widths may differ between samples.
+        mixup_alpha: Beta‑distribution alpha for Mixup.  Set to 0 to disable Mixup.
+        cutmix_alpha: Beta‑distribution alpha for CutMix.  Set to 0 to disable CutMix.
         switch_prob: Probability of using CutMix when both Mixup and CutMix are enabled.
-        local_shuffle: Size of local windows that are randomly shuffled after aspect sorting.
-            A value of 0 turns shuffling off.
+        local_shuffle: Size of local windows that are randomly shuffled after aspect sorting. Off if <= 1.
 
     Returns:
         mixed_imgs: List of mixed images.
         lam_list: Per‑sample lambda values representing the degree of mixing.
         pair_to: Mapping i -> j describing which sample was mixed with which (absent for unmatched odd sample).
-        use_cutmix: True if CutMix was used for this call, False if Mixup was used.
     """
     if len(imgs) < 2:
         raise ValueError("Need at least two images to perform Mixup/CutMix.")
@@ -71,7 +68,7 @@ def mix_batch_variable_size(
     order = sorted(range(len(imgs)), key=lambda i: imgs[i].shape[2] / imgs[i].shape[1])
     if local_shuffle > 1:
         for start in range(0, len(order), local_shuffle):
-            random.shuffle(order[start: start + local_shuffle])
+            random.shuffle(order[start:start + local_shuffle])
 
     pair_to: Dict[int, int] = {}
     for a, b in zip(order[::2], order[1::2]):
@@ -119,22 +116,41 @@ def mix_batch_variable_size(
             #print(i, 'Doing cutmix', yl_i, xl_i, yl_j, xl_j, ch, cw, lam_raw, corrected_lam)
         else:
             # Mixup: blend the entire overlap region
-            patch_i = xi[:, top_i: top_i + oh, left_i: left_i + ow]
-            patch_j = xj[:, top_j: top_j + oh, left_j: left_j + ow]
+            patch_i = xi[:, top_i:top_i + oh, left_i:left_i + ow]
+            patch_j = xj[:, top_j:top_j + oh, left_j:left_j + ow]
 
             blended = patch_i.mul(lam_raw).add_(patch_j, alpha=1.0 - lam_raw)
-            xi[:, top_i: top_i + oh, left_i: left_i + ow] = blended
+            xi[:, top_i:top_i + oh, left_i:left_i + ow] = blended
             mixed_imgs[i] = xi
 
             corrected_lam = (dest_area - overlap_area) / dest_area + lam_raw * overlap_area / dest_area
             lam_list[i] = corrected_lam
             #print(i, 'Doing mixup', top_i, left_i, top_j, left_j, (oh, ow), (hi, wi), (hj, wj), lam_raw, corrected_lam)
 
-    return mixed_imgs, lam_list, pair_to, use_cutmix
+    return mixed_imgs, lam_list, pair_to
+
+
+def smoothed_sparse_target(
+        targets: torch.Tensor,
+        *,
+        num_classes: int,
+        smoothing: float = 0.0,
+) -> torch.Tensor:
+    off_val = smoothing / num_classes
+    on_val = 1.0 - smoothing + off_val
+
+    y_onehot = torch.full(
+        (targets.size(0), num_classes),
+        off_val,
+        dtype=torch.float32,
+        device=targets.device
+    )
+    y_onehot.scatter_(1, targets.unsqueeze(1), on_val)
+    return y_onehot
 
 
 def pairwise_mixup_target(
-        labels: torch.Tensor,
+        targets: torch.Tensor,
         pair_to: Dict[int, int],
         lam_list: List[float],
         *,
@@ -144,21 +160,16 @@ def pairwise_mixup_target(
     """Create soft targets that match the pixel‑level mixing performed.
 
     Args:
-        labels: (B,) tensor of integer class indices.
+        targets: (B,) tensor of integer class indices.
         pair_to: Mapping of sample index to its mixed partner as returned by mix_batch_variable_size().
-        lam_list: Per‑sample fractions of self pixels, also from the mixer.
+        lam_list: Per‑sample fractions of own pixels, also from the mixer.
         num_classes: Total number of classes in the dataset.
         smoothing: Label‑smoothing value in the range [0, 1).
 
     Returns:
         Tensor of shape (B, num_classes) whose rows sum to 1.
     """
-    off_val = smoothing / num_classes
-    on_val = 1.0 - smoothing + off_val
-
-    y_onehot = torch.full((labels.size(0), num_classes), off_val, dtype=torch.float32, device=labels.device)
-    y_onehot.scatter_(1, labels.unsqueeze(1), on_val)
-
+    y_onehot = smoothed_sparse_target(targets, num_classes=num_classes, smoothing=smoothing)
     targets = y_onehot.clone()
     for i, j in pair_to.items():
         lam = lam_list[i]
@@ -177,8 +188,9 @@ def __init__(
             mixup_alpha: float = 0.8,
             cutmix_alpha: float = 1.0,
             switch_prob: float = 0.5,
+            prob: float = 1.0,
             local_shuffle: int = 4,
-            smoothing: float = 0.0,
+            label_smoothing: float = 0.0,
     ) -> None:
         """Configure the augmentation.
 
@@ -187,35 +199,41 @@ def __init__(
             mixup_alpha: Beta α for Mixup. 0 disables Mixup.
             cutmix_alpha: Beta α for CutMix. 0 disables CutMix.
             switch_prob: Probability of selecting CutMix when both modes are enabled.
+            prob: Probability of applying any mixing per batch.
             local_shuffle: Window size used to shuffle images after aspect sorting so pairings vary between epochs.
             smoothing: Label‑smoothing value. 0 disables smoothing.
         """
         self.num_classes = num_classes
         self.mixup_alpha = mixup_alpha
         self.cutmix_alpha = cutmix_alpha
         self.switch_prob = switch_prob
+        self.prob = prob
         self.local_shuffle = local_shuffle
-        self.smoothing = smoothing
+        self.smoothing = label_smoothing
 
     def __call__(
             self,
             imgs: List[torch.Tensor],
-            labels: torch.Tensor,
-    ) -> Tuple[List[torch.Tensor], torch.Tensor]:
+            targets: torch.Tensor,
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
         """Apply the augmentation and generate matching targets.
 
         Args:
-            imgs: List of already‑transformed images shaped (C, H, W).
-            labels: Hard labels with shape (B,).
+            imgs: List of already transformed images shaped (C, H, W).
+            targets: Hard labels with shape (B,).
 
         Returns:
             mixed_imgs: List of mixed images in the same order and shapes as the input.
             targets: Soft‑label tensor shaped (B, num_classes) suitable for cross‑entropy with soft targets.
         """
-        if isinstance(labels, (list, tuple)):
-            labels = torch.tensor(labels)
+        if not isinstance(targets, torch.Tensor):
+            targets = torch.tensor(targets)
+
+        if random.random() > self.prob:
+            targets = smoothed_sparse_target(targets, num_classes=self.num_classes, smoothing=self.smoothing)
+            return imgs, targets.unbind(0)
 
-        mixed_imgs, lam_list, pair_to, _ = mix_batch_variable_size(
+        mixed_imgs, lam_list, pair_to = mix_batch_variable_size(
             imgs,
             mixup_alpha=self.mixup_alpha,
             cutmix_alpha=self.cutmix_alpha,
@@ -224,7 +242,7 @@ def __call__(
         )
 
         targets = pairwise_mixup_target(
-            labels,
+            targets,
             pair_to,
             lam_list,
             num_classes=self.num_classes,