pytorch · NicolasHug · Apr 7, 2022 · Feb 24, 2022 · Feb 24, 2022 · Mar 8, 2022
diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py
@@ -7,9 +7,10 @@
 import torch
 from builtin_dataset_mocks import parametrize_dataset_mocks, DATASET_MOCKS
 from torch.testing._comparison import assert_equal, TensorLikePair, ObjectPair
+from torch.utils.data import DataLoader
 from torch.utils.data.graph import traverse
 from torch.utils.data.graph_settings import get_all_graph_pipes
-from torchdata.datapipes.iter import IterDataPipe, Shuffler, ShardingFilter
+from torchdata.datapipes.iter import Shuffler, ShardingFilter
 from torchvision._utils import sequence_to_str
 from torchvision.prototype import transforms, datasets
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
@@ -42,14 +43,24 @@ def test_coverage():
 
 @pytest.mark.filterwarnings("error")
 class TestCommon:
+    @pytest.mark.parametrize("name", datasets.list_datasets())
+    def test_info(self, name):
+        try:
+            info = datasets.info(name)
+        except ValueError:
+            raise AssertionError("No info available.") from None
+
+        if not (isinstance(info, dict) and all(isinstance(key, str) for key in info.keys())):
+            raise AssertionError("Info should be a dictionary with string keys.")
+
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_smoke(self, test_home, dataset_mock, config):
         dataset_mock.prepare(test_home, config)
 
         dataset = datasets.load(dataset_mock.name, **config)
 
-        if not isinstance(dataset, IterDataPipe):
-            raise AssertionError(f"Loading the dataset should return an IterDataPipe, but got {type(dataset)} instead.")
+        if not isinstance(dataset, datasets.utils.Dataset):
+            raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.")
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_sample(self, test_home, dataset_mock, config):
@@ -76,24 +87,7 @@ def test_num_samples(self, test_home, dataset_mock, config):
 
         dataset = datasets.load(dataset_mock.name, **config)
 
-        num_samples = 0
-        for _ in dataset:
-            num_samples += 1
-
-        assert num_samples == mock_info["num_samples"]
-
-    @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_decoding(self, test_home, dataset_mock, config):
-        dataset_mock.prepare(test_home, config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
-
-        undecoded_features = {key for key, value in next(iter(dataset)).items() if isinstance(value, io.IOBase)}
-        if undecoded_features:
-            raise AssertionError(
-                f"The values of key(s) "
-                f"{sequence_to_str(sorted(undecoded_features), separate_last='and ')} were not decoded."
-            )
+        assert len(list(dataset)) == mock_info["num_samples"]
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_no_vanilla_tensors(self, test_home, dataset_mock, config):
@@ -116,14 +110,36 @@ def test_transformable(self, test_home, dataset_mock, config):
 
         next(iter(dataset.map(transforms.Identity())))
 
+    @pytest.mark.parametrize("only_datapipe", [False, True])
     @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_serializable(self, test_home, dataset_mock, config):
+    def test_traversable(self, test_home, dataset_mock, config, only_datapipe):
         dataset_mock.prepare(test_home, config)
+        dataset = datasets.load(dataset_mock.name, **config)
+
+        traverse(dataset, only_datapipe=only_datapipe)
 
+    @parametrize_dataset_mocks(DATASET_MOCKS)
+    def test_serializable(self, test_home, dataset_mock, config):
+        dataset_mock.prepare(test_home, config)
         dataset = datasets.load(dataset_mock.name, **config)
 
         pickle.dumps(dataset)
 
+    @pytest.mark.parametrize("num_workers", [0, 1])
+    @parametrize_dataset_mocks(DATASET_MOCKS)
+    def test_data_loader(self, test_home, dataset_mock, config, num_workers):
+        dataset_mock.prepare(test_home, config)
+        dataset = datasets.load(dataset_mock.name, **config)
+
+        dl = DataLoader(
+            dataset,
+            batch_size=2,
+            num_workers=num_workers,
+            collate_fn=lambda batch: batch,
+        )
+
+        next(iter(dl))
+
     # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also
     #  that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680
     #  contain a custom test for that, but we opted to wait for a potential solution / test from torchdata for now.
@@ -132,7 +148,6 @@ def test_serializable(self, test_home, dataset_mock, config):
     def test_has_annotations(self, test_home, dataset_mock, config, annotation_dp_type):
 
         dataset_mock.prepare(test_home, config)
-
         dataset = datasets.load(dataset_mock.name, **config)
 
         if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)):
@@ -160,6 +175,13 @@ def test_infinite_buffer_size(self, test_home, dataset_mock, config):
                 #  resolved
                 assert dp.buffer_size == INFINITE_BUFFER_SIZE
 
+    @parametrize_dataset_mocks(DATASET_MOCKS)
+    def test_has_length(self, test_home, dataset_mock, config):
+        dataset_mock.prepare(test_home, config)
+        dataset = datasets.load(dataset_mock.name, **config)
+
+        assert len(dataset) > 0
+
 
 @parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
 class TestQMNIST:
@@ -186,7 +208,7 @@ class TestGTSRB:
     def test_label_matches_path(self, test_home, dataset_mock, config):
         # We read the labels from the csv files instead. But for the trainset, the labels are also part of the path.
         # This test makes sure that they're both the same
-        if config.split != "train":
+        if config["split"] != "train":
             return
 
         dataset_mock.prepare(test_home, config)

diff --git a/test/test_prototype_datasets_api.py b/test/test_prototype_datasets_api.py
diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py
@@ -5,7 +5,7 @@
 import torch
 from datasets_utils import make_fake_flo_file
 from torchvision.datasets._optical_flow import _read_flo as read_flo_ref
-from torchvision.prototype.datasets.utils import HttpResource, GDriveResource
+from torchvision.prototype.datasets.utils import HttpResource, GDriveResource, Dataset
 from torchvision.prototype.datasets.utils._internal import read_flo, fromfile
 
 
@@ -101,3 +101,21 @@ def preprocess_sentinel(path):
         assert redirected_resource.file_name == file_name
         assert redirected_resource.sha256 == sha256_sentinel
         assert redirected_resource._preprocess is preprocess_sentinel
+
+
+def test_missing_dependency_error():
+    class DummyDataset(Dataset):
+        def __init__(self):
+            super().__init__(root="root", dependencies=("fake_dependency",))
+
+        def _resources(self):
+            pass
+
+        def _datapipe(self, resource_dps):
+            pass
+
+        def __len__(self):
+            pass
+
+    with pytest.raises(ModuleNotFoundError, match="depends on the third-party package 'fake_dependency'"):
+        DummyDataset()