qdrant
diff --git a/‎benchmark/__init__.py
Lines changed: 1 addition & 0 deletions b/‎benchmark/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmark/dataset.py
Lines changed: 61 additions & 30 deletions b/‎benchmark/dataset.py
Lines changed: 61 additions & 30 deletions
diff --git a/‎benchmark/settings.py
Lines changed: 1 addition & 1 deletion b/‎benchmark/settings.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dataset/glove-100-angular/.dockerignore
Lines changed: 0 additions & 2 deletions b/‎dataset/glove-100-angular/.dockerignore
Lines changed: 0 additions & 2 deletions
diff --git a/‎dataset/glove-100-angular/.gitignore
Lines changed: 0 additions & 2 deletions b/‎dataset/glove-100-angular/.gitignore
Lines changed: 0 additions & 2 deletions
diff --git a/‎dataset/glove-100-angular/Dockerfile
Lines changed: 0 additions & 8 deletions b/‎dataset/glove-100-angular/Dockerfile
Lines changed: 0 additions & 8 deletions
diff --git a/‎dataset/glove-100-angular/config.json
Lines changed: 0 additions & 16 deletions b/‎dataset/glove-100-angular/config.json
Lines changed: 0 additions & 16 deletions
diff --git a/‎dataset/glove-100-angular/download.py
Lines changed: 0 additions & 22 deletions b/‎dataset/glove-100-angular/download.py
Lines changed: 0 additions & 22 deletions
diff --git a/‎dataset/random-100/Dockerfile
Lines changed: 0 additions & 5 deletions b/‎dataset/random-100/Dockerfile
Lines changed: 0 additions & 5 deletions
diff --git a/‎dataset/random-100/config.json
Lines changed: 0 additions & 16 deletions b/‎dataset/random-100/config.json
Lines changed: 0 additions & 16 deletions
diff --git a/‎dataset_reader/h5_reader.py renamed to ‎dataset_reader/ann_h5_reader.py
Lines changed: 3 additions & 3 deletions b/‎dataset_reader/h5_reader.py renamed to ‎dataset_reader/ann_h5_reader.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎datasets/.gitignore
Lines changed: 2 additions & 0 deletions b/‎datasets/.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎datasets/datasets.json
Lines changed: 25 additions & 0 deletions b/‎datasets/datasets.json
Lines changed: 25 additions & 0 deletions
diff --git a/‎dataset/random-100/vectors.jsonl renamed to ‎datasets/random-100/vectors.jsonl b/‎dataset/random-100/vectors.jsonl renamed to ‎datasets/random-100/vectors.jsonl
diff --git a/‎engine/base_client/client.py
Lines changed: 28 additions & 22 deletions b/‎engine/base_client/client.py
Lines changed: 28 additions & 22 deletions
diff --git a/‎engine/base_client/configure.py
Lines changed: 4 additions & 3 deletions b/‎engine/base_client/configure.py
Lines changed: 4 additions & 3 deletions
@@ -3,3 +3,4 @@
 # Base directory point to the main directory of the project, so all the data
 # loaded from files can refer to it as a root directory
 BASE_DIRECTORY = Path(__file__).parent.parent
+DATASETS_DIR = BASE_DIRECTORY / "datasets"
@@ -1,40 +1,71 @@
-import json
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Text
+import os
+from dataclasses import dataclass
+from typing import Optional
+import tarfile
 
-import jsons
+import urllib.request
 
-from benchmark import BASE_DIRECTORY
-
-
-@dataclass
-class PhaseConfig:
-    files: List[Text] = field(default_factory=list)
-    engine: Dict[Any, Any] = field(default_factory=dict)
+from benchmark import DATASETS_DIR
+from dataset_reader.ann_h5_reader import AnnH5Reader
+from dataset_reader.base_reader import BaseReader
+from dataset_reader.json_reader import JSONReader
 
 
 @dataclass
 class DatasetConfig:
     vector_size: int
-    distance: Text
-    load: PhaseConfig
-    search: PhaseConfig
-    url: Optional[Text]
+    distance: str
+    name: str
+    type: str
+    path: str
+    link: Optional[str] = None
+
+
+READER_TYPE = {"h5": AnnH5Reader, "jsonl": JSONReader}
 
 
 class Dataset:
-    @classmethod
-    def from_name(cls, name: Text) -> "Dataset":
-        config_path = BASE_DIRECTORY / "dataset" / name / "config.json"
-        with open(config_path, "r") as fp:
-            config = jsons.load(json.load(fp), DatasetConfig)
-        return Dataset(name, config)
-
-    def __init__(self, name: Text, config: DatasetConfig):
-        self.name = name
-        self.config = config
-
-    @property
-    def root_dir(self) -> Path:
-        return BASE_DIRECTORY / "dataset" / self.name
+    def __init__(self, config: dict):
+        self.config = DatasetConfig(**config)
+
+    def download(self):
+        target_path = DATASETS_DIR / self.config.path
+
+        if target_path.exists():
+            print(f"{target_path} already exists")
+            return
+
+        if self.config.link:
+            print(f"Downloading {self.config.link}...")
+            tmp_path, _ = urllib.request.urlretrieve(self.config.link)
+
+            if tmp_path.endswith(".tgz") or tmp_path.endswith(".tar.gz"):
+                print(f"Extracting: {tmp_path} -> {target_path}")
+                (DATASETS_DIR / self.config.path).mkdir(exist_ok=True)
+                file = tarfile.open(tmp_path)
+                file.extractall(target_path)
+                file.close()
+                os.remove(tmp_path)
+            else:
+                print(f"Moving: {tmp_path} -> {target_path}")
+                (DATASETS_DIR / self.config.path).parent.mkdir(exist_ok=True)
+                os.rename(tmp_path, target_path)
+
+    def get_reader(self) -> BaseReader:
+        reader_class = READER_TYPE[self.config.type]
+        return reader_class(self.config.path)
+
+
+if __name__ == "__main__":
+    dataset = Dataset(
+        {
+            "name": "glove-25-angular",
+            "vector_size": 25,
+            "distance": "Cosine",
+            "type": "h5",
+            "path": "glove-25-angular/glove-25-angular.hdf5",
+            "link": "http://ann-benchmarks.com/glove-25-angular.hdf5",
+        }
+    )
+
+    dataset.download()
@@ -3,4 +3,4 @@
 
 CODE_DIR = os.path.dirname(__file__)
 ROOT_DIR = os.path.dirname(CODE_DIR)
-DATASET_DIR = os.path.join(ROOT_DIR, "dataset")
+DATASETS_DIR = os.path.join(ROOT_DIR, "datasets")
@@ -5,7 +5,7 @@
 from dataset_reader.base_reader import BaseReader, Record, Query
 
 
-class H5Reader(BaseReader):
+class AnnH5Reader(BaseReader):
 
     def __init__(self, path):
         self.path = path
@@ -42,8 +42,8 @@ def read_data(self) -> Iterator[Record]:
     # `distances` - float - distances for nearest neighbors for test vectors
 
     test_path = os.path.join(DATASET_DIR, 'glove-100-angular', 'glove-100-angular.hdf5')
-    record = next(H5Reader(test_path).read_data())
+    record = next(AnnH5Reader(test_path).read_data())
     print(record, end='\n\n')
 
-    query = next(H5Reader(test_path).read_queries())
+    query = next(AnnH5Reader(test_path).read_queries())
     print(query)
@@ -0,0 +1,2 @@
+*/*
+!random-100/
@@ -0,0 +1,25 @@
+[
+  {
+    "name": "glove-25-angular",
+    "vector_size": 25,
+    "distance": "Cosine",
+    "type": "h5",
+    "path": "glove-25-angular/glove-25-angular.hdf5",
+    "link": "http://ann-benchmarks.com/glove-25-angular.hdf5"
+  },
+  {
+    "name": "glove-100-angular",
+    "vector_size": 100,
+    "distance": "Cosine",
+    "type": "h5",
+    "path": "glove-100-angular/glove-100-angular.hdf5",
+    "link": "http://ann-benchmarks.com/glove-100-angular.hdf5"
+  },
+  {
+    "name": "random-100",
+    "vector_size": 100,
+    "distance": "Cosine",
+    "type": "jsonl",
+    "path": "random-100/"
+  }
+]
@@ -1,6 +1,7 @@
-from typing import Iterable
+import json
+from datetime import datetime
 
-from dataset_reader.base_reader import Query
+from benchmark.dataset import Dataset
 from engine.base_client.configure import BaseConfigurator
 from engine.base_client.search import BaseSearcher
 from engine.base_client.upload import BaseUploader
@@ -9,34 +10,39 @@
 class BaseClient:
     def __init__(
         self,
-        url,
+        name: str,  # name of the experiment
         configurator: BaseConfigurator,
         uploader: BaseUploader,
         searcher: BaseSearcher,
     ):
-        self.url = url
+        self.name = name
         self.configurator = configurator
         self.uploader = uploader
         self.searcher = searcher
 
-    def search_all(
-        self, connection_params, search_params, queries: Iterable[Query], parallel,
-    ):
-        precisions, latencies = self.searcher.search_all(
-            self.url, connection_params, search_params, queries, parallel,
-        )
-        print(f"search::latency = {sum(latencies) / parallel}")
-        print(f"search::precisions = {sum(precisions) / len(precisions)}")
-        return latencies
+    def save_experiment_results(self, dataset_name: str, results: dict):
+        now = datetime.now()
+        timestamp = now.strftime("%Y-%m-%d-%H-%M-%S")
+        experiments_file = f"{self.name}-{dataset_name}-{timestamp}.json"
+        with open(experiments_file, "w") as out:
+            out.write(
+                json.dumps(results, indent=2)
+            )
 
-    def upload(self, filename, batch_size, parallel, connection_params):
-        latencies = self.uploader.upload(
-            self.url, filename, batch_size, parallel, connection_params
+    def run_experiment(self, dataset: Dataset):
+        self.configurator.configure(
+            distance=dataset.config.distance,
+            vector_size=dataset.config.vector_size,
         )
-        print(f"upload::latency = {sum(latencies) / parallel}")
-        return latencies
 
-    def configure(self, distance, vector_size, collection_params):
-        latency = self.configurator.configure(distance, vector_size, collection_params)
-        print(f"configure::latency = {latency}")
-        return latency
+        reader = dataset.get_reader()
+        upload_stats = self.uploader.upload(reader.read_data())
+        search_stats = self.searcher.search_all(reader.read_queries())
+
+        self.save_experiment_results(
+            dataset.config.name,
+            {
+                "upload": upload_stats,
+                "search": search_stats
+            }
+        )
@@ -7,7 +7,8 @@ class BaseConfigurator:
     DEFAULT_CONFIG_PATH = Path("default.json")
     DISTANCE_MAPPING = {}
 
-    def __init__(self, collection_params: dict, connection_params: dict):
+    def __init__(self, host, collection_params: dict, connection_params: dict):
+        self.host = host
         self.collection_params = collection_params
         self.connection_params = connection_params
 
@@ -17,10 +18,10 @@ def clean(self):
     def recreate(self, distance, vector_size, collection_params):
         raise NotImplementedError()
 
-    def configure(self, distance, vector_size, collection_params):
+    def configure(self, distance, vector_size):
         self.clean()
         start = time.perf_counter()
-        self.recreate(distance, vector_size, collection_params)
+        self.recreate(distance, vector_size, self.collection_params)
         return time.perf_counter() - start
 
     @classmethod