Skip to content

Commit 9ffac6c

Browse files
joeingenerall
andcommitted
WIP: add datasets, client factory, cli interface, update client entities
Co-authored-by: Andrey Vasnetsov <[email protected]>
1 parent e90d15f commit 9ffac6c

File tree

25 files changed

+389
-189
lines changed

25 files changed

+389
-189
lines changed

benchmark/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
# Base directory point to the main directory of the project, so all the data
44
# loaded from files can refer to it as a root directory
55
BASE_DIRECTORY = Path(__file__).parent.parent
6+
DATASETS_DIR = BASE_DIRECTORY / "datasets"

benchmark/dataset.py

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,71 @@
1-
import json
2-
from dataclasses import dataclass, field
3-
from pathlib import Path
4-
from typing import Any, Dict, List, Optional, Text
1+
import os
2+
from dataclasses import dataclass
3+
from typing import Optional
4+
import tarfile
55

6-
import jsons
6+
import urllib.request
77

8-
from benchmark import BASE_DIRECTORY
9-
10-
11-
@dataclass
12-
class PhaseConfig:
13-
files: List[Text] = field(default_factory=list)
14-
engine: Dict[Any, Any] = field(default_factory=dict)
8+
from benchmark import DATASETS_DIR
9+
from dataset_reader.ann_h5_reader import AnnH5Reader
10+
from dataset_reader.base_reader import BaseReader
11+
from dataset_reader.json_reader import JSONReader
1512

1613

1714
@dataclass
1815
class DatasetConfig:
1916
vector_size: int
20-
distance: Text
21-
load: PhaseConfig
22-
search: PhaseConfig
23-
url: Optional[Text]
17+
distance: str
18+
name: str
19+
type: str
20+
path: str
21+
link: Optional[str] = None
22+
23+
24+
READER_TYPE = {"h5": AnnH5Reader, "jsonl": JSONReader}
2425

2526

2627
class Dataset:
27-
@classmethod
28-
def from_name(cls, name: Text) -> "Dataset":
29-
config_path = BASE_DIRECTORY / "dataset" / name / "config.json"
30-
with open(config_path, "r") as fp:
31-
config = jsons.load(json.load(fp), DatasetConfig)
32-
return Dataset(name, config)
33-
34-
def __init__(self, name: Text, config: DatasetConfig):
35-
self.name = name
36-
self.config = config
37-
38-
@property
39-
def root_dir(self) -> Path:
40-
return BASE_DIRECTORY / "dataset" / self.name
28+
def __init__(self, config: dict):
29+
self.config = DatasetConfig(**config)
30+
31+
def download(self):
32+
target_path = DATASETS_DIR / self.config.path
33+
34+
if target_path.exists():
35+
print(f"{target_path} already exists")
36+
return
37+
38+
if self.config.link:
39+
print(f"Downloading {self.config.link}...")
40+
tmp_path, _ = urllib.request.urlretrieve(self.config.link)
41+
42+
if tmp_path.endswith(".tgz") or tmp_path.endswith(".tar.gz"):
43+
print(f"Extracting: {tmp_path} -> {target_path}")
44+
(DATASETS_DIR / self.config.path).mkdir(exist_ok=True)
45+
file = tarfile.open(tmp_path)
46+
file.extractall(target_path)
47+
file.close()
48+
os.remove(tmp_path)
49+
else:
50+
print(f"Moving: {tmp_path} -> {target_path}")
51+
(DATASETS_DIR / self.config.path).parent.mkdir(exist_ok=True)
52+
os.rename(tmp_path, target_path)
53+
54+
def get_reader(self) -> BaseReader:
55+
reader_class = READER_TYPE[self.config.type]
56+
return reader_class(self.config.path)
57+
58+
59+
if __name__ == "__main__":
60+
dataset = Dataset(
61+
{
62+
"name": "glove-25-angular",
63+
"vector_size": 25,
64+
"distance": "Cosine",
65+
"type": "h5",
66+
"path": "glove-25-angular/glove-25-angular.hdf5",
67+
"link": "http://ann-benchmarks.com/glove-25-angular.hdf5",
68+
}
69+
)
70+
71+
dataset.download()

benchmark/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
CODE_DIR = os.path.dirname(__file__)
55
ROOT_DIR = os.path.dirname(CODE_DIR)
6-
DATASET_DIR = os.path.join(ROOT_DIR, "dataset")
6+
DATASETS_DIR = os.path.join(ROOT_DIR, "datasets")

dataset/glove-100-angular/.dockerignore

Lines changed: 0 additions & 2 deletions
This file was deleted.

dataset/glove-100-angular/.gitignore

Lines changed: 0 additions & 2 deletions
This file was deleted.

dataset/glove-100-angular/Dockerfile

Lines changed: 0 additions & 8 deletions
This file was deleted.

dataset/glove-100-angular/config.json

Lines changed: 0 additions & 16 deletions
This file was deleted.

dataset/glove-100-angular/download.py

Lines changed: 0 additions & 22 deletions
This file was deleted.

dataset/random-100/Dockerfile

Lines changed: 0 additions & 5 deletions
This file was deleted.

dataset/random-100/config.json

Lines changed: 0 additions & 16 deletions
This file was deleted.

dataset_reader/h5_reader.py renamed to dataset_reader/ann_h5_reader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from dataset_reader.base_reader import BaseReader, Record, Query
66

77

8-
class H5Reader(BaseReader):
8+
class AnnH5Reader(BaseReader):
99

1010
def __init__(self, path):
1111
self.path = path
@@ -42,8 +42,8 @@ def read_data(self) -> Iterator[Record]:
4242
# `distances` - float - distances for nearest neighbors for test vectors
4343

4444
test_path = os.path.join(DATASET_DIR, 'glove-100-angular', 'glove-100-angular.hdf5')
45-
record = next(H5Reader(test_path).read_data())
45+
record = next(AnnH5Reader(test_path).read_data())
4646
print(record, end='\n\n')
4747

48-
query = next(H5Reader(test_path).read_queries())
48+
query = next(AnnH5Reader(test_path).read_queries())
4949
print(query)

datasets/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*/*
2+
!random-100/

datasets/datasets.json

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
[
2+
{
3+
"name": "glove-25-angular",
4+
"vector_size": 25,
5+
"distance": "Cosine",
6+
"type": "h5",
7+
"path": "glove-25-angular/glove-25-angular.hdf5",
8+
"link": "http://ann-benchmarks.com/glove-25-angular.hdf5"
9+
},
10+
{
11+
"name": "glove-100-angular",
12+
"vector_size": 100,
13+
"distance": "Cosine",
14+
"type": "h5",
15+
"path": "glove-100-angular/glove-100-angular.hdf5",
16+
"link": "http://ann-benchmarks.com/glove-100-angular.hdf5"
17+
},
18+
{
19+
"name": "random-100",
20+
"vector_size": 100,
21+
"distance": "Cosine",
22+
"type": "jsonl",
23+
"path": "random-100/"
24+
}
25+
]
File renamed without changes.

engine/base_client/client.py

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
from typing import Iterable
1+
import json
2+
from datetime import datetime
23

3-
from dataset_reader.base_reader import Query
4+
from benchmark.dataset import Dataset
45
from engine.base_client.configure import BaseConfigurator
56
from engine.base_client.search import BaseSearcher
67
from engine.base_client.upload import BaseUploader
@@ -9,34 +10,39 @@
910
class BaseClient:
1011
def __init__(
1112
self,
12-
url,
13+
name: str, # name of the experiment
1314
configurator: BaseConfigurator,
1415
uploader: BaseUploader,
1516
searcher: BaseSearcher,
1617
):
17-
self.url = url
18+
self.name = name
1819
self.configurator = configurator
1920
self.uploader = uploader
2021
self.searcher = searcher
2122

22-
def search_all(
23-
self, connection_params, search_params, queries: Iterable[Query], parallel,
24-
):
25-
precisions, latencies = self.searcher.search_all(
26-
self.url, connection_params, search_params, queries, parallel,
27-
)
28-
print(f"search::latency = {sum(latencies) / parallel}")
29-
print(f"search::precisions = {sum(precisions) / len(precisions)}")
30-
return latencies
23+
def save_experiment_results(self, dataset_name: str, results: dict):
24+
now = datetime.now()
25+
timestamp = now.strftime("%Y-%m-%d-%H-%M-%S")
26+
experiments_file = f"{self.name}-{dataset_name}-{timestamp}.json"
27+
with open(experiments_file, "w") as out:
28+
out.write(
29+
json.dumps(results, indent=2)
30+
)
3131

32-
def upload(self, filename, batch_size, parallel, connection_params):
33-
latencies = self.uploader.upload(
34-
self.url, filename, batch_size, parallel, connection_params
32+
def run_experiment(self, dataset: Dataset):
33+
self.configurator.configure(
34+
distance=dataset.config.distance,
35+
vector_size=dataset.config.vector_size,
3536
)
36-
print(f"upload::latency = {sum(latencies) / parallel}")
37-
return latencies
3837

39-
def configure(self, distance, vector_size, collection_params):
40-
latency = self.configurator.configure(distance, vector_size, collection_params)
41-
print(f"configure::latency = {latency}")
42-
return latency
38+
reader = dataset.get_reader()
39+
upload_stats = self.uploader.upload(reader.read_data())
40+
search_stats = self.searcher.search_all(reader.read_queries())
41+
42+
self.save_experiment_results(
43+
dataset.config.name,
44+
{
45+
"upload": upload_stats,
46+
"search": search_stats
47+
}
48+
)

engine/base_client/configure.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ class BaseConfigurator:
77
DEFAULT_CONFIG_PATH = Path("default.json")
88
DISTANCE_MAPPING = {}
99

10-
def __init__(self, collection_params: dict, connection_params: dict):
10+
def __init__(self, host, collection_params: dict, connection_params: dict):
11+
self.host = host
1112
self.collection_params = collection_params
1213
self.connection_params = connection_params
1314

@@ -17,10 +18,10 @@ def clean(self):
1718
def recreate(self, distance, vector_size, collection_params):
1819
raise NotImplementedError()
1920

20-
def configure(self, distance, vector_size, collection_params):
21+
def configure(self, distance, vector_size):
2122
self.clean()
2223
start = time.perf_counter()
23-
self.recreate(distance, vector_size, collection_params)
24+
self.recreate(distance, vector_size, self.collection_params)
2425
return time.perf_counter() - start
2526

2627
@classmethod

0 commit comments

Comments
 (0)