Skip to content

Commit 5b2fb6a

Browse files
committed
first running benchmark
1 parent 9ffac6c commit 5b2fb6a

File tree

14 files changed

+532
-219
lines changed

14 files changed

+532
-219
lines changed

benchmark/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def download(self):
5353

5454
def get_reader(self) -> BaseReader:
5555
reader_class = READER_TYPE[self.config.type]
56-
return reader_class(self.config.path)
56+
return reader_class(DATASETS_DIR / self.config.path)
5757

5858

5959
if __name__ == "__main__":

benchmark/settings.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import os.path
2-
2+
from pathlib import Path
33

44
CODE_DIR = os.path.dirname(__file__)
5-
ROOT_DIR = os.path.dirname(CODE_DIR)
6-
DATASETS_DIR = os.path.join(ROOT_DIR, "datasets")
5+
ROOT_DIR = Path(os.path.dirname(CODE_DIR))

dataset_reader/json_reader.py

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,66 @@
1-
from typing import Iterator
1+
from pathlib import Path
2+
from typing import Iterator, List, Optional
23

34
import json
45

56
from dataset_reader.base_reader import BaseReader, Record, Query
67

78

9+
VECTORS_FILE = 'vectors.jsonl'
10+
PAYLOADS_FILE = 'payloads.jsonl'
11+
QUERIES_FILE = 'queries.jsonl'
12+
NEIGHBOURS_FILE = 'neighbours.jsonl'
13+
14+
815
class JSONReader(BaseReader):
9-
def __init__(self, path):
16+
def __init__(self, path: Path):
1017
self.path = path
1118

12-
def read_queries(self) -> Iterator[Query]:
13-
with open(self.path, "r") as json_fp:
19+
def read_payloads(self) -> Iterator[dict]:
20+
if not (self.path / PAYLOADS_FILE).exists():
21+
while True:
22+
yield {}
23+
with open(self.path / PAYLOADS_FILE, "r") as json_fp:
1424
for json_line in json_fp:
1525
line = json.loads(json_line)
16-
yield Query(
17-
vector=line, meta_conditions=None, expected_result=None,
18-
)
26+
yield line
1927

20-
def read_data(self) -> Iterator[Record]:
21-
with open(self.path, "r") as json_fp:
22-
for idx, json_line in enumerate(json_fp):
28+
def read_vectors(self) -> Iterator[List[float]]:
29+
with open(self.path / VECTORS_FILE, "r") as json_fp:
30+
for json_line in json_fp:
31+
line = json.loads(json_line)
32+
yield line
33+
34+
def read_neighbours(self) -> Iterator[Optional[List[int]]]:
35+
if not (self.path / NEIGHBOURS_FILE).exists():
36+
while True:
37+
yield None
38+
39+
with open(self.path / NEIGHBOURS_FILE, "r") as json_fp:
40+
for json_line in json_fp:
41+
line = json.loads(json_line)
42+
yield line
43+
44+
def read_query_vectors(self) -> Iterator[List[float]]:
45+
with open(self.path / QUERIES_FILE, "r") as json_fp:
46+
for json_line in json_fp:
2347
line = json.loads(json_line)
24-
yield Record(id=idx, vector=line, metadata=None)
48+
yield line
49+
50+
def read_queries(self) -> Iterator[Query]:
51+
for idx, (vector, neighbours) in enumerate(zip(self.read_query_vectors(), self.read_neighbours())):
52+
# ToDo: add meta_conditions
53+
yield Query(vector=vector, meta_conditions=None, expected_result=neighbours)
54+
55+
def read_data(self) -> Iterator[Record]:
56+
for idx, (vector, payload) in enumerate(zip(self.read_vectors(), self.read_payloads())):
57+
yield Record(id=idx, vector=vector, metadata=payload)
2558

2659

2760
if __name__ == "__main__":
28-
import os
29-
from benchmark.settings import DATASET_DIR
61+
from benchmark import DATASETS_DIR
3062

31-
test_path = os.path.join(DATASET_DIR, "random-100", "vectors.jsonl")
63+
test_path = DATASETS_DIR / "random-100"
3264
record = next(JSONReader(test_path).read_data())
3365
print(record, end="\n\n")
3466

datasets/datasets.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,23 @@
22
{
33
"name": "glove-25-angular",
44
"vector_size": 25,
5-
"distance": "Cosine",
5+
"distance": "cosine",
66
"type": "h5",
77
"path": "glove-25-angular/glove-25-angular.hdf5",
88
"link": "http://ann-benchmarks.com/glove-25-angular.hdf5"
99
},
1010
{
1111
"name": "glove-100-angular",
1212
"vector_size": 100,
13-
"distance": "Cosine",
13+
"distance": "cosine",
1414
"type": "h5",
1515
"path": "glove-100-angular/glove-100-angular.hdf5",
1616
"link": "http://ann-benchmarks.com/glove-100-angular.hdf5"
1717
},
1818
{
1919
"name": "random-100",
2020
"vector_size": 100,
21-
"distance": "Cosine",
21+
"distance": "cosine",
2222
"type": "jsonl",
2323
"path": "random-100/"
2424
}

engine/base_client/client.py

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,47 @@
11
import json
22
from datetime import datetime
3+
from typing import List
34

45
from benchmark.dataset import Dataset
6+
from benchmark.settings import ROOT_DIR
57
from engine.base_client.configure import BaseConfigurator
68
from engine.base_client.search import BaseSearcher
79
from engine.base_client.upload import BaseUploader
810

11+
RESULTS_DIR = ROOT_DIR / "results"
12+
RESULTS_DIR.mkdir(exist_ok=True)
13+
914

1015
class BaseClient:
1116
def __init__(
12-
self,
13-
name: str, # name of the experiment
14-
configurator: BaseConfigurator,
15-
uploader: BaseUploader,
16-
searcher: BaseSearcher,
17+
self,
18+
name: str, # name of the experiment
19+
configurator: BaseConfigurator,
20+
uploader: BaseUploader,
21+
searchers: List[BaseSearcher],
1722
):
1823
self.name = name
1924
self.configurator = configurator
2025
self.uploader = uploader
21-
self.searcher = searcher
26+
self.searchers = searchers
2227

23-
def save_experiment_results(self, dataset_name: str, results: dict):
28+
def save_search_results(self, dataset_name: str, results: dict, search_id: int, search_params: dict):
2429
now = datetime.now()
2530
timestamp = now.strftime("%Y-%m-%d-%H-%M-%S")
26-
experiments_file = f"{self.name}-{dataset_name}-{timestamp}.json"
27-
with open(experiments_file, "w") as out:
31+
experiments_file = f"{self.name}-{dataset_name}-search-{search_id}-{timestamp}.json"
32+
with open(RESULTS_DIR / experiments_file, "w") as out:
33+
out.write(
34+
json.dumps({
35+
"params": search_params,
36+
"results": results
37+
}, indent=2)
38+
)
39+
40+
def save_upload_results(self, dataset_name: str, results: dict):
41+
now = datetime.now()
42+
timestamp = now.strftime("%Y-%m-%d-%H-%M-%S")
43+
experiments_file = f"{self.name}-{dataset_name}-upload-{timestamp}.json"
44+
with open(RESULTS_DIR / experiments_file, "w") as out:
2845
out.write(
2946
json.dumps(results, indent=2)
3047
)
@@ -37,12 +54,14 @@ def run_experiment(self, dataset: Dataset):
3754

3855
reader = dataset.get_reader()
3956
upload_stats = self.uploader.upload(reader.read_data())
40-
search_stats = self.searcher.search_all(reader.read_queries())
41-
42-
self.save_experiment_results(
43-
dataset.config.name,
44-
{
45-
"upload": upload_stats,
46-
"search": search_stats
47-
}
48-
)
57+
self.save_upload_results(dataset.config.name, upload_stats)
58+
59+
for search_id, searcher in enumerate(self.searchers):
60+
search_params = {**searcher.search_params}
61+
search_stats = searcher.search_all(reader.read_queries())
62+
self.save_search_results(
63+
dataset.config.name,
64+
search_stats,
65+
search_id,
66+
search_params
67+
)

engine/base_client/configure.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
1-
import json
21
import time
3-
from pathlib import Path
42

53

64
class BaseConfigurator:
7-
DEFAULT_CONFIG_PATH = Path("default.json")
85
DISTANCE_MAPPING = {}
96

107
def __init__(self, host, collection_params: dict, connection_params: dict):
@@ -23,8 +20,3 @@ def configure(self, distance, vector_size):
2320
start = time.perf_counter()
2421
self.recreate(distance, vector_size, self.collection_params)
2522
return time.perf_counter() - start
26-
27-
@classmethod
28-
def read_default_config(cls):
29-
with open(cls.DEFAULT_CONFIG_PATH, "r") as fp:
30-
return json.load(fp)

engine/base_client/search.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def search_all(
5858

5959
if parallel == 1:
6060
self.init_client(self.host, self.connection_params, self.search_params)
61-
precisions, latencies = list(zip([search_one(query) for query in queries]))
61+
precisions, latencies = list(zip(*[search_one(query) for query in queries]))
6262
else:
6363
ctx = get_context(self.MP_CONTEXT)
6464

engine/clients/client_factory.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from abc import ABC
2+
from typing import List, Type
23

34
from engine.base_client.client import BaseClient
45
from engine.base_client.configure import BaseConfigurator
@@ -44,19 +45,24 @@ def _create_uploader(self, experiment) -> BaseUploader:
4445
)
4546
return engine_uploader
4647

47-
def _create_searcher(self, experiment) -> BaseSearcher:
48-
engine_searcher_class = ENGINE_SEARCHERS[experiment["engine"]]
49-
engine_searcher = engine_searcher_class(
50-
self.host,
51-
experiment.get("connection_params", {}),
52-
experiment.get("search_params", {}),
53-
)
54-
return engine_searcher
48+
def _create_searchers(self, experiment) -> List[BaseSearcher]:
49+
engine_searcher_class: Type[BaseSearcher] = ENGINE_SEARCHERS[experiment["engine"]]
50+
51+
engine_searchers = [
52+
engine_searcher_class(
53+
self.host,
54+
connection_params=experiment.get("connection_params", {}),
55+
search_params=search_params,
56+
)
57+
for search_params in experiment.get("search_params", [{}])
58+
]
59+
60+
return engine_searchers
5561

5662
def build_client(self, experiment):
5763
return BaseClient(
5864
name=experiment["name"],
5965
configurator=self._create_configurator(experiment),
6066
uploader=self._create_uploader(experiment),
61-
searcher=self._create_searcher(experiment),
67+
searchers=self._create_searchers(experiment),
6268
)

engine/clients/qdrant/search.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,12 @@ def conditions_to_filter(cls, _meta_conditions) -> Optional[rest.Filter]:
2222
return None
2323

2424
@classmethod
25-
def search_one(cls, vector, meta_conditions) -> List[Tuple[int, float]]:
25+
def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]:
2626
res = cls.client.search(
2727
collection_name=QDRANT_COLLECTION_NAME,
2828
query_vector=vector,
2929
query_filter=cls.conditions_to_filter(meta_conditions),
30+
limit=top,
3031
**cls.search_params
3132
)
3233

engine/qdrant-0.8.4/client.Dockerfile

Lines changed: 0 additions & 8 deletions
This file was deleted.

engine/qdrant-0.8.4/client/cmd.py

Lines changed: 0 additions & 89 deletions
This file was deleted.

0 commit comments

Comments
 (0)