Skip to content

Commit 1440b9d

Browse files
Merge pull request #192 from beir-cellar/development
Merge development into main
2 parents 49d4338 + 5ba2c4f commit 1440b9d

27 files changed

+694
-265
lines changed

README.md

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ Tested with python versions 3.9+
8383

8484
- Preprocess your own IR dataset or use one of the already-preprocessed 17 benchmark datasets
8585
- Wide settings included, covers diverse benchmarks useful for both academia and industry
86-
- Includes well-known retrieval architectures (lexical, dense, sparse and reranking-based)
86+
- Evaluates well-known retrieval architectures (lexical, dense, sparse and reranking-based)
8787
- Add and evaluate your own model in a easy framework using different state-of-the-art evaluation metrics
8888

8989
## :beers: Quick Example
@@ -132,6 +132,7 @@ results = retriever.retrieve(corpus, queries)
132132

133133
#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K where k = [1,3,5,10,100,1000]
134134
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
135+
mrr = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="mrr")
135136

136137
### If you want to save your results and runfile (useful for reranking)
137138
results_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "results")
@@ -227,18 +228,27 @@ If you find this repository helpful, feel free to cite our publication [BEIR: A
227228

228229
If you use any baseline score from the BEIR leaderboard, feel free to cite our publication [Resources for Brewing BEIR: Reproducible Reference Models and an Official Leaderboard](https://arxiv.org/abs/2306.07471)
229230
```
230-
@misc{kamalloo2023resources,
231-
title={Resources for Brewing BEIR: Reproducible Reference Models and an Official Leaderboard},
232-
author={Ehsan Kamalloo and Nandan Thakur and Carlos Lassance and Xueguang Ma and Jheng-Hong Yang and Jimmy Lin},
233-
year={2023},
234-
eprint={2306.07471},
235-
archivePrefix={arXiv},
236-
primaryClass={cs.IR}
231+
@inproceedings{kamalloo:2024,
232+
author = {Kamalloo, Ehsan and Thakur, Nandan and Lassance, Carlos and Ma, Xueguang and Yang, Jheng-Hong and Lin, Jimmy},
233+
title = {Resources for Brewing BEIR: Reproducible Reference Models and Statistical Analyses},
234+
year = {2024},
235+
isbn = {9798400704314},
236+
publisher = {Association for Computing Machinery},
237+
address = {New York, NY, USA},
238+
url = {https://doi.org/10.1145/3626772.3657862},
239+
doi = {10.1145/3626772.3657862},
240+
abstract = {BEIR is a benchmark dataset originally designed for zero-shot evaluation of retrieval models across 18 different domain/task combinations. In recent years, we have witnessed the growing popularity of models based on representation learning, which naturally begs the question: How effective are these models when presented with queries and documents that differ from the training data? While BEIR was designed to answer this question, our work addresses two shortcomings that prevent the benchmark from achieving its full potential: First, the sophistication of modern neural methods and the complexity of current software infrastructure create barriers to entry for newcomers. To this end, we provide reproducible reference implementations that cover learned dense and sparse models. Second, comparisons on BEIR are performed by reducing scores from heterogeneous datasets into a single average that is difficult to interpret. To remedy this, we present meta-analyses focusing on effect sizes across datasets that are able to accurately quantify model differences. By addressing both shortcomings, our work facilitates future explorations in a range of interesting research questions.},
241+
booktitle = {Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
242+
pages = {1431–1440},
243+
numpages = {10},
244+
keywords = {domain generalization, evaluation, reproducibility},
245+
location = {Washington DC, USA},
246+
series = {SIGIR '24}
237247
}
238248
```
239249

240250
The main contributors of this repository are:
241-
- [Nandan Thakur](https://github.com/Nthakur20), Personal Website: [nandan-thakur.com](https://nandan-thakur.com)
251+
- [Nandan Thakur](https://github.com/Nthakur20), Personal Website: [thakur-nandan.gitub.io](https://thakur-nandan.github.io)
242252

243253
Contact person: Nandan Thakur, [[email protected]](mailto:[email protected])
244254

beir/retrieval/evaluation.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def __init__(
1515
self,
1616
retriever: BaseSearch = None,
1717
k_values: list[int] = [1, 3, 5, 10, 100, 1000],
18-
score_function: str = "cos_sim",
18+
score_function: str | None = "cos_sim",
1919
):
2020
self.k_values = k_values
2121
self.top_k = max(k_values)
@@ -29,25 +29,6 @@ def retrieve(
2929
raise ValueError("Model/Technique has not been provided!")
3030
return self.retriever.search(corpus, queries, self.top_k, self.score_function, **kwargs)
3131

32-
def rerank(
33-
self,
34-
corpus: dict[str, dict[str, str]],
35-
queries: dict[str, str],
36-
results: dict[str, dict[str, float]],
37-
top_k: int,
38-
) -> dict[str, dict[str, float]]:
39-
new_corpus = {}
40-
41-
for query_id in results:
42-
if len(results[query_id]) > top_k:
43-
for doc_id, _ in sorted(results[query_id].items(), key=lambda item: item[1], reverse=True)[:top_k]:
44-
new_corpus[doc_id] = corpus[doc_id]
45-
else:
46-
for doc_id in results[query_id]:
47-
new_corpus[doc_id] = corpus[doc_id]
48-
49-
return self.retriever.search(new_corpus, queries, top_k, self.score_function)
50-
5132
@staticmethod
5233
def evaluate(
5334
qrels: dict[str, dict[str, int]],

beir/retrieval/models/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,22 @@
22

33
from .bpr import BinarySentenceBERT
44
from .huggingface import HuggingFace
5+
from .llm2vec import LLM2Vec
6+
from .nvembed import NVEmbed
57
from .sentence_bert import SentenceBERT
68
from .sparta import SPARTA
79
from .splade import SPLADE
810
from .tldr import TLDR
911
from .unicoil import UniCOIL
10-
from .use_qa import UseQA
1112

1213
__all__ = [
1314
"BinarySentenceBERT",
1415
"HuggingFace",
16+
"LLM2Vec",
17+
"NVEmbed",
1518
"SentenceBERT",
1619
"SPARTA",
1720
"SPLADE",
1821
"TLDR",
1922
"UniCOIL",
20-
"UseQA",
2123
]

beir/retrieval/models/bpr.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from sentence_transformers import SentenceTransformer
55
from torch import Tensor
66

7+
from .util import extract_corpus_sentences
8+
79

810
class BinarySentenceBERT:
911
def __init__(
@@ -30,8 +32,10 @@ def _convert_embedding_to_binary_code(self, embeddings: list[Tensor]) -> list[Te
3032
def encode_queries(self, queries: list[str], batch_size: int = 16, **kwargs) -> list[Tensor] | np.ndarray | Tensor:
3133
return self.q_model.encode(queries, batch_size=batch_size, **kwargs)
3234

33-
def encode_corpus(self, corpus: list[dict[str, str]], batch_size: int = 8, **kwargs) -> np.ndarray:
34-
sentences = [(doc["title"] + self.sep + doc["text"]).strip() for doc in corpus]
35+
def encode_corpus(
36+
self, corpus: list[dict[str, str]] | dict[str, list] | list[str], batch_size: int = 8, **kwargs
37+
) -> np.ndarray:
38+
sentences = extract_corpus_sentences(corpus=corpus, sep=self.sep)
3539
embs = self.doc_model.encode(sentences, batch_size=batch_size, convert_to_tensor=True, **kwargs)
3640
embs = self._convert_embedding_to_binary_code(embs).cpu().numpy()
3741
embs = np.where(embs == -1, 0, embs).astype(np.bool)

beir/retrieval/models/huggingface.py

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,28 @@
1414
from transformers import AutoModel, AutoTokenizer
1515

1616
from .pooling import cls_pooling, eos_pooling, mean_pooling
17+
from .util import extract_corpus_sentences
1718

1819
logger = logging.getLogger(__name__)
1920

2021
POOL_FUNC = {"cls": cls_pooling, "mean": mean_pooling, "eos": eos_pooling}
2122

2223

23-
def get_peft_model(peft_model_name: str) -> PeftModel:
24+
def get_peft_model(peft_model_name: str, **kwargs) -> tuple[PeftModel, str]:
2425
config = PeftConfig.from_pretrained(peft_model_name)
25-
base_model = AutoModel.from_pretrained(config.base_model_name_or_path)
26+
logger.info(f"Loading Auto Model from {config.base_model_name_or_path} for PEFT model")
27+
base_model = AutoModel.from_pretrained(
28+
config.base_model_name_or_path,
29+
device_map="auto",
30+
attn_implementation=kwargs.get("attn_implementation", "eager"),
31+
torch_dtype=kwargs.get("torch_dtype", "auto"),
32+
trust_remote_code=True,
33+
cache_dir=kwargs.get("cache_dir", None),
34+
)
35+
logger.info(f"Loading PEFT model from {peft_model_name}")
2636
model = PeftModel.from_pretrained(base_model, peft_model_name)
2737
model = model.merge_and_unload()
28-
return model
38+
return model, config.base_model_name_or_path
2939

3040

3141
class HuggingFace:
@@ -42,18 +52,23 @@ def __init__(
4252
**kwargs,
4353
):
4454
self.sep = sep
45-
self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
46-
if self.tokenizer.pad_token_id is None:
47-
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
48-
self.tokenizer.padding_side = "right"
49-
5055
if peft_model_path:
51-
self.model = get_peft_model(peft_model_path)
56+
self.model, base_model_path = get_peft_model(peft_model_path, **kwargs)
57+
self.tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=True)
5258
else:
5359
self.model = AutoModel.from_pretrained(
54-
model_path, device_map="auto", torch_dtype=kwargs.get("torch_dtype", "auto"), trust_remote_code=True
60+
model_path,
61+
device_map="auto",
62+
torch_dtype=kwargs.get("torch_dtype", "auto"),
63+
trust_remote_code=True,
64+
attn_implementation=kwargs.get("attn_implementation", "default"),
65+
cache_dir=kwargs.get("cache_dir", None),
5566
)
67+
self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
5668
self.model.eval()
69+
if self.tokenizer.pad_token_id is None:
70+
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
71+
self.tokenizer.padding_side = "right"
5772
self.max_length = max_length if max_length else self.tokenizer.model_max_length
5873
self.normalize = normalize # Normalize the embeddings
5974
self.append_eos_token = append_eos_token # Add eos token to the input
@@ -114,23 +129,7 @@ def encode_corpus(
114129
self, corpus: list[dict[str, str]] | dict[str, list] | list[str], batch_size: int = 8, **kwargs
115130
) -> list[Tensor] | np.ndarray | Tensor:
116131
corpus_embeddings = []
117-
118-
if isinstance(corpus, dict):
119-
sentences = [
120-
(corpus["title"][i] + self.sep + corpus["text"][i]).strip()
121-
if "title" in corpus
122-
else corpus["text"][i].strip()
123-
for i in range(len(corpus["text"]))
124-
]
125-
126-
elif isinstance(corpus, list):
127-
if isinstance(corpus[0], str): # if corpus is a list of strings
128-
sentences = corpus
129-
else:
130-
sentences = [
131-
(doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
132-
for doc in corpus
133-
]
132+
sentences = extract_corpus_sentences(corpus=corpus, sep=self.sep)
134133

135134
with torch.no_grad():
136135
for start_idx in trange(0, len(sentences), batch_size):

beir/retrieval/models/llm2vec.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from __future__ import annotations
2+
3+
import importlib.util
4+
import logging
5+
6+
if importlib.util.find_spec("llm2vec") is not None:
7+
from llm2vec import LLM2Vec as LLM2VecOriginal
8+
9+
import numpy as np
10+
import torch
11+
import torch.nn.functional as F
12+
from torch import Tensor
13+
from tqdm.autonotebook import trange
14+
15+
from .util import extract_corpus_sentences
16+
17+
logger = logging.getLogger(__name__)
18+
19+
POOLING_MODES = {
20+
"mean": "mean",
21+
"weighted_mean": "weighted_mean",
22+
"eos": "eos_token",
23+
"bos_token": "bos_token",
24+
"last_token": "last_token",
25+
}
26+
27+
28+
class LLM2Vec:
29+
def __init__(
30+
self,
31+
model_path: str | tuple = None,
32+
max_length: int = None,
33+
sep: str = " ",
34+
pooling: str = "mean",
35+
normalize: bool = True,
36+
prompts: dict[str, str] = None,
37+
peft_model_path: str = None,
38+
**kwargs,
39+
):
40+
self.sep = sep
41+
self.normalize = normalize
42+
if pooling not in POOLING_MODES:
43+
raise ValueError(f"Pooling mode {pooling} not supported. Choose from {list(POOLING_MODES.keys())}")
44+
45+
self.model = LLM2VecOriginal.from_pretrained(
46+
base_model_name_or_path=model_path,
47+
peft_model_name_or_path=peft_model_path,
48+
pooling_mode=POOLING_MODES[pooling],
49+
max_length=max_length,
50+
**kwargs,
51+
)
52+
53+
if prompts:
54+
self.query_prefix = prompts.get("query", "")
55+
self.doc_prefix = prompts.get("passage", "")
56+
57+
def _append_eos_token(self, texts, pad_to_multiple_of: int = 16):
58+
"""Tokenizes the input texts and pads the tokenized input to the max_length with the eos token"""
59+
collated_texts = self.tokenizer(
60+
texts,
61+
padding=False,
62+
truncation=True,
63+
max_length=self.max_length - 1 if self.append_eos_token else self.max_length,
64+
return_attention_mask=False,
65+
return_token_type_ids=False,
66+
add_special_tokens=True,
67+
)
68+
collated_texts["input_ids"] = [x + [self.tokenizer.eos_token_id] for x in collated_texts["input_ids"]]
69+
collated_texts = self.tokenizer.pad(
70+
collated_texts,
71+
padding=True,
72+
pad_to_multiple_of=pad_to_multiple_of,
73+
return_attention_mask=True,
74+
return_tensors="pt",
75+
)
76+
return collated_texts
77+
78+
def encode_queries(self, queries: list[str], batch_size: int = 16, **kwargs) -> list[Tensor] | np.ndarray | Tensor:
79+
query_embeddings = []
80+
81+
with torch.no_grad():
82+
for start_idx in trange(0, len(queries), batch_size):
83+
sub_queries = [[self.query_prefix, query] for query in queries[start_idx : start_idx + batch_size]]
84+
query_embeddings += self.model.encode(sub_queries, batch_size=batch_size, show_progress_bar=False)
85+
86+
query_embeddings = torch.stack(query_embeddings)
87+
88+
if self.normalize:
89+
query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
90+
91+
return query_embeddings
92+
93+
def encode_corpus(
94+
self, corpus: list[dict[str, str]] | dict[str, list] | list[str], batch_size: int = 8, **kwargs
95+
) -> list[Tensor] | np.ndarray | Tensor:
96+
corpus_embeddings = []
97+
sentences = extract_corpus_sentences(corpus=corpus, sep=self.sep)
98+
99+
with torch.no_grad():
100+
for start_idx in trange(0, len(sentences), batch_size):
101+
if self.doc_prefix:
102+
sub_sentences = [
103+
[self.doc_prefix, sentence] for sentence in sentences[start_idx : start_idx + batch_size]
104+
]
105+
else:
106+
sub_sentences = sentences[start_idx : start_idx + batch_size]
107+
corpus_embeddings += self.model.encode(sub_sentences, batch_size=batch_size, show_progress_bar=False)
108+
109+
corpus_embeddings = torch.stack(corpus_embeddings)
110+
111+
if self.normalize:
112+
corpus_embeddings = F.normalize(corpus_embeddings, p=2, dim=1)
113+
114+
return corpus_embeddings

0 commit comments

Comments
 (0)