beir-cellar
diff --git a/‎README.md‎
Lines changed: 19 additions & 9 deletions b/‎README.md‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎beir/retrieval/evaluation.py‎
Lines changed: 1 addition & 20 deletions b/‎beir/retrieval/evaluation.py‎
Lines changed: 1 addition & 20 deletions
diff --git a/‎beir/retrieval/models/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎beir/retrieval/models/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎beir/retrieval/models/bpr.py‎
Lines changed: 6 additions & 2 deletions b/‎beir/retrieval/models/bpr.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎beir/retrieval/models/huggingface.py‎
Lines changed: 26 additions & 27 deletions b/‎beir/retrieval/models/huggingface.py‎
Lines changed: 26 additions & 27 deletions
diff --git a/‎beir/retrieval/models/llm2vec.py‎
Lines changed: 114 additions & 0 deletions b/‎beir/retrieval/models/llm2vec.py‎
Lines changed: 114 additions & 0 deletions
@@ -83,7 +83,7 @@ Tested with python versions 3.9+
 
 - Preprocess your own IR dataset or use one of the already-preprocessed 17 benchmark datasets
 - Wide settings included, covers diverse benchmarks useful for both academia and industry
-- Includes well-known retrieval architectures (lexical, dense, sparse and reranking-based)
+- Evaluates well-known retrieval architectures (lexical, dense, sparse and reranking-based)
 - Add and evaluate your own model in a easy framework using different state-of-the-art evaluation metrics
 
 ## :beers: Quick Example
@@ -132,6 +132,7 @@ results = retriever.retrieve(corpus, queries)
 
 #### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000]
 ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
+mrr = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="mrr")
 
 ### If you want to save your results and runfile (useful for reranking)
 results_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "results")
@@ -227,18 +228,27 @@ If you find this repository helpful, feel free to cite our publication [BEIR: A
 
 If you use any baseline score from the BEIR leaderboard, feel free to cite our publication [Resources for Brewing BEIR: Reproducible Reference Models and an Official Leaderboard](https://arxiv.org/abs/2306.07471)
 ```
-@misc{kamalloo2023resources,
-      title={Resources for Brewing BEIR: Reproducible Reference Models and an Official Leaderboard},
-      author={Ehsan Kamalloo and Nandan Thakur and Carlos Lassance and Xueguang Ma and Jheng-Hong Yang and Jimmy Lin},
-      year={2023},
-      eprint={2306.07471},
-      archivePrefix={arXiv},
-      primaryClass={cs.IR}
+@inproceedings{kamalloo:2024,
+    author = {Kamalloo, Ehsan and Thakur, Nandan and Lassance, Carlos and Ma, Xueguang and Yang, Jheng-Hong and Lin, Jimmy},
+    title = {Resources for Brewing BEIR: Reproducible Reference Models and Statistical Analyses},
+    year = {2024},
+    isbn = {9798400704314},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/3626772.3657862},
+    doi = {10.1145/3626772.3657862},
+    abstract = {BEIR is a benchmark dataset originally designed for zero-shot evaluation of retrieval models across 18 different domain/task combinations. In recent years, we have witnessed the growing popularity of models based on representation learning, which naturally begs the question: How effective are these models when presented with queries and documents that differ from the training data? While BEIR was designed to answer this question, our work addresses two shortcomings that prevent the benchmark from achieving its full potential: First, the sophistication of modern neural methods and the complexity of current software infrastructure create barriers to entry for newcomers. To this end, we provide reproducible reference implementations that cover learned dense and sparse models. Second, comparisons on BEIR are performed by reducing scores from heterogeneous datasets into a single average that is difficult to interpret. To remedy this, we present meta-analyses focusing on effect sizes across datasets that are able to accurately quantify model differences. By addressing both shortcomings, our work facilitates future explorations in a range of interesting research questions.},
+    booktitle = {Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
+    pages = {1431–1440},
+    numpages = {10},
+    keywords = {domain generalization, evaluation, reproducibility},
+    location = {Washington DC, USA},
+    series = {SIGIR '24}
 }
 ```
 
 The main contributors of this repository are:
-- [Nandan Thakur](https://github.com/Nthakur20), Personal Website: [nandan-thakur.com](https://nandan-thakur.com)
+- [Nandan Thakur](https://github.com/Nthakur20), Personal Website: [thakur-nandan.gitub.io](https://thakur-nandan.github.io)
 
 Contact person: Nandan Thakur, [[email protected]](mailto:[email protected])
 
 
@@ -15,7 +15,7 @@ def __init__(
         self,
         retriever: BaseSearch = None,
         k_values: list[int] = [1, 3, 5, 10, 100, 1000],
-        score_function: str = "cos_sim",
+        score_function: str | None = "cos_sim",
     ):
         self.k_values = k_values
         self.top_k = max(k_values)
@@ -29,25 +29,6 @@ def retrieve(
             raise ValueError("Model/Technique has not been provided!")
         return self.retriever.search(corpus, queries, self.top_k, self.score_function, **kwargs)
 
-    def rerank(
-        self,
-        corpus: dict[str, dict[str, str]],
-        queries: dict[str, str],
-        results: dict[str, dict[str, float]],
-        top_k: int,
-    ) -> dict[str, dict[str, float]]:
-        new_corpus = {}
-
-        for query_id in results:
-            if len(results[query_id]) > top_k:
-                for doc_id, _ in sorted(results[query_id].items(), key=lambda item: item[1], reverse=True)[:top_k]:
-                    new_corpus[doc_id] = corpus[doc_id]
-            else:
-                for doc_id in results[query_id]:
-                    new_corpus[doc_id] = corpus[doc_id]
-
-        return self.retriever.search(new_corpus, queries, top_k, self.score_function)
-
     @staticmethod
     def evaluate(
         qrels: dict[str, dict[str, int]],
 
@@ -2,20 +2,22 @@
 
 from .bpr import BinarySentenceBERT
 from .huggingface import HuggingFace
+from .llm2vec import LLM2Vec
+from .nvembed import NVEmbed
 from .sentence_bert import SentenceBERT
 from .sparta import SPARTA
 from .splade import SPLADE
 from .tldr import TLDR
 from .unicoil import UniCOIL
-from .use_qa import UseQA
 
 __all__ = [
     "BinarySentenceBERT",
     "HuggingFace",
+    "LLM2Vec",
+    "NVEmbed",
     "SentenceBERT",
     "SPARTA",
     "SPLADE",
     "TLDR",
     "UniCOIL",
-    "UseQA",
 ]
@@ -4,6 +4,8 @@
 from sentence_transformers import SentenceTransformer
 from torch import Tensor
 
+from .util import extract_corpus_sentences
+
 
 class BinarySentenceBERT:
     def __init__(
@@ -30,8 +32,10 @@ def _convert_embedding_to_binary_code(self, embeddings: list[Tensor]) -> list[Te
     def encode_queries(self, queries: list[str], batch_size: int = 16, **kwargs) -> list[Tensor] | np.ndarray | Tensor:
         return self.q_model.encode(queries, batch_size=batch_size, **kwargs)
 
-    def encode_corpus(self, corpus: list[dict[str, str]], batch_size: int = 8, **kwargs) -> np.ndarray:
-        sentences = [(doc["title"] + self.sep + doc["text"]).strip() for doc in corpus]
+    def encode_corpus(
+        self, corpus: list[dict[str, str]] | dict[str, list] | list[str], batch_size: int = 8, **kwargs
+    ) -> np.ndarray:
+        sentences = extract_corpus_sentences(corpus=corpus, sep=self.sep)
         embs = self.doc_model.encode(sentences, batch_size=batch_size, convert_to_tensor=True, **kwargs)
         embs = self._convert_embedding_to_binary_code(embs).cpu().numpy()
         embs = np.where(embs == -1, 0, embs).astype(np.bool)
 
@@ -14,18 +14,28 @@
 from transformers import AutoModel, AutoTokenizer
 
 from .pooling import cls_pooling, eos_pooling, mean_pooling
+from .util import extract_corpus_sentences
 
 logger = logging.getLogger(__name__)
 
 POOL_FUNC = {"cls": cls_pooling, "mean": mean_pooling, "eos": eos_pooling}
 
 
-def get_peft_model(peft_model_name: str) -> PeftModel:
+def get_peft_model(peft_model_name: str, **kwargs) -> tuple[PeftModel, str]:
     config = PeftConfig.from_pretrained(peft_model_name)
-    base_model = AutoModel.from_pretrained(config.base_model_name_or_path)
+    logger.info(f"Loading Auto Model from {config.base_model_name_or_path} for PEFT model")
+    base_model = AutoModel.from_pretrained(
+        config.base_model_name_or_path,
+        device_map="auto",
+        attn_implementation=kwargs.get("attn_implementation", "eager"),
+        torch_dtype=kwargs.get("torch_dtype", "auto"),
+        trust_remote_code=True,
+        cache_dir=kwargs.get("cache_dir", None),
+    )
+    logger.info(f"Loading PEFT model from {peft_model_name}")
     model = PeftModel.from_pretrained(base_model, peft_model_name)
     model = model.merge_and_unload()
-    return model
+    return model, config.base_model_name_or_path
 
 
 class HuggingFace:
@@ -42,18 +52,23 @@ def __init__(
         **kwargs,
     ):
         self.sep = sep
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
-        if self.tokenizer.pad_token_id is None:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        self.tokenizer.padding_side = "right"
-
         if peft_model_path:
-            self.model = get_peft_model(peft_model_path)
+            self.model, base_model_path = get_peft_model(peft_model_path, **kwargs)
+            self.tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=True)
         else:
             self.model = AutoModel.from_pretrained(
-                model_path, device_map="auto", torch_dtype=kwargs.get("torch_dtype", "auto"), trust_remote_code=True
+                model_path,
+                device_map="auto",
+                torch_dtype=kwargs.get("torch_dtype", "auto"),
+                trust_remote_code=True,
+                attn_implementation=kwargs.get("attn_implementation", "default"),
+                cache_dir=kwargs.get("cache_dir", None),
             )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
         self.model.eval()
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        self.tokenizer.padding_side = "right"
         self.max_length = max_length if max_length else self.tokenizer.model_max_length
         self.normalize = normalize  # Normalize the embeddings
         self.append_eos_token = append_eos_token  # Add eos token to the input
@@ -114,23 +129,7 @@ def encode_corpus(
         self, corpus: list[dict[str, str]] | dict[str, list] | list[str], batch_size: int = 8, **kwargs
     ) -> list[Tensor] | np.ndarray | Tensor:
         corpus_embeddings = []
-
-        if isinstance(corpus, dict):
-            sentences = [
-                (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
-                if "title" in corpus
-                else corpus["text"][i].strip()
-                for i in range(len(corpus["text"]))
-            ]
-
-        elif isinstance(corpus, list):
-            if isinstance(corpus[0], str):  # if corpus is a list of strings
-                sentences = corpus
-            else:
-                sentences = [
-                    (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
-                    for doc in corpus
-                ]
+        sentences = extract_corpus_sentences(corpus=corpus, sep=self.sep)
 
         with torch.no_grad():
             for start_idx in trange(0, len(sentences), batch_size):
 
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import importlib.util
+import logging
+
+if importlib.util.find_spec("llm2vec") is not None:
+    from llm2vec import LLM2Vec as LLM2VecOriginal
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from tqdm.autonotebook import trange
+
+from .util import extract_corpus_sentences
+
+logger = logging.getLogger(__name__)
+
+POOLING_MODES = {
+    "mean": "mean",
+    "weighted_mean": "weighted_mean",
+    "eos": "eos_token",
+    "bos_token": "bos_token",
+    "last_token": "last_token",
+}
+
+
+class LLM2Vec:
+    def __init__(
+        self,
+        model_path: str | tuple = None,
+        max_length: int = None,
+        sep: str = " ",
+        pooling: str = "mean",
+        normalize: bool = True,
+        prompts: dict[str, str] = None,
+        peft_model_path: str = None,
+        **kwargs,
+    ):
+        self.sep = sep
+        self.normalize = normalize
+        if pooling not in POOLING_MODES:
+            raise ValueError(f"Pooling mode {pooling} not supported. Choose from {list(POOLING_MODES.keys())}")
+
+        self.model = LLM2VecOriginal.from_pretrained(
+            base_model_name_or_path=model_path,
+            peft_model_name_or_path=peft_model_path,
+            pooling_mode=POOLING_MODES[pooling],
+            max_length=max_length,
+            **kwargs,
+        )
+
+        if prompts:
+            self.query_prefix = prompts.get("query", "")
+            self.doc_prefix = prompts.get("passage", "")
+
+    def _append_eos_token(self, texts, pad_to_multiple_of: int = 16):
+        """Tokenizes the input texts and pads the tokenized input to the max_length with the eos token"""
+        collated_texts = self.tokenizer(
+            texts,
+            padding=False,
+            truncation=True,
+            max_length=self.max_length - 1 if self.append_eos_token else self.max_length,
+            return_attention_mask=False,
+            return_token_type_ids=False,
+            add_special_tokens=True,
+        )
+        collated_texts["input_ids"] = [x + [self.tokenizer.eos_token_id] for x in collated_texts["input_ids"]]
+        collated_texts = self.tokenizer.pad(
+            collated_texts,
+            padding=True,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        return collated_texts
+
+    def encode_queries(self, queries: list[str], batch_size: int = 16, **kwargs) -> list[Tensor] | np.ndarray | Tensor:
+        query_embeddings = []
+
+        with torch.no_grad():
+            for start_idx in trange(0, len(queries), batch_size):
+                sub_queries = [[self.query_prefix, query] for query in queries[start_idx : start_idx + batch_size]]
+                query_embeddings += self.model.encode(sub_queries, batch_size=batch_size, show_progress_bar=False)
+
+        query_embeddings = torch.stack(query_embeddings)
+
+        if self.normalize:
+            query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
+
+        return query_embeddings
+
+    def encode_corpus(
+        self, corpus: list[dict[str, str]] | dict[str, list] | list[str], batch_size: int = 8, **kwargs
+    ) -> list[Tensor] | np.ndarray | Tensor:
+        corpus_embeddings = []
+        sentences = extract_corpus_sentences(corpus=corpus, sep=self.sep)
+
+        with torch.no_grad():
+            for start_idx in trange(0, len(sentences), batch_size):
+                if self.doc_prefix:
+                    sub_sentences = [
+                        [self.doc_prefix, sentence] for sentence in sentences[start_idx : start_idx + batch_size]
+                    ]
+                else:
+                    sub_sentences = sentences[start_idx : start_idx + batch_size]
+                corpus_embeddings += self.model.encode(sub_sentences, batch_size=batch_size, show_progress_bar=False)
+
+            corpus_embeddings = torch.stack(corpus_embeddings)
+
+            if self.normalize:
+                corpus_embeddings = F.normalize(corpus_embeddings, p=2, dim=1)
+
+            return corpus_embeddings