You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
@@ -227,18 +228,27 @@ If you find this repository helpful, feel free to cite our publication [BEIR: A
227
228
228
229
If you use any baseline score from the BEIR leaderboard, feel free to cite our publication [Resources for Brewing BEIR: Reproducible Reference Models and an Official Leaderboard](https://arxiv.org/abs/2306.07471)
229
230
```
230
-
@misc{kamalloo2023resources,
231
-
title={Resources for Brewing BEIR: Reproducible Reference Models and an Official Leaderboard},
232
-
author={Ehsan Kamalloo and Nandan Thakur and Carlos Lassance and Xueguang Ma and Jheng-Hong Yang and Jimmy Lin},
233
-
year={2023},
234
-
eprint={2306.07471},
235
-
archivePrefix={arXiv},
236
-
primaryClass={cs.IR}
231
+
@inproceedings{kamalloo:2024,
232
+
author = {Kamalloo, Ehsan and Thakur, Nandan and Lassance, Carlos and Ma, Xueguang and Yang, Jheng-Hong and Lin, Jimmy},
233
+
title = {Resources for Brewing BEIR: Reproducible Reference Models and Statistical Analyses},
234
+
year = {2024},
235
+
isbn = {9798400704314},
236
+
publisher = {Association for Computing Machinery},
237
+
address = {New York, NY, USA},
238
+
url = {https://doi.org/10.1145/3626772.3657862},
239
+
doi = {10.1145/3626772.3657862},
240
+
abstract = {BEIR is a benchmark dataset originally designed for zero-shot evaluation of retrieval models across 18 different domain/task combinations. In recent years, we have witnessed the growing popularity of models based on representation learning, which naturally begs the question: How effective are these models when presented with queries and documents that differ from the training data? While BEIR was designed to answer this question, our work addresses two shortcomings that prevent the benchmark from achieving its full potential: First, the sophistication of modern neural methods and the complexity of current software infrastructure create barriers to entry for newcomers. To this end, we provide reproducible reference implementations that cover learned dense and sparse models. Second, comparisons on BEIR are performed by reducing scores from heterogeneous datasets into a single average that is difficult to interpret. To remedy this, we present meta-analyses focusing on effect sizes across datasets that are able to accurately quantify model differences. By addressing both shortcomings, our work facilitates future explorations in a range of interesting research questions.},
241
+
booktitle = {Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
0 commit comments