From c54ea115e4096f8e81fee35e978bba9a375e57ca Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Tue, 29 Oct 2024 13:22:34 -0400 Subject: [PATCH 01/16] update --- mteb/abstasks/AbsTaskReranking.py | 211 ++++++--- mteb/evaluation/MTEB.py | 22 +- .../evaluators/RetrievalEvaluator.py | 412 +---------------- mteb/evaluation/evaluators/model_classes.py | 413 ++++++++++++++++++ mteb/overview.py | 30 +- .../Reranking/eng/AskUbuntuDupQuestions.py | 2 +- mteb/tasks/Reranking/rus/RuBQReranking.py | 2 +- 7 files changed, 608 insertions(+), 484 deletions(-) create mode 100644 mteb/evaluation/evaluators/model_classes.py diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index bcbc4571d5..a9905a407b 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -1,14 +1,34 @@ from __future__ import annotations -from typing import Any - -from datasets import Dataset - -from mteb.encoder_interface import Encoder -from mteb.load_results.task_results import ScoresDict - -from ..evaluation.evaluators import RerankingEvaluator -from .AbsTask import AbsTask, DescriptiveStatistics +import logging +from collections import defaultdict + +import datasets +import tqdm + +from .AbsTask import DescriptiveStatistics +from .AbsTaskRetrieval import AbsTaskRetrieval + +logger = logging.getLogger(__name__) + +OLD_FORMAT_RERANKING_TASKS = [ + "AskUbuntuDupQuestions", + "MindSmallReranking", + "SciDocsRR", + "StackOverflowDupQuestions", + "WebLINXCandidatesReranking", + "AlloprofReranking", + "SyntecReranking", + "VoyageMMarcoReranking", + "ESCIReranking", + "MIRACLReranking", + "WikipediaRerankingMultilingual", + "RuBQReranking", + "T2Reranking", + "MMarcoReranking", + "CMedQAv1-reranking", + "CMedQAv2-reranking", +] class RerankingDescriptiveStatistics(DescriptiveStatistics): @@ -31,9 +51,32 @@ class RerankingDescriptiveStatistics(DescriptiveStatistics): avg_negative_len: float -class AbsTaskReranking(AbsTask): - """Abstract class for re-ranking experiments. +class AbsTaskReranking(AbsTaskRetrieval): + """Abstract class for re-ranking experiments. This is mostly the same as the RetrievalEvaluator, but as previously it wasn't we need to keep it to transform old dataset versions into the same format. + + New Format: + ----------- + Same as AbsTaskRetrieval, but with a top_ranked file that contains the passages to rerank. The dataset should contain the following columns: + + self.corpus: dict[str, dict[str, str]] + Semantically, it should contain dict[split_name, dict[sample_id, dict[str, str]]] + E.g. {"test": {"document_one": {"_id": "d1", "title": "title", "text": "text"}}} + + self.queries: dict[str, dict[str, Union[str, list[str]]]] + Semantically, it should contain dict[split_name, dict[sample_id, str]] or dict[split_name, dict[sample_id, list[str]]] for conversations + E.g. {"test": {"q1": "query"}} + or {"test": {"q1": ["turn1", "turn2", "turn3"]}} + + self.relevant_docs: dict[str, dict[str, dict[str, int]]] + Semantically, it should contain dict[split_name, dict[sample_id, dict[doc_id, score]]] + E.g.: {"test": {"q1": {"document_one": 1}}} + self.top_ranked: dict[str, dict[str, list[str]]] or dict[str, dict[str, dict[str, float]]] + Semantically, it should contain dict[split_name, dict[sample_id, list[doc_id]]] or dict[split_name, dict[sample_id, dict[doc_id, score]]] + E.g.: {"test": {"q1": ["document_one", "document_two"]}} or {"test": {"q1": {"document_one": 1, "document_two": 0.5}}} + + Old Format: + ----------- self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns: query: str positive: list[str] @@ -43,56 +86,108 @@ class AbsTaskReranking(AbsTask): def __init__(self, **kwargs): super().__init__(**kwargs) - def _evaluate_subset( - self, - model: Encoder, - data_split: Dataset, - *, - encode_kwargs: dict[str, Any] = {}, - **kwargs: Any, - ) -> ScoresDict: - evaluator = RerankingEvaluator( - data_split, - task_name=self.metadata.name, - encode_kwargs=encode_kwargs, - **kwargs, - ) - scores = evaluator(model) - - self._add_main_score(scores) - return scores + def load_data(self, **kwargs): + if self.data_loaded: + return - def _add_main_score(self, scores: ScoresDict) -> None: - scores["main_score"] = scores[self.metadata.main_score] + if self.metadata.name in OLD_FORMAT_RERANKING_TASKS: + self.original_dataset = datasets.load_dataset( + **self.metadata_dict["dataset"] + ) # type: ignore + self.transform_old_format_to_standard() + else: + # use AbsTaskRetrieval default to load the data + # TODO: need to make sure top_ranked comes back + return super().load_data(**kwargs) + + def transform_old_format_to_standard(self): + """Transform the old format to the new format (see class doc string for details). Dataset has three features: query, positive, negative.""" + logging.info( + f"Transforming old format to standard format for {self.metadata.name}" + ) + self.corpus = defaultdict(dict) + self.queries = defaultdict(dict) + self.relevant_docs = defaultdict(lambda: defaultdict(dict)) + self.top_ranked = defaultdict(lambda: defaultdict(list)) + + for split in self.original_dataset: + # keep the lookups to prevent duplicate queries and documents for memory purposes + corpus_lookup = {} + query_lookup = {} + for query_i in tqdm.tqdm(range(len(self.original_dataset[split]))): + query: str = self.original_dataset[split]["query"][query_i] + positive_docs: list[str] = self.original_dataset[split]["positive"][ + query_i + ] + negative_docs: list[str] = self.original_dataset[split]["negative"][ + query_i + ] + + if query in query_lookup: + query_id = query_lookup[query] + else: + query_id = f"{split}_query{query_i}" + query_lookup[query] = query_id + self.queries[split][query_id] = query + + for i, pos_doc in enumerate(sorted(positive_docs)): + if pos_doc in corpus_lookup: + doc_id = corpus_lookup[pos_doc] + else: + doc_id = f"{query_id}_positive_{i}" + self.corpus[split][doc_id] = {"text": pos_doc, "_id": doc_id} + corpus_lookup[pos_doc] = doc_id + + self.top_ranked[split][query_id].append(doc_id) + self.relevant_docs[split][query_id][doc_id] = 1 + + for i, neg_doc in enumerate(sorted(negative_docs)): + if neg_doc in corpus_lookup: + doc_id = corpus_lookup[neg_doc] + else: + doc_id = f"{query_id}_negative_{i}" + self.corpus[split][doc_id] = {"text": neg_doc, "_id": doc_id} + corpus_lookup[neg_doc] = doc_id + + self.top_ranked[split][query_id].append(doc_id) + self.relevant_docs[split][query_id][doc_id] = 0 + + self.data_loaded = True def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False ) -> RerankingDescriptiveStatistics: - if hf_subset: - query = self.dataset[hf_subset][split]["query"] - positive = self.dataset[hf_subset][split]["positive"] - negative = self.dataset[hf_subset][split]["negative"] - elif compute_overall: - query = [] - positive = [] - negative = [] - for hf_subset in self.metadata.eval_langs: - query.extend(self.dataset[hf_subset][split]["query"]) - positive.extend(self.dataset[hf_subset][split]["positive"]) - negative.extend(self.dataset[hf_subset][split]["negative"]) + if self.metadata.name in OLD_FORMAT_RERANKING_TASKS: + # TODO: do we want the old calculated metrics for these, or should we switch to the new? + if hf_subset: + query = self.original_dataset[hf_subset][split]["query"] + positive = self.original_dataset[hf_subset][split]["positive"] + negative = self.original_dataset[hf_subset][split]["negative"] + elif compute_overall: + query = [] + positive = [] + negative = [] + for hf_subset in self.metadata.eval_langs: + query.extend(self.original_dataset[hf_subset][split]["query"]) + positive.extend(self.original_dataset[hf_subset][split]["positive"]) + negative.extend(self.original_dataset[hf_subset][split]["negative"]) + else: + query = self.original_dataset[split]["query"] + positive = self.original_dataset[split]["positive"] + negative = self.original_dataset[split]["negative"] + + total_len_query = sum([len(q) for q in query]) + total_len_positive = sum([len(p) for p in positive]) + total_len_negative = sum([len(n) for n in negative]) + return RerankingDescriptiveStatistics( + num_samples=len(query), + num_positive=len(positive), + num_negative=len(negative), + avg_query_len=total_len_query / len(query), + avg_positive_len=total_len_positive / len(positive), + avg_negative_len=total_len_negative / len(negative), + ) else: - query = self.dataset[split]["query"] - positive = self.dataset[split]["positive"] - negative = self.dataset[split]["negative"] - - total_len_query = sum([len(q) for q in query]) - total_len_positive = sum([len(p) for p in positive]) - total_len_negative = sum([len(n) for n in negative]) - return RerankingDescriptiveStatistics( - num_samples=len(query), - num_positive=len(positive), - num_negative=len(negative), - avg_query_len=total_len_query / len(query), - avg_positive_len=total_len_positive / len(positive), - avg_negative_len=total_len_negative / len(negative), - ) + return super()._calculate_metrics_from_split( + split, hf_subset, compute_overall + ) diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index baecb7d6bd..5e94f515b4 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -20,7 +20,7 @@ from mteb.models import model_meta_from_sentence_transformers from ..abstasks import * -from ..abstasks import AbsTask +from ..abstasks import AbsTask, AbsTaskReranking from ..load_results.task_results import TaskResult from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper from ..models.wrapper import Wrapper @@ -222,13 +222,19 @@ def print_selected_tasks(self): def select_tasks(self, **kwargs): """Select the tasks to be evaluated.""" # Get all existing tasks - tasks_categories_cls = list(AbsTask.__subclasses__()) - self.tasks_cls = [ - cls(hf_subsets=self._task_langs, **kwargs) - for cat_cls in tasks_categories_cls - for cls in cat_cls.__subclasses__() - if cat_cls.__name__.startswith("AbsTask") - ] + # reranking subclasses retrieval to share methods, but is an abstract task + tasks_categories_cls = list(AbsTask.__subclasses__()) + [AbsTaskReranking] + all_task_classes = [] + for cat_cls in tasks_categories_cls: + for cls in cat_cls.__subclasses__(): + if ( + cat_cls.__name__.startswith("AbsTask") + and cls.__name__ != "AbsTaskReranking" + ): + task = cls(hf_subsets=self._task_langs, **kwargs) + all_task_classes.append(task) + + self.tasks_cls = all_task_classes # If `task_list` is specified, select list of tasks if self._tasks is not None: diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 54e2e0acd8..a7531bbd56 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -1,29 +1,20 @@ from __future__ import annotations -import heapq -import json import logging -import os -from collections import defaultdict -from pathlib import Path from typing import Any import numpy as np import pytrec_eval -import torch -import tqdm -from sentence_transformers import CrossEncoder, SentenceTransformer -from mteb.encoder_interface import Encoder, PromptType -from mteb.model_meta import ModelMeta +from mteb.evaluation.evaluators.model_classes import ( + DenseRetrievalExactSearch, + DRESModel, + is_cross_encoder_compatible, +) from .Evaluator import Evaluator from .utils import ( confidence_scores, - convert_conv_history_to_query, - cos_sim, - dot_score, - download, hole, mrr, nAUC, @@ -34,399 +25,6 @@ logger = logging.getLogger(__name__) -def corpus_to_str( - corpus: list[dict[str, str]] | dict[str, list[str]] | list[str], -) -> list[str]: - if isinstance(corpus, dict): - sentences = [ - (corpus["title"][i] + " " + corpus["text"][i]).strip() - if "title" in corpus - else corpus["text"][i].strip() - for i in range(len(corpus["text"])) - ] - elif isinstance(corpus, list) and isinstance(corpus[0], dict): - sentences = [ - (doc["title"] + " " + doc["text"]).strip() - if "title" in doc - else doc["text"].strip() - for doc in corpus - ] - else: - sentences = corpus - return sentences - - -# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/search/dense/exact_search.py#L12 -class DenseRetrievalExactSearch: - def __init__( - self, - model: Encoder, - encode_kwargs: dict[str, Any] = {}, - corpus_chunk_size: int = 50000, - previous_results: str | Path | None = None, - **kwargs: Any, - ): - # Model is class that provides encode_corpus() and encode_queries() - self.model = model - self.encode_kwargs = encode_kwargs - - if "batch_size" not in encode_kwargs: - encode_kwargs["batch_size"] = 128 - if "show_progress_bar" not in encode_kwargs: - encode_kwargs["show_progress_bar"] = True - if "convert_to_tensor" not in encode_kwargs: - encode_kwargs["convert_to_tensor"] = True - - self.score_functions = {"cos_sim": cos_sim, "dot": dot_score} - self.score_function_desc = { - "cos_sim": "Cosine Similarity", - "dot": "Dot Product", - } - self.corpus_chunk_size = corpus_chunk_size - if isinstance(previous_results, Path): - self.previous_results = str(previous_results) - else: - self.previous_results = previous_results - self.batch_size = encode_kwargs.get("batch_size") - self.show_progress_bar = encode_kwargs.get("show_progress_bar") - self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) - self.corpus_embeddings = defaultdict(list) - self.results = {} - - if self.previous_results is not None: - self.previous_results = self.load_results_file() - - if isinstance(self.model, CrossEncoder): - # load the predict instance from the CrossEncoder - # custom functions can be used by extending the DenseRetrievalExactSearch class - self.predict = self.model.predict - - def search( - self, - corpus: dict[str, dict[str, str]], - queries: dict[str, str | list[str]], - top_k: int, - score_function: str, - task_name: str, - instructions: dict[str, str] | None = None, - request_qid: str | None = None, - return_sorted: bool = False, - **kwargs, - ) -> dict[str, dict[str, float]]: - # Create embeddings for all queries using model.encode - # Runs semantic search against the corpus embeddings - # Returns a ranked list with the corpus ids - if score_function not in self.score_functions: - raise ValueError( - f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product" - ) - - logger.info("Encoding Queries.") - query_ids = list(queries.keys()) - self.results = {qid: {} for qid in query_ids} - queries = [queries[qid] for qid in queries] # type: ignore - if instructions: - queries = [f"{query} {instructions[query]}".strip() for query in queries] - if isinstance(queries[0], list): # type: ignore - query_embeddings = self.encode_conversations( - model=self.model, - conversations=queries, # type: ignore - task_name=task_name, - **self.encode_kwargs, - ) - else: - query_embeddings = self.model.encode( - queries, # type: ignore - task_name=task_name, - prompt_type=PromptType.query, - **self.encode_kwargs, - ) - - logger.info("Sorting Corpus by document length (Longest first)...") - corpus_ids = sorted( - corpus, - reverse=True, - ) - corpus = [corpus[cid] for cid in corpus_ids] # type: ignore - - logger.info("Encoding Corpus in batches... Warning: This might take a while!") - logger.info( - f"Scoring Function: {self.score_function_desc[score_function]} ({score_function})" - ) - - itr = range(0, len(corpus), self.corpus_chunk_size) - - result_heaps = { - qid: [] for qid in query_ids - } # Keep only the top-k docs for each query - for batch_num, corpus_start_idx in enumerate(itr): - logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...") - corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(corpus)) - - # Encode chunk of corpus - if ( - self.save_corpus_embeddings - and request_qid - and len(self.corpus_embeddings[request_qid]) - ): - sub_corpus_embeddings = torch.tensor( - self.corpus_embeddings[request_qid][batch_num] - ) - else: - # Encode chunk of corpus - sub_corpus_embeddings = self.model.encode( - corpus[corpus_start_idx:corpus_end_idx], # type: ignore - task_name=task_name, - prompt_type=PromptType.passage, - request_qid=request_qid, - **self.encode_kwargs, - ) - if self.save_corpus_embeddings and request_qid: - self.corpus_embeddings[request_qid].append(sub_corpus_embeddings) - - # Compute similarites using either cosine-similarity or dot product - cos_scores = self.score_functions[score_function]( - query_embeddings, sub_corpus_embeddings - ) - cos_scores[torch.isnan(cos_scores)] = -1 - - # Get top-k values - cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( - cos_scores, - min( - top_k + 1, - len(cos_scores[1]) if len(cos_scores) > 1 else len(cos_scores[-1]), - ), - dim=1, - largest=True, - sorted=return_sorted, - ) - cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() - cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() - - for query_itr in range(len(query_embeddings)): - query_id = query_ids[query_itr] - for sub_corpus_id, score in zip( - cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr] - ): - corpus_id = corpus_ids[corpus_start_idx + sub_corpus_id] - if len(result_heaps[query_id]) < top_k: - # Push item on the heap - heapq.heappush(result_heaps[query_id], (score, corpus_id)) - else: - # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element - heapq.heappushpop(result_heaps[query_id], (score, corpus_id)) - - for qid in result_heaps: - for score, corpus_id in result_heaps[qid]: - self.results[qid][corpus_id] = score - - return self.results - - def load_results_file(self): - # load the first stage results from file in format {qid: {doc_id: score}} - if "https://" in self.previous_results: - # download the file - if not os.path.exists(self.previous_results): - url_descriptor = self.previous_results.split("https://")[-1].replace( - "/", "--" - ) - dest_file = os.path.join( - "results", f"cached_predictions--{url_descriptor}" - ) - os.makedirs(os.path.dirname(os.path.abspath(dest_file)), exist_ok=True) - download(self.previous_results, dest_file) - logger.info( - f"Downloaded the previous results at {self.previous_results} to {dest_file}" - ) - self.previous_results = dest_file - - with open(self.previous_results) as f: - previous_results = json.load(f) - assert isinstance(previous_results, dict) - assert isinstance(previous_results[list(previous_results.keys())[0]], dict) - return previous_results - - def search_cross_encoder( - self, - corpus: dict[str, dict[str, str]], - queries: dict[str, str | list[str]], - top_k: int, - instructions: dict[str, str] | None = None, - **kwargs, - ) -> dict[str, dict[str, float]]: - """This function provides support for reranker (or cross-encoder) models that encoder query and document at the same time (typically with attention). - Some notable examples include MonoBERT, MonoT5, RankLlama, etc. - Note: you must provide the path to the results to rerank to the __init__ function as `previous_results` or else rerank all documents in the corpus - """ - pairs = [] # create the pairs for reranking - for qid in queries.keys(): - if self.previous_results is None: - # try to use all of them - logging.logging( - f"previous_results is None. Using all the documents to rerank: {len(corpus)}" - ) - q_results = {doc_id: 0.0 for doc_id in corpus.keys()} - else: - q_results = self.previous_results[qid] - # take the top-k only - q_results_sorted = dict( - sorted(q_results.items(), key=lambda item: item[1], reverse=True) - ) - top_n = [k for k, v in list(q_results_sorted.items())[:top_k]] - query = queries[qid] - query = ( - self.convert_conv_history_to_query(self.model, [query])[0] - if isinstance(query, list) - else query - ) - for doc_id in top_n: - pairs.append( - ( - query, - corpus[doc_id], - instructions[query] if instructions is not None else None, - qid, - doc_id, - ) - ) - - logger.info(f"Reranking the top {top_k} in batches... This might take a while!") - itr = range(0, len(pairs), self.batch_size) - - results = {qid: {} for qid in queries.keys()} - for batch_num, corpus_start_idx in enumerate( - tqdm.tqdm(itr, leave=False, disable=not self.show_progress_bar) - ): - corpus_end_idx = min(corpus_start_idx + self.batch_size, len(pairs)) - cur_batch = pairs[corpus_start_idx:corpus_end_idx] - - ( - queries_in_pair, - corpus_in_pair, - instructions_in_pair, - query_ids, - corpus_ids, - ) = zip(*cur_batch) - - assert ( - len(queries_in_pair) == len(corpus_in_pair) == len(instructions_in_pair) - ) - - if isinstance(self.model.model, CrossEncoder): - # can't take instructions, so add them here - queries_in_pair = [ - f"{q} {i}".strip() - for i, q in zip(instructions_in_pair, queries_in_pair) - ] - scores = self.model.predict(list(zip(queries_in_pair, corpus_in_pair))) # type: ignore - else: - # may use the instructions in a unique way, so give them also - scores = self.model.predict( # type: ignore - list(zip(queries_in_pair, corpus_in_pair, instructions_in_pair)) - ) - - for i, score in enumerate(scores): - results[query_ids[i]][corpus_ids[i]] = float(score) - - return results - - def predict(self, queries, passages, **kwargs): - raise NotImplementedError( - "You must implement a predict method for your reranker model" - ) - - def encode_conversations( - self, - model: Encoder, - conversations: list[list[str]], - task_name: str, - **kwargs, - ): - if callable(getattr(self.model, "encode_conversations", None)): - return model.encode_conversations( # type: ignore - conversations, task_name=task_name, **kwargs - ) - logger.warning( - "Model doesn't have encode_conversations fallback to default implementation" - ) - queries = self.convert_conv_history_to_query(model, conversations) # type: ignore - return model.encode( - queries, task_name=task_name, prompt_type=PromptType.query, **kwargs - ) # type: ignore - - @staticmethod - def convert_conv_history_to_query( - model: Encoder, conversations: list[list[str]] - ) -> str: - if callable(getattr(model, "convert_conv_history_to_query", None)): - return model.convert_conv_history_to_query(conversations) # type: ignore - return convert_conv_history_to_query(conversations) # type: ignore - - -class DRESModel: - """Dense Retrieval Exact Search (DRES). - This class converts a model with just an .encode method into DRES format. - """ - - mteb_model_meta: ModelMeta | None - - def __init__(self, model, **kwargs): - self.model = model - self.use_sbert_model = isinstance(model, SentenceTransformer) - self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) - self.corpus_embeddings = {} - - def encode_corpus( - self, - corpus: list[dict[str, str]], - task_name: str, - batch_size: int, - prompt_type: PromptType = PromptType.passage, - request_qid: str | None = None, - **kwargs, - ): - if ( - request_qid - and self.save_corpus_embeddings - and len(self.corpus_embeddings) > 0 - ): - return self.corpus_embeddings[request_qid] - - sentences = corpus_to_str(corpus) - corpus_embeddings = self.model.encode( - sentences, - task_name=task_name, - prompt_type=prompt_type, - batch_size=batch_size, - **kwargs, - ) - - if self.save_corpus_embeddings and request_qid: - self.corpus_embeddings[request_qid] = corpus_embeddings - return corpus_embeddings - - def encode( - self, - sentences: list[str], - task_name: str, - prompt_type: PromptType | None = None, - **kwargs, - ): - if prompt_type and prompt_type == PromptType.passage: - return self.encode_corpus( - sentences, task_name, prompt_type=prompt_type, **kwargs - ) - return self.model.encode( - sentences, task_name=task_name, prompt_type=prompt_type, **kwargs - ) - - -def is_cross_encoder_compatible(model) -> bool: - op = getattr(model.model, "predict", None) - return callable(op) - - # Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/evaluation.py#L9 class RetrievalEvaluator(Evaluator): def __init__( diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py new file mode 100644 index 0000000000..10375da8b3 --- /dev/null +++ b/mteb/evaluation/evaluators/model_classes.py @@ -0,0 +1,413 @@ +from __future__ import annotations + +import heapq +import json +import logging +import os +from collections import defaultdict +from pathlib import Path +from typing import Any + +import torch +import tqdm +from sentence_transformers import CrossEncoder, SentenceTransformer + +from mteb.encoder_interface import Encoder, PromptType +from mteb.model_meta import ModelMeta + +from .utils import convert_conv_history_to_query, cos_sim, dot_score, download + +logger = logging.getLogger(__name__) + + +def corpus_to_str( + corpus: list[dict[str, str]] | dict[str, list[str]] | list[str], +) -> list[str]: + if isinstance(corpus, dict): + sentences = [ + (corpus["title"][i] + " " + corpus["text"][i]).strip() + if "title" in corpus + else corpus["text"][i].strip() + for i in range(len(corpus["text"])) + ] + elif isinstance(corpus, list) and isinstance(corpus[0], dict): + sentences = [ + (doc["title"] + " " + doc["text"]).strip() + if "title" in doc + else doc["text"].strip() + for doc in corpus + ] + else: + sentences = corpus + return sentences + + +# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/search/dense/exact_search.py#L12 +class DenseRetrievalExactSearch: + def __init__( + self, + model: Encoder, + encode_kwargs: dict[str, Any] = {}, + corpus_chunk_size: int = 50000, + previous_results: str | Path | None = None, + **kwargs: Any, + ): + # Model is class that provides encode_corpus() and encode_queries() + self.model = model + self.encode_kwargs = encode_kwargs + + if "batch_size" not in encode_kwargs: + encode_kwargs["batch_size"] = 128 + if "show_progress_bar" not in encode_kwargs: + encode_kwargs["show_progress_bar"] = True + if "convert_to_tensor" not in encode_kwargs: + encode_kwargs["convert_to_tensor"] = True + + self.score_functions = {"cos_sim": cos_sim, "dot": dot_score} + self.score_function_desc = { + "cos_sim": "Cosine Similarity", + "dot": "Dot Product", + } + self.corpus_chunk_size = corpus_chunk_size + if isinstance(previous_results, Path): + self.previous_results = str(previous_results) + else: + self.previous_results = previous_results + self.batch_size = encode_kwargs.get("batch_size") + self.show_progress_bar = encode_kwargs.get("show_progress_bar") + self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) + self.corpus_embeddings = defaultdict(list) + self.results = {} + + if self.previous_results is not None: + self.previous_results = self.load_results_file() + + if isinstance(self.model, CrossEncoder): + # load the predict instance from the CrossEncoder + # custom functions can be used by extending the DenseRetrievalExactSearch class + self.predict = self.model.predict + + def search( + self, + corpus: dict[str, dict[str, str]], + queries: dict[str, str | list[str]], + top_k: int, + score_function: str, + task_name: str, + instructions: dict[str, str] | None = None, + request_qid: str | None = None, + return_sorted: bool = False, + **kwargs, + ) -> dict[str, dict[str, float]]: + # Create embeddings for all queries using model.encode + # Runs semantic search against the corpus embeddings + # Returns a ranked list with the corpus ids + if score_function not in self.score_functions: + raise ValueError( + f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product" + ) + + logger.info("Encoding Queries.") + query_ids = list(queries.keys()) + self.results = {qid: {} for qid in query_ids} + queries = [queries[qid] for qid in queries] # type: ignore + if instructions: + queries = [f"{query} {instructions[query]}".strip() for query in queries] + if isinstance(queries[0], list): # type: ignore + query_embeddings = self.encode_conversations( + model=self.model, + conversations=queries, # type: ignore + task_name=task_name, + **self.encode_kwargs, + ) + else: + query_embeddings = self.model.encode( + queries, # type: ignore + task_name=task_name, + prompt_type=PromptType.query, + **self.encode_kwargs, + ) + + logger.info("Sorting Corpus by document length (Longest first)...") + corpus_ids = sorted( + corpus, + reverse=True, + ) + corpus = [corpus[cid] for cid in corpus_ids] # type: ignore + + logger.info("Encoding Corpus in batches... Warning: This might take a while!") + logger.info( + f"Scoring Function: {self.score_function_desc[score_function]} ({score_function})" + ) + + itr = range(0, len(corpus), self.corpus_chunk_size) + + result_heaps = { + qid: [] for qid in query_ids + } # Keep only the top-k docs for each query + for batch_num, corpus_start_idx in enumerate(itr): + logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...") + corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(corpus)) + + # Encode chunk of corpus + if ( + self.save_corpus_embeddings + and request_qid + and len(self.corpus_embeddings[request_qid]) + ): + sub_corpus_embeddings = torch.tensor( + self.corpus_embeddings[request_qid][batch_num] + ) + else: + # Encode chunk of corpus + sub_corpus_embeddings = self.model.encode( + corpus[corpus_start_idx:corpus_end_idx], # type: ignore + task_name=task_name, + prompt_type=PromptType.passage, + request_qid=request_qid, + **self.encode_kwargs, + ) + if self.save_corpus_embeddings and request_qid: + self.corpus_embeddings[request_qid].append(sub_corpus_embeddings) + + # Compute similarites using either cosine-similarity or dot product + cos_scores = self.score_functions[score_function]( + query_embeddings, sub_corpus_embeddings + ) + cos_scores[torch.isnan(cos_scores)] = -1 + + # Get top-k values + cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( + cos_scores, + min( + top_k + 1, + len(cos_scores[1]) if len(cos_scores) > 1 else len(cos_scores[-1]), + ), + dim=1, + largest=True, + sorted=return_sorted, + ) + cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() + cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() + + for query_itr in range(len(query_embeddings)): + query_id = query_ids[query_itr] + for sub_corpus_id, score in zip( + cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr] + ): + corpus_id = corpus_ids[corpus_start_idx + sub_corpus_id] + if len(result_heaps[query_id]) < top_k: + # Push item on the heap + heapq.heappush(result_heaps[query_id], (score, corpus_id)) + else: + # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element + heapq.heappushpop(result_heaps[query_id], (score, corpus_id)) + + for qid in result_heaps: + for score, corpus_id in result_heaps[qid]: + self.results[qid][corpus_id] = score + + return self.results + + def load_results_file(self): + # load the first stage results from file in format {qid: {doc_id: score}} + if "https://" in self.previous_results: + # download the file + if not os.path.exists(self.previous_results): + url_descriptor = self.previous_results.split("https://")[-1].replace( + "/", "--" + ) + dest_file = os.path.join( + "results", f"cached_predictions--{url_descriptor}" + ) + os.makedirs(os.path.dirname(os.path.abspath(dest_file)), exist_ok=True) + download(self.previous_results, dest_file) + logger.info( + f"Downloaded the previous results at {self.previous_results} to {dest_file}" + ) + self.previous_results = dest_file + + with open(self.previous_results) as f: + previous_results = json.load(f) + assert isinstance(previous_results, dict) + assert isinstance(previous_results[list(previous_results.keys())[0]], dict) + return previous_results + + def search_cross_encoder( + self, + corpus: dict[str, dict[str, str]], + queries: dict[str, str | list[str]], + top_k: int, + instructions: dict[str, str] | None = None, + **kwargs, + ) -> dict[str, dict[str, float]]: + """This function provides support for reranker (or cross-encoder) models that encoder query and document at the same time (typically with attention). + Some notable examples include MonoBERT, MonoT5, RankLlama, etc. + Note: you must provide the path to the results to rerank to the __init__ function as `previous_results` or else rerank all documents in the corpus + """ + pairs = [] # create the pairs for reranking + for qid in queries.keys(): + if self.previous_results is None: + # try to use all of them + logging.logging( + f"previous_results is None. Using all the documents to rerank: {len(corpus)}" + ) + q_results = {doc_id: 0.0 for doc_id in corpus.keys()} + else: + q_results = self.previous_results[qid] + # take the top-k only + q_results_sorted = dict( + sorted(q_results.items(), key=lambda item: item[1], reverse=True) + ) + top_n = [k for k, v in list(q_results_sorted.items())[:top_k]] + query = queries[qid] + query = ( + self.convert_conv_history_to_query(self.model, [query])[0] + if isinstance(query, list) + else query + ) + for doc_id in top_n: + pairs.append( + ( + query, + corpus[doc_id], + instructions[query] if instructions is not None else None, + qid, + doc_id, + ) + ) + + logger.info(f"Reranking the top {top_k} in batches... This might take a while!") + itr = range(0, len(pairs), self.batch_size) + + results = {qid: {} for qid in queries.keys()} + for batch_num, corpus_start_idx in enumerate( + tqdm.tqdm(itr, leave=False, disable=not self.show_progress_bar) + ): + corpus_end_idx = min(corpus_start_idx + self.batch_size, len(pairs)) + cur_batch = pairs[corpus_start_idx:corpus_end_idx] + + ( + queries_in_pair, + corpus_in_pair, + instructions_in_pair, + query_ids, + corpus_ids, + ) = zip(*cur_batch) + + assert ( + len(queries_in_pair) == len(corpus_in_pair) == len(instructions_in_pair) + ) + + if isinstance(self.model.model, CrossEncoder): + # can't take instructions, so add them here + queries_in_pair = [ + f"{q} {i}".strip() + for i, q in zip(instructions_in_pair, queries_in_pair) + ] + scores = self.model.predict(list(zip(queries_in_pair, corpus_in_pair))) # type: ignore + else: + # may use the instructions in a unique way, so give them also + scores = self.model.predict( # type: ignore + list(zip(queries_in_pair, corpus_in_pair, instructions_in_pair)) + ) + + for i, score in enumerate(scores): + results[query_ids[i]][corpus_ids[i]] = float(score) + + return results + + def predict(self, queries, passages, **kwargs): + raise NotImplementedError( + "You must implement a predict method for your reranker model" + ) + + def encode_conversations( + self, + model: Encoder, + conversations: list[list[str]], + task_name: str, + **kwargs, + ): + if callable(getattr(self.model, "encode_conversations", None)): + return model.encode_conversations( # type: ignore + conversations, task_name=task_name, **kwargs + ) + logger.warning( + "Model doesn't have encode_conversations fallback to default implementation" + ) + queries = self.convert_conv_history_to_query(model, conversations) # type: ignore + return model.encode( + queries, task_name=task_name, prompt_type=PromptType.query, **kwargs + ) # type: ignore + + @staticmethod + def convert_conv_history_to_query( + model: Encoder, conversations: list[list[str]] + ) -> str: + if callable(getattr(model, "convert_conv_history_to_query", None)): + return model.convert_conv_history_to_query(conversations) # type: ignore + return convert_conv_history_to_query(conversations) # type: ignore + + +class DRESModel: + """Dense Retrieval Exact Search (DRES). + This class converts a model with just an .encode method into DRES format. + """ + + mteb_model_meta: ModelMeta | None + + def __init__(self, model, **kwargs): + self.model = model + self.use_sbert_model = isinstance(model, SentenceTransformer) + self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) + self.corpus_embeddings = {} + + def encode_corpus( + self, + corpus: list[dict[str, str]], + task_name: str, + batch_size: int, + prompt_type: PromptType = PromptType.passage, + request_qid: str | None = None, + **kwargs, + ): + if ( + request_qid + and self.save_corpus_embeddings + and len(self.corpus_embeddings) > 0 + ): + return self.corpus_embeddings[request_qid] + + sentences = corpus_to_str(corpus) + corpus_embeddings = self.model.encode( + sentences, + task_name=task_name, + prompt_type=prompt_type, + batch_size=batch_size, + **kwargs, + ) + + if self.save_corpus_embeddings and request_qid: + self.corpus_embeddings[request_qid] = corpus_embeddings + return corpus_embeddings + + def encode( + self, + sentences: list[str], + task_name: str, + prompt_type: PromptType | None = None, + **kwargs, + ): + if prompt_type and prompt_type == PromptType.passage: + return self.encode_corpus( + sentences, task_name, prompt_type=prompt_type, **kwargs + ) + return self.model.encode( + sentences, task_name=task_name, prompt_type=prompt_type, **kwargs + ) + + +def is_cross_encoder_compatible(model) -> bool: + op = getattr(model.model, "predict", None) + return callable(op) diff --git a/mteb/overview.py b/mteb/overview.py index 7b1bfbb426..91723ec4df 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -8,7 +8,7 @@ import pandas as pd -from mteb.abstasks import AbsTask +from mteb.abstasks import AbsTask, AbsTaskReranking from mteb.abstasks.TaskMetadata import TASK_CATEGORY, TASK_DOMAIN, TASK_TYPE from mteb.languages import ( ISO_TO_LANGUAGE, @@ -25,19 +25,31 @@ def create_task_list() -> list[type[AbsTask]]: - tasks_categories_cls = list(AbsTask.__subclasses__()) - tasks = [ - cls - for cat_cls in tasks_categories_cls - for cls in cat_cls.__subclasses__() - if cat_cls.__name__.startswith("AbsTask") - ] + # reranking subclasses retrieval to share methods, but is an abstract task + tasks_categories_cls = list(AbsTask.__subclasses__()) + [AbsTaskReranking] + tasks = [] + for cat_cls in tasks_categories_cls: + for cls in cat_cls.__subclasses__(): + if ( + cat_cls.__name__.startswith("AbsTask") + and cls.__name__ != "AbsTaskReranking" + ): + tasks.append(cls) return tasks def create_name_to_task_mapping() -> dict[str, type[AbsTask]]: tasks = create_task_list() - return {cls.metadata.name: cls for cls in tasks} + metadata_names = {} + for cls in tasks: + if cls.metadata.name in metadata_names: + raise ValueError( + f"Duplicate task name found: {cls.metadata.name}. Please make sure that all task names are unique." + ) + if "AbsTask" in cls.__name__: + continue + metadata_names[cls.metadata.name] = cls + return metadata_names TASKS_REGISTRY = create_name_to_task_mapping() diff --git a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py index 07d6118111..54bc8316b9 100644 --- a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py +++ b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py @@ -19,7 +19,7 @@ class AskUbuntuDupQuestions(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="map", + main_score="map_at_1000", date=None, domains=None, task_subtypes=None, diff --git a/mteb/tasks/Reranking/rus/RuBQReranking.py b/mteb/tasks/Reranking/rus/RuBQReranking.py index 5303f413eb..d9b71386aa 100644 --- a/mteb/tasks/Reranking/rus/RuBQReranking.py +++ b/mteb/tasks/Reranking/rus/RuBQReranking.py @@ -19,7 +19,7 @@ class RuBQReranking(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["rus-Cyrl"], - main_score="map", + main_score="map_at_1000", date=("2001-01-01", "2021-01-01"), domains=["Encyclopaedic", "Written"], task_subtypes=[], From 0fb211db1203bb481257525d10f33855964cc13f Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Tue, 29 Oct 2024 14:56:52 -0400 Subject: [PATCH 02/16] merged retrieval; working --- mteb/abstasks/AbsTaskInstructionRetrieval.py | 10 +- mteb/abstasks/AbsTaskReranking.py | 101 ++-- mteb/abstasks/AbsTaskRetrieval.py | 23 +- .../InstructionRetrievalEvaluator.py | 52 -- .../evaluators/RerankingEvaluator.py | 563 ------------------ .../evaluators/RetrievalEvaluator.py | 16 +- mteb/evaluation/evaluators/__init__.py | 1 - .../Reranking/multilingual/ESCIReranking.py | 2 +- .../Reranking/multilingual/MIRACLReranking.py | 5 +- mteb/tasks/Reranking/zho/CMTEBReranking.py | 4 +- .../test_InstructionRetrievalEvaluator.py | 6 +- .../test_RerankingEvaluator.py | 58 -- 12 files changed, 90 insertions(+), 751 deletions(-) delete mode 100644 mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py delete mode 100644 mteb/evaluation/evaluators/RerankingEvaluator.py delete mode 100644 tests/test_evaluators/test_RerankingEvaluator.py diff --git a/mteb/abstasks/AbsTaskInstructionRetrieval.py b/mteb/abstasks/AbsTaskInstructionRetrieval.py index a0107abc75..56170a4946 100644 --- a/mteb/abstasks/AbsTaskInstructionRetrieval.py +++ b/mteb/abstasks/AbsTaskInstructionRetrieval.py @@ -13,9 +13,7 @@ from mteb.encoder_interface import Encoder from ..evaluation.evaluators import utils -from ..evaluation.evaluators.InstructionRetrievalEvaluator import ( - InstructionRetrievalEvaluator, -) +from ..evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator from .AbsTask import AbsTask, DescriptiveStatistics from .AbsTaskRetrieval import HFDataLoader @@ -347,7 +345,7 @@ def load_data(self, **kwargs): def _evaluate_subset_lang( self, - retriever: InstructionRetrievalEvaluator, + retriever: RetrievalEvaluator, corpus: dict, queries: dict, og_relevant_docs: dict, @@ -467,7 +465,7 @@ def evaluate( encode_kwargs: dict[str, Any] = {}, **kwargs, ) -> dict[str, dict[str, Any]]: - retriever = InstructionRetrievalEvaluator( + retriever = RetrievalEvaluator( retriever=model, task_name=self.metadata.name, encode_kwargs=encode_kwargs, @@ -523,7 +521,7 @@ def _add_main_score(self, scores: dict[str, dict[str, float]]) -> None: def _evaluate_subset( self, - retriever: InstructionRetrievalEvaluator, + retriever: RetrievalEvaluator, corpus: dict[str, dict[str, str]], queries: dict[str, str], relevant_docs: dict[str, dict[str, int]], diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index a9905a407b..1cda0608da 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -6,7 +6,7 @@ import datasets import tqdm -from .AbsTask import DescriptiveStatistics +from ..load_results.task_results import ScoresDict from .AbsTaskRetrieval import AbsTaskRetrieval logger = logging.getLogger(__name__) @@ -31,26 +31,6 @@ ] -class RerankingDescriptiveStatistics(DescriptiveStatistics): - """Descriptive statistics for Reranking - - Attributes: - num_samples: number of samples in the dataset. - num_positive: Number of positive examples - num_negative: Number of negative examples - avg_query_len: Average length of queries - avg_positive_len: Average length of positive examples - avg_negative_len: Average length of negative examples - """ - - num_samples: int - num_positive: int - num_negative: int - avg_query_len: float - avg_positive_len: float - avg_negative_len: float - - class AbsTaskReranking(AbsTaskRetrieval): """Abstract class for re-ranking experiments. This is mostly the same as the RetrievalEvaluator, but as previously it wasn't we need to keep it to transform old dataset versions into the same format. @@ -152,42 +132,51 @@ def transform_old_format_to_standard(self): self.top_ranked[split][query_id].append(doc_id) self.relevant_docs[split][query_id][doc_id] = 0 + self.instructions = None # previous tasks don't have instructions self.data_loaded = True - def _calculate_metrics_from_split( - self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ) -> RerankingDescriptiveStatistics: - if self.metadata.name in OLD_FORMAT_RERANKING_TASKS: - # TODO: do we want the old calculated metrics for these, or should we switch to the new? - if hf_subset: - query = self.original_dataset[hf_subset][split]["query"] - positive = self.original_dataset[hf_subset][split]["positive"] - negative = self.original_dataset[hf_subset][split]["negative"] - elif compute_overall: - query = [] - positive = [] - negative = [] - for hf_subset in self.metadata.eval_langs: - query.extend(self.original_dataset[hf_subset][split]["query"]) - positive.extend(self.original_dataset[hf_subset][split]["positive"]) - negative.extend(self.original_dataset[hf_subset][split]["negative"]) + def _evaluate_subset( + self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs + ) -> ScoresDict: + all_results = defaultdict(dict) + max_docs = 0 + top_ranked = kwargs["top_ranked"] # must be present for reranking + for query_id in tqdm.tqdm( + list(queries.keys()), leave=False, desc="Reranking over query-ids.." + ): + cur_queries = {query_id: queries[query_id]} + if "instructions" in kwargs: + instructions = kwargs["instructions"] + cur_instructions = {queries[query_id]: instructions[queries[query_id]]} else: - query = self.original_dataset[split]["query"] - positive = self.original_dataset[split]["positive"] - negative = self.original_dataset[split]["negative"] - - total_len_query = sum([len(q) for q in query]) - total_len_positive = sum([len(p) for p in positive]) - total_len_negative = sum([len(n) for n in negative]) - return RerankingDescriptiveStatistics( - num_samples=len(query), - num_positive=len(positive), - num_negative=len(negative), - avg_query_len=total_len_query / len(query), - avg_positive_len=total_len_positive / len(positive), - avg_negative_len=total_len_negative / len(negative), - ) - else: - return super()._calculate_metrics_from_split( - split, hf_subset, compute_overall + cur_instructions = None + + doc_ids_to_rerank = top_ranked[query_id] + cur_corpus = {doc_id: corpus[doc_id] for doc_id in doc_ids_to_rerank} + if ( + len(cur_corpus) > max_docs + ): # use this to make sure we get the correct MAP/MRR at max length + max_docs = len(cur_corpus) + + # to handle instruction-based reranking we pass both query_id and instructions (unused if not instruction-based) + results = retriever( + cur_corpus, + cur_queries, + instructions=cur_instructions, + query_id=query_id, ) + # results should have only one key, the query_id + all_results[query_id] = results[query_id] + + # do the evaluation like normal now, but pass our results + if max_docs > max(retriever.k_values): + retriever.k_values += [max_docs] + return super()._evaluate_subset( + retriever, + corpus, + queries, + relevant_docs, + hf_subset, + results=all_results, + **kwargs, + ) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 6fa901c791..b978bf5b07 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -238,6 +238,7 @@ def load_data(self, **kwargs): if self.data_loaded: return self.corpus, self.queries, self.relevant_docs = {}, {}, {} + self.instructions, self.top_ranked = None, None dataset_path = self.metadata_dict["dataset"]["path"] hf_repo_qrels = ( dataset_path + "-qrels" if "clarin-knext" in dataset_path else None @@ -289,12 +290,21 @@ def evaluate( self.queries[split], self.relevant_docs[split], ) + if self.top_ranked is not None: + kwargs["top_ranked"] = self.top_ranked[split] + if self.instructions is not None: + kwargs["instructions"] = self.instructions[split] else: corpus, queries, relevant_docs = ( self.corpus[hf_subset][split], self.queries[hf_subset][split], self.relevant_docs[hf_subset][split], ) + if self.top_ranked is not None: + kwargs["top_ranked"] = self.top_ranked[hf_subset][split] + if self.instructions is not None: + kwargs["instructions"] = self.instructions[hf_subset][split] + scores[hf_subset] = self._evaluate_subset( retriever, corpus, queries, relevant_docs, hf_subset, **kwargs ) @@ -303,10 +313,15 @@ def evaluate( def _evaluate_subset( self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs ) -> ScoresDict: - start_time = time() - results = retriever(corpus, queries) - end_time = time() - logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") + if "results" in kwargs: + # reranking has already been done + results = kwargs["results"] + else: + # perform the retrieval here + start_time = time() + results = retriever(corpus, queries) + end_time = time() + logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") save_predictions = kwargs.get("save_predictions", False) export_errors = kwargs.get("export_errors", False) diff --git a/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py b/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py deleted file mode 100644 index f17dad9872..0000000000 --- a/mteb/evaluation/evaluators/InstructionRetrievalEvaluator.py +++ /dev/null @@ -1,52 +0,0 @@ -from __future__ import annotations - -import logging - -from .RetrievalEvaluator import ( - RetrievalEvaluator, -) - -logger = logging.getLogger(__name__) - - -class InstructionRetrievalEvaluator(RetrievalEvaluator): - # only added to extend the RetrievalEvaluator to pass along the instructions - def __call__( - self, - corpus: dict[str, dict[str, str]], - queries: dict[str, str], - instructions: dict[str, str], - qid: str | None = None, - **kwargs, - ) -> dict[str, dict[str, float]]: - if not self.retriever: - raise ValueError("Model/Technique has not been provided!") - - if self.is_cross_encoder: - return self.retriever.search_cross_encoder( - corpus, queries, self.top_k, instructions=instructions, **kwargs - ) - elif ( - hasattr(self.retriever.model, "mteb_model_meta") - and self.retriever.model.mteb_model_meta.name == "bm25s" - ): - return self.retriever.model.search( - corpus, - queries, - self.top_k, - self.score_function, - task_name=self.task_name, # type: ignore - instructions=instructions, - **kwargs, - ) - else: - return self.retriever.search( - corpus, - queries, - self.top_k, - self.score_function, - instructions=instructions, - request_qid=qid, - task_name=self.task_name, - **kwargs, - ) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py deleted file mode 100644 index 62d741ee0c..0000000000 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ /dev/null @@ -1,563 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Any - -import numpy as np -import torch -import tqdm -from sklearn.metrics import average_precision_score - -from mteb.evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator - -from ...encoder_interface import Encoder, PromptType -from .Evaluator import Evaluator -from .utils import confidence_scores, cos_sim, nAUC - -logger = logging.getLogger(__name__) - - -class RerankingEvaluator(Evaluator): - """This class evaluates a SentenceTransformer model for the task of re-ranking. - Given a query and a list of documents, it computes the score [query, doc_i] for all possible - documents and sorts them in decreasing order. Then, MRR@10 and MAP is compute to measure the quality of the ranking. - :param samples: Must be a list and each element is of the form: - - {'query': '', 'positive': [], 'negative': []}. Query is the search query, positive is a list of positive - (relevant) documents, negative is a list of negative (irrelevant) documents. - - {'query': [], 'positive': [], 'negative': []}. Where query is a list of strings, which embeddings we average - to get the query embedding. - """ - - def __init__( - self, - samples, - task_name: str | None = None, - mrr_at_k: int = 10, - name: str = "", - similarity_fct=cos_sim, - encode_kwargs: dict[str, Any] = {}, - use_batched_encoding: bool = True, - limit: int | None = None, - k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000], - evaluator_type: str = "standard", - **kwargs, - ): - super().__init__(**kwargs) - if limit: - samples = samples.train_test_split(limit)["test"] - self.samples = samples - self.name = name - self.mrr_at_k = mrr_at_k - self.similarity_fct = similarity_fct - self.use_batched_encoding = use_batched_encoding - self.task_name = task_name - self.k_values = k_values - self.evaluator_type = evaluator_type - self.encode_kwargs = encode_kwargs - - if "batch_size" not in self.encode_kwargs: - self.encode_kwargs["batch_size"] = 512 - - if isinstance(self.samples, dict): - self.samples = list(self.samples.values()) - - ### Remove sample with empty positive / negative set - self.samples = [ - sample - for sample in self.samples - if len(sample["positive"]) > 0 and len(sample["negative"]) > 0 - ] - - def __call__(self, model: Encoder): - scores = self.compute_metrics(model) - return scores - - def compute_metrics(self, model: Encoder): - return ( - self.compute_metrics_batched(model) - if self.use_batched_encoding - else self.compute_metrics_individual(model) - ) - - def compute_metrics_batched(self, model: Encoder): - """Computes the metrices in a batched way, by batching all queries and - all documents together - """ - logger.info("Encoding queries...") - if isinstance(self.samples[0]["query"], str): - all_query_embs = np.asarray( - model.encode( - [sample["query"] for sample in self.samples], - task_name=self.task_name, - prompt_type=PromptType.query, - **self.encode_kwargs, - ) - ) - elif isinstance(self.samples[0]["query"], list): - # In case the query is a list of strings, we get the most similar embedding to any of the queries - all_query_flattened = [ - q for sample in self.samples for q in sample["query"] - ] - all_query_embs = self._encode_unique_texts( - all_query_flattened, - model, - task_name=self.task_name, - prompt_type=PromptType.query, - **self.encode_kwargs, - ) - else: - raise ValueError( - f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}" - ) - - if self.evaluator_type == "standard": - results = self._encode_candidates( - model=model, - batched=True, - all_query_embs=all_query_embs, - ) - elif self.evaluator_type == "miracl": - results = self._encode_candidates_miracl( - model=model, - batched=True, - all_query_embs=all_query_embs, - ) - return results - - def compute_metrics_individual(self, model: Encoder): - """Embeds every (query, positive, negative) tuple individually. - Is slower than the batched version, but saves memory as only the - embeddings for one tuple are needed. Useful when you have - a really large test set - """ - if self.evaluator_type == "standard": - results = self._encode_candidates( - model=model, - batched=False, - ) - elif self.evaluator_type == "miracl": - results = self._encode_candidates_miracl( - model=model, - batched=False, - ) - return results - - def _encode_candidates(self, model: Encoder, batched: bool, all_query_embs=None): - all_mrr_scores = [] - all_ap_scores = [] - all_conf_scores = [] - logger.info("Encoding candidates...") - if batched: - self._encode_candidates_batched( - model=model, - all_query_embs=all_query_embs, - all_mrr_scores=all_mrr_scores, - all_ap_scores=all_ap_scores, - all_conf_scores=all_conf_scores, - ) - else: - self._encode_candidates_individual( - model=model, - all_mrr_scores=all_mrr_scores, - all_ap_scores=all_ap_scores, - all_conf_scores=all_conf_scores, - ) - scores = self._collect_results(all_mrr_scores, all_ap_scores, all_conf_scores) - return scores - - def _encode_candidates_batched( - self, - all_query_embs, - model: Encoder, - all_mrr_scores, - all_ap_scores, - all_conf_scores, - ): - all_docs = [] - for sample in self.samples: - all_docs.extend(sample["positive"]) - all_docs.extend(sample["negative"]) - - all_docs_embs = self._encode_unique_texts( - all_docs, - model, - task_name=self.task_name, - prompt_type=PromptType.passage, - **self.encode_kwargs, - ) - - # Compute scores and confidence scores - logger.info("Evaluating...") - query_idx, docs_idx = 0, 0 - for instance in self.samples: - num_subqueries = ( - len(instance["query"]) if isinstance(instance["query"], list) else 1 - ) - query_emb = all_query_embs[query_idx : query_idx + num_subqueries] - query_idx += num_subqueries - - num_pos = len(instance["positive"]) - num_neg = len(instance["negative"]) - docs_emb = all_docs_embs[docs_idx : docs_idx + num_pos + num_neg] - docs_idx += num_pos + num_neg - - if num_pos == 0 or num_neg == 0: - continue - is_relevant = [True] * num_pos + [False] * num_neg - self._apply_sim_scores( - query_emb, - docs_emb, - is_relevant, - all_mrr_scores, - all_ap_scores, - all_conf_scores, - ) - - def _encode_candidates_individual( - self, - model: Encoder, - all_mrr_scores, - all_ap_scores, - all_conf_scores, - ): - for instance in tqdm.tqdm(self.samples, desc="Samples"): - query = instance["query"] - positive = list(instance["positive"]) - negative = list(instance["negative"]) - - if len(positive) == 0 or len(negative) == 0: - continue - - docs = positive + negative - is_relevant = [True] * len(positive) + [False] * len(negative) - - if isinstance(query, str): - # .encoding interface requires list[str] as input - query = [query] - query_emb = np.asarray( - model.encode( - query, - task_name=self.task_name, - prompt_type=PromptType.query, - **self.encode_kwargs, - ) - ) - docs_emb = np.asarray( - model.encode( - docs, - task_name=self.task_name, - prompt_type=PromptType.passage, - **self.encode_kwargs, - ) - ) - self._apply_sim_scores( - query_emb, - docs_emb, - is_relevant, - all_mrr_scores, - all_ap_scores, - all_conf_scores, - ) - - def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores): - mean_ap = np.mean(all_ap_scores) - mean_mrr = np.mean(all_mrr_scores) - - # Compute nAUCs - naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map") - naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") - - return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} - - def _encode_candidates_miracl( - self, - model: Encoder, - batched, - all_query_embs=None, - ): - if batched: - return self._encode_candidates_miracl_batched( - model=model, all_query_embs=all_query_embs - ) - else: - return self._encode_candidates_miracl_individual( - model=model, - ) - - def _encode_candidates_miracl_batched(self, all_query_embs, model: Encoder): - all_docs = [] - for sample in self.samples: - all_docs.extend(sample["candidates"]) - - all_docs_embs = np.asarray( - model.encode( - all_docs, - task_name=self.task_name, - prompt_type=PromptType.passage, - **self.encode_kwargs, - ) - ) - - # Compute scores - logger.info("Evaluating...") - query_idx, docs_idx = 0, 0 - results, qrels = {}, {} - for instance in self.samples: - num_subqueries = ( - len(instance["query"]) if isinstance(instance["query"], list) else 1 - ) - query_emb = all_query_embs[query_idx : query_idx + num_subqueries] - query_idx += num_subqueries - - positive = instance["positive"] - docs = instance["candidates"] - num_doc = len(docs) - docs_emb = all_docs_embs[docs_idx : docs_idx + num_doc] - docs_idx += num_doc - - fake_qid = str(query_idx) - results[fake_qid] = self.rerank(query_emb, docs_emb) - qrels[fake_qid] = { - str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) - } - - scores_miracl = self._collect_miracl_results(results, qrels) - return scores_miracl - - def _encode_candidates_miracl_individual(self, model: Encoder): - results, qrels = {}, {} - for i, instance in enumerate(tqdm.tqdm(self.samples, desc="Samples")): - query = instance["query"] - positive = set(instance["positive"]) - docs = list(instance["candidates"]) - - if isinstance(query, str): - # .encoding interface requires list[str] as input - query_emb = np.asarray( - model.encode( - [query], - task_name=self.task_name, - prompt_type=PromptType.query, - **self.encode_kwargs, - ) - ) - docs_emb = np.asarray( - model.encode( - docs, - task_name=self.task_name, - prompt_type=PromptType.passage, - **self.encode_kwargs, - ) - ) - - fake_qid = str(i) - results[fake_qid] = self.rerank(query_emb, docs_emb) - qrels[fake_qid] = { - str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) - } - - scores_miracl = self._collect_miracl_results(results, qrels) - return scores_miracl - - def _collect_miracl_results(self, results, qrels): - ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( - qrels=qrels, - results=results, - k_values=self.k_values, - ignore_identical_ids=False, - ) - scores = {**ndcg, **_map, **recall, **precision, **naucs} - scores_miracl = {f"{k}(MIRACL)": v for k, v in scores.items()} - return scores_miracl - - def rerank( - self, query_emb: torch.Tensor, docs_emb: torch.Tensor - ) -> dict[str, float]: - """Rerank documents (docs_emb) given the query (query_emb) - - Args: - query_emb: Query embedding of shape `(num_queries, hidden_size)`) - if `num_queries` > 0: we take the closest document to any of the queries - docs_emb: Candidates documents embeddings of shape `(num_pos+num_neg, hidden_size)`) - - Returns: - similarity_scores: - """ - if not query_emb.shape[0]: - raise ValueError("Empty query embedding") - - if not docs_emb.shape[0]: - return {"empty-docid": 0} - - pred_scores = self.similarity_fct(query_emb, docs_emb) - if len(pred_scores.shape) > 1: - pred_scores = torch.amax(pred_scores, dim=0) - - return { - str(i): score.detach().numpy().item() for i, score in enumerate(pred_scores) - } - - def _apply_sim_scores( - self, - query_emb, - docs_emb, - is_relevant, - all_mrr_scores, - all_ap_scores, - all_conf_scores, - ): - sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) - scores = self._compute_metrics_instance(sim_scores, is_relevant) - conf_scores = self.conf_scores(sim_scores.tolist()) - - all_mrr_scores.append(scores["mrr"]) - all_ap_scores.append(scores["ap"]) - all_conf_scores.append(conf_scores) - - @staticmethod - def _encode_unique_texts( - all_texts: list[str], - model: Encoder, - task_name: str | None, - prompt_type: PromptType | None, - **encode_kwargs: Any, - ): - index_map, all_unique_texts, all_texts_indexes = {}, [], [] - for text in all_texts: - text_hash = hash(text) - if text_hash not in index_map: - index_map[text_hash] = len(all_unique_texts) - all_unique_texts.append(text) - all_texts_indexes.append(index_map[text_hash]) - logger.warning( - f"A total on {len(all_texts) - len(all_unique_texts)}/{len(all_texts)} duplicate texts were found during encoding. Only encoding unique text and duplicating embeddings across." - ) - all_unique_texts_embs = np.asarray( - model.encode( - all_unique_texts, - task_name=task_name, - prompt_type=prompt_type, - **encode_kwargs, - ) - ) - return all_unique_texts_embs[all_texts_indexes] - - def _compute_sim_scores_instance( - self, query_emb: torch.Tensor, docs_emb: torch.Tensor - ) -> torch.Tensor: - """Computes similarity scores for a single instance = (query, positives, negatives) - - Args: - query_emb: Query embedding, with shape `(num_queries, hidden_size)` - if `num_queries` > 0: we take the closest document to any of the queries - docs_emb: Candidates documents embeddings, with shape `(num_pos+num_neg, hidden_size)` - - Returns: - sim_scores: Query-documents similarity scores, with shape `(num_pos+num_neg,)` - """ - sim_scores = self.similarity_fct(query_emb, docs_emb) - if len(sim_scores.shape) > 1: - sim_scores = torch.amax(sim_scores, dim=0) - - return sim_scores - - def _compute_metrics_instance( - self, sim_scores: torch.Tensor, is_relevant: list[bool] - ) -> dict[str, float]: - """Computes metrics for a single instance = (query, positives, negatives) - - Args: - sim_scores: Query-documents similarity scores, with shape `(num_pos+num_neg,)` - is_relevant: True if the document is relevant, with length `num_pos+num_neg` - - Returns: - scores: - - `mrr`: Mean Reciprocal Rank @ `self.mrr_at_k` - - `ap`: Average Precision - """ - pred_scores_argsort = torch.argsort(-sim_scores) # Sort in decreasing order - mrr = self.mrr_at_k_score(is_relevant, pred_scores_argsort, self.mrr_at_k) - ap = self.ap_score(is_relevant, sim_scores.cpu().tolist()) - return {"mrr": mrr, "ap": ap} - - @staticmethod - def conf_scores(sim_scores: torch.Tensor) -> dict[str, float]: - """Computes confidence scores for a single instance = (query, positives, negatives) - - Args: - sim_scores: Query-documents similarity scores, with shape `(num_pos+num_neg,)` - - Returns: - conf_scores: - - `max`: Maximum similarity score - - `std`: Standard deviation of similarity scores - - `diff1`: Difference between highest and second highest similarity scores - """ - return confidence_scores(sim_scores) - - @staticmethod - def nAUC_scores( - all_conf_scores: list[dict[str, float]], - metrics: list[float], - metric_name: str, - ) -> dict[str, float]: - """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997 - - Args: - all_conf_scores: Confidence scores for all instances, with length `len(samples)` - metrics: Metric scores for all instances, with length `len(samples)` - metric_name: Name of the metric (mrr or ap) - - Returns: - naucs: nAUCs for each confidence function - """ - conf_fcts = list(all_conf_scores[0].keys()) - all_conf_scores = { - fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts - } - metrics = np.array(metrics) - naucs = { - f"nAUC_{metric_name}_{fct}": nAUC(all_conf_scores[fct], metrics) - for fct in conf_fcts - } - return naucs - - @staticmethod - def mrr_at_k_score( - is_relevant: list[bool], pred_ranking: list[int], k: int - ) -> float: - """Computes MRR@k score - - Args: - is_relevant: True if the document is relevant - pred_ranking: Indices of the documents sorted in decreasing order - of the similarity score - k: Top-k documents to consider - - Returns: - The MRR@k score - """ - mrr_score = 0 - for rank, index in enumerate(pred_ranking[:k]): - if is_relevant[index]: - mrr_score = 1 / (rank + 1) - break - - return mrr_score - - @staticmethod - def ap_score(is_relevant, pred_scores): - """Computes AP score - - Args: - is_relevant (`list[bool]` of length `num_pos+num_neg`): True if the document is relevant - pred_scores (`list[float]` of length `num_pos+num_neg`): Predicted similarity scores - - Returns: - ap_score (`float`): AP score - """ - # preds = np.array(is_relevant)[pred_scores_argsort] - # precision_at_k = np.mean(preds[:k]) - # ap = np.mean([np.mean(preds[: k + 1]) for k in range(len(preds)) if preds[k]]) - ap = average_precision_score(is_relevant, pred_scores) - return ap diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index a7531bbd56..6212ee7e1c 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -60,13 +60,18 @@ def __init__( def __call__( self, corpus: dict[str, dict[str, str]], - queries: dict[str, str | list[str]], + queries: dict[str, str], + instructions: dict[str, str], + qid: str | None = None, + **kwargs, ) -> dict[str, dict[str, float]]: if not self.retriever: raise ValueError("Model/Technique has not been provided!") if self.is_cross_encoder: - return self.retriever.search_cross_encoder(corpus, queries, self.top_k) + return self.retriever.search_cross_encoder( + corpus, queries, self.top_k, instructions=instructions, **kwargs + ) elif ( hasattr(self.retriever.model, "mteb_model_meta") and self.retriever.model.mteb_model_meta.name == "bm25s" @@ -77,6 +82,8 @@ def __call__( self.top_k, self.score_function, task_name=self.task_name, # type: ignore + instructions=instructions, + **kwargs, ) else: return self.retriever.search( @@ -84,7 +91,10 @@ def __call__( queries, self.top_k, self.score_function, - task_name=self.task_name, # type: ignore + instructions=instructions, + request_qid=qid, + task_name=self.task_name, + **kwargs, ) @staticmethod diff --git a/mteb/evaluation/evaluators/__init__.py b/mteb/evaluation/evaluators/__init__.py index a1dc8faaa5..fc293a3448 100644 --- a/mteb/evaluation/evaluators/__init__.py +++ b/mteb/evaluation/evaluators/__init__.py @@ -4,7 +4,6 @@ from .ClassificationEvaluator import * from .ClusteringEvaluator import * from .PairClassificationEvaluator import * -from .RerankingEvaluator import * from .RetrievalEvaluator import * from .STSEvaluator import * from .SummarizationEvaluator import * diff --git a/mteb/tasks/Reranking/multilingual/ESCIReranking.py b/mteb/tasks/Reranking/multilingual/ESCIReranking.py index c3597c2fdf..9eb21c1961 100644 --- a/mteb/tasks/Reranking/multilingual/ESCIReranking.py +++ b/mteb/tasks/Reranking/multilingual/ESCIReranking.py @@ -38,7 +38,7 @@ class ESCIReranking(MultilingualTask, AbsTaskReranking): modalities=["text"], eval_splits=[_EVAL_SPLIT], eval_langs=_LANGUAGES, - main_score="map", + main_score="map_at_1000", date=("2022-06-14", "2022-06-14"), domains=["Written"], task_subtypes=[], diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index 4d226842bf..d8915dc405 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -8,7 +8,7 @@ from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.encoder_interface import Encoder -from mteb.evaluation.evaluators import RerankingEvaluator +from mteb.evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator from mteb.load_results.task_results import ScoresDict from ....abstasks.AbsTaskReranking import AbsTaskReranking @@ -88,7 +88,8 @@ def _evaluate_subset( encode_kwargs: dict[str, Any] = {}, **kwargs: Any, ) -> ScoresDict: - evaluator = RerankingEvaluator( + # TODO: this file will need to be fixed + evaluator = RetrievalEvaluator( samples=data_split, evaluator_type="miracl", task_name=self.metadata.name, diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index 7aa26c4ce0..130f079d25 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -18,7 +18,7 @@ class T2Reranking(AbsTaskReranking): modalities=["text"], eval_splits=["dev"], eval_langs=["cmn-Hans"], - main_score="map", + main_score="map_at_1000", date=None, form=None, domains=None, @@ -53,7 +53,7 @@ class MMarcoReranking(AbsTaskReranking): modalities=["text"], eval_splits=["dev"], eval_langs=["cmn-Hans"], - main_score="map", + main_score="map_at_1000", date=None, form=None, domains=None, diff --git a/tests/test_evaluators/test_InstructionRetrievalEvaluator.py b/tests/test_evaluators/test_InstructionRetrievalEvaluator.py index 9fe1cb13c0..5c0b4eb854 100644 --- a/tests/test_evaluators/test_InstructionRetrievalEvaluator.py +++ b/tests/test_evaluators/test_InstructionRetrievalEvaluator.py @@ -1,18 +1,18 @@ from __future__ import annotations from mteb import SentenceTransformerWrapper -from mteb.evaluation.evaluators import InstructionRetrievalEvaluator, utils +from mteb.evaluation.evaluators import RetrievalEvaluator, utils from tests.test_benchmark.mock_models import MockNumpyEncoder -class TestInstructionRetrievalEvaluator: +class TestInstructionRetrievalEvaluation: def setup_method(self): """Setup any state tied to the execution of the given method in a class. setup_method is invoked for every test method of a class. """ # checks that it loads - self.evaluator = InstructionRetrievalEvaluator.InstructionRetrievalEvaluator( + self.evaluator = RetrievalEvaluator.RetrievalEvaluator( SentenceTransformerWrapper(MockNumpyEncoder()), task_name="test" ) diff --git a/tests/test_evaluators/test_RerankingEvaluator.py b/tests/test_evaluators/test_RerankingEvaluator.py deleted file mode 100644 index 19b21e5721..0000000000 --- a/tests/test_evaluators/test_RerankingEvaluator.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import annotations - -import pytest - -from mteb.evaluation.evaluators import RerankingEvaluator - -TOL = 0.0001 - - -class TestRerankingEvaluator: - def setup_method(self): - """Setup any state tied to the execution of the given method in a class. - - setup_method is invoked for every test method of a class. - """ - self.evaluator = RerankingEvaluator([]) - - def test_mrr_at_k(self): - is_relevant = [1, 1, 1, 0, 0, 0, 0, 0, 0] - pred_ranking = [5, 2, 6, 1, 3, 4, 7, 8, 9] - - assert self.evaluator.mrr_at_k_score( - is_relevant, pred_ranking, 10 - ) == pytest.approx(0.5, TOL) - assert self.evaluator.mrr_at_k_score( - is_relevant, pred_ranking, 3 - ) == pytest.approx(0.5, TOL) - assert self.evaluator.mrr_at_k_score( - is_relevant, pred_ranking, 1 - ) == pytest.approx(0, TOL) - - def test_map(self): - is_relevant = [1, 1, 1, 0, 0] - pred_scores = [0.75, 0.93, 0.85, 0.76, 0.75] - - assert self.evaluator.ap_score(is_relevant, pred_scores) == pytest.approx( - 0.86666, TOL - ) - - def test_nAUC(self): - is_relevant = [[1, 1, 0, 0, 0], [1, 0, 0], [1, 1, 1, 0], [1, 0], [1, 1, 0, 0]] - pred_scores = [ - [0.8, 0.3, 0.4, 0.6, 0.5], - [0.5, 0.8, 0.4], - [0.9, 0.3, 0.3, 0.1], - [0.1, 0.2], - [0.5, 0.4, 0.5, 0.2], - ] - - ap_scores = [ - self.evaluator.ap_score(y, x) for x, y in zip(pred_scores, is_relevant) - ] - conf_scores = [self.evaluator.conf_scores(x) for x in pred_scores] - nauc_scores_map = self.evaluator.nAUC_scores(conf_scores, ap_scores, "map") - - assert nauc_scores_map["nAUC_map_max"] == pytest.approx(0.8694, TOL) - assert nauc_scores_map["nAUC_map_std"] == pytest.approx(0.94065, TOL) - assert nauc_scores_map["nAUC_map_diff1"] == pytest.approx(0.85460, TOL) From cacd075098d569f47273b9294e32efd39388b5a3 Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Tue, 29 Oct 2024 16:11:26 -0400 Subject: [PATCH 03/16] update tasks; working multilingual --- mteb/abstasks/AbsTaskReranking.py | 144 +++++++++++------- mteb/abstasks/AbsTaskRetrieval.py | 5 +- .../tasks/Reranking/eng/MindSmallReranking.py | 2 +- mteb/tasks/Reranking/eng/SciDocsReranking.py | 2 +- .../eng/StackOverflowDupQuestions.py | 2 +- .../eng/WebLINXCandidatesReranking.py | 2 +- mteb/tasks/Reranking/fra/AlloprofReranking.py | 2 +- mteb/tasks/Reranking/fra/SyntecReranking.py | 2 +- mteb/tasks/Reranking/jpn/MMarcoReranking.py | 2 +- .../Reranking/multilingual/MIRACLReranking.py | 2 +- .../WikipediaRerankingMultilingual.py | 2 +- mteb/tasks/Reranking/zho/CMTEBReranking.py | 2 +- 12 files changed, 103 insertions(+), 66 deletions(-) diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index 1cda0608da..5a5488d903 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -5,6 +5,7 @@ import datasets import tqdm +from datasets import Dataset from ..load_results.task_results import ScoresDict from .AbsTaskRetrieval import AbsTaskRetrieval @@ -64,80 +65,113 @@ class AbsTaskReranking(AbsTaskRetrieval): """ def __init__(self, **kwargs): - super().__init__(**kwargs) + super(AbsTaskRetrieval, self).__init__(**kwargs) def load_data(self, **kwargs): if self.data_loaded: return if self.metadata.name in OLD_FORMAT_RERANKING_TASKS: - self.original_dataset = datasets.load_dataset( - **self.metadata_dict["dataset"] - ) # type: ignore - self.transform_old_format_to_standard() + self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) # type: ignore + self.dataset_transform() else: # use AbsTaskRetrieval default to load the data # TODO: need to make sure top_ranked comes back return super().load_data(**kwargs) - def transform_old_format_to_standard(self): - """Transform the old format to the new format (see class doc string for details). Dataset has three features: query, positive, negative.""" + def process_example(self, example: dict, split: str, query_idx: int) -> dict: + """Process a single example from the dataset.""" + query = example["query"] + positive_docs = example["positive"] + negative_docs = example["negative"] + + query_id = f"{split}_query{query_idx}" + + # Initialize the structures for this example + example_data = { + "query_id": query_id, + "query": query, + "doc_ids": [], + "doc_texts": [], + "relevance_scores": [], + } + + for i, pos_doc in enumerate(sorted(positive_docs)): + doc_id = f"{query_id}_positive_{i}" + example_data["doc_ids"].append(doc_id) + example_data["doc_texts"].append(pos_doc) + example_data["relevance_scores"].append(1) + + for i, neg_doc in enumerate(sorted(negative_docs)): + doc_id = f"{query_id}_negative_{i}" + example_data["doc_ids"].append(doc_id) + example_data["doc_texts"].append(neg_doc) + example_data["relevance_scores"].append(0) + + return example_data + + def dataset_transform(self): + """Transform the old format to the new format using HF datasets mapping.""" + if self.metadata.name not in OLD_FORMAT_RERANKING_TASKS: + return + logging.info( f"Transforming old format to standard format for {self.metadata.name}" ) - self.corpus = defaultdict(dict) - self.queries = defaultdict(dict) - self.relevant_docs = defaultdict(lambda: defaultdict(dict)) - self.top_ranked = defaultdict(lambda: defaultdict(list)) - - for split in self.original_dataset: - # keep the lookups to prevent duplicate queries and documents for memory purposes - corpus_lookup = {} - query_lookup = {} - for query_i in tqdm.tqdm(range(len(self.original_dataset[split]))): - query: str = self.original_dataset[split]["query"][query_i] - positive_docs: list[str] = self.original_dataset[split]["positive"][ - query_i - ] - negative_docs: list[str] = self.original_dataset[split]["negative"][ - query_i - ] - - if query in query_lookup: - query_id = query_lookup[query] - else: - query_id = f"{split}_query{query_i}" - query_lookup[query] = query_id - self.queries[split][query_id] = query - - for i, pos_doc in enumerate(sorted(positive_docs)): - if pos_doc in corpus_lookup: - doc_id = corpus_lookup[pos_doc] - else: - doc_id = f"{query_id}_positive_{i}" - self.corpus[split][doc_id] = {"text": pos_doc, "_id": doc_id} - corpus_lookup[pos_doc] = doc_id - - self.top_ranked[split][query_id].append(doc_id) - self.relevant_docs[split][query_id][doc_id] = 1 - - for i, neg_doc in enumerate(sorted(negative_docs)): - if neg_doc in corpus_lookup: - doc_id = corpus_lookup[neg_doc] - else: - doc_id = f"{query_id}_negative_{i}" - self.corpus[split][doc_id] = {"text": neg_doc, "_id": doc_id} - corpus_lookup[neg_doc] = doc_id - - self.top_ranked[split][query_id].append(doc_id) - self.relevant_docs[split][query_id][doc_id] = 0 - - self.instructions = None # previous tasks don't have instructions + + self.corpus = defaultdict(lambda: defaultdict(dict)) + self.queries = defaultdict(lambda: defaultdict(dict)) + self.relevant_docs = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + self.top_ranked = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + + hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] + + for hf_subset in hf_subsets: + cur_dataset = self.dataset[hf_subset] + + for split in cur_dataset: + # Create an enumerated dataset to pass indices + enumerated_dataset = Dataset.from_dict( + { + "index": range(len(cur_dataset[split])), + "query": cur_dataset[split]["query"], + "positive": cur_dataset[split]["positive"], + "negative": cur_dataset[split]["negative"], + } + ) + + # Map the transformation function over the dataset + processed_dataset = enumerated_dataset.map( + lambda example, idx: self.process_example(example, split, idx), + with_indices=True, + remove_columns=enumerated_dataset.column_names, + ) + + # Populate the data structures + for item in processed_dataset: + query_id = item["query_id"] + self.queries[hf_subset][split][query_id] = item["query"] + + # Add documents and relevance information + for doc_id, doc_text, relevance in zip( + item["doc_ids"], item["doc_texts"], item["relevance_scores"] + ): + self.corpus[hf_subset][split][doc_id] = { + "text": doc_text, + "_id": doc_id, + } + self.top_ranked[hf_subset][split][query_id].append(doc_id) + self.relevant_docs[hf_subset][split][query_id][doc_id] = ( + relevance + ) + + self.instructions = None self.data_loaded = True def _evaluate_subset( self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs ) -> ScoresDict: + """Evaluate each query_id as a "mini" retrieval corpus, and rerank the top-ranked documents for each query_id.""" all_results = defaultdict(dict) max_docs = 0 top_ranked = kwargs["top_ranked"] # must be present for reranking diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index b978bf5b07..68e651e954 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -232,7 +232,10 @@ class AbsTaskRetrieval(AbsTask): ignore_identical_ids: bool = False def __init__(self, **kwargs): - super().__init__(**kwargs) + try: + super(AbsTask, self).__init__(**kwargs) + except Exception: + super().__init__(**kwargs) def load_data(self, **kwargs): if self.data_loaded: diff --git a/mteb/tasks/Reranking/eng/MindSmallReranking.py b/mteb/tasks/Reranking/eng/MindSmallReranking.py index a5ef13a603..3c25f5f25e 100644 --- a/mteb/tasks/Reranking/eng/MindSmallReranking.py +++ b/mteb/tasks/Reranking/eng/MindSmallReranking.py @@ -19,7 +19,7 @@ class MindSmallReranking(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="map", + main_score="map_at_1000", date=("2019-10-12", "2019-11-22"), domains=["News", "Written"], task_subtypes=[], diff --git a/mteb/tasks/Reranking/eng/SciDocsReranking.py b/mteb/tasks/Reranking/eng/SciDocsReranking.py index f85ffaed8d..be1153de53 100644 --- a/mteb/tasks/Reranking/eng/SciDocsReranking.py +++ b/mteb/tasks/Reranking/eng/SciDocsReranking.py @@ -19,7 +19,7 @@ class SciDocsReranking(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="map", + main_score="map_at_1000", date=("2000-01-01", "2020-12-31"), # best guess domains=["Academic", "Non-fiction", "Written"], task_subtypes=["Scientific Reranking"], diff --git a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py index f18f24e047..9b53b75f8c 100644 --- a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py +++ b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py @@ -19,7 +19,7 @@ class StackOverflowDupQuestions(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="map", + main_score="map_at_1000", date=None, domains=None, task_subtypes=None, diff --git a/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py b/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py index 4790a9460f..adbacffd34 100644 --- a/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py +++ b/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py @@ -29,7 +29,7 @@ class WebLINXCandidatesReranking(AbsTaskReranking): "test_web", ], eval_langs=["eng-Latn"], - main_score="mrr", + main_score="mrr_at_1000", date=("2023-03-01", "2023-10-30"), domains=["Academic", "Web", "Written"], task_subtypes=["Code retrieval", "Conversational retrieval"], diff --git a/mteb/tasks/Reranking/fra/AlloprofReranking.py b/mteb/tasks/Reranking/fra/AlloprofReranking.py index 3e5509c936..3ebce4f87b 100644 --- a/mteb/tasks/Reranking/fra/AlloprofReranking.py +++ b/mteb/tasks/Reranking/fra/AlloprofReranking.py @@ -21,7 +21,7 @@ class AlloprofReranking(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["fra-Latn"], - main_score="map", + main_score="map_at_1000", date=("2020-01-01", "2023-04-14"), # supposition domains=["Web", "Academic", "Written"], task_subtypes=None, diff --git a/mteb/tasks/Reranking/fra/SyntecReranking.py b/mteb/tasks/Reranking/fra/SyntecReranking.py index 3b12625e69..9fddfeccdd 100644 --- a/mteb/tasks/Reranking/fra/SyntecReranking.py +++ b/mteb/tasks/Reranking/fra/SyntecReranking.py @@ -21,7 +21,7 @@ class SyntecReranking(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["fra-Latn"], - main_score="map", + main_score="map_at_1000", date=("2022-12-01", "2022-12-02"), domains=["Legal", "Written"], task_subtypes=None, diff --git a/mteb/tasks/Reranking/jpn/MMarcoReranking.py b/mteb/tasks/Reranking/jpn/MMarcoReranking.py index bcfa5bba05..16da6b3063 100644 --- a/mteb/tasks/Reranking/jpn/MMarcoReranking.py +++ b/mteb/tasks/Reranking/jpn/MMarcoReranking.py @@ -18,7 +18,7 @@ class VoyageMMarcoReranking(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["jpn-Jpan"], - main_score="map", + main_score="map_at_1000", date=("2016-12-01", "2023-12-23"), domains=["Academic", "Non-fiction", "Written"], task_subtypes=["Scientific Reranking"], diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index d8915dc405..f59530dc5c 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -65,7 +65,7 @@ class MIRACLReranking(MultilingualTask, AbsTaskReranking): modalities=["text"], eval_splits=[_EVAL_SPLIT], eval_langs=_LANGUAGES, - main_score="NDCG@10(MIRACL)", + main_score="ndcg_cut_10", date=("2022-06-01", "2023-01-30"), domains=["Encyclopaedic", "Written"], task_subtypes=[], diff --git a/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py b/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py index 4d718f6f20..59332cdb2f 100644 --- a/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py +++ b/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py @@ -39,7 +39,7 @@ class WikipediaRerankingMultilingual(MultilingualTask, AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=_EVAL_LANGS, - main_score="map", + main_score="map_at_1000", date=("2023-11-01", "2024-05-15"), domains=["Encyclopaedic", "Written"], task_subtypes=[], diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index 130f079d25..42ef5a98c8 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -127,7 +127,7 @@ class CMedQAv2(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["cmn-Hans"], - main_score="map", + main_score="map_at_1000", date=None, form=None, domains=None, From 4b8d32ff6a34c49027d175680c348af0bd0f9eaa Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Mon, 4 Nov 2024 20:05:43 -0500 Subject: [PATCH 04/16] everything working except instructions --- README.md | 1 + docs/benchmarks.md | 2 +- docs/tasks.md | 12 +- mteb/abstasks/AbsTaskInstructionRetrieval.py | 730 ------------------ mteb/abstasks/AbsTaskReranking.py | 46 +- mteb/abstasks/AbsTaskRetrieval.py | 273 +++---- mteb/abstasks/MultilingualTask.py | 2 +- mteb/abstasks/TaskMetadata.py | 1 + mteb/abstasks/__init__.py | 1 - mteb/abstasks/dataloaders.py | 318 ++++++++ .../evaluators/RetrievalEvaluator.py | 9 +- mteb/evaluation/evaluators/utils.py | 92 +++ mteb/models/instructions.py | 1 + mteb/tasks/InstructionReranking/__init__.py | 6 + .../eng/Core17InstructionRetrieval.py | 4 +- .../eng/News21InstructionRetrieval.py | 6 +- .../eng/Robust04InstructionRetrieval.py | 6 +- .../eng}/__init__.py | 0 .../multilingual/__init__.py | 0 .../multilingual/mFollowIR.py | 10 +- mteb/tasks/InstructionRetrieval/__init__.py | 5 +- .../InstructionRetrieval/eng/InstructIR.py | 49 ++ mteb/tasks/Reranking/__init__.py | 1 + mteb/tasks/Reranking/eng/NevIR.py | 48 ++ .../eng/WebLINXCandidatesReranking.py | 23 +- mteb/tasks/Reranking/fra/AlloprofReranking.py | 3 + mteb/tasks/Reranking/fra/SyntecReranking.py | 3 + mteb/tasks/Reranking/jpn/MMarcoReranking.py | 15 +- .../Reranking/multilingual/ESCIReranking.py | 2 +- .../Reranking/multilingual/MIRACLReranking.py | 31 +- .../WikipediaRerankingMultilingual.py | 2 +- mteb/tasks/Reranking/zho/CMTEBReranking.py | 2 +- mteb/tasks/__init__.py | 1 + scripts/running_model/check_results.py | 1 + scripts/running_model/create_slurm_jobs.py | 1 + tests/test_benchmark/mock_tasks.py | 199 +++-- .../test_InstructionRetrievalEvaluator.py | 4 +- tests/test_reproducible_workflow.py | 1 + tests/test_tasks/test_all_abstasks.py | 4 +- 39 files changed, 790 insertions(+), 1125 deletions(-) delete mode 100644 mteb/abstasks/AbsTaskInstructionRetrieval.py create mode 100644 mteb/abstasks/dataloaders.py create mode 100644 mteb/tasks/InstructionReranking/__init__.py rename mteb/tasks/{InstructionRetrieval => InstructionReranking}/eng/Core17InstructionRetrieval.py (92%) rename mteb/tasks/{InstructionRetrieval => InstructionReranking}/eng/News21InstructionRetrieval.py (88%) rename mteb/tasks/{InstructionRetrieval => InstructionReranking}/eng/Robust04InstructionRetrieval.py (88%) rename mteb/tasks/{InstructionRetrieval/multilingual => InstructionReranking/eng}/__init__.py (100%) create mode 100644 mteb/tasks/InstructionReranking/multilingual/__init__.py rename mteb/tasks/{InstructionRetrieval => InstructionReranking}/multilingual/mFollowIR.py (97%) create mode 100644 mteb/tasks/InstructionRetrieval/eng/InstructIR.py create mode 100644 mteb/tasks/Reranking/eng/NevIR.py diff --git a/README.md b/README.md index 0e4f8d6e56..6dc7343919 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ In prompts the key can be: 8. `STS` 9. `Summarization` 10. `InstructionRetrieval` + 11. `InstructionReranking` 3. Pair of task type and prompt type like `Retrival-query` - these prompts will be used in all classification tasks 4. Task name - these prompts will be used in the specific task 5. Pair of task name and prompt type like `NFCorpus-query` - these prompts will be used in the specific task diff --git a/docs/benchmarks.md b/docs/benchmarks.md index a5abe50215..6450c2ddca 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -9,7 +9,7 @@ The following table gives you an overview of the benchmarks in MTEB. |------|---------|------------|---------|-----------| | [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | python,c++,sql,go,eng,php,javascript,ruby,java | | [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Written, Social, Reviews] | sun,kaz,tzl,ido,abs,arq,yue,tam,nij,glg,slk,hsb,ber,xho,cbk,pol,uzb,ina,kab,swh,amh,fao,kzj,lfn,uig,sqi,deu,ang,ind,bug,pms,ibo,cym,eus,spa,ceb,tgl,ron,isl,ita,csb,cha,fin,est,pes,jpn,tel,tha,oci,cmn,min,fry,bbc,epo,lit,rus,bos,hrv,war,ara,bjn,mkd,srp,ast,nno,urd,pam,aze,eng,ace,bew,kor,dan,awa,mui,hye,ban,cor,ben,gle,swe,mad,bul,lat,cat,nob,fra,pcm,ell,mar,vie,tat,ukr,gsw,kat,arz,dsb,lvs,nld,tur,bel,max,nds,afr,khm,dtp,yor,ces,gla,zsm,mak,ile,nov,orv,bre,swg,rej,mhr,mon,mal,jav,heb,slv,bhp,kur,wuu,tuk,por,hun,hin,hau,yid | -| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng | +| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionReranking': 3} | [Written, News] | eng | | [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Encyclopaedic, Spoken, Non-fiction, Government, News, Fiction, Social, Blog, Reviews, Written, Web, Legal] | nob,fao,swe,isl,dan,nno | | MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | python,c++,sql,c,go,eng,shell,typescript,php,scala,rust,swift,javascript,ruby,java | | [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | eng,deu,pol,fra | diff --git a/docs/tasks.md b/docs/tasks.md index d90ac1816b..18def6df57 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -145,7 +145,7 @@ The following tables give you an overview of the tasks in MTEB. | [ContractNLISharingWithEmployeesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 170} | {'test': 548.63} | | [ContractNLISharingWithThirdPartiesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 180} | {'test': 517.29} | | [ContractNLISurvivalOfObligationsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 157} | {'test': 417.64} | -| [Core17InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 39838} | {'test': {'num_docs': 19899, 'num_queries': 20, 'average_document_length': 2233.0329664807277, 'average_query_length': 109.75, 'average_instruction_length': 295.55, 'average_changed_instruction_length': 355.2, 'average_relevant_docs_per_query': 32.7, 'average_top_ranked_per_query': 1000.0}} | +| [Core17InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionReranking | s2p | [News, Written] | {'eng': 39838} | {'test': {'num_docs': 19899, 'num_queries': 20, 'average_document_length': 2233.0329664807277, 'average_query_length': 109.75, 'average_instruction_length': 295.55, 'average_changed_instruction_length': 355.2, 'average_relevant_docs_per_query': 32.7, 'average_top_ranked_per_query': 1000.0}} | | [CorporateLobbyingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 490} | {'test': 6039.85} | | [CosQA](https://arxiv.org/abs/2105.13239) (Junjie Huang, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | | {'test': {'average_document_length': 276.132741215298, 'average_query_length': 36.814, 'num_documents': 20604, 'num_queries': 500, 'average_relevant_docs_per_query': 1.0}} | | [CovidRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 332.4152658473415, 'average_query_length': 25.9304531085353, 'num_documents': 100001, 'num_queries': 949, 'average_relevant_docs_per_query': 1.0105374077976819}} | @@ -364,7 +364,7 @@ The following tables give you an overview of the tasks in MTEB. | [NeuCLIR2022RetrievalHardNegatives](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | {'test': {'average_document_length': 2066.9453653646488, 'average_query_length': 63.529411764705884, 'num_documents': 27931, 'num_queries': 136, 'average_relevant_docs_per_query': 40.39705882352941, 'hf_subset_descriptive_stats': {'fas': {'average_document_length': 2816.847782031074, 'average_query_length': 83.26666666666667, 'num_documents': 8882, 'num_queries': 45, 'average_relevant_docs_per_query': 32.71111111111111}, 'rus': {'average_document_length': 2446.5574277854193, 'average_query_length': 85.56818181818181, 'num_documents': 8724, 'num_queries': 44, 'average_relevant_docs_per_query': 42.93181818181818}, 'zho': {'average_document_length': 1101.0984987893462, 'average_query_length': 24.0, 'num_documents': 10325, 'num_queries': 47, 'average_relevant_docs_per_query': 45.38297872340426}}}} | | [NeuCLIR2023Retrieval](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 2232092, 'zho': 3179285, 'rus': 4627619} | {'test': {'fas': {'average_document_length': 2032.093148525817, 'average_query_length': 65.48684210526316, 'num_documents': 2232016, 'num_queries': 76, 'average_relevant_docs_per_query': 66.28947368421052}, 'rus': {'average_document_length': 1757.9129983233004, 'average_query_length': 74.4342105263158, 'num_documents': 4627543, 'num_queries': 76, 'average_relevant_docs_per_query': 62.223684210526315}, 'zho': {'average_document_length': 743.1426659901881, 'average_query_length': 22.210526315789473, 'num_documents': 3179209, 'num_queries': 76, 'average_relevant_docs_per_query': 53.68421052631579}}} | | [NeuCLIR2023RetrievalHardNegatives](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | {'test': {'average_document_length': 2236.175955333482, 'average_query_length': 54.10267857142857, 'num_documents': 49433, 'num_queries': 224, 'average_relevant_docs_per_query': 61.816964285714285, 'hf_subset_descriptive_stats': {'fas': {'average_document_length': 2895.869857421016, 'average_query_length': 65.89189189189189, 'num_documents': 15921, 'num_queries': 74, 'average_relevant_docs_per_query': 68.08108108108108}, 'rus': {'average_document_length': 2724.294762109928, 'average_query_length': 74.41333333333333, 'num_documents': 16247, 'num_queries': 75, 'average_relevant_docs_per_query': 63.053333333333335}, 'zho': {'average_document_length': 1168.4984071821605, 'average_query_length': 22.16, 'num_documents': 17265, 'num_queries': 75, 'average_relevant_docs_per_query': 54.4}}}} | -| [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 61906} | {'eng': 2983.724665391969} | +| [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionReranking | s2p | [News, Written] | {'eng': 61906} | {'eng': 2983.724665391969} | | [NewsClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | {'test': 7600} | {'test': 235.29} | | [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Written, Reviews] | {'test': 2050} | {'test': 82} | | [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Social, Reviews, Written] | {'train': 1640} | {'train': 135.91} | @@ -426,7 +426,7 @@ The following tables give you an overview of the tasks in MTEB. | [RestaurantReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | {'train': 2048} | {'train': 231.4} | | [RiaNewsRetrieval](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | {'test': 10000} | {'test': {'average_document_length': 1165.6429557148213, 'average_query_length': 62.4029, 'num_documents': 704344, 'num_queries': 10000, 'average_relevant_docs_per_query': 1.0}} | | [RiaNewsRetrievalHardNegatives](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | {'test': 1000} | {'test': {'average_document_length': 1225.7253146619116, 'average_query_length': 62.338, 'num_documents': 191237, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}} | -| [Robust04InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 95088} | {'eng': 2471.0398058252426} | +| [Robust04InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionReranking | s2p | [News, Written] | {'eng': 95088} | {'eng': 2471.0398058252426} | | [RomaTalesBitextMining](https://idoc.pub/documents/idocpub-zpnxm9g35ylv) | ['hun', 'rom'] | BitextMining | s2s | [Fiction, Written] | {'test': 215} | {'test': 316.8046511627907} | | [RomaniBibleClustering](https://romani.global.bible/info) | ['rom'] | Clustering | p2p | [Religious, Written] | {'test': 2048} | {'test': 132.2} | | [RomanianReviewsSentiment](https://arxiv.org/abs/2101.04197) (Anca Maria Tache, 2021) | ['ron'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 588.6} | @@ -593,8 +593,8 @@ The following tables give you an overview of the tasks in MTEB. | [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | {'test': 50000} | {} | | [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | {'test': 6161} | {'test': 173.0} | | [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, Web, News, Written] | {'test_expert': 2040} | {'test_expert': 145.88} | -| [mFollowIRCrossLingualInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'eng-fas': 80, 'eng-rus': 80, 'eng-zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 81.8780487804878, 'average_instruction_length': 389.9512195121951, 'average_changed_instruction_length': 450.5528455284553, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'eng-fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 80.075, 'average_instruction_length': 396.875, 'average_changed_instruction_length': 463.175, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'eng-rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 81.875, 'average_instruction_length': 371.125, 'average_changed_instruction_length': 431.8, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'eng-zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 83.55813953488372, 'average_instruction_length': 401.0232558139535, 'average_changed_instruction_length': 456.25581395348837, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | -| [mFollowIRInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 80, 'rus': 80, 'zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 57.113821138211385, 'average_instruction_length': 281.0650406504065, 'average_changed_instruction_length': 326.9430894308943, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 72.65, 'average_instruction_length': 358.925, 'average_changed_instruction_length': 415.325, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 77.5, 'average_instruction_length': 387, 'average_changed_instruction_length': 458, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 23.697674418604652, 'average_instruction_length': 110.09302325581395, 'average_changed_instruction_length': 122.81395348837209, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | +| [mFollowIRCrossLingualInstructionReranking](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'eng-fas': 80, 'eng-rus': 80, 'eng-zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 81.8780487804878, 'average_instruction_length': 389.9512195121951, 'average_changed_instruction_length': 450.5528455284553, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'eng-fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 80.075, 'average_instruction_length': 396.875, 'average_changed_instruction_length': 463.175, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'eng-rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 81.875, 'average_instruction_length': 371.125, 'average_changed_instruction_length': 431.8, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'eng-zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 83.55813953488372, 'average_instruction_length': 401.0232558139535, 'average_changed_instruction_length': 456.25581395348837, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | +| [mFollowIRInstructionReranking](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 80, 'rus': 80, 'zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 57.113821138211385, 'average_instruction_length': 281.0650406504065, 'average_changed_instruction_length': 326.9430894308943, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 72.65, 'average_instruction_length': 358.925, 'average_changed_instruction_length': 415.325, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 77.5, 'average_instruction_length': 387, 'average_changed_instruction_length': 458, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 23.697674418604652, 'average_instruction_length': 110.09302325581395, 'average_changed_instruction_length': 122.81395348837209, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | @@ -606,7 +606,7 @@ The following tables give you an overview of the tasks in MTEB.
-| Language | BitextMining | Classification | Clustering | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | +| Language | BitextMining | Classification | Clustering | InstructionReranking | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | |---|------|------|------|------|------|------|------|------|------|------|---| | aai | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | aak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | diff --git a/mteb/abstasks/AbsTaskInstructionRetrieval.py b/mteb/abstasks/AbsTaskInstructionRetrieval.py deleted file mode 100644 index 56170a4946..0000000000 --- a/mteb/abstasks/AbsTaskInstructionRetrieval.py +++ /dev/null @@ -1,730 +0,0 @@ -from __future__ import annotations - -import json -import logging -import os -from collections import defaultdict -from time import time -from typing import Any - -import tqdm -from datasets import Dataset, Features, Value, load_dataset - -from mteb.encoder_interface import Encoder - -from ..evaluation.evaluators import utils -from ..evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator -from .AbsTask import AbsTask, DescriptiveStatistics -from .AbsTaskRetrieval import HFDataLoader - -logger = logging.getLogger(__name__) - - -# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/datasets/data_loader_hf.py#L10 -class HFDataLoaderInstructions(HFDataLoader): - def __init__( - self, - hf_repo: str | None = None, - hf_repo_qrels: str | None = None, - data_folder: str | None = None, - prefix: str | None = None, - corpus_file: str = "corpus.jsonl", - query_file: str = "queries.jsonl", - qrels_folder: str = "qrels", - qrels_file: str = "", - streaming: bool = False, - keep_in_memory: bool = False, - ): - self.corpus = {} - self.queries = {} - self.qrels = {} - self.og_instructions = {} - self.changed_instructions = {} - self.top_ranked = {} - self.hf_repo = hf_repo - if hf_repo: - # By default fetch qrels from same repo not a second repo with "-qrels" like in original - self.hf_repo_qrels = hf_repo_qrels if hf_repo_qrels else hf_repo - else: - # data folder would contain these files: - # (1) fiqa/corpus.jsonl (format: jsonlines) - # (2) fiqa/queries.jsonl (format: jsonlines) - # (3) fiqa/qrels/test.tsv (format: tsv ("\t")) - if prefix: - query_file = prefix + "-" + query_file - qrels_folder = prefix + "-" + qrels_folder - - self.corpus_file = ( - os.path.join(data_folder, corpus_file) if data_folder else corpus_file - ) - self.query_file = ( - os.path.join(data_folder, query_file) if data_folder else query_file - ) - self.qrels_folder = ( - os.path.join(data_folder, qrels_folder) if data_folder else None - ) - self.qrels_file = qrels_file - self.streaming = streaming - self.keep_in_memory = keep_in_memory - - def load( - self, split="test" - ) -> tuple[ - Dataset, - Dataset, - dict[str, dict[str, int]], - dict[str, dict[str, int]], - Dataset, - ]: - if not self.hf_repo: - self.og_qrels_file = os.path.join(self.qrels_folder + "_og", split + ".tsv") - self.changed_qrels_file = os.path.join( - self.qrels_folder + "_changed", split + ".tsv" - ) - self.check(fIn=self.corpus_file, ext="jsonl") - self.check(fIn=self.query_file, ext="jsonl") - self.check(fIn=self.og_qrels_file, ext="tsv") - self.check(fIn=self.changed_qrels_file, ext="tsv") - - if not len(self.corpus): - logger.info("Loading Corpus...") - self._load_corpus() - logger.info("Loaded %d %s Documents.", len(self.corpus), split.upper()) - logger.info("Doc Example: %s", self.corpus[0]) - - if not len(self.queries): - logger.info("Loading Queries...") - self._load_queries() - - self._load_qrels(split, changed=False) - self._load_qrels(split, changed=True) - # filter queries with no qrels - og_qrels_dict = defaultdict(dict) - changed_qrels_dict = defaultdict(dict) - - def qrels_dict_init(row): - og_qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"]) - - def qrels_changed_dict_init(row): - changed_qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"]) - - self.changed_qrels.map(qrels_dict_init) - self.og_qrels.map(qrels_changed_dict_init) - self.og_qrels = og_qrels_dict - self.changed_qrels = changed_qrels_dict - self.queries = self.queries.filter(lambda x: x["id"] in self.og_qrels) - logger.info("Loaded %d %s Queries.", len(self.queries), split.upper()) - logger.info("Query Example: %s", self.queries[0]) - - # load top_ranked - self.load_top_ranked() - - return ( - self.corpus, - self.queries, - self.og_qrels, - self.changed_qrels, - self.top_ranked, - ) - - def load_top_ranked(self) -> None: - if self.hf_repo: - top_ranked_ds = load_dataset( - self.hf_repo, - "top_ranked", - keep_in_memory=self.keep_in_memory, - streaming=self.streaming, - ) - else: - top_ranked_ds = load_dataset( - "json", - data_files=self.top_ranked_file, - streaming=self.streaming, - keep_in_memory=self.keep_in_memory, - ) - top_ranked_ds = next(iter(top_ranked_ds.values())) # get first split - top_ranked_ds = top_ranked_ds.cast_column("qid", Value("string")) - top_ranked_ds = top_ranked_ds.cast_column("pid", Value("string")) - top_ranked_ds = top_ranked_ds.remove_columns( - [col for col in top_ranked_ds.column_names if col not in ["qid", "pid"]] - ) - self.top_ranked = top_ranked_ds - - def _load_queries(self): - if self.hf_repo: - queries_ds = load_dataset( - self.hf_repo, - "queries", - keep_in_memory=self.keep_in_memory, - streaming=self.streaming, - ) - else: - queries_ds = load_dataset( - "json", - data_files=self.query_file, - streaming=self.streaming, - keep_in_memory=self.keep_in_memory, - ) - queries_ds = next(iter(queries_ds.values())) # get first split - queries_ds = queries_ds.cast_column("_id", Value("string")) - queries_ds = queries_ds.rename_column("_id", "id") - queries_ds = queries_ds.remove_columns( - [ - col - for col in queries_ds.column_names - if col - not in [ - "id", - "text", - "instruction_og", - "instruction_changed", - "keywords", - "short_query", - ] - ] - ) - self.queries = queries_ds - - def _load_qrels(self, split, changed=False): - if self.hf_repo: - qrels_ds = load_dataset( - self.hf_repo_qrels, - "qrels_og" if not changed else "qrels_changed", - keep_in_memory=self.keep_in_memory, - streaming=self.streaming, - )[split] - else: - qrels_file = self.og_qrels_file if not changed else self.changed_qrels_file - qrels_ds = load_dataset( - "csv", - data_files=qrels_file, - delimiter="\t", - keep_in_memory=self.keep_in_memory, - ) - features = Features( - { - "query-id": Value("string"), - "corpus-id": Value("string"), - "score": Value("float"), - } - ) - qrels_ds = qrels_ds.cast(features) - - if changed: - self.changed_qrels = qrels_ds - else: - self.og_qrels = qrels_ds - - -class InstructionRetrievalDescriptiveStatistics(DescriptiveStatistics): - """Descriptive statistics for Instruction Retrieval tasks - - Attributes: - num_queries: Number of queries - num_docs: Number of documents - average_document_length: Average length of documents - average_query_length: Average length of queries - average_instruction_length: Average length of instructions - average_changed_instruction_length: Average length of changed instructions - average_relevant_docs_per_query: Average number of relevant docs per query - average_top_ranked_per_query: Average number of top ranked docs per query - """ - - num_queries: int - num_docs: int - average_document_length: float - average_query_length: float - average_instruction_length: float - average_changed_instruction_length: float - average_relevant_docs_per_query: float - average_top_ranked_per_query: float - - -class AbsTaskInstructionRetrieval(AbsTask): - """Abstract class for retrieval tasks that use instructions. An example from Core17 would be - query: What is the ongoing status of The Three Gorges Project? - instruction: A relevant document will provide the projected or actual date of completion of the project, its estimated or actual total cost, or the estimated or ongoing electrical output of the finished project. Discussions of the social, political, or ecological impact of the project are not relevant. - - Child-classes must implement the following properties: - self.corpus = dict[corpus_id, dict[str, str]] #id => dict with document datas like title and text - self.queries = dict[query_id, str] #id => query - self.relevant_docs = dict[query_id, dict[corpus_id, int]] - self.og_instructions = dict[str, str] query => original instruction - self.changed_instructions = dict[str, str] query => changed instruction - self.top_ranked = dict[query_id, list[corpus_id]] #id => list of top ranked document ids - - See https://arxiv.org/abs/2403.15246 for more details - """ - - def __init__( - self, - **kwargs, - ): - super().__init__(**kwargs) - self.do_length_ablation = kwargs.get("do_length_ablation", False) - if self.do_length_ablation: - logger.info("Running length ablation also...") - - def load_data(self, **kwargs): - if self.data_loaded: - return - self.corpus, self.queries, self.og_relevant_docs, self.changed_relevant_docs = ( - {}, - {}, - {}, - {}, - ) - self.og_instructions, self.changed_instructions = {}, {} - self.top_ranked = {} - if self.do_length_ablation: - self.keywords, self.short_instructions = {}, {} - - dataset_path = self.metadata_dict["dataset"]["path"] - hf_repo_qrels = ( - dataset_path + "-qrels" if "clarin-knext" in dataset_path else None - ) - for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]): - ( - corpus, - queries, - og_relevant_docs, - changed_relevant_docs, - top_ranked_init, - ) = HFDataLoaderInstructions( - hf_repo=dataset_path, - hf_repo_qrels=hf_repo_qrels, - streaming=False, - keep_in_memory=False, - ).load(split=split) - - # Conversion from DataSet - top_ranked = defaultdict(list) - [ - top_ranked[cur_inst["qid"]].append(cur_inst["pid"]) - for cur_inst in top_ranked_init - ] - og_instructions = { - query["text"]: query["instruction_og"] for query in queries - } - changed_instructions = { - query["text"]: query["instruction_changed"] for query in queries - } - if self.do_length_ablation: - keywords = {query["text"]: query["keywords"] for query in queries} - short_instructions = { - query["text"]: query["short_query"] for query in queries - } - queries = {query["id"]: query["text"] for query in queries} - corpus = { - doc["id"]: {"title": doc["title"], "text": doc["text"]} - for doc in corpus - } - assert ( - len(top_ranked) == len(queries) - ), f"Top ranked not loaded properly! Expected {len(self.queries)} but got {len(self.top_ranked)}." - - ( - self.corpus[split], - self.queries[split], - self.og_relevant_docs[split], - self.changed_relevant_docs[split], - ) = corpus, queries, og_relevant_docs, changed_relevant_docs - self.changed_instructions[split], self.og_instructions[split] = ( - changed_instructions, - og_instructions, - ) - self.top_ranked[split] = top_ranked - - if self.do_length_ablation: - self.keywords[split], self.short_instructions[split] = ( - keywords, - short_instructions, - ) - - self.data_loaded = True - - def _evaluate_subset_lang( - self, - retriever: RetrievalEvaluator, - corpus: dict, - queries: dict, - og_relevant_docs: dict, - changed_relevant_docs: dict, - og_instructions: dict, - changed_instructions: dict, - top_ranked: dict, - lang: str, - split: str, - keywords: dict | None = None, - short_instructions: dict | None = None, - **kwargs, - ) -> dict[str, dict[str, float] | float]: - corpus, queries = corpus[split], queries[split] - og_relevant_docs, changed_relevant_docs = ( - og_relevant_docs[split], - changed_relevant_docs[split], - ) - og_instructions, changed_instructions = ( - og_instructions[split], - changed_instructions[split], - ) - - top_ranked = top_ranked[split] - kwargs["prediction_name"] = "og" # for naming predictions, as needed - scores_og, results_og = self._evaluate_subset( - retriever, - corpus, - queries, - og_relevant_docs, - og_instructions, - top_ranked, - lang, - **kwargs, - ) - kwargs["prediction_name"] = "changed" # for naming predictions, as needed - scores_changed, results_changed = self._evaluate_subset( - retriever, - corpus, - queries, - changed_relevant_docs, - changed_instructions, - top_ranked, - lang, - **kwargs, - ) - - newly_irrelevant_qrels = self.create_qrel_diff( - og_relevant_docs, - changed_relevant_docs, - ) - overall_changed_scores = utils.evaluate_change( - results_og, results_changed, newly_irrelevant_qrels - ) - - overall_changed_scores["individual"] = { - "original": scores_og, - "changed": scores_changed, - } - - if self.do_length_ablation: - keywords, short_instructions = ( - keywords[split], - short_instructions[split], - ) - kwargs["prediction_name"] = "base" # for naming predictions, as needed - scores_base, results_base = self._evaluate_subset( - retriever, - corpus, - queries, - og_relevant_docs, - defaultdict(str), - top_ranked, - lang, - **kwargs, - ) - kwargs["prediction_name"] = "keywords" # for naming predictions, as needed - scores_w_keywords_scores, scores_w_keywords_results = self._evaluate_subset( - retriever, - corpus, - queries, - og_relevant_docs, - keywords, - top_ranked, - lang, - **kwargs, - ) - kwargs["prediction_name"] = ( - "short_instr" # for naming predictions, as needed - ) - ( - scores_w_short_instr_scores, - scores_w_short_instr_result, - ) = self._evaluate_subset( - retriever, - corpus, - queries, - og_relevant_docs, - short_instructions, - top_ranked, - lang, - **kwargs, - ) - overall_changed_scores["length_ablation"] = { - "keywords": scores_w_keywords_scores, - "short_instructions": scores_w_short_instr_scores, - "base": scores_base, - } - - return overall_changed_scores - - def evaluate( - self, - model: Encoder, - split: str = "test", - *, - encode_kwargs: dict[str, Any] = {}, - **kwargs, - ) -> dict[str, dict[str, Any]]: - retriever = RetrievalEvaluator( - retriever=model, - task_name=self.metadata.name, - encode_kwargs=encode_kwargs, - **kwargs, - ) - scores = {} - if self.is_multilingual: - for lang in self.hf_subsets: - logger.info(f"Language: {lang}") - scores[lang] = self._evaluate_subset_lang( - retriever, - corpus=self.corpus[lang], - queries=self.queries[lang], - og_relevant_docs=self.og_relevant_docs[lang], - changed_relevant_docs=self.changed_relevant_docs[lang], - og_instructions=self.og_instructions[lang], - changed_instructions=self.changed_instructions[lang], - top_ranked=self.top_ranked[lang], - lang=lang, - split=split, - keywords=self.keywords[lang] if self.do_length_ablation else None, - short_instructions=self.short_instructions[lang] - if self.do_length_ablation - else None, - **kwargs, - ) - self._add_main_score(scores[lang]) - else: - lang = "default" - scores[lang] = self._evaluate_subset_lang( - retriever, - corpus=self.corpus, - queries=self.queries, - og_relevant_docs=self.og_relevant_docs, - changed_relevant_docs=self.changed_relevant_docs, - og_instructions=self.og_instructions, - changed_instructions=self.changed_instructions, - top_ranked=self.top_ranked, - lang=lang, - split=split, - keywords=self.keywords if self.do_length_ablation else None, - short_instructions=self.short_instructions - if self.do_length_ablation - else None, - **kwargs, - ) - self._add_main_score(scores[lang]) - - return scores - - def _add_main_score(self, scores: dict[str, dict[str, float]]) -> None: - scores["main_score"] = scores[self.metadata.main_score] - - def _evaluate_subset( - self, - retriever: RetrievalEvaluator, - corpus: dict[str, dict[str, str]], - queries: dict[str, str], - relevant_docs: dict[str, dict[str, int]], - instructions: dict[str, str], - top_ranked: dict[str, list[str]], - lang=None, - **kwargs, - ) -> tuple[dict[str, float], dict[str, dict[str, float]]]: - start_time = time() - - # do the results by query and relevant docs only - all_results = [] - for query_id in tqdm.tqdm(list(queries.keys()), leave=False, desc="Retrieving"): - cur_queries = {query_id: queries[query_id]} - cur_instructions = {queries[query_id]: instructions[queries[query_id]]} - cur_docs = { - key: value - for (key, value) in corpus.items() - if key in top_ranked[query_id] - } - all_results.append( - retriever( - cur_docs, cur_queries, instructions=cur_instructions, qid=query_id - ) - ) - - # combine all the results (which are {'qid' -> {'doc_id' -> score} mappings) - # we know all are unique qids, so we can smash together - results = {k: v for d in all_results for k, v in d.items()} - - end_time = time() - logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") - - if kwargs.get("save_predictions", False): - output_folder = kwargs.get("output_folder", "results") - if not os.path.isdir(output_folder): - os.makedirs(output_folder) - top_k = kwargs.get("top_k", None) - if top_k is not None: - for qid in list(results.keys()): - doc_ids = set( - sorted( - results[qid], key=lambda x: results[qid][x], reverse=True - )[:top_k] - ) - results[qid] = { - k: v for k, v in results[qid].items() if k in doc_ids - } - if lang is None: - qrels_save_path = ( - f"{output_folder}/{self.metadata_dict['name']}_predictions.json" - ) - else: - qrels_save_path = f"{output_folder}/{self.metadata_dict['name']}_{lang}_predictions.json" - - if kwargs.get("prediction_name", None): - qrels_save_path = qrels_save_path.replace( - ".json", f"_{kwargs['prediction_name']}.json" - ) - - with open(qrels_save_path, "w") as f: - json.dump(results, f) - - ndcg, _map, recall, precision, naucs = retriever.evaluate( - relevant_docs, - results, - retriever.k_values, - ignore_identical_ids=kwargs.get("ignore_identical_ids", True), - ) - mrr, naucs = retriever.evaluate_custom( - relevant_docs, results, retriever.k_values, "mrr" - ) - scores = { - **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, - **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, - **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, - **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, - **{f"naucs_at_{k.split('@')[1]}": v for (k, v) in naucs.items()}, - **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()}, - } - return scores, results - - def create_qrel_diff(self, og_qrels, changed_qrels): - newly_irrelevant_qrels = {} - for qid in og_qrels: - newly_irrelevant_qrels[qid] = [] - for doc_id in og_qrels[qid]: - if changed_qrels[qid][doc_id] != og_qrels[qid][doc_id]: - newly_irrelevant_qrels[qid].append(doc_id) - - return newly_irrelevant_qrels - - def _calculate_metrics_from_split( - self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ) -> InstructionRetrievalDescriptiveStatistics: - if hf_subset: - corpus = self.corpus[hf_subset][split] - queries = self.queries[hf_subset][split] - relevant_docs = self.og_relevant_docs[hf_subset][split] - og_instructions = self.og_instructions[hf_subset][split] - changed_instructions = self.changed_instructions[hf_subset][split] - top_ranked = self.top_ranked[hf_subset][split] - elif compute_overall: - corpus = {} - queries = {} - relevant_docs = {} - og_instructions = {} - changed_instructions = {} - top_ranked = {} - for hf_subset in self.metadata.eval_langs: - corpus.update(process_docs(self.corpus, hf_subset, split)) - queries.update(process_docs(self.queries, hf_subset, split)) - relevant_docs.update( - process_relevant_docs(self.og_relevant_docs, hf_subset, split) - ) - og_instructions.update( - process_docs( - self.og_instructions, - hf_subset, - split, - ) - ) - changed_instructions.update( - process_docs(self.changed_instructions, hf_subset, split) - ) - top_ranked.update(process_top_ranked(self.top_ranked, hf_subset, split)) - else: - corpus = self.corpus[split] - queries = self.queries[split] - relevant_docs = self.og_relevant_docs[split] - og_instructions = self.og_instructions[split] - changed_instructions = self.changed_instructions[split] - top_ranked = self.top_ranked[split] - - total_corpus_len = sum( - [len(doc.get("title", "")) + len(doc["text"]) for doc in corpus.values()] - ) - total_queries_len = sum([len(query) for query in queries.values()]) - total_instructions_len = sum( - [len(instruction) for instruction in og_instructions.values()] - ) - total_changed_instructions_len = sum( - [len(instruction) for instruction in changed_instructions.values()] - ) - num_qrels_non_zero = sum( - sum(1 for doc_id in docs if docs[doc_id] != 0) - for docs in relevant_docs.values() - ) - qrels_per_doc = num_qrels_non_zero / len(relevant_docs) if len(queries) else 0 - top_ranked_per_query = ( - sum(len(docs) for docs in top_ranked.values()) / len(queries) - if len(queries) - else 0 - ) - return InstructionRetrievalDescriptiveStatistics( - num_docs=len(corpus), - num_queries=len(queries), - average_document_length=( - total_corpus_len / len(corpus) if len(corpus) else 0 - ), - average_query_length=( - total_queries_len / len(queries) if len(queries) else 0 - ), - average_instruction_length=( - total_instructions_len / len(queries) if len(queries) else 0 - ), - average_changed_instruction_length=( - total_changed_instructions_len / len(queries) if len(queries) else 0 - ), - average_relevant_docs_per_query=qrels_per_doc, - average_top_ranked_per_query=top_ranked_per_query, - ) - - -def process_docs( - collection: dict[str, dict[str, dict[str, str]]], hf_subset: str, split: str -) -> dict[str, str]: - """Collections can contain overlapping ids in different splits. Prepend split to avoid this""" - return { - f"{split}_{hf_subset}_{k}": v for k, v in collection[hf_subset][split].items() - } - - -def process_relevant_docs( - collection: dict[str, dict[str, dict[str, dict[str, int]]]], - hf_subset: str, - split: str, -) -> dict[str, dict[str, int]]: - """Collections can contain overlapping ids in different splits. Prepend split to avoid this""" - return_collection = {} - for query_id, relevant in collection[hf_subset][split].items(): - return_collection[f"{split}_{hf_subset}_{query_id}"] = { - f"{split}_{hf_subset}_{doc_id}": value for doc_id, value in relevant.items() - } - return return_collection - - -def process_top_ranked( - collection: dict[str, dict[str, dict[str, list[str]]]], hf_subset: str, split: str -) -> dict[str, list[str]]: - return_collection = {} - for query_id, docs_id in collection[hf_subset][split].items(): - return_collection[f"{split}_{hf_subset}_{query_id}"] = [ - f"{split}_{hf_subset}_{doc_id}" for doc_id in docs_id - ] - return return_collection diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index 5a5488d903..dc1d0d5ea8 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -33,7 +33,7 @@ class AbsTaskReranking(AbsTaskRetrieval): - """Abstract class for re-ranking experiments. This is mostly the same as the RetrievalEvaluator, but as previously it wasn't we need to keep it to transform old dataset versions into the same format. + """Abstract class for re-ranking experiments. This is mostly the same as the RetrievalEvaluator, but treats each query as a "mini" retrieval problem. New Format: ----------- @@ -55,13 +55,6 @@ class AbsTaskReranking(AbsTaskRetrieval): self.top_ranked: dict[str, dict[str, list[str]]] or dict[str, dict[str, dict[str, float]]] Semantically, it should contain dict[split_name, dict[sample_id, list[doc_id]]] or dict[split_name, dict[sample_id, dict[doc_id, score]]] E.g.: {"test": {"q1": ["document_one", "document_two"]}} or {"test": {"q1": {"document_one": 1, "document_two": 0.5}}} - - Old Format: - ----------- - self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns: - query: str - positive: list[str] - negative: list[str] """ def __init__(self, **kwargs): @@ -72,8 +65,7 @@ def load_data(self, **kwargs): return if self.metadata.name in OLD_FORMAT_RERANKING_TASKS: - self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) # type: ignore - self.dataset_transform() + self.transform_old_dataset_format() else: # use AbsTaskRetrieval default to load the data # TODO: need to make sure top_ranked comes back @@ -96,13 +88,13 @@ def process_example(self, example: dict, split: str, query_idx: int) -> dict: "relevance_scores": [], } - for i, pos_doc in enumerate(sorted(positive_docs)): + for i, pos_doc in enumerate(positive_docs): doc_id = f"{query_id}_positive_{i}" example_data["doc_ids"].append(doc_id) example_data["doc_texts"].append(pos_doc) example_data["relevance_scores"].append(1) - for i, neg_doc in enumerate(sorted(negative_docs)): + for i, neg_doc in enumerate(negative_docs): doc_id = f"{query_id}_negative_{i}" example_data["doc_ids"].append(doc_id) example_data["doc_texts"].append(neg_doc) @@ -110,8 +102,13 @@ def process_example(self, example: dict, split: str, query_idx: int) -> dict: return example_data - def dataset_transform(self): - """Transform the old format to the new format using HF datasets mapping.""" + def transform_old_dataset_format(self, given_dataset=None): + """Transform the old format to the new format using HF datasets mapping. This is a one-time transformation for datasets which are in the old format. + + Args: + given_dataset (Dataset, optional): The dataset to transform. Defaults to None. This is helpful for some older datasets which are loaded with custom code, but need to be transformed still. + + """ if self.metadata.name not in OLD_FORMAT_RERANKING_TASKS: return @@ -127,7 +124,17 @@ def dataset_transform(self): hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] for hf_subset in hf_subsets: - cur_dataset = self.dataset[hf_subset] + if given_dataset: + cur_dataset = given_dataset + elif "name" in self.metadata_dict["dataset"]: + cur_dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) # type: ignore + assert ( + hf_subset == "default" + ), f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata." + else: + cur_dataset = datasets.load_dataset( + **self.metadata_dict["dataset"], name=hf_subset + ) # type: ignore for split in cur_dataset: # Create an enumerated dataset to pass indices @@ -140,6 +147,14 @@ def dataset_transform(self): } ) + # first, filter out the ones that have no positive or no negatives + enumerated_dataset = enumerated_dataset.filter( + lambda x: len(x["positive"]) > 0 and len(x["negative"]) > 0 + ) + logger.info( + f"Filtered out {len(cur_dataset[split]) - len(enumerated_dataset)} examples with no positive or no negative examples. {len(enumerated_dataset)} examples remaining." + ) + # Map the transformation function over the dataset processed_dataset = enumerated_dataset.map( lambda example, idx: self.process_example(example, split, idx), @@ -205,6 +220,7 @@ def _evaluate_subset( # do the evaluation like normal now, but pass our results if max_docs > max(retriever.k_values): retriever.k_values += [max_docs] + return super()._evaluate_subset( retriever, corpus, diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 68e651e954..ba59616e39 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -3,195 +3,20 @@ import json import logging import os -from collections import defaultdict from pathlib import Path from time import time from typing import Any -from datasets import Features, Value, load_dataset - from mteb.abstasks.TaskMetadata import HFSubset from ..evaluation.evaluators import RetrievalEvaluator from ..load_results.task_results import ScoresDict from .AbsTask import AbsTask, DescriptiveStatistics +from .dataloaders import HFDataLoader logger = logging.getLogger(__name__) -# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/datasets/data_loader_hf.py#L10 -class HFDataLoader: - def __init__( - self, - hf_repo: str | None = None, - hf_repo_qrels: str | None = None, - data_folder: str | None = None, - prefix: str | None = None, - corpus_file: str = "corpus.jsonl", - query_file: str = "queries.jsonl", - qrels_folder: str = "qrels", - qrels_file: str = "", - streaming: bool = False, - keep_in_memory: bool = False, - ): - self.corpus = {} - self.queries = {} - self.qrels = {} - self.hf_repo = hf_repo - if hf_repo: - # By default fetch qrels from same repo not a second repo with "-qrels" like in original - self.hf_repo_qrels = hf_repo_qrels if hf_repo_qrels else hf_repo - else: - # data folder would contain these files: - # (1) fiqa/corpus.jsonl (format: jsonlines) - # (2) fiqa/queries.jsonl (format: jsonlines) - # (3) fiqa/qrels/test.tsv (format: tsv ("\t")) - if prefix: - query_file = prefix + "-" + query_file - qrels_folder = prefix + "-" + qrels_folder - - self.corpus_file = ( - os.path.join(data_folder, corpus_file) if data_folder else corpus_file - ) - self.query_file = ( - os.path.join(data_folder, query_file) if data_folder else query_file - ) - self.qrels_folder = ( - os.path.join(data_folder, qrels_folder) if data_folder else None - ) - self.qrels_file = qrels_file - self.streaming = streaming - self.keep_in_memory = keep_in_memory - - @staticmethod - def check(fIn: str, ext: str): - if not os.path.exists(fIn): - raise ValueError(f"File {fIn} not present! Please provide accurate file.") - - if not fIn.endswith(ext): - raise ValueError(f"File {fIn} must be present with extension {ext}") - - def load( - self, split="test" - ) -> tuple[dict[str, dict[str, str]], dict[str, str], dict[str, dict[str, int]]]: - if not self.hf_repo: - self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv") - self.check(fIn=self.corpus_file, ext="jsonl") - self.check(fIn=self.query_file, ext="jsonl") - self.check(fIn=self.qrels_file, ext="tsv") - - if not len(self.corpus): - logger.info("Loading Corpus...") - self._load_corpus() - logger.info("Loaded %d %s Documents.", len(self.corpus), split.upper()) - logger.info("Doc Example: %s", self.corpus[0]) - - if not len(self.queries): - logger.info("Loading Queries...") - self._load_queries() - - self._load_qrels(split) - # filter queries with no qrels - qrels_dict = defaultdict(dict) - - def qrels_dict_init(row): - qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"]) - - self.qrels.map(qrels_dict_init) - self.qrels = qrels_dict - self.queries = self.queries.filter(lambda x: x["id"] in self.qrels) - logger.info("Loaded %d %s Queries.", len(self.queries), split.upper()) - logger.info("Query Example: %s", self.queries[0]) - - return self.corpus, self.queries, self.qrels - - def load_corpus(self) -> dict[str, dict[str, str]]: - if not self.hf_repo: - self.check(fIn=self.corpus_file, ext="jsonl") - - if not len(self.corpus): - logger.info("Loading Corpus...") - self._load_corpus() - logger.info("Loaded %d %s Documents.", len(self.corpus)) - logger.info("Doc Example: %s", self.corpus[0]) - - return self.corpus - - def _load_corpus(self): - if self.hf_repo: - corpus_ds = load_dataset( - self.hf_repo, - "corpus", - keep_in_memory=self.keep_in_memory, - streaming=self.streaming, - ) - else: - corpus_ds = load_dataset( - "json", - data_files=self.corpus_file, - streaming=self.streaming, - keep_in_memory=self.keep_in_memory, - ) - corpus_ds = next(iter(corpus_ds.values())) # get first split - corpus_ds = corpus_ds.cast_column("_id", Value("string")) - corpus_ds = corpus_ds.rename_column("_id", "id") - corpus_ds = corpus_ds.remove_columns( - [ - col - for col in corpus_ds.column_names - if col not in ["id", "text", "title"] - ] - ) - self.corpus = corpus_ds - - def _load_queries(self): - if self.hf_repo: - queries_ds = load_dataset( - self.hf_repo, - "queries", - keep_in_memory=self.keep_in_memory, - streaming=self.streaming, - ) - else: - queries_ds = load_dataset( - "json", - data_files=self.query_file, - streaming=self.streaming, - keep_in_memory=self.keep_in_memory, - ) - queries_ds = next(iter(queries_ds.values())) # get first split - queries_ds = queries_ds.cast_column("_id", Value("string")) - queries_ds = queries_ds.rename_column("_id", "id") - queries_ds = queries_ds.remove_columns( - [col for col in queries_ds.column_names if col not in ["id", "text"]] - ) - self.queries = queries_ds - - def _load_qrels(self, split): - if self.hf_repo: - qrels_ds = load_dataset( - self.hf_repo_qrels, - keep_in_memory=self.keep_in_memory, - streaming=self.streaming, - )[split] - else: - qrels_ds = load_dataset( - "csv", - data_files=self.qrels_file, - delimiter="\t", - keep_in_memory=self.keep_in_memory, - ) - features = Features( - { - "query-id": Value("string"), - "corpus-id": Value("string"), - "score": Value("float"), - } - ) - qrels_ds = qrels_ds.cast(features) - self.qrels = qrels_ds - - class RetrievalDescriptiveStatistics(DescriptiveStatistics): """Descriptive statistics for Retrieval @@ -201,6 +26,9 @@ class RetrievalDescriptiveStatistics(DescriptiveStatistics): average_query_length: Average length of queries num_documents: Number of documents average_relevant_docs_per_query: Average number of relevant documents per query + average_instruction_length: Average length of instructions + average_num_instructions_per_query: Average number of instructions per query + average_top_ranked_per_query: Average number of top ranked documents per query """ num_queries: int @@ -208,6 +36,11 @@ class RetrievalDescriptiveStatistics(DescriptiveStatistics): average_query_length: float num_documents: int average_relevant_docs_per_query: float + # these are for datasets with instructions + average_instruction_length: float + average_num_instructions_per_query: float + # this is for datasets that do reranking + average_top_ranked_per_query: float class AbsTaskRetrieval(AbsTask): @@ -227,15 +60,22 @@ class AbsTaskRetrieval(AbsTask): self.relevant_docs: dict[str, dict[str, dict[str, int]]] Semantically, it should contain dict[split_name, dict[sample_id, dict[doc_id, score]]] E.g.: {"test": {"q1": {"document_one": 1}}} + + Child classes may optionally implement the following properties (top_ranked for reranking and instructions if needed): + + self.top_ranked: dict[str, dict[str, list[str]]] or dict[str, dict[str, dict[str, float]]] + Semantically, it should contain dict[split_name, dict[sample_id, list[doc_id]]] or dict[split_name, dict[sample_id, dict[doc_id, score]]] + E.g.: {"test": {"q1": ["document_one", "document_two"]}} or {"test": {"q1": {"document_one": 1, "document_two": 0.5}}} + + self.instructions: dict[str, dict[str, str]] or dict[str, dict[str, list[str]]] + Semantically, it should contain dict[split_name, dict[sample_id, str]] or dict[split_name, dict[sample_id, list[str]]] for multiple instructions per query + E.g. {"test": {"q1": "instruction"}} or {"test": {"q1": ["instruction1", "instruction2"]}} """ ignore_identical_ids: bool = False def __init__(self, **kwargs): - try: - super(AbsTask, self).__init__(**kwargs) - except Exception: - super().__init__(**kwargs) + super().__init__(**kwargs) def load_data(self, **kwargs): if self.data_loaded: @@ -247,7 +87,7 @@ def load_data(self, **kwargs): dataset_path + "-qrels" if "clarin-knext" in dataset_path else None ) for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]): - corpus, queries, qrels = HFDataLoader( + corpus, queries, qrels, instructions, top_ranked = HFDataLoader( hf_repo=dataset_path, hf_repo_qrels=hf_repo_qrels, streaming=False, @@ -264,6 +104,16 @@ def load_data(self, **kwargs): qrels, ) + # optional args + if instructions: + self.instructions = { + split: {inst["query"]: inst["instruction"] for inst in instructions} + } + if top_ranked: + self.top_ranked = { + split: {tr["query-id"]: tr["corpus-ids"] for tr in top_ranked} + } + self.data_loaded = True def evaluate( @@ -287,7 +137,7 @@ def evaluate( for hf_subset in hf_subsets: logger.info(f"Subset: {hf_subset}") - if hf_subset == "default": + if hf_subset == "default" and "default" not in self.corpus: corpus, queries, relevant_docs = ( self.corpus[split], self.queries[split], @@ -322,7 +172,9 @@ def _evaluate_subset( else: # perform the retrieval here start_time = time() - results = retriever(corpus, queries) + # instructions can be a set of instructions for each query + # TODO: add instructions here - in progress + results = retriever(corpus, queries, **kwargs) end_time = time() logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") @@ -352,11 +204,12 @@ def _evaluate_subset( with open(qrels_save_path, "w") as f: json.dump(results, f) - ndcg, _map, recall, precision, naucs = retriever.evaluate( + ndcg, _map, recall, precision, naucs, task_scores = retriever.evaluate( relevant_docs, results, retriever.k_values, ignore_identical_ids=self.ignore_identical_ids, + task_name=self.metadata.name, ) mrr, naucs_mrr = retriever.evaluate_custom( relevant_docs, results, retriever.k_values, "mrr" @@ -375,6 +228,7 @@ def _evaluate_subset( k.replace("@", "_at_").replace("_P", "_precision").lower(): v for k, v in naucs_mrr.items() }, + **task_scores, } self._add_main_score(scores) @@ -421,20 +275,36 @@ def _calculate_metrics_from_split( queries = self.queries[hf_subset][split] corpus = self.corpus[hf_subset][split] relevant_docs = self.relevant_docs[hf_subset][split] + if self.instructions is not None: + instructions = self.instructions[hf_subset][split] + if self.top_ranked is not None: + top_ranked = self.top_ranked[hf_subset][split] elif compute_overall: queries = {} corpus = {} relevant_docs = {} + instructions = {} + top_ranked = {} for hf_subset in self.metadata.eval_langs: queries.update(process_docs(self.queries, hf_subset, split)) corpus.update(process_docs(self.corpus, hf_subset, split)) relevant_docs.update( process_relevant_docs(self.relevant_docs, hf_subset, split) ) + if self.instructions is not None: + instructions.update( + process_docs(self.instructions, hf_subset, split) + ) + if self.top_ranked is not None: + top_ranked.update(process_docs(self.top_ranked, hf_subset, split)) else: queries = self.queries[split] corpus = self.corpus[split] relevant_docs = self.relevant_docs[split] + if self.instructions is not None: + instructions = self.instructions[split] + if self.top_ranked is not None: + top_ranked = self.top_ranked[split] query_len, doc_len = calculate_length(queries, corpus) num_documents = len(corpus) @@ -446,12 +316,47 @@ def _calculate_metrics_from_split( for docs in relevant_docs.values() ) qrels_per_doc = num_qrels_non_zero / len(relevant_docs) if num_queries else 0 + + if self.instructions is not None: + total_instructions_len = sum( + [ + len(instruction) + if isinstance(instruction, str) + else sum([len(i) for i in instruction]) + for instruction in instructions.values() + ] + ) + # flatten the values in instructions to get the number of them + num_instructions = sum( + 1 if isinstance(instruction, str) else len(instruction) + for instruction in instructions.values() + ) + else: + total_instructions_len = 0 + num_instructions = 0 + + if self.top_ranked is not None: + top_ranked_per_query = ( + sum(len(docs) for docs in top_ranked.values()) / num_queries + if num_queries + else 0 + ) + else: + top_ranked_per_query = 0 + return RetrievalDescriptiveStatistics( average_document_length=doc_len, average_query_length=query_len, num_documents=num_documents, num_queries=num_queries, average_relevant_docs_per_query=qrels_per_doc, + average_instruction_length=total_instructions_len / num_instructions + if num_instructions + else 0, + average_num_instructions_per_query=num_instructions / num_queries + if num_queries + else 0, + average_top_ranked_per_query=top_ranked_per_query, ) diff --git a/mteb/abstasks/MultilingualTask.py b/mteb/abstasks/MultilingualTask.py index 3fd007df6d..84860a48ed 100644 --- a/mteb/abstasks/MultilingualTask.py +++ b/mteb/abstasks/MultilingualTask.py @@ -4,7 +4,7 @@ from .MultiSubsetLoader import MultiSubsetLoader -class MultilingualTask(MultiSubsetLoader, AbsTask): +class MultilingualTask(AbsTask, MultiSubsetLoader): def __init__(self, hf_subsets: list[str] | None = None, **kwargs): super().__init__(**kwargs) if isinstance(hf_subsets, list): diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 434a61f3a5..9d53962829 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -85,6 +85,7 @@ "STS", "Summarization", "InstructionRetrieval", + "InstructionReranking", "Speed", ] diff --git a/mteb/abstasks/__init__.py b/mteb/abstasks/__init__.py index ef3e8853d7..086866b997 100644 --- a/mteb/abstasks/__init__.py +++ b/mteb/abstasks/__init__.py @@ -5,7 +5,6 @@ from .AbsTaskBitextMining import * from .AbsTaskClassification import * from .AbsTaskClustering import * -from .AbsTaskInstructionRetrieval import * from .AbsTaskMultilabelClassification import * from .AbsTaskPairClassification import * from .AbsTaskReranking import * diff --git a/mteb/abstasks/dataloaders.py b/mteb/abstasks/dataloaders.py new file mode 100644 index 0000000000..40afb883cc --- /dev/null +++ b/mteb/abstasks/dataloaders.py @@ -0,0 +1,318 @@ +from __future__ import annotations + +import logging +import os +from collections import defaultdict + +from datasets import Features, Sequence, Value, get_dataset_config_names, load_dataset + +logger = logging.getLogger(__name__) + + +# Adapted from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/datasets/data_loader_hf.py#L10 +class HFDataLoader: + """This dataloader handles the dataloading for retrieval-oriented tasks, including standard retrieval, reranking, and instruction-based variants of the above. + + If the `hf_repo` is provided, the dataloader will fetch the data from the HuggingFace hub. Otherwise, it will look for the data in the specified `data_folder`. + + Required files include the corpus, queries, and qrels files. Optionally, the dataloader can also load instructions and top-ranked (for reranking) files. + """ + + def __init__( + self, + hf_repo: str | None = None, + hf_repo_qrels: str | None = None, + data_folder: str | None = None, + prefix: str | None = None, + corpus_file: str = "corpus.jsonl", + query_file: str = "queries.jsonl", + qrels_folder: str = "qrels", + qrels_file: str = "", + streaming: bool = False, + keep_in_memory: bool = False, + ): + self.corpus = {} + self.queries = {} + self.qrels = {} + self.instructions = {} + self.top_ranked = {} + self.hf_repo = hf_repo + if hf_repo: + # By default fetch qrels from same repo not a second repo with "-qrels" like in original + self.hf_repo_qrels = hf_repo_qrels if hf_repo_qrels else hf_repo + else: + # data folder would contain these files: + # (1) fiqa/corpus.jsonl (format: jsonlines) + # (2) fiqa/queries.jsonl (format: jsonlines) + # (3) fiqa/qrels/test.tsv (format: tsv ("\t")) + if prefix: + query_file = prefix + "-" + query_file + qrels_folder = prefix + "-" + qrels_folder + + self.corpus_file = ( + os.path.join(data_folder, corpus_file) if data_folder else corpus_file + ) + self.query_file = ( + os.path.join(data_folder, query_file) if data_folder else query_file + ) + self.qrels_folder = ( + os.path.join(data_folder, qrels_folder) if data_folder else None + ) + self.qrels_file = qrels_file + self.top_ranked_file = ( + os.path.join(data_folder, "top_ranked.jsonl") + if data_folder + else "top_ranked.jsonl" + ) + self.top_ranked_file = ( + None + if not os.path.exists(self.top_ranked_file) + else self.top_ranked_file + ) + self.instructions_file = ( + os.path.join(data_folder, "instructions.jsonl") + if data_folder + else "instructions.jsonl" + ) + self.instructions_file = ( + None + if not os.path.exists(self.instructions_file) + else self.instructions_file + ) + self.streaming = streaming + self.keep_in_memory = keep_in_memory + + @staticmethod + def check(fIn: str, ext: str): + if not os.path.exists(fIn): + raise ValueError(f"File {fIn} not present! Please provide accurate file.") + + if not fIn.endswith(ext): + raise ValueError(f"File {fIn} must be present with extension {ext}") + + def load( + self, split: str = "test" + ) -> tuple[ + dict[str, dict[str, str]], # corpus + dict[str, str | list[str]], # queries + dict[str, dict[str, int]], # qrels/relevant_docs + dict[str, str | list[str]], # instructions (optional) + dict[str, list[str]] | dict[str, dict[str, float]], # top_ranked (optional) + ]: + if not self.hf_repo: + self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv") + self.check(fIn=self.corpus_file, ext="jsonl") + self.check(fIn=self.query_file, ext="jsonl") + self.check(fIn=self.qrels_file, ext="tsv") + if self.top_ranked_file: + self.check(fIn=self.top_ranked_file, ext="jsonl") + if self.instructions_file: + self.check(fIn=self.instructions_file, ext="jsonl") + configs = [] + else: + configs = get_dataset_config_names(self.hf_repo) + + if not len(self.corpus): + logger.info("Loading Corpus...") + self._load_corpus() + logger.info("Loaded %d %s Documents.", len(self.corpus), split.upper()) + logger.info("Doc Example: %s", self.corpus[0]) + + if not len(self.queries): + logger.info("Loading Queries...") + self._load_queries() + + if "top_ranked" in configs or (not self.hf_repo and self.top_ranked_file): + logger.info("Loading Top Ranked") + self._load_top_ranked() + logger.info( + f"Top ranked loaded: {len(self.top_ranked) if self.top_ranked else 0}" + ) + else: + self.top_ranked = None + + if "instruction" in configs or (not self.hf_repo and self.instructions_file): + logger.info("Loading Instructions") + self._load_instructions() + logger.info( + f"Instructions loaded: {len(self.instructions) if self.instructions else 0}" + ) + else: + self.instructions = None + + self._load_qrels(split) + # filter queries with no qrels + qrels_dict = defaultdict(dict) + + def qrels_dict_init(row): + qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"]) + + self.qrels.map(qrels_dict_init) + self.qrels = qrels_dict + self.queries = self.queries.filter(lambda x: x["id"] in self.qrels) + logger.info("Loaded %d %s Queries.", len(self.queries), split.upper()) + logger.info("Query Example: %s", self.queries[0]) + + return self.corpus, self.queries, self.qrels, self.instructions, self.top_ranked + + def load_corpus(self) -> dict[str, dict[str, str]]: + if not self.hf_repo: + self.check(fIn=self.corpus_file, ext="jsonl") + + if not len(self.corpus): + logger.info("Loading Corpus...") + self._load_corpus() + logger.info("Loaded %d %s Documents.", len(self.corpus)) + logger.info("Doc Example: %s", self.corpus[0]) + + return self.corpus + + def _load_corpus(self): + if self.hf_repo: + corpus_ds = load_dataset( + self.hf_repo, + "corpus", + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + ) + else: + corpus_ds = load_dataset( + "json", + data_files=self.corpus_file, + streaming=self.streaming, + keep_in_memory=self.keep_in_memory, + ) + corpus_ds = next(iter(corpus_ds.values())) # get first split + corpus_ds = corpus_ds.cast_column("_id", Value("string")) + corpus_ds = corpus_ds.rename_column("_id", "id") + corpus_ds = corpus_ds.remove_columns( + [ + col + for col in corpus_ds.column_names + if col not in ["id", "text", "title"] + ] + ) + self.corpus = corpus_ds + + def _load_queries(self): + if self.hf_repo: + queries_ds = load_dataset( + self.hf_repo, + "queries", + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + ) + else: + queries_ds = load_dataset( + "json", + data_files=self.query_file, + streaming=self.streaming, + keep_in_memory=self.keep_in_memory, + ) + queries_ds = next(iter(queries_ds.values())) # get first split + queries_ds = queries_ds.cast_column("_id", Value("string")) + queries_ds = queries_ds.rename_column("_id", "id") + queries_ds = queries_ds.remove_columns( + [col for col in queries_ds.column_names if col not in ["id", "text"]] + ) + self.queries = queries_ds + + def _load_qrels(self, split): + if self.hf_repo: + qrels_ds = load_dataset( + self.hf_repo_qrels, + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + )[split] + else: + qrels_ds = load_dataset( + "csv", + data_files=self.qrels_file, + delimiter="\t", + keep_in_memory=self.keep_in_memory, + ) + features = Features( + { + "query-id": Value("string"), + "corpus-id": Value("string"), + "score": Value("float"), + } + ) + qrels_ds = qrels_ds.cast(features) + self.qrels = qrels_ds + + def _load_top_ranked(self): + if self.hf_repo: + top_ranked_ds = load_dataset( + self.hf_repo, + "top_ranked", + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + ) + else: + top_ranked_ds = load_dataset( + "json", + data_files=self.top_ranked_file, + streaming=self.streaming, + keep_in_memory=self.keep_in_memory, + ) + top_ranked_ds = next(iter(top_ranked_ds.values())) # get first split + if ( + "query-id" in top_ranked_ds.column_names + and "corpus-ids" in top_ranked_ds.column_names + ): + # is a {query-id: str, corpus-ids: list[str]} format + top_ranked_ds = top_ranked_ds.cast_column("query-id", Value("string")) + top_ranked_ds = top_ranked_ds.cast_column( + "corpus-ids", Sequence(Value("string")) + ) + else: + # is a {"query-id": {"corpus-id": score}} format, let's change it + top_ranked_ds = top_ranked_ds.map( + lambda x: {"query-id": x["query-id"], "corpus-ids": list(x.keys())}, + remove_columns=[ + col for col in top_ranked_ds.column_names if col != "query-id" + ], + ) + + top_ranked_ds = top_ranked_ds.remove_columns( + [ + col + for col in top_ranked_ds.column_names + if col not in ["query-id", "corpus-ids"] + ] + ) + self.top_ranked = top_ranked_ds + + def _load_instructions(self): + if self.hf_repo: + instructions_ds = load_dataset( + self.hf_repo, + "instruction", + keep_in_memory=self.keep_in_memory, + streaming=self.streaming, + ) + else: + instructions_ds = load_dataset( + "json", + data_files=self.instructions_file, + streaming=self.streaming, + keep_in_memory=self.keep_in_memory, + ) + instructions_ds = next(iter(instructions_ds.values())) + instructions_ds = instructions_ds.cast_column("query", Value("string")) + + # if instructions is a string, change it to a list[str] + if "instruction" in instructions_ds.column_names: + instructions_ds = instructions_ds.cast_column( + "instruction", Sequence(Value("string")) + ) + + instructions_ds = instructions_ds.remove_columns( + [ + col + for col in instructions_ds.column_names + if col not in ["query", "instruction"] + ] + ) + self.instructions = instructions_ds diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 6212ee7e1c..aab07e2464 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -14,6 +14,7 @@ from .Evaluator import Evaluator from .utils import ( + add_task_specific_scores, confidence_scores, hole, mrr, @@ -61,7 +62,7 @@ def __call__( self, corpus: dict[str, dict[str, str]], queries: dict[str, str], - instructions: dict[str, str], + instructions: dict[str, str] | None = None, qid: str | None = None, **kwargs, ) -> dict[str, dict[str, float]]: @@ -103,6 +104,7 @@ def evaluate( results: dict[str, dict[str, float]], k_values: list[int], ignore_identical_ids: bool = False, + task_name: str = None, ) -> tuple[ dict[str, float], dict[str, float], @@ -164,8 +166,11 @@ def evaluate( naucs = RetrievalEvaluator.evaluate_abstention( results, {**all_ndcgs, **all_aps, **all_recalls, **all_precisions} ) + task_scores = add_task_specific_scores( + scores, qrels, results, task_name, k_values + ) - return ndcg, _map, recall, precision, naucs + return ndcg, _map, recall, precision, naucs, task_scores @staticmethod def evaluate_custom( diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index 95d84bd2f2..6574c2a2bc 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from collections import defaultdict import numpy as np import pandas as pd @@ -426,3 +427,94 @@ def abstention_curve( abst_nauc = (abst_auc - flat_auc) / (or_auc - flat_auc) return abst_nauc + + +def add_task_specific_scores( + scores: dict[str, float], + qrels: dict[str, dict[str, int]], + results: dict[str, dict[str, float]], + task_name: str, + k_values: list[int], # not needed now, but perhaps later? +) -> dict[str, float]: + """Add task-specific scores to the scores dictionary, that are not needed for all results but require additional computation.""" + task_scores = {} + if task_name in ["NevIR"]: + paired_score = paired_accuracy(qrels, results, scores) + task_scores["paired_accuracy"] = paired_score + + if task_name in ["InstructIR"]: + robustness_at_10_score = robustness_at_10(qrels, results, scores) + task_scores["robustness_at_10"] = robustness_at_10_score + + if task_name in [ + "mFollowIRInstructionReranking", + "mFollowIRCrossLingualInstructionReranking", + "Robust04InstructionRetrieval", + "Core17InstructionRetrieval", + "News21InstructionRetrieval", + ]: + p_mrr = evaluate_change(results, scores, qrels) + task_scores["p-MRR"] = p_mrr["p-MRR"] + + return task_scores + + +def paired_accuracy( + qrels: dict[str, dict[str, float]], + results: dict[str, dict[str, float]], + scores: dict[str, float], +) -> float: + """Computes the paired accuracy. This means both queries for an instance have to be correct for it to count. + This is because models will prefer one passage all the time, giving it 50% automatically unless we correct for this. + For more details, see https://arxiv.org/abs/2305.07614 + + Args: + qrels: Ground truth relevance judgments for the queries + results: Predicted relevance scores for the queries + scores: The scores for the queries, to extract top_1 recall for each query + """ + # group the queries by the query id + query_keys = set() + for key in qrels.keys(): + query_keys.add(key.split("_")[0]) + + paired_scores = [] + for key in query_keys: + # get recall_at_1 for both q1 and q2 + q1_recall_at_1 = scores[f"{key}_q1"]["recall_1"] + q2_recall_at_1 = scores[f"{key}_q2"]["recall_1"] + + # the score is 1 if both are 1, 0 otherwise + paired_scores.append(1 if q1_recall_at_1 == 1 and q2_recall_at_1 == 1 else 0) + + return sum(paired_scores) / len(paired_scores) + + +def robustness_at_10( + qrels: dict[str, dict[str, float]], + results: dict[str, dict[str, float]], + scores: dict[str, float], +) -> float: + """Computes the robustness at 10. This computes the lowest ndcg@10 over all instructions. Taken from https://arxiv.org/abs/2402.14334 + + Args: + qrels: Ground truth relevance judgments for the queries + results: Predicted relevance scores for the queries + scores: The scores for the queries, to extract ndcg@10 for each query + """ + query_keys = defaultdict(list) + for key in qrels.keys(): + query_keys[key.split("_")[0]].append(key) + + robustness_scores = [] + breakpoint() + for _, keys in query_keys.items(): + # get the ndcg@10 for each query + current_scores = [] + for key in keys: + current_scores.append(scores[key]["ndcg_cut_10"]) + + # get the lowest ndcg@10 + robustness_scores.append(min(current_scores)) + + return sum(robustness_scores) / len(robustness_scores) diff --git a/mteb/models/instructions.py b/mteb/models/instructions.py index 4a31f8da02..88369e1253 100644 --- a/mteb/models/instructions.py +++ b/mteb/models/instructions.py @@ -16,6 +16,7 @@ "Reranking": "Retrieve text based on user query.", "Retrieval": "Retrieve text based on user query.", "InstructionRetrieval": "Retrieve text based on user query.", + "InstructionReranking": "Retrieve text based on user query.", "PairClassification": "Retrieve text that are semantically similar to the given text", } diff --git a/mteb/tasks/InstructionReranking/__init__.py b/mteb/tasks/InstructionReranking/__init__.py new file mode 100644 index 0000000000..f5e812247d --- /dev/null +++ b/mteb/tasks/InstructionReranking/__init__.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +from .eng.Core17InstructionRetrieval import * +from .eng.News21InstructionRetrieval import * +from .eng.Robust04InstructionRetrieval import * +from .multilingual.mFollowIR import * diff --git a/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py b/mteb/tasks/InstructionReranking/eng/Core17InstructionRetrieval.py similarity index 92% rename from mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py rename to mteb/tasks/InstructionReranking/eng/Core17InstructionRetrieval.py index 9b52f282b2..0000969419 100644 --- a/mteb/tasks/InstructionRetrieval/eng/Core17InstructionRetrieval.py +++ b/mteb/tasks/InstructionReranking/eng/Core17InstructionRetrieval.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval +from ....abstasks.AbsTaskReranking import AbsTaskReranking -class Core17InstructionRetrieval(AbsTaskInstructionRetrieval): +class Core17InstructionRetrieval(AbsTaskReranking): metadata = TaskMetadata( name="Core17InstructionRetrieval", description="Measuring retrieval instruction following ability on Core17 narratives for the FollowIR benchmark.", diff --git a/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py b/mteb/tasks/InstructionReranking/eng/News21InstructionRetrieval.py similarity index 88% rename from mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py rename to mteb/tasks/InstructionReranking/eng/News21InstructionRetrieval.py index d693091279..7c3b5b5860 100644 --- a/mteb/tasks/InstructionRetrieval/eng/News21InstructionRetrieval.py +++ b/mteb/tasks/InstructionReranking/eng/News21InstructionRetrieval.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval +from ....abstasks.AbsTaskReranking import AbsTaskReranking -class News21InstructionRetrieval(AbsTaskInstructionRetrieval): +class News21InstructionRetrieval(AbsTaskReranking): metadata = TaskMetadata( name="News21InstructionRetrieval", description="Measuring retrieval instruction following ability on News21 narratives for the FollowIR benchmark.", @@ -14,7 +14,7 @@ class News21InstructionRetrieval(AbsTaskInstructionRetrieval): "path": "jhu-clsp/news21-instructions", "revision": "e0144086b45fe31ac125e9ac1a83b6a409bb6ca6", }, - type="InstructionRetrieval", + type="InstructionReranking", category="s2p", modalities=["text"], eval_splits=["test"], diff --git a/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py b/mteb/tasks/InstructionReranking/eng/Robust04InstructionRetrieval.py similarity index 88% rename from mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py rename to mteb/tasks/InstructionReranking/eng/Robust04InstructionRetrieval.py index c68dfabc18..3a54496555 100644 --- a/mteb/tasks/InstructionRetrieval/eng/Robust04InstructionRetrieval.py +++ b/mteb/tasks/InstructionReranking/eng/Robust04InstructionRetrieval.py @@ -2,10 +2,10 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval +from ....abstasks.AbsTaskReranking import AbsTaskReranking -class Robust04InstructionRetrieval(AbsTaskInstructionRetrieval): +class Robust04InstructionRetrieval(AbsTaskReranking): metadata = TaskMetadata( name="Robust04InstructionRetrieval", description="Measuring retrieval instruction following ability on Robust04 narratives for the FollowIR benchmark.", @@ -14,7 +14,7 @@ class Robust04InstructionRetrieval(AbsTaskInstructionRetrieval): "path": "jhu-clsp/robust04-instructions", "revision": "a5a1c4fe2bc528ac12e83f8cdf82178da85d2f1d", }, - type="InstructionRetrieval", + type="InstructionReranking", category="s2p", modalities=["text"], eval_splits=["test"], diff --git a/mteb/tasks/InstructionRetrieval/multilingual/__init__.py b/mteb/tasks/InstructionReranking/eng/__init__.py similarity index 100% rename from mteb/tasks/InstructionRetrieval/multilingual/__init__.py rename to mteb/tasks/InstructionReranking/eng/__init__.py diff --git a/mteb/tasks/InstructionReranking/multilingual/__init__.py b/mteb/tasks/InstructionReranking/multilingual/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py b/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py similarity index 97% rename from mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py rename to mteb/tasks/InstructionReranking/multilingual/mFollowIR.py index 04c1a56e19..d7ade8abf0 100644 --- a/mteb/tasks/InstructionRetrieval/multilingual/mFollowIR.py +++ b/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py @@ -7,7 +7,7 @@ from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval +from ....abstasks.AbsTaskReranking import AbsTaskReranking _LANGUAGES = { "fas": ["fas-Arab"], @@ -172,9 +172,9 @@ def load_data( ) -class mFollowIRCrossLingual(MultilingualTask, AbsTaskInstructionRetrieval): +class mFollowIRCrossLingual(MultilingualTask, AbsTaskReranking): metadata = TaskMetadata( - name="mFollowIRCrossLingualInstructionRetrieval", + name="mFollowIRCrossLingualInstructionReranking", description="This tasks measures retrieval instruction following ability on NeuCLIR narratives for the mFollowIR benchmark on the Farsi, Russian, and Chinese languages with English queries/instructions.", reference="https://neuclir.github.io/", dataset={ @@ -270,9 +270,9 @@ def load_data(self, **kwargs): self.data_loaded = True -class mFollowIR(MultilingualTask, AbsTaskInstructionRetrieval): +class mFollowIR(MultilingualTask, AbsTaskReranking): metadata = TaskMetadata( - name="mFollowIRInstructionRetrieval", + name="mFollowIRInstructionReranking", description="This tasks measures retrieval instruction following ability on NeuCLIR narratives for the mFollowIR benchmark on the Farsi, Russian, and Chinese languages.", reference="https://neuclir.github.io/", dataset={ diff --git a/mteb/tasks/InstructionRetrieval/__init__.py b/mteb/tasks/InstructionRetrieval/__init__.py index f5e812247d..c8454858dd 100644 --- a/mteb/tasks/InstructionRetrieval/__init__.py +++ b/mteb/tasks/InstructionRetrieval/__init__.py @@ -1,6 +1,3 @@ from __future__ import annotations -from .eng.Core17InstructionRetrieval import * -from .eng.News21InstructionRetrieval import * -from .eng.Robust04InstructionRetrieval import * -from .multilingual.mFollowIR import * +from .eng.InstructIR import * diff --git a/mteb/tasks/InstructionRetrieval/eng/InstructIR.py b/mteb/tasks/InstructionRetrieval/eng/InstructIR.py new file mode 100644 index 0000000000..e4e2110817 --- /dev/null +++ b/mteb/tasks/InstructionRetrieval/eng/InstructIR.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class InstructIR(AbsTaskRetrieval): + metadata = TaskMetadata( + name="InstructIR", + description="A benchmark specifically designed to evaluate the instruction following ability in information retrieval models. Our approach focuses on user-aligned instructions tailored to each query instance, reflecting the diverse characteristics inherent in real-world search scenarios.", + reference="https://github.com/kaistAI/InstructIR/tree/main", + dataset={ + "path": "mteb/InstructIR-mteb", + "revision": "6b68698b2cd34aed28a64c8917605019f065a6c5", + }, + type="Reranking", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="robustness_at_10", + date=("2024-02-05", "2024-02-06"), + domains=["Web"], + task_subtypes=["Article retrieval"], + license="mit", + annotations_creators="human-annotated", + dialect=[], + sample_creation="created", + bibtex_citation="""@article{oh2024instructir, + title={{INSTRUCTIR: A Benchmark for Instruction Following of Information Retrieval Models}}, + author={{Hanseok Oh and Hyunji Lee and Seonghyeon Ye and Haebin Shin and Hansol Jang and Changwook Jun and Minjoon Seo}}, + year={{2024}}, + eprint={{2402.14334}}, + archivePrefix={{arXiv}}, + primaryClass={{cs.CL}} +}""", + descriptive_stats={ + "n_samples": {"test": 2255}, + "test": { + "num_samples": 375, + "num_positive": 375, + "num_negative": 375, + "avg_query_len": 50.205333333333336, + "avg_positive_len": 6.013333333333334, + "avg_negative_len": 13.986666666666666, + }, + }, + ) diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index a4b302a17f..18dbd53f43 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -2,6 +2,7 @@ from .eng.AskUbuntuDupQuestions import * from .eng.MindSmallReranking import * +from .eng.NevIR import * from .eng.SciDocsReranking import * from .eng.StackOverflowDupQuestions import * from .eng.WebLINXCandidatesReranking import * diff --git a/mteb/tasks/Reranking/eng/NevIR.py b/mteb/tasks/Reranking/eng/NevIR.py new file mode 100644 index 0000000000..c5810b62b3 --- /dev/null +++ b/mteb/tasks/Reranking/eng/NevIR.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskReranking import AbsTaskReranking + + +class NevIR(AbsTaskReranking): + metadata = TaskMetadata( + name="NevIR", + description="Paired evaluation of real world negation in retrieval, with questions and passages. Since models generally prefer one passage over the other always, there are two questions that the model must get right to understand the negation (hence the `paired_accuracy` metric).", + reference="https://github.com/orionw/NevIR", + dataset={ + "path": "orionweller/NevIR-mteb", + "revision": "eab99575c01c6a8e39f8d2adc6e3c3adcfe84413", + }, + type="Reranking", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="paired_accuracy", + date=("2023-05-12", "2023-09-28"), + domains=["Web"], + task_subtypes=["Article retrieval"], + license="mit", + annotations_creators="human-annotated", + dialect=[], + sample_creation="created", + bibtex_citation="""@inproceedings{Weller2023NevIRNI, + title={{NevIR: Negation in Neural Information Retrieval}}, + author={{Orion Weller and Dawn J Lawrie and Benjamin Van Durme}}, + booktitle={{Conference of the European Chapter of the Association for Computational Linguistics}}, + year={{2023}}, + url={{https://api.semanticscholar.org/CorpusID:258676146}} +}""", + descriptive_stats={ + "n_samples": {"test": 2255}, + "test": { + "num_samples": 375, + "num_positive": 375, + "num_negative": 375, + "avg_query_len": 50.205333333333336, + "avg_positive_len": 6.013333333333334, + "avg_negative_len": 13.986666666666666, + }, + }, + ) diff --git a/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py b/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py index adbacffd34..2d9b528479 100644 --- a/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py +++ b/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py @@ -1,7 +1,5 @@ from __future__ import annotations -import datasets - from mteb.abstasks.TaskMetadata import TaskMetadata from ....abstasks.AbsTaskReranking import AbsTaskReranking @@ -29,7 +27,7 @@ class WebLINXCandidatesReranking(AbsTaskReranking): "test_web", ], eval_langs=["eng-Latn"], - main_score="mrr_at_1000", + main_score="mrr_at_10", date=("2023-03-01", "2023-10-30"), domains=["Academic", "Web", "Written"], task_subtypes=["Code retrieval", "Conversational retrieval"], @@ -66,22 +64,3 @@ class WebLINXCandidatesReranking(AbsTaskReranking): }, }, ) - - def load_data(self, **kwargs): - if self.data_loaded: - return - - self._datasets = {} - - for split in self.metadata.eval_splits: - self._datasets[split] = datasets.load_dataset( - split=split, **self.metadata_dict["dataset"] - ) - - self.dataset = datasets.DatasetDict( - {split: self._datasets[split] for split in self.metadata.eval_splits} - ) - - self.dataset_transform() - - self.data_loaded = True diff --git a/mteb/tasks/Reranking/fra/AlloprofReranking.py b/mteb/tasks/Reranking/fra/AlloprofReranking.py index 3ebce4f87b..54cada1a94 100644 --- a/mteb/tasks/Reranking/fra/AlloprofReranking.py +++ b/mteb/tasks/Reranking/fra/AlloprofReranking.py @@ -70,4 +70,7 @@ def load_data(self, **kwargs): self.dataset_transform() + # now convert to the new format + self.transform_old_dataset_format(self.dataset) + self.data_loaded = True diff --git a/mteb/tasks/Reranking/fra/SyntecReranking.py b/mteb/tasks/Reranking/fra/SyntecReranking.py index 9fddfeccdd..ac6466e458 100644 --- a/mteb/tasks/Reranking/fra/SyntecReranking.py +++ b/mteb/tasks/Reranking/fra/SyntecReranking.py @@ -65,4 +65,7 @@ def load_data(self, **kwargs): self.dataset_transform() + # now convert to the new format + self.transform_old_dataset_format(self.dataset) + self.data_loaded = True diff --git a/mteb/tasks/Reranking/jpn/MMarcoReranking.py b/mteb/tasks/Reranking/jpn/MMarcoReranking.py index 16da6b3063..86c22a1d26 100644 --- a/mteb/tasks/Reranking/jpn/MMarcoReranking.py +++ b/mteb/tasks/Reranking/jpn/MMarcoReranking.py @@ -1,6 +1,7 @@ from __future__ import annotations from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata @@ -38,7 +39,14 @@ class VoyageMMarcoReranking(AbsTaskReranking): }, ) - def dataset_transform(self): + def load_data(self, **kwargs): + if self.data_loaded: + return + + # since AbsTaskReranking has no `load_data` method, we call the parent class method + super(AbsTaskRetrieval, self).load_data(**kwargs) + + # now fix the column names self.dataset = self.dataset.rename_column( "positives", "positive" ).rename_column("negatives", "negative") @@ -46,3 +54,8 @@ def dataset_transform(self): self.dataset["test"] = self.dataset.pop("train").train_test_split( test_size=2048, seed=self.seed )["test"] + + # now convert to the new format + self.transform_old_dataset_format(self.dataset) + + self.data_loaded = True diff --git a/mteb/tasks/Reranking/multilingual/ESCIReranking.py b/mteb/tasks/Reranking/multilingual/ESCIReranking.py index 9eb21c1961..3c0db2dcac 100644 --- a/mteb/tasks/Reranking/multilingual/ESCIReranking.py +++ b/mteb/tasks/Reranking/multilingual/ESCIReranking.py @@ -24,7 +24,7 @@ }""" -class ESCIReranking(MultilingualTask, AbsTaskReranking): +class ESCIReranking(AbsTaskReranking, MultilingualTask): metadata = TaskMetadata( name="ESCIReranking", description="", diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index f59530dc5c..00048688f5 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -1,15 +1,9 @@ from __future__ import annotations import logging -from typing import Any - -from datasets import Dataset from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata -from mteb.encoder_interface import Encoder -from mteb.evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator -from mteb.load_results.task_results import ScoresDict from ....abstasks.AbsTaskReranking import AbsTaskReranking @@ -50,7 +44,7 @@ }""" -class MIRACLReranking(MultilingualTask, AbsTaskReranking): +class MIRACLReranking(AbsTaskReranking, MultilingualTask): metadata = TaskMetadata( name="MIRACLReranking", description="MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval dataset that focuses on search across 18 different languages.", @@ -65,7 +59,7 @@ class MIRACLReranking(MultilingualTask, AbsTaskReranking): modalities=["text"], eval_splits=[_EVAL_SPLIT], eval_langs=_LANGUAGES, - main_score="ndcg_cut_10", + main_score="ndcg_at_10", date=("2022-06-01", "2023-01-30"), domains=["Encyclopaedic", "Written"], task_subtypes=[], @@ -79,24 +73,3 @@ class MIRACLReranking(MultilingualTask, AbsTaskReranking): "avg_character_length": {"dev": 506.30}, }, ) - - def _evaluate_subset( - self, - model: Encoder, - data_split: Dataset, - *, - encode_kwargs: dict[str, Any] = {}, - **kwargs: Any, - ) -> ScoresDict: - # TODO: this file will need to be fixed - evaluator = RetrievalEvaluator( - samples=data_split, - evaluator_type="miracl", - task_name=self.metadata.name, - encode_kwargs=encode_kwargs, - **kwargs, - ) - scores = evaluator(model) - - self._add_main_score(scores) - return scores diff --git a/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py b/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py index 59332cdb2f..632a20d4e3 100644 --- a/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py +++ b/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py @@ -25,7 +25,7 @@ } -class WikipediaRerankingMultilingual(MultilingualTask, AbsTaskReranking): +class WikipediaRerankingMultilingual(AbsTaskReranking, MultilingualTask): metadata = TaskMetadata( name="WikipediaRerankingMultilingual", description="The dataset is derived from Cohere's wikipedia-2023-11 dataset and contains synthetically generated queries.", diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index 42ef5a98c8..f8b6a4fc49 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -88,7 +88,7 @@ class CMedQAv1(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["cmn-Hans"], - main_score="map", + main_score="map_at_1000", date=("2017-01-01", "2017-07-26"), domains=["Medical", "Written"], task_subtypes=[], diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index dfe568bb89..72c357606f 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -3,6 +3,7 @@ from .BitextMining import * from .Classification import * from .Clustering import * +from .InstructionReranking import * from .InstructionRetrieval import * from .MultiLabelClassification import * from .PairClassification import * diff --git a/scripts/running_model/check_results.py b/scripts/running_model/check_results.py index c410fb5be7..09f95fcd22 100644 --- a/scripts/running_model/check_results.py +++ b/scripts/running_model/check_results.py @@ -119,6 +119,7 @@ def normalize_results(results): "Summarization", "Clustering", "InstructionRetrieval", + "InstructionReranking", "Retrieval", ] ) diff --git a/scripts/running_model/create_slurm_jobs.py b/scripts/running_model/create_slurm_jobs.py index 606630d9e5..ce3a03df92 100644 --- a/scripts/running_model/create_slurm_jobs.py +++ b/scripts/running_model/create_slurm_jobs.py @@ -100,6 +100,7 @@ def run_slurm_jobs(files: list[Path]) -> None: "Reranking", "Retrieval", "InstructionRetrieval", + "InstructionReranking", "STS", "Summarization", ], diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index 60c1a205a0..9143c6c090 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -9,7 +9,6 @@ from mteb.abstasks.AbsTaskClassification import AbsTaskClassification from mteb.abstasks.AbsTaskClustering import AbsTaskClustering from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast -from mteb.abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval from mteb.abstasks.AbsTaskMultilabelClassification import ( AbsTaskMultilabelClassification, ) @@ -864,42 +863,47 @@ class MockRerankingTask(AbsTaskReranking): metadata = TaskMetadata( type="Reranking", name="MockRerankingTask", - main_score="map", + main_score="map_at_1000", descriptive_stats={ "test": { - "num_samples": 2, - "num_positive": 2, - "num_negative": 2, - "avg_query_len": 26.0, - "avg_positive_len": 30.0, - "avg_negative_len": 30.0, + "average_document_length": 27.0, + "average_query_length": 26.0, + "num_documents": 2, + "num_queries": 2, + "average_relevant_docs_per_query": 1.0, } }, **general_args, # type: ignore ) def load_data(self, **kwargs): - query = ["This is a test sentence", "This is another test sentence"] - positive = [ - "This is a positive sentence", - "This is another positive sentence", - ] - negative = [ - "This is a negative sentence", - "This is another negative sentence", - ] - - self.dataset = DatasetDict( - { - "test": Dataset.from_dict( - { - "query": query, - "positive": positive, - "negative": negative, - } - ), + self.queries = { + "test": { + "q1": "This is a test sentence", + "q2": "This is another test sentence", } - ) + } + self.corpus = { + "test": { + "d1": "This is a positive sentence", + "d2": "This is a negative sentence", + } + } + + self.relevant_docs = { + "test": { + "q1": {"d1": 1, "d2": 0}, + "q2": {"d1": 0, "d2": 1}, + }, + } + + self.top_ranked = { + "test": { + "q1": ["d1", "d2"], + "q2": ["d2", "d1"], + }, + } + self.instructions = None self.data_loaded = True @@ -907,31 +911,28 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): metadata = TaskMetadata( type="Reranking", name="MockMultilingualRerankingTask", - main_score="map", + main_score="map_at_10", descriptive_stats={ "test": { - "num_samples": 4, - "num_positive": 4, - "num_negative": 4, - "avg_query_len": 26.0, - "avg_positive_len": 30.0, - "avg_negative_len": 30.0, + "average_document_length": 30.0, + "average_query_length": 26.0, + "num_documents": 4, + "num_queries": 4, + "average_relevant_docs_per_query": 1.0, "hf_subset_descriptive_stats": { "eng": { - "num_samples": 2, - "num_positive": 2, - "num_negative": 2, - "avg_query_len": 26.0, - "avg_positive_len": 30.0, - "avg_negative_len": 30.0, + "average_document_length": 30.0, + "average_query_length": 26.0, + "num_documents": 2, + "num_queries": 2, + "average_relevant_docs_per_query": 1.0, }, "fra": { - "num_samples": 2, - "num_positive": 2, - "num_negative": 2, - "avg_query_len": 26.0, - "avg_positive_len": 30.0, - "avg_negative_len": 30.0, + "average_document_length": 30.0, + "average_query_length": 26.0, + "num_documents": 2, + "num_queries": 2, + "average_relevant_docs_per_query": 1.0, }, }, } @@ -941,30 +942,42 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): metadata.eval_langs = multilingual_eval_langs def load_data(self, **kwargs): - query = ["This is a test sentence", "This is another test sentence"] - positive = [ - "This is a positive sentence", - "This is another positive sentence", - ] - negative = [ - "This is a negative sentence", - "This is another negative sentence", - ] - data = { - "test": Dataset.from_dict( - { - "query": query, - "positive": positive, - "negative": negative, - } - ), + queries = { + "test": { + "q1": "This is a test sentence", + "q2": "This is another test sentence", + } } - self.dataset = DatasetDict( - { - "eng": data, - "fra": data, + self.queries = {"eng": queries, "fra": queries} + corpus = { + "test": { + "d1": "This is a positive sentence", + "d2": "This is another positive sentence", } - ) + } + self.corpus = {"eng": corpus, "fra": corpus} + + relevant_docs = { + "test": { + "q1": {"d1": 1, "d2": 0}, + "q2": {"d1": 0, "d2": 1}, + }, + } + self.relevant_docs = { + "eng": relevant_docs, + "fra": relevant_docs, + } + top_ranked = { + "test": { + "q1": ["d1", "d2"], + "q2": ["d2", "d1"], + }, + } + self.top_ranked = { + "eng": top_ranked, + "fra": top_ranked, + } + self.instructions = None self.data_loaded = True @@ -1005,6 +1018,8 @@ def load_data(self, **kwargs): "q2": {"d1": 0, "d2": 1}, }, } + self.top_ranked = None + self.instructions = None self.data_loaded = True @@ -1068,6 +1083,8 @@ def load_data(self, **kwargs): "eng": relevant_docs, "fra": relevant_docs, } + self.top_ranked = None + self.instructions = None self.data_loaded = True @@ -1175,10 +1192,10 @@ def load_data(self, **kwargs): self.data_loaded = True -class MockInstructionRetrival(AbsTaskInstructionRetrieval): +class MockInstructionRetrival(AbsTaskReranking): do_length_ablation = True metadata = TaskMetadata( - type="InstructionRetrieval", + type="InstructionReranking", name="MockInstructionRetrival", main_score="p-MRR", descriptive_stats={ @@ -1241,28 +1258,13 @@ def load_data(self, **kwargs): "q2": ["d2", "d1"], } } - - self.keywords = { - "test": { - "This is a test sentence": "test1", - "This is another test sentence": "test2", - } - } - self.short_instructions = { - "test": { - "This is a test sentence": "short1", - "This is another test sentence": "short2", - } - } self.data_loaded = True -class MockMultilingualInstructionRetrival( - AbsTaskInstructionRetrieval, MultilingualTask -): +class MockMultilingualInstructionRetrival(AbsTaskReranking, MultilingualTask): do_length_ablation = True metadata = TaskMetadata( - type="InstructionRetrieval", + type="InstructionReranking", name="MockMultilingualInstructionRetrival", main_score="p-MRR", descriptive_stats={ @@ -1377,25 +1379,4 @@ def load_data(self, **kwargs): "eng": top_ranked, "fra": top_ranked, } - - keywords = { - "test": { - "This is a test sentence": "test1", - "This is another test sentence": "test2", - } - } - self.keywords = { - "eng": keywords, - "fra": keywords, - } - short_instructions = { - "test": { - "This is a test sentence": "short1", - "This is another test sentence": "short2", - } - } - self.short_instructions = { - "eng": short_instructions, - "fra": short_instructions, - } self.data_loaded = True diff --git a/tests/test_evaluators/test_InstructionRetrievalEvaluator.py b/tests/test_evaluators/test_InstructionRetrievalEvaluator.py index 5c0b4eb854..1711a641bf 100644 --- a/tests/test_evaluators/test_InstructionRetrievalEvaluator.py +++ b/tests/test_evaluators/test_InstructionRetrievalEvaluator.py @@ -5,14 +5,14 @@ from tests.test_benchmark.mock_models import MockNumpyEncoder -class TestInstructionRetrievalEvaluation: +class TestInstructionMetricsEvaluation: def setup_method(self): """Setup any state tied to the execution of the given method in a class. setup_method is invoked for every test method of a class. """ # checks that it loads - self.evaluator = RetrievalEvaluator.RetrievalEvaluator( + self.evaluator = RetrievalEvaluator( SentenceTransformerWrapper(MockNumpyEncoder()), task_name="test" ) diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index afe9f98c8d..6d34d104ba 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -46,6 +46,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio "STS", "Summarization", "InstructionRetrieval", + "InstructionReranking", "Speed", ], ) diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 6c00a2d5e0..208e7221c3 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -10,7 +10,7 @@ import mteb from mteb import MTEB from mteb.abstasks import AbsTask -from mteb.abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask from mteb.abstasks.MultiSubsetLoader import MultiSubsetLoader @@ -34,7 +34,7 @@ def test_load_data( # TODO: We skip because this load_data is completely different. if ( isinstance(task, AbsTaskRetrieval) - or isinstance(task, AbsTaskInstructionRetrieval) + or isinstance(task, AbsTaskReranking) or isinstance(task, MultiSubsetLoader) or isinstance(task, AbsTaskSpeedTask) ): From 45df270c46f020f33222dc187896411bb289a624 Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Tue, 5 Nov 2024 16:39:41 -0500 Subject: [PATCH 05/16] working instructions; just need cleanup --- docs/tasks.md | 2 +- mteb/abstasks/AbsTaskReranking.py | 11 +- mteb/abstasks/AbsTaskRetrieval.py | 51 +--- mteb/abstasks/dataloaders.py | 12 +- .../evaluators/RetrievalEvaluator.py | 52 ++-- mteb/evaluation/evaluators/model_classes.py | 19 +- mteb/evaluation/evaluators/utils.py | 189 ++++++++++++-- mteb/model_meta.py | 2 +- mteb/models/sentence_transformers_models.py | 17 ++ .../eng/Core17InstructionRetrieval.py | 6 +- .../eng/News21InstructionRetrieval.py | 4 +- .../eng/Robust04InstructionRetrieval.py | 4 +- .../multilingual/mFollowIR.py | 110 +++----- .../InstructionRetrieval/eng/InstructIR.py | 4 +- tests/test_benchmark/mock_tasks.py | 245 ++++++++++++++---- tests/test_benchmark/task_grid.py | 12 +- tests/test_benchmark/test_benchmark.py | 8 +- .../test_InstructionRetrievalEvaluator.py | 30 ++- .../test_RetrievalEvaluator.py | 7 +- 19 files changed, 516 insertions(+), 269 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 18def6df57..04b07a3aed 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -593,7 +593,7 @@ The following tables give you an overview of the tasks in MTEB. | [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | {'test': 50000} | {} | | [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | {'test': 6161} | {'test': 173.0} | | [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, Web, News, Written] | {'test_expert': 2040} | {'test_expert': 145.88} | -| [mFollowIRCrossLingualInstructionReranking](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'eng-fas': 80, 'eng-rus': 80, 'eng-zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 81.8780487804878, 'average_instruction_length': 389.9512195121951, 'average_changed_instruction_length': 450.5528455284553, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'eng-fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 80.075, 'average_instruction_length': 396.875, 'average_changed_instruction_length': 463.175, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'eng-rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 81.875, 'average_instruction_length': 371.125, 'average_changed_instruction_length': 431.8, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'eng-zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 83.55813953488372, 'average_instruction_length': 401.0232558139535, 'average_changed_instruction_length': 456.25581395348837, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | +| [mFollowIR](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'eng-fas': 80, 'eng-rus': 80, 'eng-zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 81.8780487804878, 'average_instruction_length': 389.9512195121951, 'average_changed_instruction_length': 450.5528455284553, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'eng-fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 80.075, 'average_instruction_length': 396.875, 'average_changed_instruction_length': 463.175, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'eng-rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 81.875, 'average_instruction_length': 371.125, 'average_changed_instruction_length': 431.8, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'eng-zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 83.55813953488372, 'average_instruction_length': 401.0232558139535, 'average_changed_instruction_length': 456.25581395348837, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | | [mFollowIRInstructionReranking](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 80, 'rus': 80, 'zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 57.113821138211385, 'average_instruction_length': 281.0650406504065, 'average_changed_instruction_length': 326.9430894308943, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 72.65, 'average_instruction_length': 358.925, 'average_changed_instruction_length': 415.325, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 77.5, 'average_instruction_length': 387, 'average_changed_instruction_length': 458, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 23.697674418604652, 'average_instruction_length': 110.09302325581395, 'average_changed_instruction_length': 122.81395348837209, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index dc1d0d5ea8..2a303dbb65 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -68,7 +68,6 @@ def load_data(self, **kwargs): self.transform_old_dataset_format() else: # use AbsTaskRetrieval default to load the data - # TODO: need to make sure top_ranked comes back return super().load_data(**kwargs) def process_example(self, example: dict, split: str, query_idx: int) -> dict: @@ -89,13 +88,16 @@ def process_example(self, example: dict, split: str, query_idx: int) -> dict: } for i, pos_doc in enumerate(positive_docs): - doc_id = f"{query_id}_positive_{i}" + # have "a" in front so that positives are first, then negatives + # this shouldn't matter except for ties, and the previous reranking results + # had the positives first + doc_id = f"apositive_{i}_{query_id}" example_data["doc_ids"].append(doc_id) example_data["doc_texts"].append(pos_doc) example_data["relevance_scores"].append(1) for i, neg_doc in enumerate(negative_docs): - doc_id = f"{query_id}_negative_{i}" + doc_id = f"negative_{i}_{query_id}" example_data["doc_ids"].append(doc_id) example_data["doc_texts"].append(neg_doc) example_data["relevance_scores"].append(0) @@ -196,7 +198,7 @@ def _evaluate_subset( cur_queries = {query_id: queries[query_id]} if "instructions" in kwargs: instructions = kwargs["instructions"] - cur_instructions = {queries[query_id]: instructions[queries[query_id]]} + cur_instructions = {query_id: instructions[query_id]} else: cur_instructions = None @@ -219,6 +221,7 @@ def _evaluate_subset( # do the evaluation like normal now, but pass our results if max_docs > max(retriever.k_values): + # only added if we need a large k-value for reranking past 1000 retriever.k_values += [max_docs] return super()._evaluate_subset( diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index ba59616e39..3956280473 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -10,6 +10,7 @@ from mteb.abstasks.TaskMetadata import HFSubset from ..evaluation.evaluators import RetrievalEvaluator +from ..evaluation.evaluators.utils import make_score_dict from ..load_results.task_results import ScoresDict from .AbsTask import AbsTask, DescriptiveStatistics from .dataloaders import HFDataLoader @@ -27,7 +28,7 @@ class RetrievalDescriptiveStatistics(DescriptiveStatistics): num_documents: Number of documents average_relevant_docs_per_query: Average number of relevant documents per query average_instruction_length: Average length of instructions - average_num_instructions_per_query: Average number of instructions per query + num_instructions: Number of instructions average_top_ranked_per_query: Average number of top ranked documents per query """ @@ -38,7 +39,7 @@ class RetrievalDescriptiveStatistics(DescriptiveStatistics): average_relevant_docs_per_query: float # these are for datasets with instructions average_instruction_length: float - average_num_instructions_per_query: float + num_instructions: int # this is for datasets that do reranking average_top_ranked_per_query: float @@ -68,8 +69,8 @@ class AbsTaskRetrieval(AbsTask): E.g.: {"test": {"q1": ["document_one", "document_two"]}} or {"test": {"q1": {"document_one": 1, "document_two": 0.5}}} self.instructions: dict[str, dict[str, str]] or dict[str, dict[str, list[str]]] - Semantically, it should contain dict[split_name, dict[sample_id, str]] or dict[split_name, dict[sample_id, list[str]]] for multiple instructions per query - E.g. {"test": {"q1": "instruction"}} or {"test": {"q1": ["instruction1", "instruction2"]}} + Semantically, it should contain dict[split_name, dict[sample_id, str]]. If there are multiple instructions per query, please duplicate the queries and give them unique ids for consolidation. + E.g. {"test": {"query-id1": "instruction text"}} """ ignore_identical_ids: bool = False @@ -107,7 +108,9 @@ def load_data(self, **kwargs): # optional args if instructions: self.instructions = { - split: {inst["query"]: inst["instruction"] for inst in instructions} + split: { + inst["query-id"]: inst["instruction"] for inst in instructions + } } if top_ranked: self.top_ranked = { @@ -172,8 +175,6 @@ def _evaluate_subset( else: # perform the retrieval here start_time = time() - # instructions can be a set of instructions for each query - # TODO: add instructions here - in progress results = retriever(corpus, queries, **kwargs) end_time = time() logger.info(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") @@ -214,22 +215,9 @@ def _evaluate_subset( mrr, naucs_mrr = retriever.evaluate_custom( relevant_docs, results, retriever.k_values, "mrr" ) - scores = { - **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, - **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, - **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, - **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, - **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()}, - **{ - k.replace("@", "_at_").replace("_P", "_precision").lower(): v - for k, v in naucs.items() - }, - **{ - k.replace("@", "_at_").replace("_P", "_precision").lower(): v - for k, v in naucs_mrr.items() - }, - **task_scores, - } + scores = make_score_dict( + ndcg, _map, recall, precision, mrr, naucs, naucs_mrr, task_scores + ) self._add_main_score(scores) if export_errors: @@ -319,18 +307,9 @@ def _calculate_metrics_from_split( if self.instructions is not None: total_instructions_len = sum( - [ - len(instruction) - if isinstance(instruction, str) - else sum([len(i) for i in instruction]) - for instruction in instructions.values() - ] - ) - # flatten the values in instructions to get the number of them - num_instructions = sum( - 1 if isinstance(instruction, str) else len(instruction) - for instruction in instructions.values() + [len(instruction) for instruction in instructions.values()] ) + num_instructions = len(instructions) else: total_instructions_len = 0 num_instructions = 0 @@ -353,9 +332,7 @@ def _calculate_metrics_from_split( average_instruction_length=total_instructions_len / num_instructions if num_instructions else 0, - average_num_instructions_per_query=num_instructions / num_queries - if num_queries - else 0, + num_instructions=num_instructions, average_top_ranked_per_query=top_ranked_per_query, ) diff --git a/mteb/abstasks/dataloaders.py b/mteb/abstasks/dataloaders.py index 40afb883cc..909983217d 100644 --- a/mteb/abstasks/dataloaders.py +++ b/mteb/abstasks/dataloaders.py @@ -300,19 +300,13 @@ def _load_instructions(self): keep_in_memory=self.keep_in_memory, ) instructions_ds = next(iter(instructions_ds.values())) - instructions_ds = instructions_ds.cast_column("query", Value("string")) - - # if instructions is a string, change it to a list[str] - if "instruction" in instructions_ds.column_names: - instructions_ds = instructions_ds.cast_column( - "instruction", Sequence(Value("string")) - ) - + instructions_ds = instructions_ds.cast_column("query-id", Value("string")) + instructions_ds = instructions_ds.cast_column("instruction", Value("string")) instructions_ds = instructions_ds.remove_columns( [ col for col in instructions_ds.column_names - if col not in ["query", "instruction"] + if col not in ["query-id", "instruction"] ] ) self.instructions = instructions_ds diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index aab07e2464..0a1d96cec9 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -19,6 +19,7 @@ hole, mrr, nAUC, + parse_metrics_from_scores, recall_cap, top_k_accuracy, ) @@ -55,7 +56,14 @@ def __init__( self.top_k = ( max(k_values) if "top_k" not in kwargs else kwargs["top_k"] ) # can lower it if reranking - self.score_function = score_function + self.score_function = ( + retriever.mteb_model_meta.similarity_fn_name + if ( + hasattr(retriever, "mteb_model_meta") + and retriever.mteb_model_meta.similarity_fn_name + ) + else score_function + ) self.task_name = task_name def __call__( @@ -69,6 +77,11 @@ def __call__( if not self.retriever: raise ValueError("Model/Technique has not been provided!") + # allow kwargs top-k to override the class top-k + if "top_k" in kwargs: + self.top_k = kwargs["top_k"] + del kwargs["top_k"] + if self.is_cross_encoder: return self.retriever.search_cross_encoder( corpus, queries, self.top_k, instructions=instructions, **kwargs @@ -126,14 +139,6 @@ def evaluate( "For evaluation, we DO NOT ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=True`` to ignore this." ) - all_ndcgs, all_aps, all_recalls, all_precisions = {}, {}, {}, {} - - for k in k_values: - all_ndcgs[f"NDCG@{k}"] = [] - all_aps[f"MAP@{k}"] = [] - all_recalls[f"Recall@{k}"] = [] - all_precisions[f"P@{k}"] = [] - map_string = "map_cut." + ",".join([str(k) for k in k_values]) ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) recall_string = "recall." + ",".join([str(k) for k in k_values]) @@ -143,25 +148,16 @@ def evaluate( ) scores = evaluator.evaluate(results) - for query_id in scores.keys(): - for k in k_values: - all_ndcgs[f"NDCG@{k}"].append(scores[query_id]["ndcg_cut_" + str(k)]) - all_aps[f"MAP@{k}"].append(scores[query_id]["map_cut_" + str(k)]) - all_recalls[f"Recall@{k}"].append(scores[query_id]["recall_" + str(k)]) - all_precisions[f"P@{k}"].append(scores[query_id]["P_" + str(k)]) - - ndcg, _map, recall, precision = ( - all_ndcgs.copy(), - all_aps.copy(), - all_recalls.copy(), - all_precisions.copy(), - ) - - for k in k_values: - ndcg[f"NDCG@{k}"] = round(sum(ndcg[f"NDCG@{k}"]) / len(scores), 5) - _map[f"MAP@{k}"] = round(sum(_map[f"MAP@{k}"]) / len(scores), 5) - recall[f"Recall@{k}"] = round(sum(recall[f"Recall@{k}"]) / len(scores), 5) - precision[f"P@{k}"] = round(sum(precision[f"P@{k}"]) / len(scores), 5) + ( + ndcg, + _map, + recall, + precision, + all_ndcgs, + all_aps, + all_recalls, + all_precisions, + ) = parse_metrics_from_scores(scores, k_values) naucs = RetrievalEvaluator.evaluate_abstention( results, {**all_ndcgs, **all_aps, **all_recalls, **all_precisions} diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index 10375da8b3..4febcda19a 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -63,9 +63,10 @@ def __init__( if "convert_to_tensor" not in encode_kwargs: encode_kwargs["convert_to_tensor"] = True - self.score_functions = {"cos_sim": cos_sim, "dot": dot_score} + self.score_functions = {"cos_sim": cos_sim, "dot": dot_score, "cosine": cos_sim} self.score_function_desc = { "cos_sim": "Cosine Similarity", + "cosine": "Cosine Similarity", "dot": "Dot Product", } self.corpus_chunk_size = corpus_chunk_size @@ -90,7 +91,7 @@ def __init__( def search( self, corpus: dict[str, dict[str, str]], - queries: dict[str, str | list[str]], + queries: dict[str, str], top_k: int, score_function: str, task_name: str, @@ -101,7 +102,6 @@ def search( ) -> dict[str, dict[str, float]]: # Create embeddings for all queries using model.encode # Runs semantic search against the corpus embeddings - # Returns a ranked list with the corpus ids if score_function not in self.score_functions: raise ValueError( f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product" @@ -110,9 +110,16 @@ def search( logger.info("Encoding Queries.") query_ids = list(queries.keys()) self.results = {qid: {} for qid in query_ids} - queries = [queries[qid] for qid in queries] # type: ignore + query_ids, queries = zip(*queries.items()) + if instructions: - queries = [f"{query} {instructions[query]}".strip() for query in queries] + new_queries = [] + for q_idx, qid in enumerate(query_ids): + new_queries.append( + f"{queries[q_idx].strip()} {instructions[qid]}".strip() + ) + queries = new_queries + if isinstance(queries[0], list): # type: ignore query_embeddings = self.encode_conversations( model=self.model, @@ -271,7 +278,7 @@ def search_cross_encoder( ( query, corpus[doc_id], - instructions[query] if instructions is not None else None, + instructions[qid] if instructions is not None else None, qid, doc_id, ) diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index 6574c2a2bc..8ccb6de8d2 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -5,9 +5,11 @@ import numpy as np import pandas as pd +import pytrec_eval import requests import torch import tqdm +from datasets import load_dataset from packaging.version import Version from sklearn.metrics import auc @@ -235,15 +237,11 @@ def get_rank_from_dict( return len(sorted_by_score) + 1, 0 -def evaluate_change( - original_run: dict[str, dict[str, float]], - new_run: dict[str, dict[str, float]], - changed_qrels: dict[str, list[str]], -) -> dict[str, float]: +def calculate_pmrr(original_run, new_run, changed_qrels): changes = [] for qid in changed_qrels.keys(): - original_qid_run = original_run[qid] - new_qid_run = new_run[qid] + original_qid_run = original_run[qid + "-og"] + new_qid_run = new_run[qid + "-changed"] for idx, changed_doc in enumerate(changed_qrels[qid]): original_rank, original_score = get_rank_from_dict( original_qid_run, changed_doc @@ -267,10 +265,81 @@ def evaluate_change( changes_df = pd.DataFrame(changes) changes_df["p-MRR"] = changes_df.apply(lambda x: rank_score(x), axis=1) qid_wise = changes_df.groupby("qid").agg({"p-MRR": "mean"}) - return { - "p-MRR": qid_wise["p-MRR"].mean(), + return qid_wise["p-MRR"].mean() + + +def evaluate_p_mrr_change( + results: dict[str, dict[str, float]], + qrels: dict[str, dict[str, float]], + task_name: str, + k_values: list[int], +) -> dict[str, float]: + """Computes the scores needed for FollowIR datasets, including p-MRR (measuring change in instruction) and + details about the original instruction run and changed instruction run. + """ + followir_scores = defaultdict(dict) + # load the qrel_diff from the dataset + TASK_TO_HF_DATASET = { + "Core17InstructionRetrieval": ("jhu-clsp/core17-instructions-mteb", False), + "Robust04InstructionRetrieval": ("jhu-clsp/robust04-instructions-mteb", False), + "News21InstructionRetrieval": ("jhu-clsp/news21-instructions-mteb", False), + "mFollowIR": ("jhu-clsp/mfollowir-mteb", True), + "mFollowIRCrossLingual": ( + "jhu-clsp/mfollowir-cross-lingual-mteb", + True, + ), + } + hf_path, is_multilingual = TASK_TO_HF_DATASET[task_name] + langs = "eng" if not is_multilingual else ["zho", "rus", "fas"] + + qrels_sep = { + "og": {k: v for k, v in qrels.items() if k.endswith("-og")}, + "changed": {k: v for k, v in qrels.items() if not k.endswith("-og")}, } + original_run = {} + new_run = {} + # make original run from the results file with all "-og" items only and vice versa + for qid, docs in results.items(): + if qid.endswith("-og"): + original_run[qid] = docs + else: + new_run[qid] = docs + + for lang in langs: + config_name = "qrel_diff" if not is_multilingual else f"qrel_diff-{lang}" + changed_qrels = { + item["query-id"]: item["corpus-ids"] + for item in load_dataset(hf_path, config_name)["qrel_diff"] + } + p_mrr = calculate_pmrr(original_run, new_run, changed_qrels) + followir_scores[lang]["p-MRR"] = p_mrr + + # unfortunately, have to re-compute scores here to get only og and changed scores + followir_scores[lang]["og"] = {} + followir_scores[lang]["changed"] = {} + for name, group in [("og", original_run), ("changed", new_run)]: + map_string = "map_cut." + ",".join([str(k) for k in k_values]) + ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) + recall_string = "recall." + ",".join([str(k) for k in k_values]) + precision_string = "P." + ",".join([str(k) for k in k_values]) + evaluator = pytrec_eval.RelevanceEvaluator( + qrels_sep[name], + {map_string, ndcg_string, recall_string, precision_string}, + ) + # qrels_changed_as_og = {k.replace("-changed", "-og"): v for k, v in qrels_sep["changed"].items()} + group_scores = evaluator.evaluate(group) + ndcg, _map, recall, precision = parse_metrics_from_scores( + group_scores, k_values + )[:4] # don't need the other fields + + # add these to the followir_scores with name prefix + scores_dict = make_score_dict(ndcg, _map, recall, precision, {}, {}, {}, {}) + for key, value in scores_dict.items(): + followir_scores[lang][name][key] = value + + return followir_scores + def rank_score(x: dict[str, float]) -> float: if x["og_rank"] >= x["new_rank"]: @@ -434,7 +503,7 @@ def add_task_specific_scores( qrels: dict[str, dict[str, int]], results: dict[str, dict[str, float]], task_name: str, - k_values: list[int], # not needed now, but perhaps later? + k_values: list[int], ) -> dict[str, float]: """Add task-specific scores to the scores dictionary, that are not needed for all results but require additional computation.""" task_scores = {} @@ -447,14 +516,20 @@ def add_task_specific_scores( task_scores["robustness_at_10"] = robustness_at_10_score if task_name in [ - "mFollowIRInstructionReranking", - "mFollowIRCrossLingualInstructionReranking", + "mFollowIR", + "mFollowIRCrossLingual", "Robust04InstructionRetrieval", "Core17InstructionRetrieval", "News21InstructionRetrieval", ]: - p_mrr = evaluate_change(results, scores, qrels) - task_scores["p-MRR"] = p_mrr["p-MRR"] + p_mrr_and_consolidated_scores = evaluate_p_mrr_change( + results, qrels, task_name, k_values + ) + task_scores.update(p_mrr_and_consolidated_scores) + + if task_name in ["MindSmallReranking"]: + take_max_over_subqueries = max_over_subqueries(qrels, results, scores) + task_scores["max_over_subqueries"] = take_max_over_subqueries return task_scores @@ -507,7 +582,6 @@ def robustness_at_10( query_keys[key.split("_")[0]].append(key) robustness_scores = [] - breakpoint() for _, keys in query_keys.items(): # get the ndcg@10 for each query current_scores = [] @@ -518,3 +592,88 @@ def robustness_at_10( robustness_scores.append(min(current_scores)) return sum(robustness_scores) / len(robustness_scores) + + +def make_score_dict(ndcg, _map, recall, precision, mrr, naucs, naucs_mrr, task_scores): + scores = { + **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, + **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, + **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, + **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, + **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()}, + **{ + k.replace("@", "_at_").replace("_P", "_precision").lower(): v + for k, v in naucs.items() + }, + **{ + k.replace("@", "_at_").replace("_P", "_precision").lower(): v + for k, v in naucs_mrr.items() + }, + **task_scores, + } + return scores + + +def parse_metrics_from_scores(scores, k_values): + all_ndcgs, all_aps, all_recalls, all_precisions = {}, {}, {}, {} + for k in k_values: + all_ndcgs[f"NDCG@{k}"] = [] + all_aps[f"MAP@{k}"] = [] + all_recalls[f"Recall@{k}"] = [] + all_precisions[f"P@{k}"] = [] + + for query_id in scores.keys(): + for k in k_values: + all_ndcgs[f"NDCG@{k}"].append(scores[query_id]["ndcg_cut_" + str(k)]) + all_aps[f"MAP@{k}"].append(scores[query_id]["map_cut_" + str(k)]) + all_recalls[f"Recall@{k}"].append(scores[query_id]["recall_" + str(k)]) + all_precisions[f"P@{k}"].append(scores[query_id]["P_" + str(k)]) + + ndcg, _map, recall, precision = ( + all_ndcgs.copy(), + all_aps.copy(), + all_recalls.copy(), + all_precisions.copy(), + ) + + for k in k_values: + ndcg[f"NDCG@{k}"] = round(sum(ndcg[f"NDCG@{k}"]) / len(scores), 5) + _map[f"MAP@{k}"] = round(sum(_map[f"MAP@{k}"]) / len(scores), 5) + recall[f"Recall@{k}"] = round(sum(recall[f"Recall@{k}"]) / len(scores), 5) + precision[f"P@{k}"] = round(sum(precision[f"P@{k}"]) / len(scores), 5) + + return ( + ndcg, + _map, + recall, + precision, + all_ndcgs, + all_aps, + all_recalls, + all_precisions, + ) + + +def max_over_subqueries(qrels, results, scores): + """Computes the max over subqueries. This metric is the maximum of the scores for all subqueries + + Args: + qrels: Ground truth relevance judgments for the queries + results: Predicted relevance scores for the queries + scores: The scores for the queries, to extract average precision for each query + """ + query_keys = defaultdict(list) + for key in qrels.keys(): + query_keys[key.split("_")[0]].append(key) + + max_over_subqueries = [] + for _, keys in query_keys.items(): + # get the average precision for each query + current_scores = [] + for key in keys: + current_scores.append(scores[key]["P_1"]) + + # get the max average precision + max_over_subqueries.append(max(current_scores)) + + return sum(max_over_subqueries) / len(max_over_subqueries) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 7acb806b81..bd735adbfd 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -26,7 +26,7 @@ "API", "Tevatron", ] -DISTANCE_METRICS = Literal["cosine"] +DISTANCE_METRICS = Literal["cosine", "dot"] def sentence_transformers_loader( diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 9a33e0f64f..8ebcca8cd2 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -127,3 +127,20 @@ framework=["Sentence Transformers", "PyTorch"], use_instuctions=False, ) + +contriever = ModelMeta( + name="facebook/contriever-msmarco", + languages=["eng-Latn"], + open_weights=True, + revision="abe8c1493371369031bcb1e02acb754cf4e162fa", + release_date="2022-06-25", # release date of model on HF + n_parameters=150_000_000, + memory_usage=None, + embed_dim=768, + license=None, + max_tokens=512, + reference="https://huggingface.co/facebook/contriever-msmarco", + similarity_fn_name="dot", + framework=["Sentence Transformers", "PyTorch"], + use_instuctions=False, +) diff --git a/mteb/tasks/InstructionReranking/eng/Core17InstructionRetrieval.py b/mteb/tasks/InstructionReranking/eng/Core17InstructionRetrieval.py index 0000969419..58ee16867c 100644 --- a/mteb/tasks/InstructionReranking/eng/Core17InstructionRetrieval.py +++ b/mteb/tasks/InstructionReranking/eng/Core17InstructionRetrieval.py @@ -11,10 +11,10 @@ class Core17InstructionRetrieval(AbsTaskReranking): description="Measuring retrieval instruction following ability on Core17 narratives for the FollowIR benchmark.", reference="https://arxiv.org/abs/2403.15246", dataset={ - "path": "jhu-clsp/core17-instructions", - "revision": "e39ff896cf3efbbdeeb950e6bd7c79f266995b07", + "path": "jhu-clsp/core17-instructions-mteb", + "revision": "7030c7efc3585d9020f243b12862997889243b78", }, - type="InstructionRetrieval", + type="InstructionReranking", category="s2p", modalities=["text"], eval_splits=["test"], diff --git a/mteb/tasks/InstructionReranking/eng/News21InstructionRetrieval.py b/mteb/tasks/InstructionReranking/eng/News21InstructionRetrieval.py index 7c3b5b5860..f9d85db4ef 100644 --- a/mteb/tasks/InstructionReranking/eng/News21InstructionRetrieval.py +++ b/mteb/tasks/InstructionReranking/eng/News21InstructionRetrieval.py @@ -11,8 +11,8 @@ class News21InstructionRetrieval(AbsTaskReranking): description="Measuring retrieval instruction following ability on News21 narratives for the FollowIR benchmark.", reference="https://arxiv.org/abs/2403.15246", dataset={ - "path": "jhu-clsp/news21-instructions", - "revision": "e0144086b45fe31ac125e9ac1a83b6a409bb6ca6", + "path": "jhu-clsp/news21-instructions-mteb", + "revision": "39db677749b3b783bb277d0e2d4712f5f133f52b", }, type="InstructionReranking", category="s2p", diff --git a/mteb/tasks/InstructionReranking/eng/Robust04InstructionRetrieval.py b/mteb/tasks/InstructionReranking/eng/Robust04InstructionRetrieval.py index 3a54496555..59fe49d418 100644 --- a/mteb/tasks/InstructionReranking/eng/Robust04InstructionRetrieval.py +++ b/mteb/tasks/InstructionReranking/eng/Robust04InstructionRetrieval.py @@ -11,8 +11,8 @@ class Robust04InstructionRetrieval(AbsTaskReranking): description="Measuring retrieval instruction following ability on Robust04 narratives for the FollowIR benchmark.", reference="https://arxiv.org/abs/2403.15246", dataset={ - "path": "jhu-clsp/robust04-instructions", - "revision": "a5a1c4fe2bc528ac12e83f8cdf82178da85d2f1d", + "path": "jhu-clsp/robust04-instructions-mteb", + "revision": "0a3efedfcac0a7f859c46cff3a0fac0f8747b28f", }, type="InstructionReranking", category="s2p", diff --git a/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py b/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py index d7ade8abf0..be265391e7 100644 --- a/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py +++ b/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py @@ -1,7 +1,5 @@ from __future__ import annotations -from collections import defaultdict - import datasets from mteb.abstasks.MultilingualTask import MultilingualTask @@ -51,8 +49,8 @@ def load_data( ): corpus = {lang: {EVAL_SPLIT: {}} for lang in langs} queries = {lang: {EVAL_SPLIT: {}} for lang in langs} - og_relevant_docs = {lang: {EVAL_SPLIT: {}} for lang in langs} - changed_relevant_docs = {lang: {EVAL_SPLIT: {}} for lang in langs} + relevant_docs = {lang: {EVAL_SPLIT: {}} for lang in langs} + instructions = {lang: {EVAL_SPLIT: {}} for lang in langs} top_ranked = {lang: {EVAL_SPLIT: {}} for lang in langs} for lang in langs: @@ -60,6 +58,7 @@ def load_data( loading_lang = lang.split("-")[1] # don't care about the eng part else: loading_lang = lang + print(f"Loading data for {lang} from {loading_lang}") # Load corpus data corpus_data = datasets.load_dataset( @@ -67,7 +66,6 @@ def load_data( f"corpus-{loading_lang}", cache_dir=cache_dir, revision=revision, - trust_remote_code=True, ) corpus[lang][EVAL_SPLIT] = { row["_id"]: {"title": row["title"], "text": row["text"]} @@ -80,54 +78,39 @@ def load_data( f"queries-{loading_lang}", cache_dir=cache_dir, revision=revision, - trust_remote_code=True, ) queries[lang][EVAL_SPLIT] = { - row["_id"]: { - "text": row["text"], - "instruction_og": row["instruction_og"], - "instruction_changed": row["instruction_changed"], - "keywords": row["keywords"] if "keywords" in row else None, - "short_query": row["short_query"] if "short_query" in row else None, - } - for row in queries_data["queries"] + row["_id"]: row["text"] for row in queries_data["queries"] } - # Load qrels_og data - qrels_og_data = datasets.load_dataset( + # Load instructions data + instructions_data = datasets.load_dataset( path, - f"qrels_og-{loading_lang}", + f"instruction-{loading_lang}", cache_dir=cache_dir, revision=revision, - trust_remote_code=True, ) - for row in qrels_og_data[EVAL_SPLIT]: - if row["query-id"] not in og_relevant_docs[lang][EVAL_SPLIT]: - og_relevant_docs[lang][EVAL_SPLIT][row["query-id"]] = { - row["corpus-id"]: int(row["score"]) - } - else: - og_relevant_docs[lang][EVAL_SPLIT][row["query-id"]][ - row["corpus-id"] - ] = int(row["score"]) + instructions[lang][EVAL_SPLIT] = { + row["query-id"]: row["instruction"] + for row in instructions_data["instruction"] + } - # Load qrels_changed data - qrels_changed_data = datasets.load_dataset( + # Load qrels_og data + qrels_og_data = datasets.load_dataset( path, - f"qrels_changed-{loading_lang}", + f"default-{loading_lang}", cache_dir=cache_dir, revision=revision, - trust_remote_code=True, ) - for row in qrels_changed_data[EVAL_SPLIT]: - if row["query-id"] not in changed_relevant_docs[lang][EVAL_SPLIT]: - changed_relevant_docs[lang][EVAL_SPLIT][row["query-id"]] = { + for row in qrels_og_data[EVAL_SPLIT]: + if row["query-id"] not in relevant_docs[lang][EVAL_SPLIT]: + relevant_docs[lang][EVAL_SPLIT][row["query-id"]] = { row["corpus-id"]: int(row["score"]) } else: - changed_relevant_docs[lang][EVAL_SPLIT][row["query-id"]][ - row["corpus-id"] - ] = int(row["score"]) + relevant_docs[lang][EVAL_SPLIT][row["query-id"]][row["corpus-id"]] = ( + int(row["score"]) + ) # Load top_ranked data top_ranked_data = datasets.load_dataset( @@ -135,7 +118,6 @@ def load_data( f"top_ranked-{loading_lang}", cache_dir=cache_dir, revision=revision, - trust_remote_code=True, ) for row in top_ranked_data["top_ranked"]: if row["qid"] not in top_ranked[lang][EVAL_SPLIT]: @@ -143,43 +125,17 @@ def load_data( else: top_ranked[lang][EVAL_SPLIT][row["qid"]].append(row["pid"]) - # make og_instructions and changed_instructions from queries and then turn queries into just queries - og_instructions = {lang: {EVAL_SPLIT: defaultdict(dict)} for lang in queries} - changed_instructions = {lang: {EVAL_SPLIT: defaultdict(dict)} for lang in queries} - queries_only = {lang: {EVAL_SPLIT: {}} for lang in queries} - for lang in queries: - for split in queries[lang]: - for qid in queries[lang][split]: - text = queries[lang][split][qid]["text"] - og_instructions[lang][split][text] = queries[lang][split][qid][ - "instruction_og" - ] - changed_instructions[lang][split][text] = queries[lang][split][qid][ - "instruction_changed" - ] - queries_only[lang][split][qid] = text - - queries = queries_only - - return ( - corpus, - queries, - og_instructions, - changed_instructions, - og_relevant_docs, - changed_relevant_docs, - top_ranked, - ) + return (corpus, queries, instructions, relevant_docs, top_ranked) class mFollowIRCrossLingual(MultilingualTask, AbsTaskReranking): metadata = TaskMetadata( - name="mFollowIRCrossLingualInstructionReranking", + name="mFollowIRCrossLingual", description="This tasks measures retrieval instruction following ability on NeuCLIR narratives for the mFollowIR benchmark on the Farsi, Russian, and Chinese languages with English queries/instructions.", reference="https://neuclir.github.io/", dataset={ - "path": "jhu-clsp/mFollowIR-cross-lingual-parquet", - "revision": "7a82814a53229d3c8f18b2e18762a1a959dc5ff6", + "path": "jhu-clsp/mFollowIR-cross-lingual-parquet-mteb", + "revision": "9c1e094d813857dcdefcaf9764c520b8ba237ffd", }, type="Retrieval", category="s2p", @@ -254,10 +210,8 @@ def load_data(self, **kwargs): ( self.corpus, self.queries, - self.og_instructions, - self.changed_instructions, - self.og_relevant_docs, - self.changed_relevant_docs, + self.instructions, + self.relevant_docs, self.top_ranked, ) = load_data( path=self.metadata_dict["dataset"]["path"], @@ -272,12 +226,12 @@ def load_data(self, **kwargs): class mFollowIR(MultilingualTask, AbsTaskReranking): metadata = TaskMetadata( - name="mFollowIRInstructionReranking", + name="mFollowIR", description="This tasks measures retrieval instruction following ability on NeuCLIR narratives for the mFollowIR benchmark on the Farsi, Russian, and Chinese languages.", reference="https://neuclir.github.io/", dataset={ - "path": "jhu-clsp/mFollowIR-parquet", - "revision": "2c5cdcb438eff9de6412803768ac7304d4771cdc", + "path": "jhu-clsp/mFollowIR-parquet-mteb", + "revision": "d95f42313ebac4f1ae188afcd26097210ada8779", }, type="Retrieval", category="s2p", @@ -352,10 +306,8 @@ def load_data(self, **kwargs): ( self.corpus, self.queries, - self.og_instructions, - self.changed_instructions, - self.og_relevant_docs, - self.changed_relevant_docs, + self.instructions, + self.relevant_docs, self.top_ranked, ) = load_data( path=self.metadata_dict["dataset"]["path"], diff --git a/mteb/tasks/InstructionRetrieval/eng/InstructIR.py b/mteb/tasks/InstructionRetrieval/eng/InstructIR.py index e4e2110817..910a3a5bae 100644 --- a/mteb/tasks/InstructionRetrieval/eng/InstructIR.py +++ b/mteb/tasks/InstructionRetrieval/eng/InstructIR.py @@ -8,11 +8,11 @@ class InstructIR(AbsTaskRetrieval): metadata = TaskMetadata( name="InstructIR", - description="A benchmark specifically designed to evaluate the instruction following ability in information retrieval models. Our approach focuses on user-aligned instructions tailored to each query instance, reflecting the diverse characteristics inherent in real-world search scenarios.", + description='A benchmark specifically designed to evaluate the instruction following ability in information retrieval models. Our approach focuses on user-aligned instructions tailored to each query instance, reflecting the diverse characteristics inherent in real-world search scenarios. NOTE: scores on this may differ unless you include instruction first, then "[SEP]" and then the query.', reference="https://github.com/kaistAI/InstructIR/tree/main", dataset={ "path": "mteb/InstructIR-mteb", - "revision": "6b68698b2cd34aed28a64c8917605019f065a6c5", + "revision": "42c3afabe480643b755a7099dbf0f9ebeedaf6ca", }, type="Reranking", category="s2p", diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index 9143c6c090..ee512ed5d9 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -868,8 +868,11 @@ class MockRerankingTask(AbsTaskReranking): "test": { "average_document_length": 27.0, "average_query_length": 26.0, + "average_instruction_length": 0, "num_documents": 2, "num_queries": 2, + "average_top_ranked_per_query": 2.0, + "num_instructions": 0, "average_relevant_docs_per_query": 1.0, } }, @@ -918,11 +921,17 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): "average_query_length": 26.0, "num_documents": 4, "num_queries": 4, + "num_instructions": 0, "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "average_top_ranked_per_query": 2.0, "hf_subset_descriptive_stats": { "eng": { "average_document_length": 30.0, "average_query_length": 26.0, + "average_instruction_length": 0, + "average_top_ranked_per_query": 2.0, + "num_instructions": 0, "num_documents": 2, "num_queries": 2, "average_relevant_docs_per_query": 1.0, @@ -933,6 +942,9 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): "num_documents": 2, "num_queries": 2, "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "average_top_ranked_per_query": 2.0, + "num_instructions": 0, }, }, } @@ -993,6 +1005,9 @@ class MockRetrievalTask(AbsTaskRetrieval): "num_documents": 2, "num_queries": 2, "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 0, + "average_instruction_length": 0, + "num_instructions": 0, } }, **general_args, # type: ignore @@ -1035,6 +1050,9 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): "num_documents": 4, "num_queries": 4, "average_relevant_docs_per_query": 1.0, + "num_instructions": 0, + "average_top_ranked_per_query": 0, + "average_instruction_length": 0, "hf_subset_descriptive_stats": { "eng": { "average_document_length": 30.0, @@ -1042,6 +1060,9 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): "num_documents": 2, "num_queries": 2, "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 0, + "average_instruction_length": 0, + "num_instructions": 0, }, "fra": { "average_document_length": 30.0, @@ -1049,6 +1070,9 @@ class MockMultilingualRetrievalTask(AbsTaskRetrieval, MultilingualTask): "num_documents": 2, "num_queries": 2, "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 0, + "average_instruction_length": 0, + "num_instructions": 0, }, }, } @@ -1192,22 +1216,22 @@ def load_data(self, **kwargs): self.data_loaded = True -class MockInstructionRetrival(AbsTaskReranking): +class MockInstructionRetrieval(AbsTaskRetrieval): do_length_ablation = True metadata = TaskMetadata( - type="InstructionReranking", - name="MockInstructionRetrival", - main_score="p-MRR", + type="InstructionRetrieval", + name="MockInstructionRetrieval", + main_score="ndcg_at_10", descriptive_stats={ "test": { - "num_docs": 2, + "num_documents": 2, "num_queries": 2, "average_document_length": 30.0, "average_query_length": 26.0, "average_instruction_length": 29.0, - "average_changed_instruction_length": 37.0, "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 2.0, + "average_top_ranked_per_query": 0, + "num_instructions": 2, } }, **general_args, # type: ignore @@ -1222,36 +1246,74 @@ def load_data(self, **kwargs): } self.corpus = { "test": { - "d1": {"text": "This is a positive sentence"}, - "d2": {"text": "This is another positive sentence"}, + "d1": "This is a positive sentence", + "d2": "This is another positive sentence", } } - self.og_relevant_docs = { + self.relevant_docs = { "test": { "q1": {"d1": 1, "d2": 0}, "q2": {"d1": 0, "d2": 1}, }, } - self.og_instructions = { + self.instructions = { "test": { - "This is a test sentence": "This is a test instruction", - "This is another test sentence": "This is another test instruction", + "q1": "This is a test instruction", + "q2": "This is another test instruction", } } - self.changed_instructions = { + self.top_ranked = None + self.data_loaded = True + + +class MockInstructionReranking(AbsTaskReranking): + do_length_ablation = True + metadata = TaskMetadata( + type="InstructionReranking", + name="MockInstructionReranking", + main_score="ndcg_at_10", + descriptive_stats={ + "test": { + "num_documents": 2, + "num_queries": 2, + "num_instructions": 2, + "average_document_length": 30.0, + "average_query_length": 26.0, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 2.0, + } + }, + **general_args, # type: ignore + ) + + def load_data(self, **kwargs): + self.queries = { "test": { - "This is a test sentence": "This is a changed test instruction", - "This is another test sentence": "This is changed another test instruction", + "q1": "This is a test sentence", + "q2": "This is another test sentence", } } - self.changed_relevant_docs = { + self.corpus = { "test": { - "q1": {"d1": 0, "d2": 1}, - "q2": {"d1": 1, "d2": 0}, + "d1": "This is a positive sentence", + "d2": "This is another positive sentence", } } + self.relevant_docs = { + "test": { + "q1": {"d1": 1, "d2": 0}, + "q2": {"d1": 0, "d2": 1}, + }, + } + self.instructions = { + "test": { + "q1": "This is a test instruction", + "q2": "This is another test instruction", + } + } self.top_ranked = { "test": { "q1": ["d1", "d2"], @@ -1261,42 +1323,42 @@ def load_data(self, **kwargs): self.data_loaded = True -class MockMultilingualInstructionRetrival(AbsTaskReranking, MultilingualTask): +class MockMultilingualInstructionRetrieval(AbsTaskRetrieval, MultilingualTask): do_length_ablation = True metadata = TaskMetadata( - type="InstructionReranking", - name="MockMultilingualInstructionRetrival", - main_score="p-MRR", + type="InstructionRetrieval", + name="MockMultilingualInstructionRetrieval", + main_score="ndcg_at_10", descriptive_stats={ "test": { - "num_docs": 4, + "num_documents": 4, "num_queries": 4, + "num_instructions": 4, "average_document_length": 30.0, "average_query_length": 26.0, "average_instruction_length": 29.0, - "average_changed_instruction_length": 37.0, "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 2.0, + "average_top_ranked_per_query": 0, "hf_subset_descriptive_stats": { "eng": { - "num_docs": 2, + "num_documents": 2, "num_queries": 2, + "num_instructions": 2, "average_document_length": 30.0, "average_query_length": 26.0, "average_instruction_length": 29.0, - "average_changed_instruction_length": 37.0, "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 2.0, + "average_top_ranked_per_query": 0, }, "fra": { - "num_docs": 2, + "num_documents": 2, "num_queries": 2, + "num_instructions": 2, "average_document_length": 30.0, "average_query_length": 26.0, "average_instruction_length": 29.0, - "average_changed_instruction_length": 37.0, "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 2.0, + "average_top_ranked_per_query": 0, }, }, } @@ -1318,8 +1380,8 @@ def load_data(self, **kwargs): } corpus = { "test": { - "d1": {"text": "This is a positive sentence"}, - "d2": {"text": "This is another positive sentence"}, + "d1": "This is a positive sentence", + "d2": "This is another positive sentence", } } self.corpus = { @@ -1327,48 +1389,119 @@ def load_data(self, **kwargs): "fra": corpus, } - og_relevant_docs = { + relevant_docs = { "test": { "q1": {"d1": 1, "d2": 0}, "q2": {"d1": 0, "d2": 1}, }, } - self.og_relevant_docs = { - "eng": og_relevant_docs, - "fra": og_relevant_docs, + self.relevant_docs = { + "eng": relevant_docs, + "fra": relevant_docs, } - og_instructions = { + instructions = { "test": { - "This is a test sentence": "This is a test instruction", - "This is another test sentence": "This is another test instruction", + "q1": "This is a test instruction", + "q2": "This is another test instruction", } } - self.og_instructions = { - "eng": og_instructions, - "fra": og_instructions, + self.instructions = { + "eng": instructions, + "fra": instructions, } - changed_instructions = { + self.top_ranked = None + + +class MockMultilingualInstructionReranking(AbsTaskReranking, MultilingualTask): + do_length_ablation = True + metadata = TaskMetadata( + type="InstructionReranking", + name="MockMultilingualInstructionReranking", + main_score="ndcg_at_10", + descriptive_stats={ "test": { - "This is a test sentence": "This is a changed test instruction", - "This is another test sentence": "This is changed another test instruction", + "num_documents": 4, + "num_queries": 4, + "num_instructions": 4, + "average_document_length": 30.0, + "average_query_length": 26.0, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 2.0, + "hf_subset_descriptive_stats": { + "eng": { + "num_documents": 2, + "num_queries": 2, + "num_instructions": 2, + "average_document_length": 30.0, + "average_query_length": 26.0, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 2.0, + }, + "fra": { + "num_documents": 2, + "num_queries": 2, + "num_instructions": 2, + "average_document_length": 30.0, + "average_query_length": 26.0, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 2.0, + }, + }, + } + }, + **general_args, # type: ignore + ) + metadata.eval_langs = multilingual_eval_langs + + def load_data(self, **kwargs): + queries = { + "test": { + "q1": "This is a test sentence", + "q2": "This is another test sentence", } } - self.changed_instructions = { - "eng": changed_instructions, - "fra": changed_instructions, + self.queries = { + "eng": queries, + "fra": queries, } - changed_relevant_docs = { + corpus = { "test": { - "q1": {"d1": 0, "d2": 1}, - "q2": {"d1": 1, "d2": 0}, + "d1": "This is a positive sentence", + "d2": "This is another positive sentence", } } - self.changed_relevant_docs = { - "eng": changed_relevant_docs, - "fra": changed_relevant_docs, + + self.corpus = { + "eng": corpus, + "fra": corpus, } + relevant_docs = { + "test": { + "q1": {"d1": 1, "d2": 0}, + "q2": {"d1": 0, "d2": 1}, + }, + } + + self.relevant_docs = { + "eng": relevant_docs, + "fra": relevant_docs, + } + + instructions = { + "test": { + "q1": "This is a test instruction", + "q2": "This is another test instruction", + } + } + self.instructions = { + "eng": instructions, + "fra": instructions, + } top_ranked = { "test": { "q1": ["d1", "d2"], diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index c28ad3ea59..4c73d825e5 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -16,13 +16,15 @@ MockClassificationTask, MockClusteringFastTask, MockClusteringTask, - MockInstructionRetrival, + MockInstructionReranking, + MockInstructionRetrieval, MockMultilabelClassification, MockMultilingualBitextMiningTask, MockMultilingualClassificationTask, MockMultilingualClusteringFastTask, MockMultilingualClusteringTask, - MockMultilingualInstructionRetrival, + MockMultilingualInstructionReranking, + MockMultilingualInstructionRetrieval, MockMultilingualMultilabelClassification, MockMultilingualPairClassificationTask, MockMultilingualParallelBitextMiningTask, @@ -90,8 +92,10 @@ MockMultilingualMultilabelClassification(), MockSummarizationTask(), MockMultilingualSummarizationTask(), - MockInstructionRetrival(), - MockMultilingualInstructionRetrival(), + MockInstructionRetrieval(), + MockMultilingualInstructionRetrieval(), + MockMultilingualInstructionReranking(), + MockInstructionReranking(), ] MOCK_TASK_TEST_GRID_AS_STRING = [ diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index ff3e1d5c86..660dd50c80 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -23,8 +23,8 @@ MockTorchEncoder, ) from .mock_tasks import ( - MockInstructionRetrival, - MockMultilingualInstructionRetrival, + MockInstructionRetrieval, + MockMultilingualInstructionRetrieval, MockMultilingualRerankingTask, MockMultilingualRetrievalTask, MockRerankingTask, @@ -268,8 +268,8 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): [ MockRerankingTask(), MockMultilingualRerankingTask(), - MockInstructionRetrival(), - MockMultilingualInstructionRetrival(), + MockInstructionRetrieval(), + MockMultilingualInstructionRetrieval(), MockRetrievalTask(), MockMultilingualRetrievalTask(), ], diff --git a/tests/test_evaluators/test_InstructionRetrievalEvaluator.py b/tests/test_evaluators/test_InstructionRetrievalEvaluator.py index 1711a641bf..42bc23b48f 100644 --- a/tests/test_evaluators/test_InstructionRetrievalEvaluator.py +++ b/tests/test_evaluators/test_InstructionRetrievalEvaluator.py @@ -23,41 +23,43 @@ def test_p_mrr(self): # these are the query: {"doc_id": score} original_run = { - "a": {"0": 1, "1": 2, "2": 3, "3": 4}, + "a-og": {"0": 1, "1": 2, "2": 3, "3": 4}, } new_run = { - "a": {"0": 1, "1": 2, "2": 3, "3": 4}, + "a-changed": {"0": 1, "1": 2, "2": 3, "3": 4}, } - results = utils.evaluate_change( + score = utils.calculate_pmrr( original_run, new_run, changed_qrels, ) - - assert results["p-MRR"] == 0.0 + assert score == 0.0 # test with a change new_run = { - "a": {"0": 4, "1": 1, "2": 2, "3": 3}, + "a-changed": {"0": 4, "1": 1, "2": 2, "3": 3}, } - results = utils.evaluate_change( + score = utils.calculate_pmrr( original_run, new_run, changed_qrels, ) + assert score == -0.75 - assert results["p-MRR"] == -0.75 - - # test with a positive change - - results = utils.evaluate_change( + # test with a positive change, flipping them + new_run = { + "a-og": {"0": 4, "1": 1, "2": 2, "3": 3}, + } + original_run = { + "a-changed": {"0": 1, "1": 2, "2": 3, "3": 4}, + } + score = utils.calculate_pmrr( new_run, original_run, changed_qrels, ) - - assert results["p-MRR"] == 0.75 + assert score == 0.75 diff --git a/tests/test_evaluators/test_RetrievalEvaluator.py b/tests/test_evaluators/test_RetrievalEvaluator.py index 01a4747969..1d4714aca4 100644 --- a/tests/test_evaluators/test_RetrievalEvaluator.py +++ b/tests/test_evaluators/test_RetrievalEvaluator.py @@ -38,6 +38,7 @@ def setup_method(self): "map": {"MAP@1": 0.75, "MAP@2": 1.0, "MAP@3": 1.0}, "recall": {"Recall@1": 0.75, "Recall@2": 1.0, "Recall@3": 1.0}, "precision": {"P@1": 1.0, "P@2": 0.75, "P@3": 0.5}, + "task_specific": {}, }, ), # Test no self retrieval @@ -57,6 +58,7 @@ def setup_method(self): "map": {"MAP@1": 0.25, "MAP@2": 0.25, "MAP@3": 0.25}, "recall": {"Recall@1": 0.25, "Recall@2": 0.25, "Recall@3": 0.25}, "precision": {"P@1": 0.5, "P@2": 0.25, "P@3": 0.16667}, + "task_specific": {}, }, ), ], @@ -71,12 +73,13 @@ def test_metrics_at_k( ignore_identical_ids=ignore_identical_ids, ) - ndcg, _map, recall, precision, nauc = output + ndcg, _map, recall, precision, nauc, task_specific = output assert ndcg == expected_metrics["ndcg"] assert _map == expected_metrics["map"] assert recall == expected_metrics["recall"] assert precision == expected_metrics["precision"] + assert task_specific == expected_metrics["task_specific"] @pytest.mark.parametrize( "ignore_identical_ids, expected_naucs", @@ -115,7 +118,7 @@ def test_nAUC(self, ignore_identical_ids, expected_naucs): "4": {"0": 0.5, "1": 0.4, "2": 0.5}, } - _, _, _, _, naucs = self.evaluator.evaluate( + _, _, _, _, naucs, _ = self.evaluator.evaluate( relevant_docs, results, [1, 2, 3], From 4f25da46e0d4d5018d7059e8a0b32f385488eb7a Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Tue, 5 Nov 2024 18:24:02 -0500 Subject: [PATCH 06/16] add metadata for all but MindSmall --- mteb/abstasks/AbsTask.py | 4 +- mteb/abstasks/AbsTaskRetrieval.py | 7 +- .../evaluators/RetrievalEvaluator.py | 59 +---- mteb/evaluation/evaluators/utils.py | 154 ++++++++---- .../multilingual/mFollowIR.py | 9 +- .../Reranking/eng/AskUbuntuDupQuestions.py | 14 +- .../tasks/Reranking/eng/MindSmallReranking.py | 42 +++- mteb/tasks/Reranking/eng/NevIR.py | 14 +- mteb/tasks/Reranking/eng/SciDocsReranking.py | 11 +- .../eng/StackOverflowDupQuestions.py | 11 +- .../eng/WebLINXCandidatesReranking.py | 66 ++++- mteb/tasks/Reranking/fra/AlloprofReranking.py | 11 +- mteb/tasks/Reranking/fra/SyntecReranking.py | 14 +- mteb/tasks/Reranking/jpn/MMarcoReranking.py | 11 +- .../Reranking/multilingual/ESCIReranking.py | 56 +++-- .../Reranking/multilingual/MIRACLReranking.py | 193 +++++++++++++- .../WikipediaRerankingMultilingual.py | 238 ++++++++++-------- mteb/tasks/Reranking/rus/RuBQReranking.py | 11 +- mteb/tasks/Reranking/zho/CMTEBReranking.py | 55 +++- 19 files changed, 715 insertions(+), 265 deletions(-) diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 5fa983bbe5..f84c1012ca 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -222,7 +222,9 @@ def calculate_metadata_metrics( split_details ) else: - split_details = self._calculate_metrics_from_split(split) + split_details = self._calculate_metrics_from_split( + split, hf_subset="default" + ) all_details[split] = split_details return all_details diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 3956280473..e6e8f3c17e 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -259,7 +259,7 @@ def _add_main_score(self, scores: ScoresDict) -> None: def _calculate_metrics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False ) -> RetrievalDescriptiveStatistics: - if hf_subset: + if hf_subset and hf_subset in self.queries: queries = self.queries[hf_subset][split] corpus = self.corpus[hf_subset][split] relevant_docs = self.relevant_docs[hf_subset][split] @@ -346,7 +346,10 @@ def calculate_length( queries_lens.append(len(query)) for doc in corpus.values(): - doc_lens.append(len(doc)) + if isinstance(doc, dict): + doc_lens.append(len(doc["text"])) + else: + doc_lens.append(len(doc)) doc_len = sum(doc_lens) / len(doc_lens) if doc_lens else 0 query_len = sum(queries_lens) / len(queries_lens) if queries_lens else 0 diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 0a1d96cec9..8dcac9ab00 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -3,9 +3,6 @@ import logging from typing import Any -import numpy as np -import pytrec_eval - from mteb.evaluation.evaluators.model_classes import ( DenseRetrievalExactSearch, DRESModel, @@ -15,11 +12,10 @@ from .Evaluator import Evaluator from .utils import ( add_task_specific_scores, - confidence_scores, + calculate_retrieval_scores, + evaluate_abstention, hole, mrr, - nAUC, - parse_metrics_from_scores, recall_cap, top_k_accuracy, ) @@ -139,31 +135,11 @@ def evaluate( "For evaluation, we DO NOT ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=True`` to ignore this." ) - map_string = "map_cut." + ",".join([str(k) for k in k_values]) - ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) - recall_string = "recall." + ",".join([str(k) for k in k_values]) - precision_string = "P." + ",".join([str(k) for k in k_values]) - evaluator = pytrec_eval.RelevanceEvaluator( - qrels, {map_string, ndcg_string, recall_string, precision_string} - ) - scores = evaluator.evaluate(results) - - ( - ndcg, - _map, - recall, - precision, - all_ndcgs, - all_aps, - all_recalls, - all_precisions, - ) = parse_metrics_from_scores(scores, k_values) - - naucs = RetrievalEvaluator.evaluate_abstention( - results, {**all_ndcgs, **all_aps, **all_recalls, **all_precisions} + all_scores, ndcg, _map, recall, precision, naucs = calculate_retrieval_scores( + results, qrels, k_values ) task_scores = add_task_specific_scores( - scores, qrels, results, task_name, k_values + all_scores, qrels, results, task_name, k_values ) return ndcg, _map, recall, precision, naucs, task_scores @@ -194,30 +170,7 @@ def evaluate_custom( ]: metric_scores = top_k_accuracy(qrels, results, k_values, output_type) - naucs = RetrievalEvaluator.evaluate_abstention(results, metric_scores) + naucs = evaluate_abstention(results, metric_scores) metric_scores_avg = {k: sum(v) / len(v) for k, v in metric_scores.items()} return metric_scores_avg, naucs - - @staticmethod - def evaluate_abstention( - results: dict[str, dict[str, float]], - metric_scores: dict[str, list[float]], - ) -> dict[str, float]: - """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997""" - all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())] - all_conf_scores = [ - confidence_scores(sim_scores) for sim_scores in all_sim_scores - ] - conf_fcts = list(all_conf_scores[0].keys()) - all_conf_scores = { - fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts - } - metric_scores = {k: np.array(v) for k, v in metric_scores.items()} - naucs = {} - - for metric_name, scores in metric_scores.items(): - for fct, conf_scores in all_conf_scores.items(): - naucs[f"nAUC_{metric_name}_{fct}"] = nAUC(conf_scores, scores) - - return naucs diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index 8ccb6de8d2..e7e010bad2 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -240,6 +240,9 @@ def get_rank_from_dict( def calculate_pmrr(original_run, new_run, changed_qrels): changes = [] for qid in changed_qrels.keys(): + if qid + "-og" not in original_run or qid + "-changed" not in new_run: + logging.warning(f"Query {qid} not found in the runs for calculating p-MRR") + continue original_qid_run = original_run[qid + "-og"] new_qid_run = new_run[qid + "-changed"] for idx, changed_doc in enumerate(changed_qrels[qid]): @@ -283,14 +286,36 @@ def evaluate_p_mrr_change( "Core17InstructionRetrieval": ("jhu-clsp/core17-instructions-mteb", False), "Robust04InstructionRetrieval": ("jhu-clsp/robust04-instructions-mteb", False), "News21InstructionRetrieval": ("jhu-clsp/news21-instructions-mteb", False), - "mFollowIR": ("jhu-clsp/mfollowir-mteb", True), + "mFollowIR": ("jhu-clsp/mfollowir-parquet-mteb", True), "mFollowIRCrossLingual": ( - "jhu-clsp/mfollowir-cross-lingual-mteb", + "jhu-clsp/mfollowir-cross-lingual-parquet-mteb", True, ), } hf_path, is_multilingual = TASK_TO_HF_DATASET[task_name] - langs = "eng" if not is_multilingual else ["zho", "rus", "fas"] + if is_multilingual: + # figure out which of the languages this is: ["zho", "rus", "fas"] + # gather the changed_qrels for each, and store the keys as a check + for lang in ["zho", "rus", "fas"]: + config_name = f"qrel_diff-{lang}" + changed_qrels = { + item["query-id"]: item["corpus-ids"] + for item in load_dataset(hf_path, config_name)["qrel_diff"] + } + potential_keys = {item + "-og" for item in changed_qrels.keys()} | { + item + "-changed" for item in changed_qrels.keys() + } + if ( + potential_keys == set(qrels.keys()) + or len(potential_keys - set(qrels.keys())) <= 2 + ): # there are about two skipped + break # this is the right qrels + + else: + changed_qrels = { + item["query-id"]: item["corpus-ids"] + for item in load_dataset(hf_path, "qrel_diff")["qrel_diff"] + } qrels_sep = { "og": {k: v for k, v in qrels.items() if k.endswith("-og")}, @@ -306,37 +331,20 @@ def evaluate_p_mrr_change( else: new_run[qid] = docs - for lang in langs: - config_name = "qrel_diff" if not is_multilingual else f"qrel_diff-{lang}" - changed_qrels = { - item["query-id"]: item["corpus-ids"] - for item in load_dataset(hf_path, config_name)["qrel_diff"] - } - p_mrr = calculate_pmrr(original_run, new_run, changed_qrels) - followir_scores[lang]["p-MRR"] = p_mrr - - # unfortunately, have to re-compute scores here to get only og and changed scores - followir_scores[lang]["og"] = {} - followir_scores[lang]["changed"] = {} - for name, group in [("og", original_run), ("changed", new_run)]: - map_string = "map_cut." + ",".join([str(k) for k in k_values]) - ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) - recall_string = "recall." + ",".join([str(k) for k in k_values]) - precision_string = "P." + ",".join([str(k) for k in k_values]) - evaluator = pytrec_eval.RelevanceEvaluator( - qrels_sep[name], - {map_string, ndcg_string, recall_string, precision_string}, - ) - # qrels_changed_as_og = {k.replace("-changed", "-og"): v for k, v in qrels_sep["changed"].items()} - group_scores = evaluator.evaluate(group) - ndcg, _map, recall, precision = parse_metrics_from_scores( - group_scores, k_values - )[:4] # don't need the other fields + p_mrr = calculate_pmrr(original_run, new_run, changed_qrels) + followir_scores["p-MRR"] = p_mrr - # add these to the followir_scores with name prefix - scores_dict = make_score_dict(ndcg, _map, recall, precision, {}, {}, {}, {}) - for key, value in scores_dict.items(): - followir_scores[lang][name][key] = value + # unfortunately, have to re-compute scores here to get only og and changed scores + followir_scores["og"] = {} + followir_scores["changed"] = {} + for name, group in [("og", original_run), ("changed", new_run)]: + _, ndcg, _map, recall, precision, naucs = calculate_retrieval_scores( + group, qrels_sep[name], k_values + ) + # add these to the followir_scores with name prefix + scores_dict = make_score_dict(ndcg, _map, recall, precision, {}, naucs, {}, {}) + for key, value in scores_dict.items(): + followir_scores[name][key] = value return followir_scores @@ -654,26 +662,82 @@ def parse_metrics_from_scores(scores, k_values): ) -def max_over_subqueries(qrels, results, scores): - """Computes the max over subqueries. This metric is the maximum of the scores for all subqueries +def max_over_subqueries(qrels, results, k_values): + """Computes the max over subqueries scores when merging. Args: qrels: Ground truth relevance judgments for the queries results: Predicted relevance scores for the queries - scores: The scores for the queries, to extract average precision for each query + k_values: The k values for which to compute the scores """ query_keys = defaultdict(list) for key in qrels.keys(): query_keys[key.split("_")[0]].append(key) - max_over_subqueries = [] - for _, keys in query_keys.items(): - # get the average precision for each query - current_scores = [] - for key in keys: - current_scores.append(scores[key]["P_1"]) + new_results = {} + for query_id_base, query_ids in query_keys.items(): + doc_scores = defaultdict(float) + for query_id_full in query_ids: + for doc_id, score in results[query_id_full].items(): + if doc_id not in doc_scores: + doc_scores[doc_id] = score + else: + doc_scores[doc_id] = max(score, doc_scores[doc_id]) + + new_results[query_id_base] = doc_scores + + # now we have the new results, we can compute the scores + _, ndcg, _map, recall, precision, naucs = calculate_retrieval_scores( + new_results, qrels, k_values + ) + score_dict = make_score_dict(ndcg, _map, recall, precision, {}, naucs, {}, {}) + return {"max_over_subqueries_" + k: v for k, v in score_dict.items()} + + +def calculate_retrieval_scores(results, qrels, k_values): + map_string = "map_cut." + ",".join([str(k) for k in k_values]) + ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) + recall_string = "recall." + ",".join([str(k) for k in k_values]) + precision_string = "P." + ",".join([str(k) for k in k_values]) + evaluator = pytrec_eval.RelevanceEvaluator( + qrels, {map_string, ndcg_string, recall_string, precision_string} + ) + scores = evaluator.evaluate(results) + + ( + ndcg, + _map, + recall, + precision, + all_ndcgs, + all_aps, + all_recalls, + all_precisions, + ) = parse_metrics_from_scores(scores, k_values) + + naucs = evaluate_abstention( + results, {**all_ndcgs, **all_aps, **all_recalls, **all_precisions} + ) + + return scores, ndcg, _map, recall, precision, naucs + + +def evaluate_abstention( + results: dict[str, dict[str, float]], + metric_scores: dict[str, list[float]], +) -> dict[str, float]: + """Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997""" + all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())] + all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores] + conf_fcts = list(all_conf_scores[0].keys()) + all_conf_scores = { + fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts + } + metric_scores = {k: np.array(v) for k, v in metric_scores.items()} + naucs = {} - # get the max average precision - max_over_subqueries.append(max(current_scores)) + for metric_name, scores in metric_scores.items(): + for fct, conf_scores in all_conf_scores.items(): + naucs[f"nAUC_{metric_name}_{fct}"] = nAUC(conf_scores, scores) - return sum(max_over_subqueries) / len(max_over_subqueries) + return naucs diff --git a/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py b/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py index be265391e7..a12be42492 100644 --- a/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py +++ b/mteb/tasks/InstructionReranking/multilingual/mFollowIR.py @@ -120,10 +120,7 @@ def load_data( revision=revision, ) for row in top_ranked_data["top_ranked"]: - if row["qid"] not in top_ranked[lang][EVAL_SPLIT]: - top_ranked[lang][EVAL_SPLIT][row["qid"]] = [row["pid"]] - else: - top_ranked[lang][EVAL_SPLIT][row["qid"]].append(row["pid"]) + top_ranked[lang][EVAL_SPLIT][row["query-id"]] = row["corpus-ids"] return (corpus, queries, instructions, relevant_docs, top_ranked) @@ -135,7 +132,7 @@ class mFollowIRCrossLingual(MultilingualTask, AbsTaskReranking): reference="https://neuclir.github.io/", dataset={ "path": "jhu-clsp/mFollowIR-cross-lingual-parquet-mteb", - "revision": "9c1e094d813857dcdefcaf9764c520b8ba237ffd", + "revision": "6b01566619233a0c35d135123510b6b02c258ff5", }, type="Retrieval", category="s2p", @@ -231,7 +228,7 @@ class mFollowIR(MultilingualTask, AbsTaskReranking): reference="https://neuclir.github.io/", dataset={ "path": "jhu-clsp/mFollowIR-parquet-mteb", - "revision": "d95f42313ebac4f1ae188afcd26097210ada8779", + "revision": "09eecbe45c54b4a6dfb8e68e345cae77337768e2", }, type="Retrieval", category="s2p", diff --git a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py index 54bc8316b9..3f6a4a4b33 100644 --- a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py +++ b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py @@ -38,12 +38,14 @@ class AskUbuntuDupQuestions(AbsTaskReranking): descriptive_stats={ "n_samples": {"test": 2255}, "test": { - "num_samples": 375, - "num_positive": 375, - "num_negative": 375, - "avg_query_len": 50.205333333333336, - "avg_positive_len": 6.013333333333334, - "avg_negative_len": 13.986666666666666, + "average_document_length": 52.49722991689751, + "average_query_length": 50.13019390581717, + "num_documents": 7220, + "num_queries": 361, + "average_relevant_docs_per_query": 5.470914127423823, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 20.0, }, }, ) diff --git a/mteb/tasks/Reranking/eng/MindSmallReranking.py b/mteb/tasks/Reranking/eng/MindSmallReranking.py index 3c25f5f25e..556b2614ea 100644 --- a/mteb/tasks/Reranking/eng/MindSmallReranking.py +++ b/mteb/tasks/Reranking/eng/MindSmallReranking.py @@ -1,8 +1,17 @@ from __future__ import annotations +import logging + +import tqdm +from datasets import Dataset, DatasetDict + from mteb.abstasks.TaskMetadata import TaskMetadata from ....abstasks.AbsTaskReranking import AbsTaskReranking +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) class MindSmallReranking(AbsTaskReranking): @@ -19,7 +28,7 @@ class MindSmallReranking(AbsTaskReranking): modalities=["text"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="map_at_1000", + main_score="max_over_subqueries_map_at_1000", date=("2019-10-12", "2019-11-22"), domains=["News", "Written"], task_subtypes=[], @@ -51,3 +60,34 @@ class MindSmallReranking(AbsTaskReranking): "avg_character_length": {"test": 70.9}, }, ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + # since AbsTaskReranking has no `load_data` method, we call the parent class method + super(AbsTaskRetrieval, self).load_data(**kwargs) + + # we can expand the queries so that each has its own row and a new query-id + new_dataset = {} + for split in self.dataset: + logging.info(f"Expanding queries for split {split}") + # Create lists to store the expanded data + expanded_data = [] + + for instance in tqdm.tqdm(self.dataset[split]): + for i, subquery in enumerate(instance["query"]): + new_instance = instance.copy() + new_instance["query"] = subquery + new_instance["id"] = f"{instance['id']}_{i}" + expanded_data.append(new_instance) + + # Convert the list of instances into a Dataset object + new_dataset[split] = Dataset.from_list(expanded_data) + + # Create the DatasetDict from the dictionary of Datasets + self.dataset = DatasetDict(new_dataset) + + # now convert to the new format + self.transform_old_dataset_format(self.dataset) + self.data_loaded = True diff --git a/mteb/tasks/Reranking/eng/NevIR.py b/mteb/tasks/Reranking/eng/NevIR.py index c5810b62b3..f2769f7c04 100644 --- a/mteb/tasks/Reranking/eng/NevIR.py +++ b/mteb/tasks/Reranking/eng/NevIR.py @@ -37,12 +37,14 @@ class NevIR(AbsTaskReranking): descriptive_stats={ "n_samples": {"test": 2255}, "test": { - "num_samples": 375, - "num_positive": 375, - "num_negative": 375, - "avg_query_len": 50.205333333333336, - "avg_positive_len": 6.013333333333334, - "avg_negative_len": 13.986666666666666, + "average_document_length": 712.460289514867, + "average_query_length": 67.9287780187997, + "num_documents": 5112, + "num_queries": 2766, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 2.0, }, }, ) diff --git a/mteb/tasks/Reranking/eng/SciDocsReranking.py b/mteb/tasks/Reranking/eng/SciDocsReranking.py index be1153de53..f81641aeb0 100644 --- a/mteb/tasks/Reranking/eng/SciDocsReranking.py +++ b/mteb/tasks/Reranking/eng/SciDocsReranking.py @@ -52,6 +52,15 @@ class SciDocsReranking(AbsTaskReranking): """, descriptive_stats={ "n_samples": {"test": 19599}, - "avg_character_length": {"test": 69.0}, + "test": { + "average_document_length": 71.52865935919056, + "average_query_length": 69.87456008044244, + "num_documents": 118600, + "num_queries": 3978, + "average_relevant_docs_per_query": 4.92684766214178, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 29.813976872800403, + }, }, ) diff --git a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py index 9b53b75f8c..6b4d6ce57c 100644 --- a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py +++ b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py @@ -36,6 +36,15 @@ class StackOverflowDupQuestions(AbsTaskReranking): }""", descriptive_stats={ "n_samples": {"test": 3467}, - "avg_character_length": {"test": 49.8}, + "test": { + "average_document_length": 44.482094556834696, + "average_query_length": 53.160762032085564, + "num_documents": 89470, + "num_queries": 2992, + "average_relevant_docs_per_query": 1.1587566844919786, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 29.90307486631016, + }, }, ) diff --git a/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py b/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py index 2d9b528479..b99af72049 100644 --- a/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py +++ b/mteb/tasks/Reranking/eng/WebLINXCandidatesReranking.py @@ -54,13 +54,65 @@ class WebLINXCandidatesReranking(AbsTaskReranking): "test_vis": 5298, "test_geo": 4916, }, - "avg_character_length": { - "validation": 1647.52, - "test_iid": 1722.63, - "test_cat": 2149.66, - "test_web": 1831.46, - "test_vis": 1737.26, - "test_geo": 1742.66, + "validation": { + "average_document_length": 318.17634941296905, + "average_query_length": 1647.5180630284397, + "num_documents": 316508, + "num_queries": 1301, + "average_relevant_docs_per_query": 1.01076095311299, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 243.2805534204458, + }, + "test_iid": { + "average_document_length": 318.135696550501, + "average_query_length": 1722.6321279554938, + "num_documents": 405972, + "num_queries": 1438, + "average_relevant_docs_per_query": 1.0528511821974966, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 282.317107093185, + }, + "test_cat": { + "average_document_length": 313.91351392594606, + "average_query_length": 2149.6587078651687, + "num_documents": 1258191, + "num_queries": 3560, + "average_relevant_docs_per_query": 1.0016853932584269, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 353.4244382022472, + }, + "test_geo": { + "average_document_length": 315.00053963351843, + "average_query_length": 1742.6588689991863, + "num_documents": 1150781, + "num_queries": 4916, + "average_relevant_docs_per_query": 1.0024410089503661, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 234.08889340927584, + }, + "test_vis": { + "average_document_length": 327.165126601106, + "average_query_length": 1737.2595318988297, + "num_documents": 1606858, + "num_queries": 5298, + "average_relevant_docs_per_query": 1.0152887882219706, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 303.2952057380143, + }, + "test_web": { + "average_document_length": 326.280188209908, + "average_query_length": 1831.4624681933842, + "num_documents": 834175, + "num_queries": 3144, + "average_relevant_docs_per_query": 1.0588422391857506, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 265.3228371501272, }, }, ) diff --git a/mteb/tasks/Reranking/fra/AlloprofReranking.py b/mteb/tasks/Reranking/fra/AlloprofReranking.py index 54cada1a94..8d7400c127 100644 --- a/mteb/tasks/Reranking/fra/AlloprofReranking.py +++ b/mteb/tasks/Reranking/fra/AlloprofReranking.py @@ -41,7 +41,16 @@ class AlloprofReranking(AbsTaskReranking): }""", descriptive_stats={ "n_samples": {"test": 2316, "train": 9264}, - "avg_character_length": None, + "test": { + "average_document_length": 4071.0077079755583, + "average_query_length": 170.71286701208982, + "num_documents": 25039, + "num_queries": 2316, + "average_relevant_docs_per_query": 1.2845423143350605, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.811312607944732, + }, }, ) diff --git a/mteb/tasks/Reranking/fra/SyntecReranking.py b/mteb/tasks/Reranking/fra/SyntecReranking.py index ac6466e458..a86d61b679 100644 --- a/mteb/tasks/Reranking/fra/SyntecReranking.py +++ b/mteb/tasks/Reranking/fra/SyntecReranking.py @@ -37,7 +37,19 @@ class SyntecReranking(AbsTaskReranking): archivePrefix={arXiv}, primaryClass={cs.CL} }""", - descriptive_stats={"n_samples": None, "avg_character_length": None}, + descriptive_stats={ + "n_samples": {"test": 100}, + "test": { + "average_document_length": 1716.897738446411, + "average_query_length": 72.82, + "num_documents": 1017, + "num_queries": 100, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.17, + }, + }, ) def load_data(self, **kwargs): diff --git a/mteb/tasks/Reranking/jpn/MMarcoReranking.py b/mteb/tasks/Reranking/jpn/MMarcoReranking.py index 86c22a1d26..825832898c 100644 --- a/mteb/tasks/Reranking/jpn/MMarcoReranking.py +++ b/mteb/tasks/Reranking/jpn/MMarcoReranking.py @@ -35,7 +35,16 @@ class VoyageMMarcoReranking(AbsTaskReranking): archivePrefix={arXiv},}""", descriptive_stats={ "n_samples": {"test": 2048}, - "avg_character_length": {"test": 162}, + "test": { + "average_document_length": 164.72532084309134, + "average_query_length": 15.9208984375, + "num_documents": 53375, + "num_queries": 2048, + "average_relevant_docs_per_query": 1.06201171875, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 26.06201171875, + }, }, ) diff --git a/mteb/tasks/Reranking/multilingual/ESCIReranking.py b/mteb/tasks/Reranking/multilingual/ESCIReranking.py index 3c0db2dcac..05d858c7c0 100644 --- a/mteb/tasks/Reranking/multilingual/ESCIReranking.py +++ b/mteb/tasks/Reranking/multilingual/ESCIReranking.py @@ -49,36 +49,44 @@ class ESCIReranking(AbsTaskReranking, MultilingualTask): bibtex_citation=_CITATION, descriptive_stats={ "test": { - "num_samples": 29285, - "num_positive": 29285, - "num_negative": 29285, - "avg_query_len": 19.691890046098685, - "avg_positive_len": 9.268089465596722, - "avg_negative_len": 1.5105002561038074, + "average_document_length": 793.9222570025365, + "average_query_length": 20.194805194805195, + "num_documents": 148232, + "num_queries": 10395, + "average_relevant_docs_per_query": 10.277825877825878, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 14.25993265993266, "hf_subset_descriptive_stats": { "us": { - "num_samples": 21296, - "num_positive": 21296, - "num_negative": 21296, - "avg_query_len": 21.440833959429, - "avg_positive_len": 8.892515026296017, - "avg_negative_len": 1.1956705484598047, + "average_document_length": 858.2693745556295, + "average_query_length": 22.554526441589484, + "num_documents": 87202, + "num_queries": 6694, + "average_relevant_docs_per_query": 9.446519270988945, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 13.026889752016732, }, "es": { - "num_samples": 3703, - "num_positive": 3703, - "num_negative": 3703, - "avg_query_len": 20.681609505806104, - "avg_positive_len": 10.561706724277613, - "avg_negative_len": 2.749932487172563, + "average_document_length": 1006.1636500281832, + "average_query_length": 21.262560777957862, + "num_documents": 31934, + "num_queries": 1851, + "average_relevant_docs_per_query": 12.038357644516477, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 17.252296056185845, }, "jp": { - "num_samples": 4286, - "num_positive": 4286, - "num_negative": 4286, - "avg_query_len": 10.146756882874476, - "avg_positive_len": 10.016565562295847, - "avg_negative_len": 2.003966402239851, + "average_document_length": 368.12785262579047, + "average_query_length": 10.588108108108107, + "num_documents": 29096, + "num_queries": 1850, + "average_relevant_docs_per_query": 11.524324324324324, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 15.727567567567567, }, }, } diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index 00048688f5..d20cdf97a1 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -70,6 +70,197 @@ class MIRACLReranking(AbsTaskReranking, MultilingualTask): bibtex_citation=_CITATION, descriptive_stats={ "n_samples": {"dev": 44608}, - "avg_character_length": {"dev": 506.30}, + "dev": { + "average_document_length": 611.2265860323572, + "average_query_length": 36.47500798466943, + "num_documents": 128812, + "num_queries": 12524, + "average_relevant_docs_per_query": 2.3573139572021717, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.285212392206963, + "hf_subset_descriptive_stats": { + "ar": { + "average_document_length": 693.8530670959345, + "average_query_length": 29.480662983425415, + "num_documents": 29197, + "num_queries": 2896, + "average_relevant_docs_per_query": 1.953729281767956, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.081837016574585, + }, + "bn": { + "average_document_length": 711.114122681883, + "average_query_length": 46.98053527980535, + "num_documents": 4206, + "num_queries": 411, + "average_relevant_docs_per_query": 2.099756690997567, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.233576642335766, + }, + "de": { + "average_document_length": 634.067007019783, + "average_query_length": 46.06578947368421, + "num_documents": 3134, + "num_queries": 304, + "average_relevant_docs_per_query": 2.6348684210526314, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.30921052631579, + }, + "en": { + "average_document_length": 750.0277271068953, + "average_query_length": 40.31003811944092, + "num_documents": 8223, + "num_queries": 787, + "average_relevant_docs_per_query": 2.7941550190597204, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.44853875476493, + }, + "es": { + "average_document_length": 626.9067948509044, + "average_query_length": 47.573743922204216, + "num_documents": 6137, + "num_queries": 617, + "average_relevant_docs_per_query": 4.345218800648298, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.946515397082658, + }, + "fa": { + "average_document_length": 492.3620453507838, + "average_query_length": 41.1503164556962, + "num_documents": 6571, + "num_queries": 632, + "average_relevant_docs_per_query": 2.079113924050633, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.397151898734178, + }, + "fi": { + "average_document_length": 630.257385028533, + "average_query_length": 38.76246830092984, + "num_documents": 11916, + "num_queries": 1183, + "average_relevant_docs_per_query": 1.9907016060862215, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.072696534234996, + }, + "fr": { + "average_document_length": 558.0711577719452, + "average_query_length": 43.883381924198254, + "num_documents": 3429, + "num_queries": 343, + "average_relevant_docs_per_query": 2.131195335276968, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.997084548104956, + }, + "hi": { + "average_document_length": 577.9819690898684, + "average_query_length": 53.34, + "num_documents": 3494, + "num_queries": 350, + "average_relevant_docs_per_query": 2.1485714285714286, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.982857142857142, + }, + "id": { + "average_document_length": 677.5285759393748, + "average_query_length": 38.03407880724175, + "num_documents": 9501, + "num_queries": 939, + "average_relevant_docs_per_query": 3.110756123535676, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.118210862619808, + }, + "ja": { + "average_document_length": 292.59835768626976, + "average_query_length": 17.7465495608532, + "num_documents": 8281, + "num_queries": 797, + "average_relevant_docs_per_query": 2.1543287327478042, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.39021329987453, + }, + "ko": { + "average_document_length": 282.47890088321884, + "average_query_length": 21.624413145539908, + "num_documents": 3057, + "num_queries": 213, + "average_relevant_docs_per_query": 2.568075117370892, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 14.352112676056338, + }, + "ru": { + "average_document_length": 767.156817659232, + "average_query_length": 44.15878107457899, + "num_documents": 13047, + "num_queries": 1247, + "average_relevant_docs_per_query": 2.8123496391339216, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.46271050521251, + }, + "sw": { + "average_document_length": 541.4608131997643, + "average_query_length": 38.88565488565489, + "num_documents": 5091, + "num_queries": 481, + "average_relevant_docs_per_query": 1.8898128898128899, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.584199584199585, + }, + "te": { + "average_document_length": 787.1287703016242, + "average_query_length": 38.464285714285715, + "num_documents": 862, + "num_queries": 84, + "average_relevant_docs_per_query": 1.3095238095238095, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.261904761904763, + }, + "th": { + "average_document_length": 586.1071334214002, + "average_query_length": 42.83150684931507, + "num_documents": 7570, + "num_queries": 730, + "average_relevant_docs_per_query": 1.8356164383561644, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 10.36986301369863, + }, + "yo": { + "average_document_length": 407.2617845117845, + "average_query_length": 37.6890756302521, + "num_documents": 1188, + "num_queries": 119, + "average_relevant_docs_per_query": 1.2100840336134453, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.983193277310924, + }, + "zh": { + "average_document_length": 182.91760491299897, + "average_query_length": 10.859335038363172, + "num_documents": 3908, + "num_queries": 391, + "average_relevant_docs_per_query": 2.4910485933503836, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.994884910485933, + }, + }, + }, }, ) diff --git a/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py b/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py index 632a20d4e3..af63334660 100644 --- a/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py +++ b/mteb/tasks/Reranking/multilingual/WikipediaRerankingMultilingual.py @@ -72,140 +72,174 @@ class WikipediaRerankingMultilingual(AbsTaskReranking, MultilingualTask): "sv": 1500, }, "test": { - "num_samples": 24000, - "num_positive": 24000, - "num_negative": 24000, - "avg_query_len": 59.091208333333334, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 381.70714351851854, + "average_query_length": 59.091208333333334, + "num_documents": 216000, + "num_queries": 24000, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, "hf_subset_descriptive_stats": { "bg": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 60.82666666666667, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 374.376, + "average_query_length": 60.82666666666667, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "bn": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 47.266666666666666, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 394.05044444444445, + "average_query_length": 47.266666666666666, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "cs": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 56.272, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 369.9831111111111, + "average_query_length": 56.272, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "da": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 56.75066666666667, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 345.2597037037037, + "average_query_length": 56.75066666666667, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "de": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 70.004, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 398.4137777777778, + "average_query_length": 70.004, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "en": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 68.372, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 452.9871111111111, + "average_query_length": 68.372, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "fa": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 48.66733333333333, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 345.1568888888889, + "average_query_length": 48.66733333333333, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "fi": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 55.343333333333334, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 379.71237037037037, + "average_query_length": 55.343333333333334, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "hi": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 50.77733333333333, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 410.72540740740743, + "average_query_length": 50.77733333333333, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "it": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 70.05466666666666, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 393.73437037037036, + "average_query_length": 70.05466666666666, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "nl": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 65.34466666666667, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 375.6695555555556, + "average_query_length": 65.34466666666667, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "pt": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 65.11933333333333, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 398.27237037037037, + "average_query_length": 65.11933333333333, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "ro": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 61.973333333333336, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 348.3817037037037, + "average_query_length": 61.973333333333336, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "sr": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 55.669333333333334, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 384.3131851851852, + "average_query_length": 55.669333333333334, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "no": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 55.288, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 366.93733333333336, + "average_query_length": 55.288, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, "sv": { - "num_samples": 1500, - "num_positive": 1500, - "num_negative": 1500, - "avg_query_len": 57.73, - "avg_positive_len": 1.0, - "avg_negative_len": 8.0, + "average_document_length": 369.340962962963, + "average_query_length": 57.73, + "num_documents": 13500, + "num_queries": 1500, + "average_relevant_docs_per_query": 1.0, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 9.0, }, }, }, diff --git a/mteb/tasks/Reranking/rus/RuBQReranking.py b/mteb/tasks/Reranking/rus/RuBQReranking.py index d9b71386aa..97209c5d99 100644 --- a/mteb/tasks/Reranking/rus/RuBQReranking.py +++ b/mteb/tasks/Reranking/rus/RuBQReranking.py @@ -36,6 +36,15 @@ class RuBQReranking(AbsTaskReranking): }""", descriptive_stats={ "n_samples": {"test": 1551}, - "avg_character_length": {"test": 499.9}, + "test": { + "average_document_length": 457.17801158971344, + "average_query_length": 42.818826563507415, + "num_documents": 37447, + "num_queries": 1551, + "average_relevant_docs_per_query": 1.6776273372018053, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 24.143778207607994, + }, }, ) diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index f8b6a4fc49..066e403ff8 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -35,7 +35,19 @@ class T2Reranking(AbsTaskReranking): archivePrefix={arXiv}, primaryClass={cs.IR} }""", - descriptive_stats={"n_samples": None, "avg_character_length": None}, + descriptive_stats={ + "n_samples": {"dev": 5908}, + "dev": { + "average_document_length": 840.8301307712837, + "average_query_length": 10.948375084631008, + "num_documents": 97422, + "num_queries": 5908, + "average_relevant_docs_per_query": 7.522681110358835, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 16.489844278943806, + }, + }, ) @@ -70,7 +82,19 @@ class MMarcoReranking(AbsTaskReranking): archivePrefix={arXiv}, primaryClass={cs.CL} }""", - descriptive_stats={"n_samples": None, "avg_character_length": None}, + descriptive_stats={ + "n_samples": {"dev": 100}, + "dev": { + "average_document_length": 123.76968988063103, + "average_query_length": 11.44, + "num_documents": 100026, + "num_queries": 100, + "average_relevant_docs_per_query": 1.07, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 1000.26, + }, + }, ) @@ -107,8 +131,17 @@ class CMedQAv1(AbsTaskReranking): publisher={Multidisciplinary Digital Publishing Institute} }""", descriptive_stats={ - "n_samples": {"test": 2000}, - "avg_character_length": {"test": 165}, + "n_samples": {"test": 1000}, + "test": { + "average_document_length": 106.63992, + "average_query_length": 55.717, + "num_documents": 100000, + "num_queries": 1000, + "average_relevant_docs_per_query": 1.931, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 100.0, + }, }, ) @@ -148,5 +181,17 @@ class CMedQAv2(AbsTaskReranking): doi={10.1109/ACCESS.2018.2883637}, ISSN={2169-3536}, month={},}""", - descriptive_stats={"n_samples": None, "avg_character_length": None}, + descriptive_stats={ + "n_samples": {"test": 1000}, + "test": { + "average_document_length": 100.61386, + "average_query_length": 48.848, + "num_documents": 100000, + "num_queries": 1000, + "average_relevant_docs_per_query": 1.91, + "average_instruction_length": 0, + "num_instructions": 0, + "average_top_ranked_per_query": 100.0, + }, + }, ) From 01c63e0d467898e265d3325f39595099d2fe92cf Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Sat, 9 Nov 2024 12:53:02 -0500 Subject: [PATCH 07/16] faster evaluation; mindsmall can compute in reasonable time --- mteb/abstasks/AbsTask.py | 4 +- mteb/abstasks/AbsTaskReranking.py | 86 +----- mteb/abstasks/AbsTaskRetrieval.py | 8 + mteb/evaluation/evaluators/model_classes.py | 276 +++++++++++++----- mteb/evaluation/evaluators/utils.py | 66 +++-- .../IndicSentimentClassification.py | 2 +- .../tasks/Reranking/eng/MindSmallReranking.py | 176 +++++++++-- mteb/tasks/Reranking/eng/NevIR.py | 2 +- .../Reranking/multilingual/MIRACLReranking.py | 127 ++++++++ 9 files changed, 559 insertions(+), 188 deletions(-) diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index f84c1012ca..5fa983bbe5 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -222,9 +222,7 @@ def calculate_metadata_metrics( split_details ) else: - split_details = self._calculate_metrics_from_split( - split, hf_subset="default" - ) + split_details = self._calculate_metrics_from_split(split) all_details[split] = split_details return all_details diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index 2a303dbb65..b4a5cffd25 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -4,10 +4,8 @@ from collections import defaultdict import datasets -import tqdm from datasets import Dataset -from ..load_results.task_results import ScoresDict from .AbsTaskRetrieval import AbsTaskRetrieval logger = logging.getLogger(__name__) @@ -33,29 +31,7 @@ class AbsTaskReranking(AbsTaskRetrieval): - """Abstract class for re-ranking experiments. This is mostly the same as the RetrievalEvaluator, but treats each query as a "mini" retrieval problem. - - New Format: - ----------- - Same as AbsTaskRetrieval, but with a top_ranked file that contains the passages to rerank. The dataset should contain the following columns: - - self.corpus: dict[str, dict[str, str]] - Semantically, it should contain dict[split_name, dict[sample_id, dict[str, str]]] - E.g. {"test": {"document_one": {"_id": "d1", "title": "title", "text": "text"}}} - - self.queries: dict[str, dict[str, Union[str, list[str]]]] - Semantically, it should contain dict[split_name, dict[sample_id, str]] or dict[split_name, dict[sample_id, list[str]]] for conversations - E.g. {"test": {"q1": "query"}} - or {"test": {"q1": ["turn1", "turn2", "turn3"]}} - - self.relevant_docs: dict[str, dict[str, dict[str, int]]] - Semantically, it should contain dict[split_name, dict[sample_id, dict[doc_id, score]]] - E.g.: {"test": {"q1": {"document_one": 1}}} - - self.top_ranked: dict[str, dict[str, list[str]]] or dict[str, dict[str, dict[str, float]]] - Semantically, it should contain dict[split_name, dict[sample_id, list[doc_id]]] or dict[split_name, dict[sample_id, dict[doc_id, score]]] - E.g.: {"test": {"q1": ["document_one", "document_two"]}} or {"test": {"q1": {"document_one": 1, "document_two": 0.5}}} - """ + """Abstract class for re-ranking experiments. This is mostly the same as the RetrievalEvaluator, but here to adapt the old format to the new format. TODO: update these tasks to the new format and delete this class.""" def __init__(self, **kwargs): super(AbsTaskRetrieval, self).__init__(**kwargs) @@ -88,16 +64,19 @@ def process_example(self, example: dict, split: str, query_idx: int) -> dict: } for i, pos_doc in enumerate(positive_docs): + # format i as a five digit number + formatted_i = str(i).zfill(5) # have "a" in front so that positives are first, then negatives # this shouldn't matter except for ties, and the previous reranking results # had the positives first - doc_id = f"apositive_{i}_{query_id}" + doc_id = f"apositive_{query_id}_{formatted_i}" example_data["doc_ids"].append(doc_id) example_data["doc_texts"].append(pos_doc) example_data["relevance_scores"].append(1) for i, neg_doc in enumerate(negative_docs): - doc_id = f"negative_{i}_{query_id}" + formatted_i = str(i).zfill(5) + doc_id = f"negative_{query_id}_{formatted_i}" example_data["doc_ids"].append(doc_id) example_data["doc_texts"].append(neg_doc) example_data["relevance_scores"].append(0) @@ -151,8 +130,10 @@ def transform_old_dataset_format(self, given_dataset=None): # first, filter out the ones that have no positive or no negatives enumerated_dataset = enumerated_dataset.filter( - lambda x: len(x["positive"]) > 0 and len(x["negative"]) > 0 + lambda example: len(example["positive"]) > 0 + and len(example["negative"]) > 0 ) + logger.info( f"Filtered out {len(cur_dataset[split]) - len(enumerated_dataset)} examples with no positive or no negative examples. {len(enumerated_dataset)} examples remaining." ) @@ -184,52 +165,3 @@ def transform_old_dataset_format(self, given_dataset=None): self.instructions = None self.data_loaded = True - - def _evaluate_subset( - self, retriever, corpus, queries, relevant_docs, hf_subset: str, **kwargs - ) -> ScoresDict: - """Evaluate each query_id as a "mini" retrieval corpus, and rerank the top-ranked documents for each query_id.""" - all_results = defaultdict(dict) - max_docs = 0 - top_ranked = kwargs["top_ranked"] # must be present for reranking - for query_id in tqdm.tqdm( - list(queries.keys()), leave=False, desc="Reranking over query-ids.." - ): - cur_queries = {query_id: queries[query_id]} - if "instructions" in kwargs: - instructions = kwargs["instructions"] - cur_instructions = {query_id: instructions[query_id]} - else: - cur_instructions = None - - doc_ids_to_rerank = top_ranked[query_id] - cur_corpus = {doc_id: corpus[doc_id] for doc_id in doc_ids_to_rerank} - if ( - len(cur_corpus) > max_docs - ): # use this to make sure we get the correct MAP/MRR at max length - max_docs = len(cur_corpus) - - # to handle instruction-based reranking we pass both query_id and instructions (unused if not instruction-based) - results = retriever( - cur_corpus, - cur_queries, - instructions=cur_instructions, - query_id=query_id, - ) - # results should have only one key, the query_id - all_results[query_id] = results[query_id] - - # do the evaluation like normal now, but pass our results - if max_docs > max(retriever.k_values): - # only added if we need a large k-value for reranking past 1000 - retriever.k_values += [max_docs] - - return super()._evaluate_subset( - retriever, - corpus, - queries, - relevant_docs, - hf_subset, - results=all_results, - **kwargs, - ) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index e6e8f3c17e..5ce9d42d85 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -76,6 +76,8 @@ class AbsTaskRetrieval(AbsTask): ignore_identical_ids: bool = False def __init__(self, **kwargs): + self.top_ranked = None + self.instructions = None super().__init__(**kwargs) def load_data(self, **kwargs): @@ -205,6 +207,12 @@ def _evaluate_subset( with open(qrels_save_path, "w") as f: json.dump(results, f) + # save qrels also + with open( + output_folder / f"{self.metadata.name}_{hf_subset}_qrels.json", "w" + ) as f: + json.dump(relevant_docs, f) + ndcg, _map, recall, precision, naucs, task_scores = retriever.evaluate( relevant_docs, results, diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index 4febcda19a..d22e45d540 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -4,7 +4,6 @@ import json import logging import os -from collections import defaultdict from pathlib import Path from typing import Any @@ -76,8 +75,6 @@ def __init__( self.previous_results = previous_results self.batch_size = encode_kwargs.get("batch_size") self.show_progress_bar = encode_kwargs.get("show_progress_bar") - self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) - self.corpus_embeddings = defaultdict(list) self.results = {} if self.previous_results is not None: @@ -98,10 +95,23 @@ def search( instructions: dict[str, str] | None = None, request_qid: str | None = None, return_sorted: bool = False, + top_ranked: dict[str, list[str]] | None = None, **kwargs, ) -> dict[str, dict[str, float]]: - # Create embeddings for all queries using model.encode - # Runs semantic search against the corpus embeddings + """Perform semantic search (retrieval or reranking). + + Args: + corpus: Dictionary mapping corpus IDs to document dictionaries + queries: Dictionary mapping query IDs to query strings + top_k: Number of top results to return + score_function: Scoring function to use ('cos_sim' or 'dot') + task_name: Name of the task + instructions: Optional instructions to append to queries + request_qid: Optional request query ID + return_sorted: Whether to return results sorted + top_ranked: Optional dict mapping query IDs to lists of pre-ranked corpus IDs + **kwargs: Additional keyword arguments passed to the underlying model + """ if score_function not in self.score_functions: raise ValueError( f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product" @@ -120,70 +130,220 @@ def search( ) queries = new_queries - if isinstance(queries[0], list): # type: ignore - query_embeddings = self.encode_conversations( + # Create mapping of unique queries to their indices + unique_queries = [] + query_to_idx = {} + query_idx_mapping = [] + + for query in queries: + query_key = tuple(query) if isinstance(query, list) else query + if query_key not in query_to_idx: + query_to_idx[query_key] = len(unique_queries) + unique_queries.append(query) + query_idx_mapping.append(query_to_idx[query_key]) + + # Encode only unique queries + if isinstance(queries[0], list): + unique_query_embeddings = self.encode_conversations( model=self.model, - conversations=queries, # type: ignore + conversations=unique_queries, task_name=task_name, **self.encode_kwargs, ) else: - query_embeddings = self.model.encode( - queries, # type: ignore + unique_query_embeddings = self.model.encode( + unique_queries, task_name=task_name, prompt_type=PromptType.query, **self.encode_kwargs, ) - logger.info("Sorting Corpus by document length (Longest first)...") - corpus_ids = sorted( - corpus, - reverse=True, - ) - corpus = [corpus[cid] for cid in corpus_ids] # type: ignore + # Map back to original order but reuse embeddings + query_embeddings = unique_query_embeddings[query_idx_mapping] - logger.info("Encoding Corpus in batches... Warning: This might take a while!") logger.info( f"Scoring Function: {self.score_function_desc[score_function]} ({score_function})" ) + if top_ranked is not None: + logger.info("Performing reranking on pre-ranked documents...") + result_heaps = self._rerank_documents( + query_ids=query_ids, + query_embeddings=query_embeddings, + corpus=corpus, + top_ranked=top_ranked, + top_k=top_k, + score_function=score_function, + task_name=task_name, + request_qid=request_qid, + return_sorted=return_sorted, + ) + else: + logger.info("Performing full corpus search...") + result_heaps = self._full_corpus_search( + query_ids=query_ids, + query_embeddings=query_embeddings, + corpus=corpus, + top_k=top_k, + score_function=score_function, + task_name=task_name, + request_qid=request_qid, + return_sorted=return_sorted, + ) + + for qid in result_heaps: + for score, corpus_id in result_heaps[qid]: + self.results[qid][corpus_id] = score + + return self.results + + def _rerank_documents( + self, + query_ids: list[str], + query_embeddings: torch.Tensor, + corpus: dict[str, dict[str, str]], + top_ranked: dict[str, list[str]], + top_k: int, + score_function: str, + task_name: str, + request_qid: str | None = None, + return_sorted: bool = False, + ) -> dict[str, list[tuple[float, str]]]: + """Rerank documents for each query using top_ranked.""" + # Determine device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + logger.info(f"Using device: {device}") + + # Move query embeddings to appropriate device + query_embeddings = torch.as_tensor(query_embeddings).to(device) + + result_heaps = {qid: [] for qid in query_ids} + + # Get unique document IDs across all queries + unique_doc_ids = list( + { + doc_id + for qid in query_ids + if qid in top_ranked + for doc_id in top_ranked[qid] + } + ) + + # Create mapping from unique doc IDs to their index in the embedding matrix + doc_id_to_idx = {doc_id: idx for idx, doc_id in enumerate(unique_doc_ids)} + + # Encode unique documents only once + unique_docs = [corpus[doc_id] for doc_id in unique_doc_ids] + all_doc_embeddings = self.model.encode( + unique_docs, + task_name=task_name, + prompt_type=PromptType.passage, + request_qid=request_qid, + **self.encode_kwargs, + ) + + # Let's make sure we don't get the warnings for the tokenizer here via torch.compile + if hasattr(torch, "compile"): + os.environ["TOKENIZERS_PARALLELISM"] = "false" # we don't need it anymore + + # Process each query + for query_idx, query_id in enumerate(tqdm.tqdm(query_ids)): + if query_id not in top_ranked: + logger.warning(f"No pre-ranked documents found for query {query_id}") + continue + + ranked_ids = top_ranked[query_id] + doc_indices = torch.tensor([doc_id_to_idx[doc_id] for doc_id in ranked_ids]) + query_doc_embeddings = torch.as_tensor(all_doc_embeddings[doc_indices]).to( + device + ) + + # Ensure query embedding is on the correct device and has correct shape + query_embedding = query_embeddings[query_idx].unsqueeze(0) + + with torch.inference_mode(): + scores = self.score_functions[score_function]( + query_embedding, + query_doc_embeddings, + ) + + # Handle NaN values + scores = torch.nan_to_num(scores, nan=-1.0) + + # Compute top-k scores + scores_top_k_values, scores_top_k_idx = torch.topk( + scores, + min(top_k, len(ranked_ids)), + dim=1, + largest=True, + sorted=return_sorted, + ) + + # Move results back to CPU for heap operations + scores_top_k_values = scores_top_k_values.cpu() + scores_top_k_idx = scores_top_k_idx.cpu() + + # Build result heap + for doc_idx, score in zip( + scores_top_k_idx[0].tolist(), + scores_top_k_values[0].tolist(), + ): + corpus_id = ranked_ids[doc_idx] + heapq.heappush(result_heaps[query_id], (score, corpus_id)) + + # Clear CUDA cache after processing + if device.type == "cuda": + torch.cuda.empty_cache() + + return result_heaps + + def _full_corpus_search( + self, + query_ids: list[str], + query_embeddings: torch.Tensor, + corpus: dict[str, dict[str, str]], + top_k: int, + score_function: str, + task_name: str, + request_qid: str | None = None, + return_sorted: bool = False, + ) -> dict[str, list[tuple[float, str]]]: + """Perform full corpus search using batched processing.""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + logger.info(f"Using device: {device}") + + logger.info("Sorting Corpus by document length (Longest first)...") + corpus_ids = sorted(corpus, reverse=True) + corpus = [corpus[cid] for cid in corpus_ids] + + logger.info("Encoding Corpus in batches... Warning: This might take a while!") itr = range(0, len(corpus), self.corpus_chunk_size) - result_heaps = { - qid: [] for qid in query_ids - } # Keep only the top-k docs for each query + result_heaps = {qid: [] for qid in query_ids} for batch_num, corpus_start_idx in enumerate(itr): logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...") corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(corpus)) - # Encode chunk of corpus - if ( - self.save_corpus_embeddings - and request_qid - and len(self.corpus_embeddings[request_qid]) - ): - sub_corpus_embeddings = torch.tensor( - self.corpus_embeddings[request_qid][batch_num] - ) - else: - # Encode chunk of corpus - sub_corpus_embeddings = self.model.encode( - corpus[corpus_start_idx:corpus_end_idx], # type: ignore - task_name=task_name, - prompt_type=PromptType.passage, - request_qid=request_qid, - **self.encode_kwargs, - ) - if self.save_corpus_embeddings and request_qid: - self.corpus_embeddings[request_qid].append(sub_corpus_embeddings) + sub_corpus_embeddings = self.model.encode( + corpus[corpus_start_idx:corpus_end_idx], # type: ignore + task_name=task_name, + prompt_type=PromptType.passage, + request_qid=request_qid, + **self.encode_kwargs, + ) # Compute similarites using either cosine-similarity or dot product - cos_scores = self.score_functions[score_function]( - query_embeddings, sub_corpus_embeddings - ) - cos_scores[torch.isnan(cos_scores)] = -1 + logging.info("Computing Similarities...") + query_embeddings = torch.as_tensor(query_embeddings).to(device) + sub_corpus_embeddings = torch.as_tensor(sub_corpus_embeddings).to(device) + with torch.inference_mode(): + cos_scores = self.score_functions[score_function]( + query_embeddings, sub_corpus_embeddings + ) + + cos_scores = torch.nan_to_num(cos_scores, nan=-1.0) - # Get top-k values + # get top-k values cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk( cos_scores, min( @@ -194,27 +354,22 @@ def search( largest=True, sorted=return_sorted, ) - cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() - cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() for query_itr in range(len(query_embeddings)): query_id = query_ids[query_itr] for sub_corpus_id, score in zip( - cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr] + cos_scores_top_k_idx[query_itr].cpu().tolist(), + cos_scores_top_k_values[query_itr].cpu().tolist(), ): corpus_id = corpus_ids[corpus_start_idx + sub_corpus_id] if len(result_heaps[query_id]) < top_k: - # Push item on the heap + # push item on the heap heapq.heappush(result_heaps[query_id], (score, corpus_id)) else: # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element heapq.heappushpop(result_heaps[query_id], (score, corpus_id)) - for qid in result_heaps: - for score, corpus_id in result_heaps[qid]: - self.results[qid][corpus_id] = score - - return self.results + return result_heaps def load_results_file(self): # load the first stage results from file in format {qid: {doc_id: score}} @@ -367,8 +522,6 @@ class DRESModel: def __init__(self, model, **kwargs): self.model = model self.use_sbert_model = isinstance(model, SentenceTransformer) - self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) - self.corpus_embeddings = {} def encode_corpus( self, @@ -376,16 +529,8 @@ def encode_corpus( task_name: str, batch_size: int, prompt_type: PromptType = PromptType.passage, - request_qid: str | None = None, **kwargs, ): - if ( - request_qid - and self.save_corpus_embeddings - and len(self.corpus_embeddings) > 0 - ): - return self.corpus_embeddings[request_qid] - sentences = corpus_to_str(corpus) corpus_embeddings = self.model.encode( sentences, @@ -394,9 +539,6 @@ def encode_corpus( batch_size=batch_size, **kwargs, ) - - if self.save_corpus_embeddings and request_qid: - self.corpus_embeddings[request_qid] = corpus_embeddings return corpus_embeddings def encode( diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index e7e010bad2..fae3021604 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -13,47 +13,72 @@ from packaging.version import Version from sklearn.metrics import auc +try: + # speeds up computation if available + torch.set_float32_matmul_precision("high") +except Exception: + pass -def cos_sim(a, b): - """Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j. + +def cos_sim(a: torch.Tensor, b: torch.Tensor): + """Calculate pairwise cosine similarities between two sets of vectors. + + Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j. Return: Matrix with res[i][j] = cos_sim(a[i], b[j]) - """ # noqa: D402 + """ + # Move tensor conversion outside the compiled function + # since compile works better with pure tensor operations if not isinstance(a, torch.Tensor): a = torch.tensor(a) - if not isinstance(b, torch.Tensor): b = torch.tensor(b) - if len(a.shape) == 1: - a = a.unsqueeze(0) - - if len(b.shape) == 1: - b = b.unsqueeze(0) - - a_norm = torch.nn.functional.normalize(a, p=2, dim=1) - b_norm = torch.nn.functional.normalize(b, p=2, dim=1) - return torch.mm(a_norm, b_norm.transpose(0, 1)) + # The actual function to compile + def _cos_sim_core(a_tensor, b_tensor): + if len(a_tensor.shape) == 1: + a_tensor = a_tensor.unsqueeze(0) + if len(b_tensor.shape) == 1: + b_tensor = b_tensor.unsqueeze(0) + + a_norm = torch.nn.functional.normalize(a_tensor, p=2, dim=1) + b_norm = torch.nn.functional.normalize(b_tensor, p=2, dim=1) + return torch.mm(a_norm, b_norm.transpose(0, 1)) + + # Compile the core function once + if hasattr(torch, "compile"): # Check if torch.compile is available + _cos_sim_core_compiled = torch.compile(_cos_sim_core) + return _cos_sim_core_compiled(a, b) + else: + return _cos_sim_core(a, b) def dot_score(a: torch.Tensor, b: torch.Tensor): """Computes the dot-product dot_prod(a[i], b[j]) for all i and j. :return: Matrix with res[i][j] = dot_prod(a[i], b[j]) """ + # Move tensor conversion outside the compiled function if not isinstance(a, torch.Tensor): a = torch.tensor(a) - if not isinstance(b, torch.Tensor): b = torch.tensor(b) - if len(a.shape) == 1: - a = a.unsqueeze(0) + # The actual function to compile + def _dot_score_core(a_tensor, b_tensor): + if len(a_tensor.shape) == 1: + a_tensor = a_tensor.unsqueeze(0) + if len(b_tensor.shape) == 1: + b_tensor = b_tensor.unsqueeze(0) - if len(b.shape) == 1: - b = b.unsqueeze(0) + return torch.mm(a_tensor, b_tensor.transpose(0, 1)) - return torch.mm(a, b.transpose(0, 1)) + # Compile the core function once + if hasattr(torch, "compile"): # Check if torch.compile is available + _dot_score_core_compiled = torch.compile(_dot_score_core) + return _dot_score_core_compiled(a, b) + else: + return _dot_score_core(a, b) # From https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/custom_metrics.py#L4 @@ -437,7 +462,6 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]: cs_diff1 = 0.0 conf_scores = {"max": cs_max, "std": cs_std, "diff1": cs_diff1} - return conf_scores @@ -672,7 +696,7 @@ def max_over_subqueries(qrels, results, k_values): """ query_keys = defaultdict(list) for key in qrels.keys(): - query_keys[key.split("_")[0]].append(key) + query_keys["_".join(key.split("_")[:-1])].append(key) new_results = {} for query_id_base, query_ids in query_keys.items(): diff --git a/mteb/tasks/Classification/multilingual/IndicSentimentClassification.py b/mteb/tasks/Classification/multilingual/IndicSentimentClassification.py index bd9058918b..2c576c7c03 100644 --- a/mteb/tasks/Classification/multilingual/IndicSentimentClassification.py +++ b/mteb/tasks/Classification/multilingual/IndicSentimentClassification.py @@ -21,7 +21,7 @@ } -class IndicSentimentClassification(MultilingualTask, AbsTaskClassification): +class IndicSentimentClassification(AbsTaskClassification, MultilingualTask): fast_loading = True metadata = TaskMetadata( name="IndicSentimentClassification", diff --git a/mteb/tasks/Reranking/eng/MindSmallReranking.py b/mteb/tasks/Reranking/eng/MindSmallReranking.py index 556b2614ea..00d503e705 100644 --- a/mteb/tasks/Reranking/eng/MindSmallReranking.py +++ b/mteb/tasks/Reranking/eng/MindSmallReranking.py @@ -1,9 +1,9 @@ from __future__ import annotations import logging +from collections import defaultdict import tqdm -from datasets import Dataset, DatasetDict from mteb.abstasks.TaskMetadata import TaskMetadata @@ -61,33 +61,173 @@ class MindSmallReranking(AbsTaskReranking): }, ) + def process_example( + self, example: dict, split: str, query_idx: int, subquery_idx: int + ) -> dict: # Added subquery_idx parameter + """Process a single example from the dataset.""" + query = example["query"] + positive_docs = example["positive"] + negative_docs = example["negative"] + + # Modified query_id to include subquery index + query_id = f"{split}_query{query_idx}_{subquery_idx}" + + # Rest of the method remains the same + example_data = { + "query_id": query_id, + "query": query, + "doc_ids": [], + "doc_texts": [], + "relevance_scores": [], + } + + def get_doc_hash(text: str) -> str: + import hashlib + + return hashlib.md5(text.encode()).hexdigest() + + # Process positive documents + for i, pos_doc in enumerate(positive_docs): + doc_hash = get_doc_hash(pos_doc) + if pos_doc in self.doc_text_to_id[split]: + doc_id = self.doc_text_to_id[split][pos_doc] + else: + formatted_i = str(i).zfill(5) + doc_id = f"apositive_{doc_hash}_{formatted_i}" + self.doc_text_to_id[split][pos_doc] = doc_id + + example_data["doc_ids"].append(doc_id) + example_data["doc_texts"].append(pos_doc) + example_data["relevance_scores"].append(1) + + # Process negative documents + for i, neg_doc in enumerate(negative_docs): + doc_hash = get_doc_hash(neg_doc) + if neg_doc in self.doc_text_to_id[split]: + doc_id = self.doc_text_to_id[split][neg_doc] + else: + formatted_i = str(i).zfill(5) + doc_id = f"negative_{doc_hash}_{formatted_i}" + self.doc_text_to_id[split][neg_doc] = doc_id + + example_data["doc_ids"].append(doc_id) + example_data["doc_texts"].append(neg_doc) + example_data["relevance_scores"].append(0) + + return example_data + def load_data(self, **kwargs): + """Load and transform the dataset with efficient deduplication.""" if self.data_loaded: return - # since AbsTaskReranking has no `load_data` method, we call the parent class method + # Call parent class method super(AbsTaskRetrieval, self).load_data(**kwargs) - # we can expand the queries so that each has its own row and a new query-id - new_dataset = {} + logging.info( + f"Transforming old format to standard format for {self.metadata.name}" + ) + + self.corpus = defaultdict(lambda: defaultdict(dict)) + self.queries = defaultdict(lambda: defaultdict(dict)) + self.relevant_docs = defaultdict(lambda: defaultdict(dict)) + self.top_ranked = defaultdict(lambda: defaultdict(list)) + self.doc_text_to_id = defaultdict(dict) + + # Process each split for split in self.dataset: - logging.info(f"Expanding queries for split {split}") - # Create lists to store the expanded data - expanded_data = [] + if split == "train": + continue + logging.info(f"Processing split {split}") + + # Pre-allocate lists for batch processing + all_queries = [] + all_positives = [] + all_negatives = [] + all_ids = [] + all_instance_indices = [] # Renamed for clarity + all_subquery_indices = [] + # First pass: expand queries while maintaining relationships + current_instance_idx = 0 for instance in tqdm.tqdm(self.dataset[split]): - for i, subquery in enumerate(instance["query"]): - new_instance = instance.copy() - new_instance["query"] = subquery - new_instance["id"] = f"{instance['id']}_{i}" - expanded_data.append(new_instance) + queries = instance["query"] + positives = instance.get("positive", []) + negatives = instance.get("negative", []) + + # For each query in this instance + for subquery_idx, query in enumerate(queries): + all_queries.append(query) + all_positives.append(positives) # Same positives for each subquery + all_negatives.append(negatives) # Same negatives for each subquery + all_ids.append( + f"{instance.get('id', current_instance_idx)}_{subquery_idx}" + ) + all_instance_indices.append(current_instance_idx) + all_subquery_indices.append(subquery_idx) + + current_instance_idx += 1 + + # Filter valid examples + valid_examples = [] + valid_instance_indices = [] + valid_subquery_indices = [] + + # Filter while maintaining relationships + for idx, (pos, neg) in enumerate(zip(all_positives, all_negatives)): + if len(pos) > 0 and len(neg) > 0: + valid_examples.append(idx) + valid_instance_indices.append(all_instance_indices[idx]) + valid_subquery_indices.append(all_subquery_indices[idx]) + + total_instances = len(set(all_instance_indices)) + valid_unique_instances = len(set(valid_instance_indices)) + logging.info( + f"Found {total_instances} total instances, {valid_unique_instances} valid instances" + ) + logging.info( + f"Filtered {len(all_queries) - len(valid_examples)} invalid examples. {len(valid_examples)} remaining." + ) + + # Process valid examples in batches + batch_size = 1000 + for batch_start in tqdm.tqdm(range(0, len(valid_examples), batch_size)): + batch_end = min(batch_start + batch_size, len(valid_examples)) + batch_indices = valid_examples[batch_start:batch_end] + + # Process batch + for i, example_idx in enumerate(batch_indices): + instance_idx = valid_instance_indices[batch_start + i] + subquery_idx = valid_subquery_indices[batch_start + i] + + example_data = self.process_example( + { + "query": all_queries[example_idx], + "positive": all_positives[example_idx], + "negative": all_negatives[example_idx], + }, + split, + instance_idx, + subquery_idx, + ) + + # Populate data structures + query_id = example_data["query_id"] + self.queries[split][query_id] = example_data["query"] - # Convert the list of instances into a Dataset object - new_dataset[split] = Dataset.from_list(expanded_data) + for doc_id, doc_text, relevance in zip( + example_data["doc_ids"], + example_data["doc_texts"], + example_data["relevance_scores"], + ): + if doc_id not in self.corpus[split]: + self.corpus[split][doc_id] = { + "text": doc_text, + "_id": doc_id, + } - # Create the DatasetDict from the dictionary of Datasets - self.dataset = DatasetDict(new_dataset) + self.top_ranked[split][query_id].append(doc_id) + self.relevant_docs[split][query_id][doc_id] = relevance - # now convert to the new format - self.transform_old_dataset_format(self.dataset) + self.instructions = None self.data_loaded = True diff --git a/mteb/tasks/Reranking/eng/NevIR.py b/mteb/tasks/Reranking/eng/NevIR.py index f2769f7c04..fc610793a0 100644 --- a/mteb/tasks/Reranking/eng/NevIR.py +++ b/mteb/tasks/Reranking/eng/NevIR.py @@ -35,7 +35,7 @@ class NevIR(AbsTaskReranking): url={{https://api.semanticscholar.org/CorpusID:258676146}} }""", descriptive_stats={ - "n_samples": {"test": 2255}, + "n_samples": {"test": 2766}, "test": { "average_document_length": 712.460289514867, "average_query_length": 67.9287780187997, diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index d20cdf97a1..241a2e440f 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -1,6 +1,9 @@ from __future__ import annotations import logging +from collections import defaultdict + +import datasets from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -264,3 +267,127 @@ class MIRACLReranking(AbsTaskReranking, MultilingualTask): }, }, ) + + def process_example(self, example: dict, split: str, query_idx: int) -> dict: + """Process a single example from the dataset. Slightly altered from the original class""" + query = example["query"] + assert isinstance(query, str) + positive_docs = set(example["positive"]) + candidate_docs = example["candidates"] + + # add four leading zeros + # query_id = f"{split}_query{query_idx:04d}" + query_id = f"{split}_query{query_idx}" + + # Initialize the structures for this example + example_data = { + "query_id": query_id, + "query": query, + "doc_ids": [], + "doc_texts": [], + "relevance_scores": [], + } + + for i, candidate_doc in enumerate(candidate_docs): + # format i as a five digit number + formatted_i = str(i).zfill(5) + doc_id = f"candidate_{query_id}_{formatted_i}" + example_data["doc_ids"].append(doc_id) + example_data["doc_texts"].append(candidate_doc) + if candidate_doc in positive_docs: + example_data["relevance_scores"].append(1) + else: + # this is not technically correct, but was done in the original so keeping it + example_data["relevance_scores"].append(0) + + return example_data + + def load_data(self, **kwargs): + """Super method to load the data, then convert to the new format. It is almost the same as the above, except there are negatives, positives, and candidates""" + logging.info( + f"Transforming old format to standard format for {self.metadata.name}" + ) + + self.corpus = defaultdict(lambda: defaultdict(dict)) + self.queries = defaultdict(lambda: defaultdict(dict)) + self.relevant_docs = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + self.top_ranked = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + + hf_subsets = list(self.hf_subsets) if self.is_multilingual else ["default"] + for hf_subset in hf_subsets: + if "name" in self.metadata_dict["dataset"]: + cur_dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) # type: ignore + assert ( + hf_subset == "default" + ), f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata." + else: + cur_dataset = datasets.load_dataset( + **self.metadata_dict["dataset"], name=hf_subset + ) # type: ignore + + for split in cur_dataset: + # Create an enumerated dataset to pass indices + enumerated_dataset = datasets.Dataset.from_dict( + { + "index": range(len(cur_dataset[split])), + "query": cur_dataset[split]["query"], + "positive": cur_dataset[split]["positive"], + "negative": cur_dataset[split]["negative"], + "candidates": cur_dataset[split]["candidates"], + } + ) + + # first, only keep those that have positives and negatives + enumerated_dataset = enumerated_dataset.filter( + lambda example: len(example["positive"]) > 0 + and len(example["negative"]) > 0 + ) + + logger.info( + f"Filtered out {len(cur_dataset[split]) - len(enumerated_dataset)} examples. {len(enumerated_dataset)} examples remaining." + ) + + # Map the transformation function over the dataset + processed_dataset = enumerated_dataset.map( + lambda example, idx: self.process_example(example, split, idx), + with_indices=True, + remove_columns=enumerated_dataset.column_names, + ) + + # Populate the data structures + for idx, item in enumerate(processed_dataset): + query_id = item["query_id"] + self.queries[hf_subset][split][query_id] = item["query"] + + # Add documents and relevance information + for doc_id, doc_text, relevance in zip( + item["doc_ids"], item["doc_texts"], item["relevance_scores"] + ): + self.corpus[hf_subset][split][doc_id] = { + "text": doc_text, + "_id": doc_id, + } + self.top_ranked[hf_subset][split][query_id].append(doc_id) + self.relevant_docs[hf_subset][split][query_id][doc_id] = ( + relevance + ) + + if len(self.top_ranked[hf_subset][split][query_id]) == 0: + # give it a negative, even though qrels should be empty since that was how it was done in the original + neg_doc = cur_dataset[split]["negative"][idx][0] + assert isinstance( + neg_doc, str + ), f"Negative document is not a string: {neg_doc}" + neg_doc_id = f"negative_{query_id}" + self.top_ranked[hf_subset][split][query_id].append(neg_doc_id) + self.corpus[hf_subset][split][neg_doc_id] = { + "text": neg_doc, + "_id": neg_doc_id, + } + assert self.relevant_docs[hf_subset][split][query_id] == {} + logger.warning( + f"Query {query_id} has no relevant documents. Adding a negative example." + ) + + self.instructions = None + self.data_loaded = True From d557aac892d8fc6b4318c1c064ca5f6514792948 Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Sat, 9 Nov 2024 14:42:52 -0500 Subject: [PATCH 08/16] fix bad merge of docs --- docs/benchmarks.md | 2 +- docs/tasks.md | 382 ++++++++++++++++++++++----------------------- 2 files changed, 192 insertions(+), 192 deletions(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 6450c2ddca..a5abe50215 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -9,7 +9,7 @@ The following table gives you an overview of the benchmarks in MTEB. |------|---------|------------|---------|-----------| | [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | python,c++,sql,go,eng,php,javascript,ruby,java | | [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Written, Social, Reviews] | sun,kaz,tzl,ido,abs,arq,yue,tam,nij,glg,slk,hsb,ber,xho,cbk,pol,uzb,ina,kab,swh,amh,fao,kzj,lfn,uig,sqi,deu,ang,ind,bug,pms,ibo,cym,eus,spa,ceb,tgl,ron,isl,ita,csb,cha,fin,est,pes,jpn,tel,tha,oci,cmn,min,fry,bbc,epo,lit,rus,bos,hrv,war,ara,bjn,mkd,srp,ast,nno,urd,pam,aze,eng,ace,bew,kor,dan,awa,mui,hye,ban,cor,ben,gle,swe,mad,bul,lat,cat,nob,fra,pcm,ell,mar,vie,tat,ukr,gsw,kat,arz,dsb,lvs,nld,tur,bel,max,nds,afr,khm,dtp,yor,ces,gla,zsm,mak,ile,nov,orv,bre,swg,rej,mhr,mon,mal,jav,heb,slv,bhp,kur,wuu,tuk,por,hun,hin,hau,yid | -| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionReranking': 3} | [Written, News] | eng | +| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng | | [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Encyclopaedic, Spoken, Non-fiction, Government, News, Fiction, Social, Blog, Reviews, Written, Web, Legal] | nob,fao,swe,isl,dan,nno | | MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | python,c++,sql,c,go,eng,shell,typescript,php,scala,rust,swift,javascript,ruby,java | | [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | eng,deu,pol,fra | diff --git a/docs/tasks.md b/docs/tasks.md index a2545b0e67..30badc6b9e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -124,93 +124,93 @@ The following tables give you an overview of the tasks in MTEB. | [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | | [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | | None | None | | [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | ['cmn'] | PairClassification | s2s | | None | None | -| [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 13000} | {'train': {'python': {'average_document_length': 597.592, 'average_query_length': 69.519, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'average_document_length': 582.554, 'average_query_length': 56.88, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'typescript': {'average_document_length': 580.877, 'average_query_length': 60.092, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'go': {'average_document_length': 548.498, 'average_query_length': 70.797, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'average_document_length': 518.895, 'average_query_length': 66.9, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'java': {'average_document_length': 620.332, 'average_query_length': 62.984, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'php': {'average_document_length': 545.452, 'average_query_length': 61.927, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'c': {'average_document_length': 475.868, 'average_query_length': 97.588, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'c++': {'average_document_length': 544.446, 'average_query_length': 114.48, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'rust': {'average_document_length': 609.548, 'average_query_length': 67.503, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'swift': {'average_document_length': 574.62, 'average_query_length': 57.279, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'scala': {'average_document_length': 495.485, 'average_query_length': 64.833, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'shell': {'average_document_length': 486.519, 'average_query_length': 72.059, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}}} | -| [CodeFeedbackMT](https://arxiv.org/abs/2402.14658) (Tianyu Zheng, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'average_document_length': 1467.879728243677, 'average_query_length': 4425.522256533855, 'num_documents': 66383, 'num_queries': 13277, 'average_relevant_docs_per_query': 1.0}} | -| [CodeFeedbackST](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'average_document_length': 1521.3317148588733, 'average_query_length': 724.2441704465598, 'num_documents': 156526, 'num_queries': 31306, 'average_relevant_docs_per_query': 1.0}} | -| [CodeSearchNetCCRetrieval](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'python': {'average_document_length': 388.31577184555965, 'average_query_length': 551.7934039415471, 'num_documents': 280652, 'num_queries': 14918, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'average_document_length': 276.0730050152605, 'average_query_length': 443.70707991491946, 'num_documents': 65201, 'num_queries': 3291, 'average_relevant_docs_per_query': 1.0}, 'go': {'average_document_length': 185.0307932251621, 'average_query_length': 233.76803742920464, 'num_documents': 182735, 'num_queries': 8122, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'average_document_length': 214.86204146730464, 'average_query_length': 266.8731165741475, 'num_documents': 27588, 'num_queries': 1261, 'average_relevant_docs_per_query': 1.0}, 'java': {'average_document_length': 281.96280259139183, 'average_query_length': 342.5341853035144, 'num_documents': 181061, 'num_queries': 10955, 'average_relevant_docs_per_query': 1.0}, 'php': {'average_document_length': 268.9752569556027, 'average_query_length': 336.62194947909234, 'num_documents': 268237, 'num_queries': 14014, 'average_relevant_docs_per_query': 1.0}}} | -| [CodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1000} | {'test': {'python': {'average_document_length': 862.842, 'average_query_length': 466.546, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'average_document_length': 1415.632, 'average_query_length': 186.018, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'go': {'average_document_length': 563.729, 'average_query_length': 125.213, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'average_document_length': 577.634, 'average_query_length': 313.818, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'java': {'average_document_length': 420.287, 'average_query_length': 690.36, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}, 'php': {'average_document_length': 712.129, 'average_query_length': 162.119, 'num_documents': 1000, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}}} | -| [CodeTransOceanContest](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['c++', 'python'] | Retrieval | p2p | [Programming, Written] | | {'test': {'average_document_length': 1528.9156746031747, 'average_query_length': 1012.1131221719457, 'num_documents': 1008, 'num_queries': 221, 'average_relevant_docs_per_query': 1.0}} | -| [CodeTransOceanDL](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['python'] | Retrieval | p2p | [Programming, Written] | | {'test': {'average_document_length': 1479.0735294117646, 'average_query_length': 1867.6222222222223, 'num_documents': 816, 'num_queries': 180, 'average_relevant_docs_per_query': 1.0}} | -| [ContractNLIConfidentialityOfAgreementLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 82} | {'test': 473.17} | -| [ContractNLIExplicitIdentificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 109} | {'test': 506.12} | -| [ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 139} | {'test': 525.75} | -| [ContractNLILimitedUseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 208} | {'test': 407.51} | -| [ContractNLINoLicensingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 162} | {'test': 419.42} | -| [ContractNLINoticeOnCompelledDisclosureLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 142} | {'test': 503.45} | -| [ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 178} | {'test': 427.4} | -| [ContractNLIPermissibleCopyLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 87} | {'test': 386.84} | -| [ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 136} | {'test': 396.4} | -| [ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 111} | {'test': 529.09} | -| [ContractNLIReturnOfConfidentialInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 66} | {'test': 478.29} | -| [ContractNLISharingWithEmployeesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 170} | {'test': 548.63} | -| [ContractNLISharingWithThirdPartiesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 180} | {'test': 517.29} | -| [ContractNLISurvivalOfObligationsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 157} | {'test': 417.64} | -| [Core17InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 39838} | {'test': {'num_docs': 19899, 'num_queries': 20, 'average_document_length': 2233.0329664807277, 'average_query_length': 109.75, 'average_instruction_length': 295.55, 'average_changed_instruction_length': 355.2, 'average_relevant_docs_per_query': 32.7, 'average_top_ranked_per_query': 1000.0}} | -| [CorporateLobbyingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 490} | {'test': 6039.85} | -| [CosQA](https://arxiv.org/abs/2105.13239) (Junjie Huang, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | | {'test': {'average_document_length': 276.132741215298, 'average_query_length': 36.814, 'num_documents': 20604, 'num_queries': 500, 'average_relevant_docs_per_query': 1.0}} | -| [CovidRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 332.4152658473415, 'average_query_length': 25.9304531085353, 'num_documents': 100001, 'num_queries': 949, 'average_relevant_docs_per_query': 1.0105374077976819}} | -| [CrossLingualSemanticDiscriminationWMT19](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | {'test': 2946} | {'test': {'deu-fra': {'average_document_length': 147.49857433808555, 'average_query_length': 152.95587236931433, 'num_documents': 7365, 'num_queries': 1473, 'average_relevant_docs_per_query': 1.0}, 'fra-deu': {'average_document_length': 154.21968771215208, 'average_query_length': 145.877800407332, 'num_documents': 7365, 'num_queries': 1473, 'average_relevant_docs_per_query': 1.0}}} | -| [CrossLingualSemanticDiscriminationWMT21](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | {'test': 1786} | {'test': {'deu-fra': {'average_document_length': 177.26270996640537, 'average_query_length': 171.73012318029114, 'num_documents': 4465, 'num_queries': 893, 'average_relevant_docs_per_query': 1.0}, 'fra-deu': {'average_document_length': 174.45061590145576, 'average_query_length': 176.99216125419932, 'num_documents': 4465, 'num_queries': 893, 'average_relevant_docs_per_query': 1.0}}} | -| [CyrillicTurkicLangClassification](https://huggingface.co/datasets/tatiana-merz/cyrillic_turkic_langs) (Goldhahn et al., 2012) | ['bak', 'chv', 'kaz', 'kir', 'krc', 'rus', 'sah', 'tat', 'tyv'] | Classification | s2s | [Web, Written] | {'test': 2048} | {'test': 92.22} | -| [CzechProductReviewSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 153.26} | -| [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | {'test': 1000} | {'test': 59.89} | -| [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) | ['ces'] | Classification | s2s | [Reviews, Written] | {'validation': 500, 'test': 2000} | {'validation': 108.2, 'test': 108.3} | -| [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | {'test': {'average_document_length': 1122.7690155333814, 'average_query_length': 48.7264325323475, 'num_documents': 48605, 'num_queries': 541, 'average_relevant_docs_per_query': 1.3752310536044363}} | -| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | {'test': {'average_document_length': 311.7007956561823, 'average_query_length': 35.45, 'num_documents': 4635922, 'num_queries': 400, 'average_relevant_docs_per_query': 38.215}} | -| [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | {'test': 400} | {'test': {'average_document_length': 363.468546000768, 'average_query_length': 35.45, 'num_documents': 88542, 'num_queries': 400, 'average_relevant_docs_per_query': 38.215}} | -| [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | {'test': 400} | {'test': {'average_document_length': 338.58561119129564, 'average_query_length': 34.085, 'num_documents': 90070, 'num_queries': 400, 'average_relevant_docs_per_query': 38.215}} | -| [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | {'test': 70000} | {'test': 281.4} | -| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) | ['dan'] | Classification | s2s | [Social, Written] | {'test': 329} | {'test': 104.0} | -| [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | {'test': 444} | {'test': 243.8} | -| [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | {'train': 8897} | {'train': {'average_document_length': 312.1117274167987, 'average_query_length': 50.26957476855484, 'num_documents': 2524, 'num_queries': 6373, 'average_relevant_docs_per_query': 0.48721167425074535}} | -| [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | {'train': 9010} | {'train': 69.9} | -| [DefinitionClassificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1337} | {'test': 253.72} | -| [DiaBlaBitextMining](https://inria.hal.science/hal-03021633) (González et al., 2019) | ['eng', 'fra'] | BitextMining | s2s | [Social, Written] | {} | {} | -| [Diversity1LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 103.21} | -| [Diversity2LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 0} | -| [Diversity3LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 135.46} | -| [Diversity4LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 144.52} | -| [Diversity5LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 174.77} | -| [Diversity6LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 300} | {'test': 301.01} | -| [DuRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) (Yifu Qiu, 2022) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 331.3219967800322, 'average_query_length': 9.289, 'num_documents': 100001, 'num_queries': 2000, 'average_relevant_docs_per_query': 4.9195}} | -| [DutchBookReviewSentimentClassification](https://github.com/benjaminvdb/DBRD) (Benjamin et al., 2019) | ['nld'] | Classification | s2s | [Reviews, Written] | {'test': 2224} | {'test': 1443.0} | -| [ESCIReranking](https://github.com/amazon-science/esci-data/) (Chandan K. Reddy, 2022) | ['eng', 'jpn', 'spa'] | Reranking | s2p | [Written] | | | -| [EcomRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | {'dev': {'average_document_length': 32.98041664189015, 'average_query_length': 6.798, 'num_documents': 100902, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}} | -| [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) | ['pol'] | Clustering | s2s | [Social, Written] | {'test': 2048} | {'test': 78.73} | -| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) | ['eng'] | Classification | s2s | [Social, Written] | {'validation': 2000, 'test': 2000} | {'validation': 95.3, 'test': 95.6} | -| [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 603} | {'test': {'average_document_length': 785.595041322314, 'average_query_length': 55.32006633499171, 'num_documents': 121, 'num_queries': 603, 'average_relevant_docs_per_query': 1.0}} | -| [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) | ['est'] | Classification | s2s | [News, Written] | {'train': 3270, 'test': 818} | {'train': 226.70642201834863, 'test': 231.5085574572127} | -| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | {'train': {'average_document_length': 538.2340070317589, 'average_query_length': 47.56034058828886, 'num_documents': 5416568, 'num_queries': 109810, 'average_relevant_docs_per_query': 1.2757034878426372}, 'dev': {'average_document_length': 538.2340070317589, 'average_query_length': 47.326282628262824, 'num_documents': 5416568, 'num_queries': 6666, 'average_relevant_docs_per_query': 1.211971197119712}, 'test': {'average_document_length': 538.2340070317589, 'average_query_length': 49.60546054605461, 'num_documents': 5416568, 'num_queries': 6666, 'average_relevant_docs_per_query': 1.1906690669066906}} | -| [FEVERHardNegatives](https://fever.ai/) | ['eng'] | Retrieval | s2p | | {'test': 1000} | {'test': {'average_document_length': 695.4370242764114, 'average_query_length': 49.62, 'num_documents': 163698, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.171}} | -| [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 400, 'validation': 100} | {'test': {'average_document_length': 896.3308550185874, 'average_query_length': 58.52, 'num_documents': 269, 'num_queries': 400, 'average_relevant_docs_per_query': 1.0}, 'validation': {'average_document_length': 895.1340206185567, 'average_query_length': 54.13, 'num_documents': 97, 'num_queries': 100, 'average_relevant_docs_per_query': 1.0}} | -| [FaithDial](https://mcgill-nlp.github.io/FaithDial) (Dziri et al., 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 2042} | {'test': {'average_document_length': 140.61062447018932, 'average_query_length': 4.926542605288932, 'num_documents': 3539, 'num_queries': 2042, 'average_relevant_docs_per_query': 1.0}} | -| [FalseFriendsGermanEnglish](https://drive.google.com/file/d/1jgq0nBnV-UiYNxbKNrrr2gxDEHm-DMKH/view?usp=share_link) | ['deu'] | PairClassification | s2s | [Written] | {'test': 1524} | {'test': 40.3} | -| [FaroeseSTS](https://aclanthology.org/2023.nodalida-1.74.pdf) | ['fao'] | STS | s2s | [News, Web, Written] | {'train': 729} | {'train': 43.6} | -| [FarsTail](https://link.springer.com/article/10.1007/s00500-023-08959-3) (Amirkhani et al., 2023) | ['fas'] | PairClassification | s2s | [Academic, Written] | {'test': 1029} | {'test': 125.84} | -| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Web, Government, Medical, Written] | {'test': 1992} | {'test': {'average_document_length': 1174.7986463620982, 'average_query_length': 72.33182730923694, 'num_documents': 2364, 'num_queries': 1992, 'average_relevant_docs_per_query': 1.0}} | -| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 795.2371699226205, 'average_query_length': 70.00771604938272, 'num_documents': 57638, 'num_queries': 648, 'average_relevant_docs_per_query': 2.632716049382716}} | -| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | | None | {'train': {'average_document_length': 767.2108157812554, 'average_query_length': 61.49763636363636, 'num_documents': 57638, 'num_queries': 5500, 'average_relevant_docs_per_query': 2.5756363636363635}, 'dev': {'average_document_length': 767.2108157812554, 'average_query_length': 62.756, 'num_documents': 57638, 'num_queries': 500, 'average_relevant_docs_per_query': 2.476}, 'test': {'average_document_length': 767.2108157812554, 'average_query_length': 62.7037037037037, 'num_documents': 57638, 'num_queries': 648, 'average_relevant_docs_per_query': 2.632716049382716}} | -| [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | {'validation': 2048, 'test': 2048} | {'validation': 88.1, 'test': 87.4} | -| [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | {'validation': 2250, 'test': 2250} | {'validation': 143.8, 'test': 145.1} | -| [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | {'test': 1000, 'validation': 1000} | {'test': 59.0, 'validation': 58.8} | -| [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) | ['fin'] | Classification | s2s | [News, Written] | {'train': 2048, 'test': 2048} | {'train': 432.63, 'test': 401.03} | -| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [News, Written] | {'train': 4840} | {'train': 121.96} | -| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Non-fiction, Encyclopaedic, Written] | {'dev': 997, 'devtest': 1012} | {} | -| [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | {'train': 2048} | {'train': 311.5} | -| [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['eng'] | Classification | s2s | [Social, Written] | {'test': 2300} | {'test': 188.75} | -| [FrenkHrClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['hrv'] | Classification | s2s | [Social, Written] | {'test': 2120} | {'test': 89.86} | -| [FrenkSlClassification](https://arxiv.org/pdf/1906.02045) (Nikola Ljubešić, 2019) | ['slv'] | Classification | s2s | [Social, Written] | {'test': 2177} | {'test': 136.61} | -| [FunctionOfDecisionSectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 367} | {'test': 551.07} | -| [GPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | {'test': 1} | {'test': 3591} | -| [GeoreviewClassification](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | {'test': 2048} | {'test': 409.0} | -| [GeoreviewClusteringP2P](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Clustering | p2p | [Reviews, Written] | {'test': 2000} | {'test': 384.5} | -| [GeorgianFAQRetrieval](https://huggingface.co/datasets/jupyterjazz/georgian-faq) | ['kat'] | Retrieval | s2p | [Web, Written] | {'test': 2566} | {'test': {'average_document_length': 511.24668745128605, 'average_query_length': 61.69551656920078, 'num_documents': 2566, 'num_queries': 2565, 'average_relevant_docs_per_query': 1.0003898635477584}} | -| [GerDaLIR](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | s2p | | None | {'test': {'average_document_length': 15483.237726805888, 'average_query_length': 1027.3495690356156, 'num_documents': 131445, 'num_queries': 12298, 'average_relevant_docs_per_query': 1.1704342169458448}} | -| [GerDaLIRSmall](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | p2p | [Legal, Written] | None | {'test': {'average_document_length': 19706.823653325308, 'average_query_length': 1031.0680889324833, 'num_documents': 9969, 'num_queries': 12234, 'average_relevant_docs_per_query': 1.1705084191597188}} | -| [GermanDPR](https://huggingface.co/datasets/deepset/germandpr) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1288.3410987482614, 'average_query_length': 64.38439024390244, 'num_documents': 2876, 'num_queries': 1025, 'average_relevant_docs_per_query': 1.0}} | -| [GermanGovServiceRetrieval](https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA) | ['deu'] | Retrieval | s2p | [Government, Written] | {'test': 357} | {'test': {'average_document_length': 1246.4571428571428, 'average_query_length': 68.17977528089888, 'num_documents': 105, 'num_queries': 356, 'average_relevant_docs_per_query': 1.0}} | -| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) | ['deu'] | Classification | s2s | [Social, Government, Written] | {'test': 357} | {'test': 302.48} | -| [GermanQuAD-Retrieval](https://www.kaggle.com/datasets/GermanQuAD) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1941.090717299578, 'average_query_length': 56.74773139745916, 'num_documents': 474, 'num_queries': 2204, 'average_relevant_docs_per_query': 1.0}} | +| [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 26000} | {'train': {'number_of_characters': 71.99, 'num_samples': 26000, 'num_queries': 13000, 'num_documents': 13000, 'average_document_length': 0.0, 'average_query_length': 0.01, 'average_relevant_docs_per_query': 1.0, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 70.52, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'number_of_characters': 57.88, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.06, 'average_relevant_docs_per_query': 1.0}, 'typescript': {'number_of_characters': 61.09, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.06, 'average_relevant_docs_per_query': 1.0}, 'go': {'number_of_characters': 71.8, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'number_of_characters': 67.9, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 1.0}, 'java': {'number_of_characters': 63.98, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.06, 'average_relevant_docs_per_query': 1.0}, 'php': {'number_of_characters': 62.93, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.06, 'average_relevant_docs_per_query': 1.0}, 'c': {'number_of_characters': 98.59, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.1, 'average_relevant_docs_per_query': 1.0}, 'c++': {'number_of_characters': 115.48, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.11, 'average_relevant_docs_per_query': 1.0}, 'rust': {'number_of_characters': 68.5, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 1.0}, 'swift': {'number_of_characters': 58.28, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.06, 'average_relevant_docs_per_query': 1.0}, 'scala': {'number_of_characters': 65.83, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.06, 'average_relevant_docs_per_query': 1.0}, 'shell': {'number_of_characters': 73.06, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 1.0}}}} | +| [CodeFeedbackMT](https://arxiv.org/abs/2402.14658) (Tianyu Zheng, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 79660} | {'test': {'number_of_characters': 5894.4, 'num_samples': 79660, 'num_queries': 13277, 'num_documents': 66383, 'average_document_length': 0.02, 'average_query_length': 0.33, 'average_relevant_docs_per_query': 1.0}} | +| [CodeFeedbackST](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 187832} | {'test': {'number_of_characters': 2246.58, 'num_samples': 187832, 'num_queries': 31306, 'num_documents': 156526, 'average_document_length': 0.01, 'average_query_length': 0.02, 'average_relevant_docs_per_query': 1.0}} | +| [CodeSearchNetCCRetrieval](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1058035} | {'test': {'number_of_characters': 390.06, 'num_samples': 1058035, 'num_queries': 52561, 'num_documents': 1005474, 'average_document_length': 0.0, 'average_query_length': 0.01, 'average_relevant_docs_per_query': 1.0, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 553.79, 'num_samples': 295570, 'num_queries': 14918, 'num_documents': 280652, 'average_document_length': 0.0, 'average_query_length': 0.04, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'number_of_characters': 445.71, 'num_samples': 68492, 'num_queries': 3291, 'num_documents': 65201, 'average_document_length': 0.0, 'average_query_length': 0.13, 'average_relevant_docs_per_query': 1.0}, 'go': {'number_of_characters': 235.77, 'num_samples': 190857, 'num_queries': 8122, 'num_documents': 182735, 'average_document_length': 0.0, 'average_query_length': 0.03, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'number_of_characters': 268.87, 'num_samples': 28849, 'num_queries': 1261, 'num_documents': 27588, 'average_document_length': 0.0, 'average_query_length': 0.21, 'average_relevant_docs_per_query': 1.0}, 'java': {'number_of_characters': 344.53, 'num_samples': 192016, 'num_queries': 10955, 'num_documents': 181061, 'average_document_length': 0.0, 'average_query_length': 0.03, 'average_relevant_docs_per_query': 1.0}, 'php': {'number_of_characters': 338.62, 'num_samples': 282251, 'num_queries': 14014, 'num_documents': 268237, 'average_document_length': 0.0, 'average_query_length': 0.02, 'average_relevant_docs_per_query': 1.0}}}} | +| [CodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 12000} | {'test': {'number_of_characters': 325.01, 'num_samples': 12000, 'num_queries': 6000, 'num_documents': 6000, 'average_document_length': 0.0, 'average_query_length': 0.05, 'average_relevant_docs_per_query': 1.0, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 467.55, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.47, 'average_relevant_docs_per_query': 1.0}, 'javascript': {'number_of_characters': 187.02, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.19, 'average_relevant_docs_per_query': 1.0}, 'go': {'number_of_characters': 126.21, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.13, 'average_relevant_docs_per_query': 1.0}, 'ruby': {'number_of_characters': 314.82, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.31, 'average_relevant_docs_per_query': 1.0}, 'java': {'number_of_characters': 691.36, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.69, 'average_relevant_docs_per_query': 1.0}, 'php': {'number_of_characters': 163.12, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'average_document_length': 0.0, 'average_query_length': 0.16, 'average_relevant_docs_per_query': 1.0}}}} | +| [CodeTransOceanContest](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['c++', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 1229} | {'test': {'number_of_characters': 2520.65, 'num_samples': 1229, 'num_queries': 221, 'num_documents': 1008, 'average_document_length': 1.5, 'average_query_length': 4.58, 'average_relevant_docs_per_query': 1.0}} | +| [CodeTransOceanDL](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['python'] | Retrieval | p2p | [Programming, Written] | {'test': 996} | {'test': {'number_of_characters': 3347.7, 'num_samples': 996, 'num_queries': 180, 'num_documents': 816, 'average_document_length': 1.81, 'average_query_length': 10.38, 'average_relevant_docs_per_query': 1.0}} | +| [ContractNLIConfidentialityOfAgreementLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLIExplicitIdentificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLILimitedUseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLINoLicensingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLINoticeOnCompelledDisclosureLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLIPermissibleCopyLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLIReturnOfConfidentialInformationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLISharingWithEmployeesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLISharingWithThirdPartiesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [ContractNLISurvivalOfObligationsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [Core17InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'test': 19919} | {'test': {'num_samples': 19919, 'num_docs': 19899, 'num_queries': 20, 'number_of_characters': 44450333, 'average_document_length': 2233.03, 'average_query_length': 109.75, 'average_instruction_length': 295.55, 'average_changed_instruction_length': 355.2, 'average_relevant_docs_per_query': 32.7, 'average_top_ranked_per_query': 1000.0}} | +| [CorporateLobbyingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [CosQA](https://arxiv.org/abs/2105.13239) (Junjie Huang, 2021) | ['eng', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 21104} | {'test': {'number_of_characters': 313.95, 'num_samples': 21104, 'num_queries': 500, 'num_documents': 20604, 'average_document_length': 0.01, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 1.0}} | +| [CovidRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | +| [CrossLingualSemanticDiscriminationWMT19](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | +| [CrossLingualSemanticDiscriminationWMT21](https://huggingface.co/datasets/Andrianos/clsd_wmt19_21) | ['deu', 'fra'] | Retrieval | s2s | [News, Written] | None | None | +| [CyrillicTurkicLangClassification](https://huggingface.co/datasets/tatiana-merz/cyrillic_turkic_langs) (Goldhahn et al., 2012) | ['bak', 'chv', 'kaz', 'kir', 'krc', 'rus', 'sah', 'tat', 'tyv'] | Classification | s2s | [Web, Written] | None | None | +| [CzechProductReviewSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | +| [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | None | None | +| [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | +| [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None | +| [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [DefinitionClassificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [DiaBlaBitextMining](https://inria.hal.science/hal-03021633) (González et al., 2019) | ['eng', 'fra'] | BitextMining | s2s | [Social, Written] | None | None | +| [Diversity1LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [Diversity2LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [Diversity3LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [Diversity4LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [Diversity5LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [Diversity6LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [DuRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) (Yifu Qiu, 2022) | ['cmn'] | Retrieval | s2p | | None | None | +| [DutchBookReviewSentimentClassification](https://github.com/benjaminvdb/DBRD) (Benjamin et al., 2019) | ['nld'] | Classification | s2s | [Reviews, Written] | None | None | +| [ESCIReranking](https://github.com/amazon-science/esci-data/) (Chandan K. Reddy, 2022) | ['eng', 'jpn', 'spa'] | Reranking | s2p | [Written] | {'test': 29285} | {'test': {'num_samples': 29285, 'number_of_characters': 254538331, 'num_positive': 271416, 'num_negative': 44235, 'avg_query_len': 19.69, 'avg_positive_len': 803.92, 'avg_negative_len': 808.5, 'hf_subset_descriptive_stats': {'us': {'num_samples': 21296, 'number_of_characters': 186915609, 'num_positive': 189375, 'num_negative': 25463, 'avg_query_len': 21.44, 'avg_positive_len': 868.37, 'avg_negative_len': 864.45}, 'es': {'num_samples': 3703, 'number_of_characters': 48861389, 'num_positive': 39110, 'num_negative': 10183, 'avg_query_len': 20.68, 'avg_positive_len': 980.96, 'avg_negative_len': 1023.22}, 'jp': {'num_samples': 4286, 'number_of_characters': 18761333, 'num_positive': 42931, 'num_negative': 8589, 'avg_query_len': 10.15, 'avg_positive_len': 358.36, 'avg_negative_len': 388.08}}}} | +| [EcomRetrieval](https://arxiv.org/abs/2203.03367) | ['cmn'] | Retrieval | s2p | | None | None | +| [EightTagsClustering.v2](https://aclanthology.org/2020.lrec-1.207.pdf) | ['pol'] | Clustering | s2s | [Social, Written] | None | None | +| [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) | ['est'] | Classification | s2s | [News, Written] | None | None | +| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None | +| [FEVERHardNegatives](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None | +| [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [FaithDial](https://mcgill-nlp.github.io/FaithDial) (Dziri et al., 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [FalseFriendsGermanEnglish](https://drive.google.com/file/d/1jgq0nBnV-UiYNxbKNrrr2gxDEHm-DMKH/view?usp=share_link) | ['deu'] | PairClassification | s2s | [Written] | None | None | +| [FaroeseSTS](https://aclanthology.org/2023.nodalida-1.74.pdf) | ['fao'] | STS | s2s | [News, Web, Written] | None | None | +| [FarsTail](https://link.springer.com/article/10.1007/s00500-023-08959-3) (Amirkhani et al., 2023) | ['fas'] | PairClassification | s2s | [Academic, Written] | None | None | +| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Web, Government, Medical, Written] | None | None | +| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | | None | None | +| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | | None | None | +| [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | None | None | +| [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | None | None | +| [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None | +| [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) | ['fin'] | Classification | s2s | [News, Written] | None | None | +| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [News, Written] | None | None | +| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Non-fiction, Encyclopaedic, Written] | None | None | +| [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | +| [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | +| [FrenkHrClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['hrv'] | Classification | s2s | [Social, Written] | None | None | +| [FrenkSlClassification](https://arxiv.org/pdf/1906.02045) (Nikola Ljubešić, 2019) | ['slv'] | Classification | s2s | [Social, Written] | None | None | +| [FunctionOfDecisionSectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [GPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | +| [GeoreviewClassification](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | +| [GeoreviewClusteringP2P](https://github.com/yandex/geo-reviews-dataset-2023) | ['rus'] | Clustering | p2p | [Reviews, Written] | None | None | +| [GeorgianFAQRetrieval](https://huggingface.co/datasets/jupyterjazz/georgian-faq) | ['kat'] | Retrieval | s2p | [Web, Written] | None | None | +| [GerDaLIR](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | s2p | | None | None | +| [GerDaLIRSmall](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | p2p | [Legal, Written] | None | None | +| [GermanDPR](https://huggingface.co/datasets/deepset/germandpr) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | | None | None | +| [GermanGovServiceRetrieval](https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA) | ['deu'] | Retrieval | s2p | [Government, Written] | None | None | +| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) | ['deu'] | Classification | s2s | [Social, Government, Written] | None | None | +| [GermanQuAD-Retrieval](https://www.kaggle.com/datasets/GermanQuAD) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | | None | None | | [GermanSTSBenchmark](https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark) (Philip May, 2021) | ['deu'] | STS | s2s | | None | None | | [GreekCivicsQA](https://huggingface.co/datasets/antoinelb7/alloprof) | ['ell'] | Retrieval | s2p | [Academic, Written] | None | None | | [GreekLegalCodeClassification](https://arxiv.org/abs/2109.15298) | ['ell'] | Classification | s2s | [Legal, Written] | None | None | @@ -344,48 +344,48 @@ The following tables give you an overview of the tasks in MTEB. | [MultiHateClassification](https://aclanthology.org/2022.woah-1.15/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'nld', 'pol', 'por', 'spa'] | Classification | s2s | [Constructed, Written] | None | None | | [MultiLongDocRetrieval](https://arxiv.org/abs/2402.03216) (Jianlv Chen, 2024) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'por', 'rus', 'spa', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written, Web, Non-fiction, Fiction] | None | None | | [MultilingualSentiment](https://github.com/tyqiangz/multilingual-sentiment-datasets) | ['cmn'] | Classification | s2s | | None | None | -| [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | {'test': 7000} | {'test': 56} | -| [MyanmarNews](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | {'train': 2048} | {'train': 174.2} | -| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1589.783925130746, 'average_query_length': 21.764705882352942, 'num_documents': 3633, 'num_queries': 323, 'average_relevant_docs_per_query': 38.18575851393189}} | -| [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1652.1926782273604, 'average_query_length': 24.390092879256965, 'num_documents': 3633, 'num_queries': 323, 'average_relevant_docs_per_query': 38.18575851393189}} | -| [NLPJournalAbsIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | {'test': 404} | {'test': {'average_document_length': 2052.8611111111113, 'average_query_length': 439.2772277227723, 'num_documents': 504, 'num_queries': 404, 'average_relevant_docs_per_query': 1.0}} | -| [NLPJournalTitleAbsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | {'test': 404} | {'test': {'average_document_length': 441.6746031746032, 'average_query_length': 27.60891089108911, 'num_documents': 504, 'num_queries': 404, 'average_relevant_docs_per_query': 1.0}} | -| [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | {'test': 404} | {'test': {'average_document_length': 2052.8611111111113, 'average_query_length': 27.60891089108911, 'num_documents': 504, 'num_queries': 404, 'average_relevant_docs_per_query': 1.0}} | -| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 492.2287851281462, 'average_query_length': 48.17902665121669, 'num_documents': 2681468, 'num_queries': 3452, 'average_relevant_docs_per_query': 1.2169756662804172}} | -| [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 502.14302128535564, 'average_query_length': 48.31662804171495, 'num_documents': 2681468, 'num_queries': 3452, 'average_relevant_docs_per_query': 1.2169756662804172}} | -| [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | {'test': 1000} | {'test': {'average_document_length': 610.7449138094336, 'average_query_length': 48.381, 'num_documents': 184765, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.213}} | -| [NQHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | {'test': 1000} | {'test': {'average_document_length': 602.7903551179953, 'average_query_length': 47.878, 'num_documents': 198779, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.213}} | -| [NTREXBitextMining](https://huggingface.co/datasets/davidstap/NTREX) | ['afr', 'amh', 'arb', 'aze', 'bak', 'bel', 'bem', 'ben', 'bod', 'bos', 'bul', 'cat', 'ces', 'ckb', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'eus', 'ewe', 'fao', 'fas', 'fij', 'fil', 'fin', 'fra', 'fuc', 'gle', 'glg', 'guj', 'hau', 'heb', 'hin', 'hmn', 'hrv', 'hun', 'hye', 'ibo', 'ind', 'isl', 'ita', 'jpn', 'kan', 'kat', 'kaz', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mey', 'mkd', 'mlg', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nde', 'nep', 'nld', 'nno', 'nob', 'nso', 'nya', 'orm', 'pan', 'pol', 'por', 'prs', 'pus', 'ron', 'rus', 'shi', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'spa', 'sqi', 'srp', 'ssw', 'swa', 'swe', 'tah', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tsn', 'tuk', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'ven', 'vie', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul'] | BitextMining | s2s | [News, Written] | {'test': 3826252} | {'test': 120} | -| [NYSJudicialEthicsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 292} | {'test': 159.45} | -| [NaijaSenti](https://github.com/hausanlp/NaijaSenti) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | {'test': 4800} | {'test': 72.81} | -| [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomáš Kočiský, 2017) | ['eng'] | Retrieval | s2p | | None | {'test': {'average_document_length': 326753.5323943662, 'average_query_length': 47.730889457232166, 'num_documents': 355, 'num_queries': 10557, 'average_relevant_docs_per_query': 1.0}} | -| [NepaliNewsClassification](https://github.com/goru001/nlp-for-nepali) | ['nep'] | Classification | s2s | [News, Written] | {'train': 5975, 'test': 1495} | {'train': 196.61, 'test': 196.017} | -| [NeuCLIR2022Retrieval](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 2232130, 'zho': 3179323, 'rus': 4627657} | {'test': {'fas': {'average_document_length': 2032.093148525817, 'average_query_length': 85.4298245614035, 'num_documents': 2232016, 'num_queries': 114, 'average_relevant_docs_per_query': 12.912280701754385}, 'rus': {'average_document_length': 1757.9129983233004, 'average_query_length': 85.58771929824562, 'num_documents': 4627543, 'num_queries': 114, 'average_relevant_docs_per_query': 16.57017543859649}, 'zho': {'average_document_length': 743.1426659901881, 'average_query_length': 24.17543859649123, 'num_documents': 3179209, 'num_queries': 114, 'average_relevant_docs_per_query': 18.710526315789473}}} | -| [NeuCLIR2022RetrievalHardNegatives](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | {'test': {'average_document_length': 2066.9453653646488, 'average_query_length': 63.529411764705884, 'num_documents': 27931, 'num_queries': 136, 'average_relevant_docs_per_query': 40.39705882352941, 'hf_subset_descriptive_stats': {'fas': {'average_document_length': 2816.847782031074, 'average_query_length': 83.26666666666667, 'num_documents': 8882, 'num_queries': 45, 'average_relevant_docs_per_query': 32.71111111111111}, 'rus': {'average_document_length': 2446.5574277854193, 'average_query_length': 85.56818181818181, 'num_documents': 8724, 'num_queries': 44, 'average_relevant_docs_per_query': 42.93181818181818}, 'zho': {'average_document_length': 1101.0984987893462, 'average_query_length': 24.0, 'num_documents': 10325, 'num_queries': 47, 'average_relevant_docs_per_query': 45.38297872340426}}}} | -| [NeuCLIR2023Retrieval](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 2232092, 'zho': 3179285, 'rus': 4627619} | {'test': {'fas': {'average_document_length': 2032.093148525817, 'average_query_length': 65.48684210526316, 'num_documents': 2232016, 'num_queries': 76, 'average_relevant_docs_per_query': 66.28947368421052}, 'rus': {'average_document_length': 1757.9129983233004, 'average_query_length': 74.4342105263158, 'num_documents': 4627543, 'num_queries': 76, 'average_relevant_docs_per_query': 62.223684210526315}, 'zho': {'average_document_length': 743.1426659901881, 'average_query_length': 22.210526315789473, 'num_documents': 3179209, 'num_queries': 76, 'average_relevant_docs_per_query': 53.68421052631579}}} | -| [NeuCLIR2023RetrievalHardNegatives](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | {'test': {'average_document_length': 2236.175955333482, 'average_query_length': 54.10267857142857, 'num_documents': 49433, 'num_queries': 224, 'average_relevant_docs_per_query': 61.816964285714285, 'hf_subset_descriptive_stats': {'fas': {'average_document_length': 2895.869857421016, 'average_query_length': 65.89189189189189, 'num_documents': 15921, 'num_queries': 74, 'average_relevant_docs_per_query': 68.08108108108108}, 'rus': {'average_document_length': 2724.294762109928, 'average_query_length': 74.41333333333333, 'num_documents': 16247, 'num_queries': 75, 'average_relevant_docs_per_query': 63.053333333333335}, 'zho': {'average_document_length': 1168.4984071821605, 'average_query_length': 22.16, 'num_documents': 17265, 'num_queries': 75, 'average_relevant_docs_per_query': 54.4}}}} | -| [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 61906} | {'eng': 2983.724665391969} | -| [NewsClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | {'test': 7600} | {'test': 235.29} | -| [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Written, Reviews] | {'test': 2050} | {'test': 82} | -| [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Social, Reviews, Written] | {'train': 1640} | {'train': 135.91} | -| [NorQuadRetrieval](https://aclanthology.org/2023.nodalida-1.17/) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | {'test': 2602} | {'test': {'average_document_length': 214.5114503816794, 'average_query_length': 47.896484375, 'num_documents': 1048, 'num_queries': 1024, 'average_relevant_docs_per_query': 2.0}} | -| [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/) | ['dan', 'fao', 'isl', 'nno', 'nob', 'swe'] | Classification | s2s | [Encyclopaedic] | {'test': 3000} | {'test': 78.2} | -| [NorwegianCourtsBitextMining](https://opus.nlpl.eu/index.php) (Tiedemann et al., 2020) | ['nno', 'nob'] | BitextMining | s2s | [Legal, Written] | {'test': 2050} | {'test': 1884.0} | -| [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | ['nob'] | Classification | s2s | [Government, Spoken] | {'test': 1200, 'validation': 1200} | {'test': 1884.0, 'validation': 1911.0} | -| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | {'train': 15516, 'validation': 2948, 'test': 6250} | {'train': 740.24, 'validation': 740.66, 'test': 740.71} | -| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | {'train': 15516, 'validation': 2948, 'test': 6250} | {'train': 740.24, 'validation': 740.66, 'test': 740.71} | -| [NusaTranslationBitextMining](https://huggingface.co/datasets/indonlp/nusatranslation_mt) (Cahyawijaya et al., 2023) | ['abs', 'bbc', 'bew', 'bhp', 'ind', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | BitextMining | s2s | [Social, Written] | {'train': 50200} | {'train': {'average_sentence1_length': 145.4552390438247, 'average_sentence2_length': 148.56607569721115, 'num_samples': 50200, 'hf_subset_descriptive_stats': {'ind-abs': {'average_sentence1_length': 148.366, 'average_sentence2_length': 147.314, 'num_samples': 1000}, 'ind-btk': {'average_sentence1_length': 145.36666666666667, 'average_sentence2_length': 146.74045454545455, 'num_samples': 6600}, 'ind-bew': {'average_sentence1_length': 145.4280303030303, 'average_sentence2_length': 148.40530303030303, 'num_samples': 6600}, 'ind-bhp': {'average_sentence1_length': 133.528, 'average_sentence2_length': 128.138, 'num_samples': 1000}, 'ind-jav': {'average_sentence1_length': 145.42772727272728, 'average_sentence2_length': 145.8089393939394, 'num_samples': 6600}, 'ind-mad': {'average_sentence1_length': 145.35545454545453, 'average_sentence2_length': 153.6228787878788, 'num_samples': 6600}, 'ind-mak': {'average_sentence1_length': 145.42772727272728, 'average_sentence2_length': 150.6128787878788, 'num_samples': 6600}, 'ind-min': {'average_sentence1_length': 145.42772727272728, 'average_sentence2_length': 148.0621212121212, 'num_samples': 6600}, 'ind-mui': {'average_sentence1_length': 150.454, 'average_sentence2_length': 150.994, 'num_samples': 1000}, 'ind-rej': {'average_sentence1_length': 151.622, 'average_sentence2_length': 139.583, 'num_samples': 1000}, 'ind-sun': {'average_sentence1_length': 145.42772727272728, 'average_sentence2_length': 150.9880303030303, 'num_samples': 6600}}}} | -| [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Reviews, Web, Social, Constructed, Written] | {'test': 4800} | {'test': 52.4} | -| [NusaXBitextMining](https://huggingface.co/datasets/indonlp/NusaX-senti/) (Winata et al., 2023) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | BitextMining | s2s | [Reviews, Written] | {'train': 5500} | {'train': 157.15} | -| [OPP115DataRetentionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 88} | {'test': 195.2} | -| [OPP115DataSecurityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1334} | {'test': 246.69} | -| [OPP115DoNotTrackLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 110} | {'test': 223.16} | -| [OPP115FirstPartyCollectionUseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 2086} | {'test': 204.25} | -| [OPP115InternationalAndSpecificAudiencesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 980} | {'test': 327.71} | -| [OPP115PolicyChangeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 431} | {'test': 200.99} | -| [OPP115ThirdPartySharingCollectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1590} | {'test': 223.64} | -| [OPP115UserAccessEditAndDeletionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 462} | {'test': 218.59} | -| [OPP115UserChoiceControlLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 1546} | {'test': 210.62} | +| [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | None | None | +| [MyanmarNews](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | None | None | +| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | | None | None | +| [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [NLPJournalAbsIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalTitleAbsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | +| [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [NQHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | +| [NTREXBitextMining](https://huggingface.co/datasets/davidstap/NTREX) | ['afr', 'amh', 'arb', 'aze', 'bak', 'bel', 'bem', 'ben', 'bod', 'bos', 'bul', 'cat', 'ces', 'ckb', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'eus', 'ewe', 'fao', 'fas', 'fij', 'fil', 'fin', 'fra', 'fuc', 'gle', 'glg', 'guj', 'hau', 'heb', 'hin', 'hmn', 'hrv', 'hun', 'hye', 'ibo', 'ind', 'isl', 'ita', 'jpn', 'kan', 'kat', 'kaz', 'khm', 'kin', 'kir', 'kmr', 'kor', 'lao', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mey', 'mkd', 'mlg', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nde', 'nep', 'nld', 'nno', 'nob', 'nso', 'nya', 'orm', 'pan', 'pol', 'por', 'prs', 'pus', 'ron', 'rus', 'shi', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'spa', 'sqi', 'srp', 'ssw', 'swa', 'swe', 'tah', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tsn', 'tuk', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'ven', 'vie', 'wol', 'xho', 'yor', 'yue', 'zho', 'zul'] | BitextMining | s2s | [News, Written] | None | None | +| [NYSJudicialEthicsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [NaijaSenti](https://github.com/hausanlp/NaijaSenti) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | None | None | +| [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomáš Kočiský, 2017) | ['eng'] | Retrieval | s2p | | None | None | +| [NepaliNewsClassification](https://github.com/goru001/nlp-for-nepali) | ['nep'] | Classification | s2s | [News, Written] | None | None | +| [NeuCLIR2022Retrieval](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | +| [NeuCLIR2022RetrievalHardNegatives](https://neuclir.github.io/) (Lawrie et al., 2023) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | +| [NeuCLIR2023Retrieval](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | +| [NeuCLIR2023RetrievalHardNegatives](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | +| [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | None | None | +| [NewsClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | None | None | +| [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Written, Reviews] | None | None | +| [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Social, Reviews, Written] | None | None | +| [NorQuadRetrieval](https://aclanthology.org/2023.nodalida-1.17/) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | +| [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/) | ['dan', 'fao', 'isl', 'nno', 'nob', 'swe'] | Classification | s2s | [Encyclopaedic] | None | None | +| [NorwegianCourtsBitextMining](https://opus.nlpl.eu/index.php) (Tiedemann et al., 2020) | ['nno', 'nob'] | BitextMining | s2s | [Legal, Written] | None | None | +| [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | ['nob'] | Classification | s2s | [Government, Spoken] | None | None | +| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | None | None | +| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | None | None | +| [NusaTranslationBitextMining](https://huggingface.co/datasets/indonlp/nusatranslation_mt) (Cahyawijaya et al., 2023) | ['abs', 'bbc', 'bew', 'bhp', 'ind', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | BitextMining | s2s | [Social, Written] | {'train': 50200} | {'train': {'average_sentence1_length': 145.46, 'average_sentence2_length': 148.57, 'num_samples': 50200, 'number_of_characters': 14759870, 'hf_subset_descriptive_stats': {'ind-abs': {'average_sentence1_length': 148.37, 'average_sentence2_length': 147.31, 'num_samples': 1000, 'number_of_characters': 295680}, 'ind-btk': {'average_sentence1_length': 145.37, 'average_sentence2_length': 146.74, 'num_samples': 6600, 'number_of_characters': 1927907}, 'ind-bew': {'average_sentence1_length': 145.43, 'average_sentence2_length': 148.41, 'num_samples': 6600, 'number_of_characters': 1939300}, 'ind-bhp': {'average_sentence1_length': 133.53, 'average_sentence2_length': 128.14, 'num_samples': 1000, 'number_of_characters': 261666}, 'ind-jav': {'average_sentence1_length': 145.43, 'average_sentence2_length': 145.81, 'num_samples': 6600, 'number_of_characters': 1922162}, 'ind-mad': {'average_sentence1_length': 145.36, 'average_sentence2_length': 153.62, 'num_samples': 6600, 'number_of_characters': 1973257}, 'ind-mak': {'average_sentence1_length': 145.43, 'average_sentence2_length': 150.61, 'num_samples': 6600, 'number_of_characters': 1953868}, 'ind-min': {'average_sentence1_length': 145.43, 'average_sentence2_length': 148.06, 'num_samples': 6600, 'number_of_characters': 1937033}, 'ind-mui': {'average_sentence1_length': 150.45, 'average_sentence2_length': 150.99, 'num_samples': 1000, 'number_of_characters': 301448}, 'ind-rej': {'average_sentence1_length': 151.62, 'average_sentence2_length': 139.58, 'num_samples': 1000, 'number_of_characters': 291205}, 'ind-sun': {'average_sentence1_length': 145.43, 'average_sentence2_length': 150.99, 'num_samples': 6600, 'number_of_characters': 1956344}}}} | +| [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Reviews, Web, Social, Constructed, Written] | None | None | +| [NusaXBitextMining](https://huggingface.co/datasets/indonlp/NusaX-senti/) (Winata et al., 2023) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | BitextMining | s2s | [Reviews, Written] | None | None | +| [OPP115DataRetentionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115DataSecurityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115DoNotTrackLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115FirstPartyCollectionUseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115InternationalAndSpecificAudiencesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115PolicyChangeLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115ThirdPartySharingCollectionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115UserAccessEditAndDeletionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [OPP115UserChoiceControlLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [Ocnli](https://arxiv.org/abs/2010.05444) (Hai Hu, 2020) | ['cmn'] | PairClassification | s2s | | None | None | | [OdiaNewsClassification](https://github.com/goru001/nlp-for-odia) (Anoop Kunchukuttan, 2020) | ['ory'] | Classification | s2s | [News, Written] | None | None | | [OnlineShopping](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | @@ -412,50 +412,50 @@ The following tables give you an overview of the tasks in MTEB. | [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Medical, Government, Web, Written] | None | None | | [PunjabiNewsClassification](https://github.com/goru001/nlp-for-punjabi/) (Anoop Kunchukuttan, 2020) | ['pan'] | Classification | s2s | [News, Written] | None | None | | [QBQTC](https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset) | ['cmn'] | STS | s2s | | None | None | -| [Quail](https://text-machine.cs.uml.edu/lab2/projects/quail/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 2720} | {'test': {'average_document_length': 27.50788422240522, 'average_query_length': 1957.3632352941177, 'num_documents': 32787, 'num_queries': 2720, 'average_relevant_docs_per_query': 1.0}} | -| [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | {'validation': {'average_document_length': 65.82473022253414, 'average_query_length': 54.6006, 'num_documents': 522931, 'num_queries': 5000, 'average_relevant_docs_per_query': 1.5252}, 'test': {'average_document_length': 65.82473022253414, 'average_query_length': 54.5354, 'num_documents': 522931, 'num_queries': 10000, 'average_relevant_docs_per_query': 1.5675}} | -| [Quora-PLHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | {'test': 1000} | {'test': {'average_document_length': 67.77529631287385, 'average_query_length': 53.846, 'num_documents': 172031, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.641}} | -| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | {'dev': {'average_document_length': 62.158154708747425, 'average_query_length': 51.5342, 'num_documents': 522931, 'num_queries': 5000, 'average_relevant_docs_per_query': 1.5252}, 'test': {'average_document_length': 62.158154708747425, 'average_query_length': 51.5396, 'num_documents': 522931, 'num_queries': 10000, 'average_relevant_docs_per_query': 1.5675}} | -| [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | {'test': 1000} | {'test': {'average_document_length': 58.96963812985781, 'average_query_length': 51.228, 'num_documents': 177163, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.641}} | -| [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | {'test': 1484} | {'test': {'average_document_length': 793.6813076734267, 'average_query_length': 375.7506738544474, 'num_documents': 301482, 'num_queries': 1484, 'average_relevant_docs_per_query': 1.0}} | -| [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 6319} | {'test': {'average_document_length': 504.0197829347469, 'average_query_length': 210.30732710871973, 'num_documents': 389376, 'num_queries': 6319, 'average_relevant_docs_per_query': 1.0}} | -| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [News, Web, Encyclopaedic, Written] | {'test': 1923} | {'test': 124.79} | -| [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Pivovarova et al., 2017) | ['rus'] | STS | s2s | [News, Written] | {'test': 1924} | {'test': 61.25} | -| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Web, Social, Written] | {'test': 32768} | {'test': 64.7} | -| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Social, Written] | {'test': 18375} | {'test': 727.7} | -| [RestaurantReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | {'train': 2048} | {'train': 231.4} | -| [RiaNewsRetrieval](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | {'test': 10000} | {'test': {'average_document_length': 1165.6429557148213, 'average_query_length': 62.4029, 'num_documents': 704344, 'num_queries': 10000, 'average_relevant_docs_per_query': 1.0}} | -| [RiaNewsRetrievalHardNegatives](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | {'test': 1000} | {'test': {'average_document_length': 1225.7253146619116, 'average_query_length': 62.338, 'num_documents': 191237, 'num_queries': 1000, 'average_relevant_docs_per_query': 1.0}} | -| [Robust04InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | {'eng': 95088} | {'eng': 2471.0398058252426} | -| [RomaTalesBitextMining](https://idoc.pub/documents/idocpub-zpnxm9g35ylv) | ['hun', 'rom'] | BitextMining | s2s | [Fiction, Written] | {'test': 215} | {'test': 316.8046511627907} | -| [RomaniBibleClustering](https://romani.global.bible/info) | ['rom'] | Clustering | p2p | [Religious, Written] | {'test': 2048} | {'test': 132.2} | -| [RomanianReviewsSentiment](https://arxiv.org/abs/2101.04197) (Anca Maria Tache, 2021) | ['ron'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 588.6} | -| [RomanianSentimentClassification](https://arxiv.org/abs/2009.08712) (Dumitrescu et al., 2020) | ['ron'] | Classification | s2s | [Reviews, Written] | {'test': 2048} | {'test': 67.6} | -| [RonSTS](https://openreview.net/forum?id=JH61CD7afTv) (Dumitrescu et al., 2021) | ['ron'] | STS | s2s | [News, Social, Web, Written] | {'test': 1379} | {'test': 60.5} | -| [RuBQReranking](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Reranking | s2p | [Encyclopaedic, Written] | {'test': 1551} | {'test': 499.9} | -| [RuBQRetrieval](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Retrieval | s2p | [Encyclopaedic, Written] | {'test': 2845} | {'test': {'average_document_length': 448.94659134903037, 'average_query_length': 45.29609929078014, 'num_documents': 56826, 'num_queries': 1692, 'average_relevant_docs_per_query': 1.6814420803782506}} | -| [RuReviewsClassification](https://github.com/sismetanin/rureviews) (Sergey Smetanin, 2019) | ['rus'] | Classification | p2p | [Reviews, Written] | {'test': 2048} | {'test': 133.2} | -| [RuSTSBenchmarkSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['rus'] | STS | s2s | [News, Social, Web, Written] | {'test': 1264} | {'test': 54.2} | -| [RuSciBenchGRNTIClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | {'test': 2048} | {'test': 890.1} | -| [RuSciBenchGRNTIClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'average_text_length': 889.81396484375, 'average_labels_per_text': 1.0, 'unique_labels': 28, 'labels': {'3': {'count': 73}, '4': {'count': 73}, '20': {'count': 73}, '9': {'count': 73}, '21': {'count': 73}, '15': {'count': 73}, '16': {'count': 74}, '2': {'count': 73}, '8': {'count': 73}, '23': {'count': 73}, '6': {'count': 73}, '24': {'count': 73}, '10': {'count': 73}, '1': {'count': 73}, '17': {'count': 74}, '14': {'count': 74}, '18': {'count': 73}, '27': {'count': 73}, '19': {'count': 73}, '22': {'count': 73}, '12': {'count': 73}, '25': {'count': 73}, '5': {'count': 74}, '0': {'count': 73}, '26': {'count': 73}, '11': {'count': 73}, '13': {'count': 73}, '7': {'count': 73}}}} | -| [RuSciBenchOECDClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | {'test': 2048} | {'test': 838.9} | -| [RuSciBenchOECDClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': 838.9} | -| [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3520} | -| [SCDBPAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3507} | -| [SCDBPCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 378} | {'test': 3507} | -| [SCDBPTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3506} | -| [SCDBPVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3498} | -| [SCDDAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 378} | {'test': 3522} | -| [SCDDAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3506} | -| [SCDDCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 378} | {'test': 3518} | -| [SCDDTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3499} | -| [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | {'test': 379} | {'test': 3503} | -| [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | {'test': {'average_document_length': 1203.3659819932182, 'average_query_length': 71.632, 'num_documents': 25657, 'num_queries': 1000, 'average_relevant_docs_per_query': 4.928}} | -| [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | {'test': {'average_document_length': 1270.0791986592353, 'average_query_length': 80.671, 'num_documents': 25657, 'num_queries': 1000, 'average_relevant_docs_per_query': 4.928}} | -| [SIB200Classification](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Classification | s2s | [News, Written] | {'train': 701, 'validation': 99, 'test': 204} | {'train': 111.24, 'validation': 97.11, 'test': 135.53} | -| [SIB200ClusteringS2S](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Clustering | s2s | [News, Written] | {'test': 1004} | {'test': 114.78} | -| [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | PairClassification | s2s | [Web, Written] | {'test': 1000} | {'test': 54.89} | -| [SICK-BR-STS](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | STS | s2s | [Web, Written] | {'test': 1000} | {'test': 54.89} | +| [Quail](https://text-machine.cs.uml.edu/lab2/projects/quail/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | +| [Quora-PLHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | +| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | +| [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | +| [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None | +| [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [News, Web, Encyclopaedic, Written] | None | None | +| [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Pivovarova et al., 2017) | ['rus'] | STS | s2s | [News, Written] | None | None | +| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Web, Social, Written] | None | None | +| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Social, Written] | None | None | +| [RestaurantReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | +| [RiaNewsRetrieval](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None | +| [RiaNewsRetrievalHardNegatives](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None | +| [Robust04InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | None | None | +| [RomaTalesBitextMining](https://idoc.pub/documents/idocpub-zpnxm9g35ylv) | ['hun', 'rom'] | BitextMining | s2s | [Fiction, Written] | None | None | +| [RomaniBibleClustering](https://romani.global.bible/info) | ['rom'] | Clustering | p2p | [Religious, Written] | None | None | +| [RomanianReviewsSentiment](https://arxiv.org/abs/2101.04197) (Anca Maria Tache, 2021) | ['ron'] | Classification | s2s | [Reviews, Written] | None | None | +| [RomanianSentimentClassification](https://arxiv.org/abs/2009.08712) (Dumitrescu et al., 2020) | ['ron'] | Classification | s2s | [Reviews, Written] | None | None | +| [RonSTS](https://openreview.net/forum?id=JH61CD7afTv) (Dumitrescu et al., 2021) | ['ron'] | STS | s2s | [News, Social, Web, Written] | None | None | +| [RuBQReranking](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Reranking | s2p | [Encyclopaedic, Written] | None | None | +| [RuBQRetrieval](https://openreview.net/pdf?id=P5UQFFoQ4PJ) (Ivan Rybin, 2021) | ['rus'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [RuReviewsClassification](https://github.com/sismetanin/rureviews) (Sergey Smetanin, 2019) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | +| [RuSTSBenchmarkSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['rus'] | STS | s2s | [News, Social, Web, Written] | None | None | +| [RuSciBenchGRNTIClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | +| [RuSciBenchGRNTIClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 1822339, 'average_text_length': 889.81, 'average_labels_per_text': 1.0, 'unique_labels': 28, 'labels': {'3': {'count': 73}, '4': {'count': 73}, '20': {'count': 73}, '9': {'count': 73}, '21': {'count': 73}, '15': {'count': 73}, '16': {'count': 74}, '2': {'count': 73}, '8': {'count': 73}, '23': {'count': 73}, '6': {'count': 73}, '24': {'count': 73}, '10': {'count': 73}, '1': {'count': 73}, '17': {'count': 74}, '14': {'count': 74}, '18': {'count': 73}, '27': {'count': 73}, '19': {'count': 73}, '22': {'count': 73}, '12': {'count': 73}, '25': {'count': 73}, '5': {'count': 74}, '0': {'count': 73}, '26': {'count': 73}, '11': {'count': 73}, '13': {'count': 73}, '7': {'count': 73}}}} | +| [RuSciBenchOECDClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | +| [RuSciBenchOECDClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | None | None | +| [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDBPVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | +| [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | +| [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [SIB200Classification](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Classification | s2s | [News, Written] | None | None | +| [SIB200ClusteringS2S](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Clustering | s2s | [News, Written] | None | None | +| [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | PairClassification | s2s | [Web, Written] | None | None | +| [SICK-BR-STS](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | STS | s2s | [Web, Written] | None | None | | [SICK-E-PL](https://aclanthology.org/2020.lrec-1.207) | ['pol'] | PairClassification | s2s | | None | None | | [SICK-R](https://aclanthology.org/2020.lrec-1.207) | ['eng'] | STS | s2s | | None | None | | [SICK-R-PL](https://aclanthology.org/2020.lrec-1.207) | ['pol'] | STS | s2s | [Web, Written] | None | None | @@ -578,23 +578,23 @@ The following tables give you an overview of the tasks in MTEB. | [Waimai](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | | [WebLINXCandidatesReranking](https://mcgill-nlp.github.io/weblinx) (Xing Han Lù, 2024) | ['eng'] | Reranking | p2p | [Academic, Web, Written] | None | None | | [WikiCitiesClustering](https://huggingface.co/datasets/wikipedia) | ['eng'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | -| [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | {'test': 2048} | {'test': {'num_samples': 28672, 'average_text_length': 629.7426409040179, 'average_labels_per_text': 1.0, 'unique_labels': 39, 'labels': {'16': {'count': 541}, '3': {'count': 1607}, '12': {'count': 846}, '0': {'count': 2410}, '15': {'count': 878}, '11': {'count': 864}, '6': {'count': 787}, '9': {'count': 654}, '14': {'count': 966}, '8': {'count': 1389}, '2': {'count': 2428}, '10': {'count': 839}, '1': {'count': 1370}, '4': {'count': 2942}, '7': {'count': 2514}, '5': {'count': 1490}, '13': {'count': 918}, '19': {'count': 315}, '17': {'count': 711}, '20': {'count': 345}, '18': {'count': 800}, '24': {'count': 467}, '25': {'count': 928}, '21': {'count': 62}, '26': {'count': 270}, '22': {'count': 186}, '23': {'count': 36}, '27': {'count': 465}, '28': {'count': 62}, '36': {'count': 139}, '32': {'count': 57}, '38': {'count': 43}, '30': {'count': 52}, '34': {'count': 80}, '33': {'count': 75}, '35': {'count': 62}, '31': {'count': 63}, '37': {'count': 8}, '29': {'count': 3}}, 'hf_subset_descriptive_stats': {'bs': {'num_samples': 2048, 'average_text_length': 1046.25732421875, 'average_labels_per_text': 1.0, 'unique_labels': 17, 'labels': {'16': {'count': 268}, '3': {'count': 89}, '12': {'count': 597}, '0': {'count': 202}, '15': {'count': 113}, '11': {'count': 11}, '6': {'count': 142}, '9': {'count': 181}, '14': {'count': 179}, '8': {'count': 33}, '2': {'count': 172}, '10': {'count': 12}, '1': {'count': 7}, '4': {'count': 25}, '7': {'count': 6}, '5': {'count': 9}, '13': {'count': 2}}}, 'ca': {'num_samples': 2048, 'average_text_length': 600.73291015625, 'average_labels_per_text': 1.0, 'unique_labels': 8, 'labels': {'6': {'count': 257}, '1': {'count': 737}, '2': {'count': 284}, '4': {'count': 394}, '0': {'count': 162}, '7': {'count': 151}, '5': {'count': 55}, '3': {'count': 8}}}, 'cs': {'num_samples': 2048, 'average_text_length': 659.2294921875, 'average_labels_per_text': 1.0, 'unique_labels': 21, 'labels': {'19': {'count': 35}, '5': {'count': 624}, '17': {'count': 126}, '10': {'count': 155}, '1': {'count': 231}, '7': {'count': 215}, '11': {'count': 128}, '0': {'count': 57}, '13': {'count': 75}, '2': {'count': 83}, '3': {'count': 38}, '9': {'count': 8}, '6': {'count': 14}, '12': {'count': 9}, '16': {'count': 16}, '20': {'count': 73}, '18': {'count': 38}, '4': {'count': 60}, '15': {'count': 14}, '14': {'count': 38}, '8': {'count': 11}}}, 'da': {'num_samples': 2048, 'average_text_length': 767.58935546875, 'average_labels_per_text': 1.0, 'unique_labels': 20, 'labels': {'14': {'count': 212}, '4': {'count': 74}, '15': {'count': 16}, '8': {'count': 165}, '13': {'count': 115}, '0': {'count': 79}, '1': {'count': 34}, '9': {'count': 114}, '7': {'count': 364}, '10': {'count': 32}, '17': {'count': 66}, '18': {'count': 32}, '12': {'count': 129}, '11': {'count': 159}, '2': {'count': 66}, '3': {'count': 185}, '19': {'count': 103}, '16': {'count': 33}, '5': {'count': 56}, '6': {'count': 14}}}, 'eu': {'num_samples': 2048, 'average_text_length': 405.16015625, 'average_labels_per_text': 1.0, 'unique_labels': 5, 'labels': {'4': {'count': 383}, '0': {'count': 995}, '3': {'count': 282}, '2': {'count': 344}, '1': {'count': 44}}}, 'gv': {'num_samples': 2048, 'average_text_length': 368.01123046875, 'average_labels_per_text': 1.0, 'unique_labels': 28, 'labels': {'6': {'count': 32}, '1': {'count': 83}, '24': {'count': 13}, '17': {'count': 152}, '2': {'count': 534}, '25': {'count': 76}, '5': {'count': 198}, '15': {'count': 100}, '21': {'count': 22}, '26': {'count': 188}, '13': {'count': 230}, '20': {'count': 11}, '3': {'count': 107}, '19': {'count': 88}, '16': {'count': 55}, '22': {'count': 29}, '14': {'count': 12}, '8': {'count': 61}, '0': {'count': 5}, '10': {'count': 4}, '4': {'count': 9}, '23': {'count': 6}, '7': {'count': 3}, '9': {'count': 20}, '18': {'count': 4}, '12': {'count': 3}, '27': {'count': 1}, '11': {'count': 2}}}, 'ilo': {'num_samples': 2048, 'average_text_length': 617.90771484375, 'average_labels_per_text': 1.0, 'unique_labels': 29, 'labels': {'3': {'count': 562}, '0': {'count': 373}, '18': {'count': 521}, '8': {'count': 129}, '13': {'count': 123}, '11': {'count': 54}, '25': {'count': 8}, '27': {'count': 5}, '17': {'count': 13}, '15': {'count': 4}, '4': {'count': 28}, '7': {'count': 83}, '10': {'count': 15}, '1': {'count': 11}, '24': {'count': 15}, '14': {'count': 8}, '16': {'count': 4}, '19': {'count': 9}, '23': {'count': 10}, '26': {'count': 4}, '28': {'count': 8}, '12': {'count': 29}, '21': {'count': 12}, '6': {'count': 5}, '20': {'count': 6}, '5': {'count': 4}, '22': {'count': 2}, '9': {'count': 2}, '2': {'count': 1}}}, 'ku': {'num_samples': 2048, 'average_text_length': 421.17333984375, 'average_labels_per_text': 1.0, 'unique_labels': 39, 'labels': {'14': {'count': 14}, '36': {'count': 139}, '20': {'count': 108}, '22': {'count': 27}, '15': {'count': 102}, '32': {'count': 55}, '8': {'count': 431}, '17': {'count': 210}, '38': {'count': 43}, '30': {'count': 51}, '4': {'count': 60}, '2': {'count': 111}, '6': {'count': 95}, '34': {'count': 70}, '27': {'count': 15}, '5': {'count': 174}, '26': {'count': 37}, '0': {'count': 11}, '25': {'count': 50}, '16': {'count': 2}, '12': {'count': 16}, '24': {'count': 2}, '11': {'count': 17}, '21': {'count': 9}, '13': {'count': 20}, '1': {'count': 7}, '33': {'count': 33}, '35': {'count': 28}, '10': {'count': 11}, '31': {'count': 51}, '18': {'count': 4}, '3': {'count': 4}, '28': {'count': 8}, '37': {'count': 8}, '23': {'count': 2}, '19': {'count': 7}, '7': {'count': 6}, '9': {'count': 8}, '29': {'count': 2}}}, 'lv': {'num_samples': 2048, 'average_text_length': 770.67138671875, 'average_labels_per_text': 1.0, 'unique_labels': 16, 'labels': {'15': {'count': 288}, '2': {'count': 110}, '6': {'count': 74}, '12': {'count': 50}, '0': {'count': 171}, '14': {'count': 188}, '10': {'count': 351}, '5': {'count': 142}, '4': {'count': 300}, '13': {'count': 60}, '11': {'count': 48}, '1': {'count': 165}, '8': {'count': 53}, '7': {'count': 5}, '3': {'count': 9}, '9': {'count': 34}}}, 'min': {'num_samples': 2048, 'average_text_length': 631.74072265625, 'average_labels_per_text': 1.0, 'unique_labels': 15, 'labels': {'7': {'count': 1595}, '9': {'count': 9}, '4': {'count': 48}, '3': {'count': 83}, '2': {'count': 160}, '0': {'count': 19}, '5': {'count': 74}, '6': {'count': 12}, '10': {'count': 12}, '13': {'count': 10}, '8': {'count': 5}, '11': {'count': 13}, '12': {'count': 2}, '1': {'count': 5}, '14': {'count': 1}}}, 'mt': {'num_samples': 2048, 'average_text_length': 821.22265625, 'average_labels_per_text': 1.0, 'unique_labels': 27, 'labels': {'12': {'count': 8}, '10': {'count': 147}, '14': {'count': 180}, '17': {'count': 117}, '25': {'count': 654}, '19': {'count': 35}, '0': {'count': 77}, '3': {'count': 12}, '16': {'count': 44}, '15': {'count': 108}, '24': {'count': 267}, '6': {'count': 43}, '26': {'count': 32}, '4': {'count': 79}, '22': {'count': 67}, '9': {'count': 16}, '8': {'count': 16}, '2': {'count': 55}, '5': {'count': 6}, '11': {'count': 30}, '18': {'count': 12}, '21': {'count': 12}, '20': {'count': 15}, '23': {'count': 7}, '13': {'count': 6}, '7': {'count': 1}, '1': {'count': 2}}}, 'sco': {'num_samples': 2048, 'average_text_length': 1065.21044921875, 'average_labels_per_text': 1.0, 'unique_labels': 23, 'labels': {'18': {'count': 178}, '6': {'count': 92}, '9': {'count': 28}, '15': {'count': 106}, '8': {'count': 432}, '2': {'count': 95}, '11': {'count': 104}, '1': {'count': 42}, '13': {'count': 248}, '16': {'count': 118}, '20': {'count': 130}, '3': {'count': 171}, '22': {'count': 57}, '7': {'count': 83}, '10': {'count': 74}, '5': {'count': 6}, '4': {'count': 17}, '17': {'count': 24}, '14': {'count': 14}, '0': {'count': 7}, '19': {'count': 18}, '21': {'count': 3}, '12': {'count': 1}}}, 'sq': {'num_samples': 2048, 'average_text_length': 425.486328125, 'average_labels_per_text': 1.0, 'unique_labels': 36, 'labels': {'27': {'count': 444}, '9': {'count': 234}, '14': {'count': 120}, '0': {'count': 128}, '15': {'count': 27}, '11': {'count': 298}, '24': {'count': 170}, '28': {'count': 46}, '19': {'count': 20}, '25': {'count': 140}, '3': {'count': 47}, '2': {'count': 87}, '35': {'count': 34}, '8': {'count': 53}, '31': {'count': 12}, '17': {'count': 3}, '23': {'count': 11}, '20': {'count': 2}, '33': {'count': 42}, '10': {'count': 26}, '34': {'count': 10}, '7': {'count': 2}, '13': {'count': 29}, '4': {'count': 4}, '6': {'count': 7}, '26': {'count': 9}, '5': {'count': 16}, '30': {'count': 1}, '21': {'count': 4}, '22': {'count': 4}, '18': {'count': 11}, '32': {'count': 2}, '12': {'count': 2}, '16': {'count': 1}, '1': {'count': 1}, '29': {'count': 1}}}, 'wa': {'num_samples': 2048, 'average_text_length': 216.00390625, 'average_labels_per_text': 1.0, 'unique_labels': 6, 'labels': {'5': {'count': 126}, '4': {'count': 1461}, '0': {'count': 124}, '2': {'count': 326}, '3': {'count': 10}, '1': {'count': 1}}}}}} | -| [WikipediaRerankingMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-reranking-multilingual) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Reranking | s2p | [Encyclopaedic, Written] | {'en': 1500, 'de': 1500, 'it': 1500, 'pt': 1500, 'nl': 1500, 'cs': 1500, 'ro': 1500, 'bg': 1500, 'sr': 1500, 'fi': 1500, 'da': 1500, 'fa': 1500, 'hi': 1500, 'bn': 1500, 'no': 1500, 'sv': 1500} | {'test': {'num_samples': 24000, 'num_positive': 24000, 'num_negative': 24000, 'avg_query_len': 59.091208333333334, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0, 'hf_subset_descriptive_stats': {'bg': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 60.82666666666667, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'bn': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 47.266666666666666, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'cs': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 56.272, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'da': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 56.75066666666667, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'de': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 70.004, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'en': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 68.372, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'fa': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 48.66733333333333, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'fi': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 55.343333333333334, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'hi': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 50.77733333333333, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'it': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 70.05466666666666, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'nl': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 65.34466666666667, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'pt': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 65.11933333333333, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'ro': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 61.973333333333336, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'sr': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 55.669333333333334, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'no': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 55.288, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}, 'sv': {'num_samples': 1500, 'num_positive': 1500, 'num_negative': 1500, 'avg_query_len': 57.73, 'avg_positive_len': 1.0, 'avg_negative_len': 8.0}}}} | -| [WikipediaRetrievalMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-retrieval-multilingual-queries) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Retrieval | s2p | [Encyclopaedic, Written] | {'en': 1500, 'de': 1500, 'it': 1500, 'pt': 1500, 'nl': 1500, 'cs': 1500, 'ro': 1500, 'bg': 1500, 'sr': 1500, 'fi': 1500, 'da': 1500, 'fa': 1500, 'hi': 1500, 'bn': 1500, 'no': 1500, 'sv': 1500} | {'test': {'bg': {'average_document_length': 374.376, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'bn': {'average_document_length': 394.05044444444445, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'cs': {'average_document_length': 369.9831111111111, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'da': {'average_document_length': 345.2597037037037, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'de': {'average_document_length': 398.4137777777778, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'en': {'average_document_length': 452.9871111111111, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'fa': {'average_document_length': 345.1568888888889, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'fi': {'average_document_length': 379.71237037037037, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'hi': {'average_document_length': 410.72540740740743, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'it': {'average_document_length': 393.73437037037036, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'nl': {'average_document_length': 375.6695555555556, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'pt': {'average_document_length': 398.27237037037037, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'ro': {'average_document_length': 348.3817037037037, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'sr': {'average_document_length': 384.3131851851852, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'no': {'average_document_length': 366.93733333333336, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}, 'sv': {'average_document_length': 369.340962962963, 'average_query_length': 1.0, 'num_documents': 13500, 'num_queries': 1500, 'average_relevant_docs_per_query': 1.0}}} | -| [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | {'test': 0} | {'test': {'average_document_length': 7.68243375858685, 'average_query_length': 111.78216258879242, 'num_documents': 5095, 'num_queries': 1267, 'average_relevant_docs_per_query': 1.0}} | -| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [Social, News, Written] | {'train': 2048} | {'train': 103.42} | -| XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | {'test': {'de': {'average_document_length': 187.4061197288943, 'average_query_length': 15.717612088184294, 'num_documents': 70526, 'num_queries': 4037, 'average_relevant_docs_per_query': 54.3522417636859}, 'en': {'average_document_length': 452.792089662076, 'average_query_length': 15.881635344543357, 'num_documents': 218777, 'num_queries': 9099, 'average_relevant_docs_per_query': 85.43719090009891}, 'es': {'average_document_length': 279.67909262759923, 'average_query_length': 19.97062937062937, 'num_documents': 39675, 'num_queries': 3575, 'average_relevant_docs_per_query': 36.01006993006993}}} | -| [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | {'validation': 2163, 'test': 2460} | {'test': {'num_samples': 19110, 'avg_sentence1_len': 103.23793825222397, 'avg_sentence2_len': 48.88895866038723, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'avg_sentence1_len': 89.57362637362637, 'avg_sentence2_len': 41.99487179487179, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'avg_sentence1_len': 110.01611721611722, 'avg_sentence2_len': 51.62930402930403, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'avg_sentence1_len': 119.92600732600732, 'avg_sentence2_len': 56.794871794871796, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'avg_sentence1_len': 119.05421245421246, 'avg_sentence2_len': 56.93260073260073, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'avg_sentence1_len': 105.67032967032966, 'avg_sentence2_len': 49.8043956043956, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'avg_sentence1_len': 115.43296703296703, 'avg_sentence2_len': 54.68205128205128, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'avg_sentence1_len': 121.0967032967033, 'avg_sentence2_len': 58.58021978021978, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'avg_sentence1_len': 104.63443223443224, 'avg_sentence2_len': 50.17289377289377, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'avg_sentence1_len': 110.76923076923077, 'avg_sentence2_len': 52.452014652014654, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'avg_sentence1_len': 104.43956043956044, 'avg_sentence2_len': 49.48205128205128, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'avg_sentence1_len': 96.6923076923077, 'avg_sentence2_len': 44.544322344322346, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'avg_sentence1_len': 103.67765567765568, 'avg_sentence2_len': 49.18534798534799, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'avg_sentence1_len': 111.31208791208792, 'avg_sentence2_len': 52.46007326007326, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'avg_sentence1_len': 33.03589743589744, 'avg_sentence2_len': 15.73040293040293, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'avg_sentence1_len': 103.20790162218734, 'avg_sentence2_len': 49.01909994767138, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'avg_sentence1_len': 88.31868131868131, 'avg_sentence2_len': 41.61172161172161, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'avg_sentence1_len': 109.196336996337, 'avg_sentence2_len': 51.967032967032964, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'avg_sentence1_len': 119.81172161172161, 'avg_sentence2_len': 57.36923076923077, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'avg_sentence1_len': 119.87545787545787, 'avg_sentence2_len': 56.88278388278388, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'avg_sentence1_len': 105.71648351648352, 'avg_sentence2_len': 49.87619047619047, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'avg_sentence1_len': 115.17289377289377, 'avg_sentence2_len': 55.120879120879124, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'avg_sentence1_len': 121.75897435897436, 'avg_sentence2_len': 59.08864468864469, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'avg_sentence1_len': 105.06446886446886, 'avg_sentence2_len': 50.44395604395604, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'avg_sentence1_len': 109.74725274725274, 'avg_sentence2_len': 52.26886446886447, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'avg_sentence1_len': 104.32234432234432, 'avg_sentence2_len': 49.87692307692308, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'avg_sentence1_len': 97.28498168498169, 'avg_sentence2_len': 43.843223443223444, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'avg_sentence1_len': 102.96630036630036, 'avg_sentence2_len': 49.63809523809524, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'avg_sentence1_len': 112.26373626373626, 'avg_sentence2_len': 52.432967032967035, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'avg_sentence1_len': 33.41098901098901, 'avg_sentence2_len': 15.846886446886447, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} | -| [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | {'test': 5010} | {'test': 80.06} | -| [XPQARetrieval](https://arxiv.org/abs/2305.09249) (Shen et al., 2023) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'pol', 'por', 'spa', 'tam'] | Retrieval | s2p | [Reviews, Written] | {'test': 19801} | {'test': {'ara-ara': {'average_document_length': 61.88361204013378, 'average_query_length': 29.688, 'num_documents': 1495, 'num_queries': 750, 'average_relevant_docs_per_query': 2.004}, 'eng-ara': {'average_document_length': 125.26940639269407, 'average_query_length': 29.688, 'num_documents': 1533, 'num_queries': 750, 'average_relevant_docs_per_query': 2.058666666666667}, 'ara-eng': {'average_document_length': 61.88361204013378, 'average_query_length': 39.5188679245283, 'num_documents': 1495, 'num_queries': 742, 'average_relevant_docs_per_query': 2.024258760107817}, 'deu-deu': {'average_document_length': 69.54807692307692, 'average_query_length': 55.51827676240209, 'num_documents': 1248, 'num_queries': 766, 'average_relevant_docs_per_query': 1.6318537859007833}, 'eng-deu': {'average_document_length': 115.77118078719145, 'average_query_length': 55.51827676240209, 'num_documents': 1499, 'num_queries': 766, 'average_relevant_docs_per_query': 1.9634464751958225}, 'deu-eng': {'average_document_length': 69.54807692307692, 'average_query_length': 51.88903394255875, 'num_documents': 1248, 'num_queries': 766, 'average_relevant_docs_per_query': 1.6318537859007833}, 'spa-spa': {'average_document_length': 68.27511591962906, 'average_query_length': 46.711223203026485, 'num_documents': 1941, 'num_queries': 793, 'average_relevant_docs_per_query': 2.4489281210592684}, 'eng-spa': {'average_document_length': 123.43698347107438, 'average_query_length': 46.711223203026485, 'num_documents': 1936, 'num_queries': 793, 'average_relevant_docs_per_query': 2.472887767969735}, 'spa-eng': {'average_document_length': 68.27511591962906, 'average_query_length': 47.21059268600252, 'num_documents': 1941, 'num_queries': 793, 'average_relevant_docs_per_query': 2.4489281210592684}, 'fra-fra': {'average_document_length': 76.99354005167959, 'average_query_length': 56.0520694259012, 'num_documents': 1548, 'num_queries': 749, 'average_relevant_docs_per_query': 2.069425901201602}, 'eng-fra': {'average_document_length': 137.31242532855435, 'average_query_length': 56.0520694259012, 'num_documents': 1674, 'num_queries': 749, 'average_relevant_docs_per_query': 2.248331108144192}, 'fra-eng': {'average_document_length': 76.99354005167959, 'average_query_length': 49.58744993324433, 'num_documents': 1548, 'num_queries': 749, 'average_relevant_docs_per_query': 2.069425901201602}, 'hin-hin': {'average_document_length': 47.20783373301359, 'average_query_length': 33.47783783783784, 'num_documents': 1251, 'num_queries': 925, 'average_relevant_docs_per_query': 1.3902702702702703}, 'eng-hin': {'average_document_length': 106.67662682602922, 'average_query_length': 33.47783783783784, 'num_documents': 1506, 'num_queries': 925, 'average_relevant_docs_per_query': 1.8054054054054054}, 'hin-eng': {'average_document_length': 47.20783373301359, 'average_query_length': 34.98574561403509, 'num_documents': 1251, 'num_queries': 912, 'average_relevant_docs_per_query': 1.4100877192982457}, 'ita-ita': {'average_document_length': 59.778301886792455, 'average_query_length': 49.14932126696833, 'num_documents': 1272, 'num_queries': 663, 'average_relevant_docs_per_query': 1.9245852187028658}, 'eng-ita': {'average_document_length': 123.07302075326672, 'average_query_length': 49.14932126696833, 'num_documents': 1301, 'num_queries': 663, 'average_relevant_docs_per_query': 1.9849170437405732}, 'ita-eng': {'average_document_length': 59.778301886792455, 'average_query_length': 49.040723981900456, 'num_documents': 1272, 'num_queries': 663, 'average_relevant_docs_per_query': 1.9245852187028658}, 'jpn-jpn': {'average_document_length': 41.030605871330415, 'average_query_length': 23.296969696969697, 'num_documents': 1601, 'num_queries': 825, 'average_relevant_docs_per_query': 1.9406060606060607}, 'eng-jpn': {'average_document_length': 126.2647564469914, 'average_query_length': 23.296969696969697, 'num_documents': 1745, 'num_queries': 825, 'average_relevant_docs_per_query': 2.1187878787878787}, 'jpn-eng': {'average_document_length': 41.030605871330415, 'average_query_length': 51.416058394160586, 'num_documents': 1601, 'num_queries': 822, 'average_relevant_docs_per_query': 1.9476885644768855}, 'kor-kor': {'average_document_length': 31.22722159730034, 'average_query_length': 21.81804281345566, 'num_documents': 889, 'num_queries': 654, 'average_relevant_docs_per_query': 1.5642201834862386}, 'eng-kor': {'average_document_length': 112.41231822070145, 'average_query_length': 21.81804281345566, 'num_documents': 1169, 'num_queries': 654, 'average_relevant_docs_per_query': 1.952599388379205}, 'kor-eng': {'average_document_length': 31.22722159730034, 'average_query_length': 43.9527687296417, 'num_documents': 889, 'num_queries': 614, 'average_relevant_docs_per_query': 1.6661237785016287}, 'pol-pol': {'average_document_length': 50.66814439518683, 'average_query_length': 53.72101910828025, 'num_documents': 1579, 'num_queries': 785, 'average_relevant_docs_per_query': 2.080254777070064}, 'eng-pol': {'average_document_length': 112.96919566457501, 'average_query_length': 53.72101910828025, 'num_documents': 1753, 'num_queries': 785, 'average_relevant_docs_per_query': 2.385987261146497}, 'pol-eng': {'average_document_length': 50.66814439518683, 'average_query_length': 54.1994851994852, 'num_documents': 1579, 'num_queries': 777, 'average_relevant_docs_per_query': 2.101673101673102}, 'por-por': {'average_document_length': 75.9845869297164, 'average_query_length': 42.58875, 'num_documents': 1622, 'num_queries': 800, 'average_relevant_docs_per_query': 2.14}, 'eng-por': {'average_document_length': 111.42525930445393, 'average_query_length': 42.58875, 'num_documents': 1639, 'num_queries': 800, 'average_relevant_docs_per_query': 2.21875}, 'por-eng': {'average_document_length': 75.9845869297164, 'average_query_length': 46.57967377666248, 'num_documents': 1622, 'num_queries': 797, 'average_relevant_docs_per_query': 2.148055207026349}, 'tam-tam': {'average_document_length': 64.89019607843137, 'average_query_length': 33.267263427109974, 'num_documents': 1275, 'num_queries': 782, 'average_relevant_docs_per_query': 1.6994884910485935}, 'eng-tam': {'average_document_length': 96.96361185983828, 'average_query_length': 33.267263427109974, 'num_documents': 1484, 'num_queries': 782, 'average_relevant_docs_per_query': 2.0255754475703327}, 'tam-eng': {'average_document_length': 64.89019607843137, 'average_query_length': 34.777633289986994, 'num_documents': 1275, 'num_queries': 769, 'average_relevant_docs_per_query': 1.728218465539662}, 'cmn-cmn': {'average_document_length': 20.958944281524925, 'average_query_length': 12.21116504854369, 'num_documents': 1705, 'num_queries': 824, 'average_relevant_docs_per_query': 2.0716019417475726}, 'eng-cmn': {'average_document_length': 108.31593874078276, 'average_query_length': 12.21116504854369, 'num_documents': 1763, 'num_queries': 824, 'average_relevant_docs_per_query': 2.2633495145631066}, 'cmn-eng': {'average_document_length': 20.958944281524925, 'average_query_length': 41.24390243902439, 'num_documents': 1705, 'num_queries': 820, 'average_relevant_docs_per_query': 2.0817073170731706}}} | -| [XQuADRetrieval](https://huggingface.co/datasets/xquad) (Mikel Artetxe, 2019) | ['arb', 'deu', 'ell', 'eng', 'hin', 'ron', 'rus', 'spa', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Web, Written] | {'test': 1190} | {'validation': {'ar': {'average_document_length': 683.4666666666667, 'average_query_length': 53.327993254637434, 'num_documents': 240, 'num_queries': 1186, 'average_relevant_docs_per_query': 1.0}, 'de': {'average_document_length': 894.0666666666667, 'average_query_length': 69.04318374259103, 'num_documents': 240, 'num_queries': 1181, 'average_relevant_docs_per_query': 1.0}, 'el': {'average_document_length': 894.3791666666667, 'average_query_length': 68.61317567567568, 'num_documents': 240, 'num_queries': 1184, 'average_relevant_docs_per_query': 1.0}, 'en': {'average_document_length': 784.8333333333334, 'average_query_length': 61.25063291139241, 'num_documents': 240, 'num_queries': 1185, 'average_relevant_docs_per_query': 1.0}, 'es': {'average_document_length': 883.8041666666667, 'average_query_length': 68.23817567567568, 'num_documents': 240, 'num_queries': 1184, 'average_relevant_docs_per_query': 1.0}, 'hi': {'average_document_length': 764.9416666666667, 'average_query_length': 59.684699915469146, 'num_documents': 240, 'num_queries': 1183, 'average_relevant_docs_per_query': 1.0}, 'ro': {'average_document_length': 878.4458333333333, 'average_query_length': 67.17229729729729, 'num_documents': 240, 'num_queries': 1184, 'average_relevant_docs_per_query': 1.0}, 'ru': {'average_document_length': 850.1875, 'average_query_length': 64.94261603375527, 'num_documents': 240, 'num_queries': 1185, 'average_relevant_docs_per_query': 1.0}, 'th': {'average_document_length': 736.7583333333333, 'average_query_length': 55.103389830508476, 'num_documents': 240, 'num_queries': 1180, 'average_relevant_docs_per_query': 1.0}, 'tr': {'average_document_length': 788.3, 'average_query_length': 60.876689189189186, 'num_documents': 240, 'num_queries': 1184, 'average_relevant_docs_per_query': 1.0}, 'vi': {'average_document_length': 803.9083333333333, 'average_query_length': 61.62859560067682, 'num_documents': 240, 'num_queries': 1182, 'average_relevant_docs_per_query': 1.0}, 'zh': {'average_document_length': 252.4, 'average_query_length': 18.460626587637595, 'num_documents': 240, 'num_queries': 1181, 'average_relevant_docs_per_query': 1.0}}} | -| [XStance](https://github.com/ZurichNLP/xstance) | ['deu', 'fra', 'ita'] | PairClassification | s2s | [Social, Written] | {'test': 2048} | {'test': 152.41} | -| [YahooAnswersTopicsClassification](https://huggingface.co/datasets/yahoo_answers_topics) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Web, Written] | {'test': 60000} | {'test': 346.35} | -| [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | {'test': 50000} | {} | -| [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | {'test': 6161} | {'test': 173.0} | -| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, Web, News, Written] | {'test_expert': 2040} | {'test_expert': 145.88} | -| [mFollowIRCrossLingualInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'eng-fas': 80, 'eng-rus': 80, 'eng-zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 81.8780487804878, 'average_instruction_length': 389.9512195121951, 'average_changed_instruction_length': 450.5528455284553, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'eng-fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 80.075, 'average_instruction_length': 396.875, 'average_changed_instruction_length': 463.175, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'eng-rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 81.875, 'average_instruction_length': 371.125, 'average_changed_instruction_length': 431.8, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'eng-zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 83.55813953488372, 'average_instruction_length': 401.0232558139535, 'average_changed_instruction_length': 456.25581395348837, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | -| [mFollowIRInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'fas': 80, 'rus': 80, 'zho': 86} | {'test': {'num_docs': 121635, 'num_queries': 123, 'average_document_length': 2331.0777818884367, 'average_query_length': 57.113821138211385, 'average_instruction_length': 281.0650406504065, 'average_changed_instruction_length': 326.9430894308943, 'average_relevant_docs_per_query': 10.30952380952381, 'average_top_ranked_per_query': 1024.3902439024391, 'hf_subset_descriptive_stats': {'fas': {'num_docs': 41189, 'num_queries': 40, 'average_document_length': 3145.4990895627475, 'average_query_length': 72.65, 'average_instruction_length': 358.925, 'average_changed_instruction_length': 415.325, 'average_relevant_docs_per_query': 10.465116279069768, 'average_top_ranked_per_query': 1075}, 'rus': {'num_docs': 39326, 'num_queries': 40, 'average_document_length': 2784.0813456746173, 'average_query_length': 77.5, 'average_instruction_length': 387, 'average_changed_instruction_length': 458, 'average_relevant_docs_per_query': 9.775, 'average_top_ranked_per_query': 1000}, 'zho': {'num_docs': 41120, 'num_queries': 43, 'average_document_length': 1082.0501215953307, 'average_query_length': 23.697674418604652, 'average_instruction_length': 110.09302325581395, 'average_changed_instruction_length': 122.81395348837209, 'average_relevant_docs_per_query': 10.651162790697674, 'average_top_ranked_per_query': 1000}}}} | +| [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | +| [WikipediaRerankingMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-reranking-multilingual) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Reranking | s2p | [Encyclopaedic, Written] | {'test': 24000} | {'test': {'num_samples': 24000, 'number_of_characters': 83866932, 'num_positive': 24000, 'num_negative': 192000, 'avg_query_len': 59.09, 'avg_positive_len': 385.45, 'avg_negative_len': 381.24, 'hf_subset_descriptive_stats': {'bg': {'num_samples': 1500, 'number_of_characters': 5145316, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 60.83, 'avg_positive_len': 375.89, 'avg_negative_len': 374.19}, 'bn': {'num_samples': 1500, 'number_of_characters': 5390581, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 47.27, 'avg_positive_len': 394.59, 'avg_negative_len': 393.98}, 'cs': {'num_samples': 1500, 'number_of_characters': 5079180, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 56.27, 'avg_positive_len': 383.84, 'avg_negative_len': 368.25}, 'da': {'num_samples': 1500, 'number_of_characters': 4746132, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 56.75, 'avg_positive_len': 351.68, 'avg_negative_len': 344.46}, 'de': {'num_samples': 1500, 'number_of_characters': 5483592, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 70.0, 'avg_positive_len': 391.54, 'avg_negative_len': 399.27}, 'en': {'num_samples': 1500, 'number_of_characters': 6217884, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 68.37, 'avg_positive_len': 451.73, 'avg_negative_len': 453.14}, 'fa': {'num_samples': 1500, 'number_of_characters': 4732619, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 48.67, 'avg_positive_len': 347.7, 'avg_negative_len': 344.84}, 'fi': {'num_samples': 1500, 'number_of_characters': 5209132, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 55.34, 'avg_positive_len': 394.71, 'avg_negative_len': 377.84}, 'hi': {'num_samples': 1500, 'number_of_characters': 5620959, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 50.78, 'avg_positive_len': 420.38, 'avg_negative_len': 409.52}, 'it': {'num_samples': 1500, 'number_of_characters': 5420496, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 70.05, 'avg_positive_len': 396.97, 'avg_negative_len': 393.33}, 'nl': {'num_samples': 1500, 'number_of_characters': 5169556, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 65.34, 'avg_positive_len': 380.79, 'avg_negative_len': 375.03}, 'pt': {'num_samples': 1500, 'number_of_characters': 5474356, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 65.12, 'avg_positive_len': 404.02, 'avg_negative_len': 397.55}, 'ro': {'num_samples': 1500, 'number_of_characters': 4796113, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 61.97, 'avg_positive_len': 346.71, 'avg_negative_len': 348.59}, 'sr': {'num_samples': 1500, 'number_of_characters': 5271732, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 55.67, 'avg_positive_len': 386.35, 'avg_negative_len': 384.06}, 'no': {'num_samples': 1500, 'number_of_characters': 5036586, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 55.29, 'avg_positive_len': 367.72, 'avg_negative_len': 366.84}, 'sv': {'num_samples': 1500, 'number_of_characters': 5072698, 'num_positive': 1500, 'num_negative': 12000, 'avg_query_len': 57.73, 'avg_positive_len': 372.59, 'avg_negative_len': 368.94}}}} | +| [WikipediaRetrievalMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-retrieval-multilingual-queries) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [Social, News, Written] | None | None | +| XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | None | +| [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | {'test': 19110, 'validation': 19110} | {'test': {'num_samples': 19110, 'number_of_characters': 2907145, 'avg_sentence1_len': 103.24, 'avg_sentence2_len': 48.89, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 179591, 'avg_sentence1_len': 89.57, 'avg_sentence2_len': 41.99, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 220646, 'avg_sentence1_len': 110.02, 'avg_sentence2_len': 51.63, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241224, 'avg_sentence1_len': 119.93, 'avg_sentence2_len': 56.79, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 240222, 'avg_sentence1_len': 119.05, 'avg_sentence2_len': 56.93, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212223, 'avg_sentence1_len': 105.67, 'avg_sentence2_len': 49.8, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232207, 'avg_sentence1_len': 115.43, 'avg_sentence2_len': 54.68, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 245259, 'avg_sentence1_len': 121.1, 'avg_sentence2_len': 58.58, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 211312, 'avg_sentence1_len': 104.63, 'avg_sentence2_len': 50.17, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 222797, 'avg_sentence1_len': 110.77, 'avg_sentence2_len': 52.45, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210103, 'avg_sentence1_len': 104.44, 'avg_sentence2_len': 49.48, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192788, 'avg_sentence1_len': 96.69, 'avg_sentence2_len': 44.54, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208658, 'avg_sentence1_len': 103.68, 'avg_sentence2_len': 49.19, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 223549, 'avg_sentence1_len': 111.31, 'avg_sentence2_len': 52.46, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 66566, 'avg_sentence1_len': 33.04, 'avg_sentence2_len': 15.73, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'number_of_characters': 2909058, 'avg_sentence1_len': 103.21, 'avg_sentence2_len': 49.02, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 177355, 'avg_sentence1_len': 88.32, 'avg_sentence2_len': 41.61, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 219988, 'avg_sentence1_len': 109.2, 'avg_sentence2_len': 51.97, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241852, 'avg_sentence1_len': 119.81, 'avg_sentence2_len': 57.37, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 241275, 'avg_sentence1_len': 119.88, 'avg_sentence2_len': 56.88, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212384, 'avg_sentence1_len': 105.72, 'avg_sentence2_len': 49.88, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232451, 'avg_sentence1_len': 115.17, 'avg_sentence2_len': 55.12, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 246857, 'avg_sentence1_len': 121.76, 'avg_sentence2_len': 59.09, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 212269, 'avg_sentence1_len': 105.06, 'avg_sentence2_len': 50.44, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 221152, 'avg_sentence1_len': 109.75, 'avg_sentence2_len': 52.27, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210482, 'avg_sentence1_len': 104.32, 'avg_sentence2_len': 49.88, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192640, 'avg_sentence1_len': 97.28, 'avg_sentence2_len': 43.84, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208305, 'avg_sentence1_len': 102.97, 'avg_sentence2_len': 49.64, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 224811, 'avg_sentence1_len': 112.26, 'avg_sentence2_len': 52.43, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 67237, 'avg_sentence1_len': 33.41, 'avg_sentence2_len': 15.85, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} | +| [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | None | None | +| [XPQARetrieval](https://arxiv.org/abs/2305.09249) (Shen et al., 2023) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'pol', 'por', 'spa', 'tam'] | Retrieval | s2p | [Reviews, Written] | None | None | +| [XQuADRetrieval](https://huggingface.co/datasets/xquad) (Mikel Artetxe, 2019) | ['arb', 'deu', 'ell', 'eng', 'hin', 'ron', 'rus', 'spa', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Web, Written] | None | None | +| [XStance](https://github.com/ZurichNLP/xstance) | ['deu', 'fra', 'ita'] | PairClassification | s2s | [Social, Written] | None | None | +| [YahooAnswersTopicsClassification](https://huggingface.co/datasets/yahoo_answers_topics) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Web, Written] | None | None | +| [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | +| [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | None | None | +| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, Web, News, Written] | None | None | +| [mFollowIRCrossLingualInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283654099, 'average_document_length': 2331.08, 'average_query_length': 81.88, 'average_instruction_length': 389.95, 'average_changed_instruction_length': 450.55, 'average_relevant_docs_per_query': 10.43, 'average_top_ranked_per_query': 1000.0, 'hf_subset_descriptive_stats': {'eng-fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129597567, 'average_document_length': 3145.5, 'average_query_length': 80.08, 'average_instruction_length': 396.88, 'average_changed_instruction_length': 463.18, 'average_relevant_docs_per_query': 10.85, 'average_top_ranked_per_query': 1000.0}, 'eng-rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109522175, 'average_document_length': 2784.08, 'average_query_length': 81.88, 'average_instruction_length': 371.12, 'average_changed_instruction_length': 431.8, 'average_relevant_docs_per_query': 9.78, 'average_top_ranked_per_query': 1000.0}, 'eng-zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44534357, 'average_document_length': 1082.05, 'average_query_length': 83.56, 'average_instruction_length': 401.02, 'average_changed_instruction_length': 456.26, 'average_relevant_docs_per_query': 10.65, 'average_top_ranked_per_query': 1000.0}}}} | +| [mFollowIRInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283622456, 'average_document_length': 2331.08, 'average_query_length': 57.11, 'average_instruction_length': 281.07, 'average_changed_instruction_length': 326.94, 'average_relevant_docs_per_query': 10.43, 'average_top_ranked_per_query': 1000.0, 'hf_subset_descriptive_stats': {'fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129593838, 'average_document_length': 3145.5, 'average_query_length': 72.65, 'average_instruction_length': 358.93, 'average_changed_instruction_length': 415.32, 'average_relevant_docs_per_query': 10.85, 'average_top_ranked_per_query': 1000.0}, 'rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109523683, 'average_document_length': 2784.08, 'average_query_length': 77.5, 'average_instruction_length': 387.0, 'average_changed_instruction_length': 458.0, 'average_relevant_docs_per_query': 9.78, 'average_top_ranked_per_query': 1000.0}, 'zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44504935, 'average_document_length': 1082.05, 'average_query_length': 23.7, 'average_instruction_length': 110.09, 'average_changed_instruction_length': 122.81, 'average_relevant_docs_per_query': 10.65, 'average_top_ranked_per_query': 1000.0}}}} |
@@ -1662,4 +1662,4 @@ The following tables give you an overview of the tasks in MTEB. | Total | 1394 | 795 | 304 | 3 | 28 | 67 | 50 | 456 | 85 | 2 | 2 | - + \ No newline at end of file From b19335b606a626545401d04a892e502c34698741 Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Sat, 9 Nov 2024 14:46:09 -0500 Subject: [PATCH 09/16] lint --- tests/test_benchmark/mock_tasks.py | 122 ++++++++++++++--------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index 92c2459fc9..f599a21990 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -883,7 +883,7 @@ class MockRerankingTask(AbsTaskReranking): metadata = TaskMetadata( type="Reranking", name="MockRerankingTask", - main_score="map_at_1000", + main_score="map_at_1000", **general_args, # type: ignore ) @@ -956,7 +956,7 @@ class MockMultilingualRerankingTask(AbsTaskReranking, MultilingualTask): "num_samples": 4, "number_of_characters": 56.0, }, - } + }, } } @@ -1249,18 +1249,18 @@ def load_data(self, **kwargs): class MockInstructionRetrieval(AbsTaskRetrieval): expected_stats = { - "test": { - "num_documents": 2, - "num_queries": 2, - "average_document_length": 15.0, - "average_query_length": 13.0, - "average_instruction_length": 29.0, - "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 0, - "num_instructions": 2, - "num_samples": 4, - "number_of_characters": 56.0 - } + "test": { + "num_documents": 2, + "num_queries": 2, + "average_document_length": 15.0, + "average_query_length": 13.0, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 0, + "num_instructions": 2, + "num_samples": 4, + "number_of_characters": 56.0, + } } metadata = TaskMetadata( @@ -1303,16 +1303,16 @@ def load_data(self, **kwargs): class MockInstructionReranking(AbsTaskReranking): expected_stats = { "test": { - "num_documents": 2, - "num_queries": 2, - "num_instructions": 2, - "average_document_length": 15.0, - "average_query_length": 13.0, - "average_instruction_length": 29.0, - "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 2.0, - "num_samples": 4, - "number_of_characters": 56.0 + "num_documents": 2, + "num_queries": 2, + "num_instructions": 2, + "average_document_length": 15.0, + "average_query_length": 13.0, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 2.0, + "num_samples": 4, + "number_of_characters": 56.0, } } @@ -1360,7 +1360,7 @@ def load_data(self, **kwargs): class MockMultilingualInstructionRetrieval(AbsTaskRetrieval, MultilingualTask): expected_stats = { - "test": { + "test": { "num_documents": 4, "num_queries": 4, "num_instructions": 4, @@ -1457,43 +1457,43 @@ def load_data(self, **kwargs): class MockMultilingualInstructionReranking(AbsTaskReranking, MultilingualTask): expected_stats = { "test": { - "num_documents": 4, - "num_queries": 4, - "num_instructions": 4, - "average_document_length": 7.5, - "average_query_length": 6.5, - "average_instruction_length": 29.0, - "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 2.0, - "num_samples": 8, - "number_of_characters": 56.0, - "hf_subset_descriptive_stats": { - "eng": { - "num_documents": 2, - "num_queries": 2, - "num_instructions": 2, - "average_document_length": 15.0, - "average_query_length": 13.0, - "average_instruction_length": 29.0, - "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 2.0, - "num_samples": 4, - "number_of_characters": 56.0, - }, - "fra": { - "num_documents": 2, - "num_queries": 2, - "num_instructions": 2, - "average_document_length": 15.0, - "average_query_length": 13.0, - "average_instruction_length": 29.0, - "average_relevant_docs_per_query": 1.0, - "average_top_ranked_per_query": 2.0, - "num_samples": 4, - "number_of_characters": 56.0, - }, + "num_documents": 4, + "num_queries": 4, + "num_instructions": 4, + "average_document_length": 7.5, + "average_query_length": 6.5, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 2.0, + "num_samples": 8, + "number_of_characters": 56.0, + "hf_subset_descriptive_stats": { + "eng": { + "num_documents": 2, + "num_queries": 2, + "num_instructions": 2, + "average_document_length": 15.0, + "average_query_length": 13.0, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 2.0, + "num_samples": 4, + "number_of_characters": 56.0, }, - } + "fra": { + "num_documents": 2, + "num_queries": 2, + "num_instructions": 2, + "average_document_length": 15.0, + "average_query_length": 13.0, + "average_instruction_length": 29.0, + "average_relevant_docs_per_query": 1.0, + "average_top_ranked_per_query": 2.0, + "num_samples": 4, + "number_of_characters": 56.0, + }, + }, + } } metadata = TaskMetadata( From 636fdfc527d265efc27c13acc24a8d62f5dcc7a8 Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Sat, 9 Nov 2024 15:18:40 -0500 Subject: [PATCH 10/16] fix test --- mteb/abstasks/AbsTaskRetrieval.py | 5 ++++- mteb/abstasks/MultilingualTask.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 5a056aee32..0536e3de13 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -84,7 +84,10 @@ class AbsTaskRetrieval(AbsTask): def __init__(self, **kwargs): self.top_ranked = None self.instructions = None - super().__init__(**kwargs) + if isinstance(self, AbsTaskRetrieval): + super(AbsTaskRetrieval, self).__init__(**kwargs) + else: + super().__init__(**kwargs) def load_data(self, **kwargs): if self.data_loaded: diff --git a/mteb/abstasks/MultilingualTask.py b/mteb/abstasks/MultilingualTask.py index 84860a48ed..3fd007df6d 100644 --- a/mteb/abstasks/MultilingualTask.py +++ b/mteb/abstasks/MultilingualTask.py @@ -4,7 +4,7 @@ from .MultiSubsetLoader import MultiSubsetLoader -class MultilingualTask(AbsTask, MultiSubsetLoader): +class MultilingualTask(MultiSubsetLoader, AbsTask): def __init__(self, hf_subsets: list[str] | None = None, **kwargs): super().__init__(**kwargs) if isinstance(hf_subsets, list): From 47e80ba1d0656a558ef60cbcb26adf7e17ba8841 Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Sat, 9 Nov 2024 15:19:44 -0500 Subject: [PATCH 11/16] qa --- mteb/abstasks/AbsTaskRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 0536e3de13..ee323fd9ca 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -85,7 +85,7 @@ def __init__(self, **kwargs): self.top_ranked = None self.instructions = None if isinstance(self, AbsTaskRetrieval): - super(AbsTaskRetrieval, self).__init__(**kwargs) + super(AbsTaskRetrieval, self).__init__(**kwargs) # noqa else: super().__init__(**kwargs) From ad0a3dbf26085505239ded5c408e7067b879047c Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Sat, 9 Nov 2024 18:31:39 -0500 Subject: [PATCH 12/16] updated mindsmall --- mteb/abstasks/AbsTaskRetrieval.py | 14 +++++++------- mteb/evaluation/evaluators/utils.py | 10 +++++++--- mteb/tasks/Reranking/eng/MindSmallReranking.py | 6 +----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index ee323fd9ca..a57532ce77 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -84,10 +84,8 @@ class AbsTaskRetrieval(AbsTask): def __init__(self, **kwargs): self.top_ranked = None self.instructions = None - if isinstance(self, AbsTaskRetrieval): - super(AbsTaskRetrieval, self).__init__(**kwargs) # noqa - else: - super().__init__(**kwargs) + # there could be multiple options, so do this even if multilingual + super(AbsTaskRetrieval, self).__init__(**kwargs) # noqa def load_data(self, **kwargs): if self.data_loaded: @@ -195,7 +193,8 @@ def _evaluate_subset( save_predictions = kwargs.get("save_predictions", False) export_errors = kwargs.get("export_errors", False) - if save_predictions or export_errors: + save_qrels = kwargs.get("save_qrels", False) + if save_predictions or export_errors or save_qrels: output_folder = Path(kwargs.get("output_folder", "results")) if not os.path.isdir(output_folder): os.makedirs(output_folder) @@ -219,7 +218,7 @@ def _evaluate_subset( with open(qrels_save_path, "w") as f: json.dump(results, f) - # save qrels also + if save_qrels: with open( output_folder / f"{self.metadata.name}_{hf_subset}_qrels.json", "w" ) as f: @@ -230,8 +229,9 @@ def _evaluate_subset( results, retriever.k_values, ignore_identical_ids=self.ignore_identical_ids, - task_name=self.metadata.name, + task_name=self.metadata.name ) + mrr, naucs_mrr = retriever.evaluate_custom( relevant_docs, results, retriever.k_values, "mrr" ) diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index fae3021604..89aef71bf9 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -560,8 +560,8 @@ def add_task_specific_scores( task_scores.update(p_mrr_and_consolidated_scores) if task_name in ["MindSmallReranking"]: - take_max_over_subqueries = max_over_subqueries(qrels, results, scores) - task_scores["max_over_subqueries"] = take_max_over_subqueries + take_max_over_subqueries = max_over_subqueries(qrels, results, k_values) + task_scores.update(take_max_over_subqueries) return task_scores @@ -699,6 +699,7 @@ def max_over_subqueries(qrels, results, k_values): query_keys["_".join(key.split("_")[:-1])].append(key) new_results = {} + new_qrels = {} for query_id_base, query_ids in query_keys.items(): doc_scores = defaultdict(float) for query_id_full in query_ids: @@ -709,10 +710,12 @@ def max_over_subqueries(qrels, results, k_values): doc_scores[doc_id] = max(score, doc_scores[doc_id]) new_results[query_id_base] = doc_scores + new_qrels[query_id_base] = qrels[query_id_full] # all the same + # now we have the new results, we can compute the scores _, ndcg, _map, recall, precision, naucs = calculate_retrieval_scores( - new_results, qrels, k_values + new_results, new_qrels, k_values ) score_dict = make_score_dict(ndcg, _map, recall, precision, {}, naucs, {}, {}) return {"max_over_subqueries_" + k: v for k, v in score_dict.items()} @@ -723,6 +726,7 @@ def calculate_retrieval_scores(results, qrels, k_values): ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) recall_string = "recall." + ",".join([str(k) for k in k_values]) precision_string = "P." + ",".join([str(k) for k in k_values]) + evaluator = pytrec_eval.RelevanceEvaluator( qrels, {map_string, ndcg_string, recall_string, precision_string} ) diff --git a/mteb/tasks/Reranking/eng/MindSmallReranking.py b/mteb/tasks/Reranking/eng/MindSmallReranking.py index 5a7ce8176c..f48ad7c6d9 100644 --- a/mteb/tasks/Reranking/eng/MindSmallReranking.py +++ b/mteb/tasks/Reranking/eng/MindSmallReranking.py @@ -141,8 +141,7 @@ def load_data(self, **kwargs): all_queries = [] all_positives = [] all_negatives = [] - all_ids = [] - all_instance_indices = [] # Renamed for clarity + all_instance_indices = [] all_subquery_indices = [] # First pass: expand queries while maintaining relationships @@ -157,9 +156,6 @@ def load_data(self, **kwargs): all_queries.append(query) all_positives.append(positives) # Same positives for each subquery all_negatives.append(negatives) # Same negatives for each subquery - all_ids.append( - f"{instance.get('id', current_instance_idx)}_{subquery_idx}" - ) all_instance_indices.append(current_instance_idx) all_subquery_indices.append(subquery_idx) From 88df96db83bbe638543a4eabfa69ba542c2fe7d3 Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Sat, 9 Nov 2024 18:32:02 -0500 Subject: [PATCH 13/16] lint --- mteb/abstasks/AbsTaskRetrieval.py | 2 +- mteb/evaluation/evaluators/utils.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index a57532ce77..c980adad45 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -229,7 +229,7 @@ def _evaluate_subset( results, retriever.k_values, ignore_identical_ids=self.ignore_identical_ids, - task_name=self.metadata.name + task_name=self.metadata.name, ) mrr, naucs_mrr = retriever.evaluate_custom( diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index 89aef71bf9..c2e8f8ba45 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -710,8 +710,7 @@ def max_over_subqueries(qrels, results, k_values): doc_scores[doc_id] = max(score, doc_scores[doc_id]) new_results[query_id_base] = doc_scores - new_qrels[query_id_base] = qrels[query_id_full] # all the same - + new_qrels[query_id_base] = qrels[query_id_full] # all the same # now we have the new results, we can compute the scores _, ndcg, _map, recall, precision, naucs = calculate_retrieval_scores( From 94ba50bc52a75babfce3bb6946b2fd93bb457ef5 Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Sat, 9 Nov 2024 18:32:48 -0500 Subject: [PATCH 14/16] fix debug --- mteb/tasks/Reranking/eng/MindSmallReranking.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mteb/tasks/Reranking/eng/MindSmallReranking.py b/mteb/tasks/Reranking/eng/MindSmallReranking.py index f48ad7c6d9..5c57e3f77d 100644 --- a/mteb/tasks/Reranking/eng/MindSmallReranking.py +++ b/mteb/tasks/Reranking/eng/MindSmallReranking.py @@ -133,8 +133,6 @@ def load_data(self, **kwargs): # Process each split for split in self.dataset: - if split == "train": - continue logging.info(f"Processing split {split}") # Pre-allocate lists for batch processing From 37a668d0db1199bf8af55d8d9eecdd21b2a26716 Mon Sep 17 00:00:00 2001 From: Orion Weller <31665361+orionw@users.noreply.github.com> Date: Mon, 11 Nov 2024 09:03:55 -0500 Subject: [PATCH 15/16] Update mteb/abstasks/dataloaders.py Co-authored-by: Roman Solomatin --- mteb/abstasks/dataloaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/abstasks/dataloaders.py b/mteb/abstasks/dataloaders.py index dc641e5970..79c29ac37a 100644 --- a/mteb/abstasks/dataloaders.py +++ b/mteb/abstasks/dataloaders.py @@ -98,8 +98,8 @@ def load( dict[str, dict[str, str]], # corpus dict[str, str | list[str]], # queries dict[str, dict[str, int]], # qrels/relevant_docs - dict[str, str | list[str]], # instructions (optional) - dict[str, list[str]] | dict[str, dict[str, float]], # top_ranked (optional) + dict[str, str | list[str]] | None, # instructions (optional) + dict[str, list[str]] | dict[str, dict[str, float]] | None, # top_ranked (optional) ]: if not self.hf_repo: self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv") From 7688f84a920e0958ac6ef740a82edf37c33c39af Mon Sep 17 00:00:00 2001 From: Orion Weller Date: Wed, 13 Nov 2024 11:17:46 -0500 Subject: [PATCH 16/16] lint --- mteb/abstasks/dataloaders.py | 4 +++- mteb/evaluation/evaluators/utils.py | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/mteb/abstasks/dataloaders.py b/mteb/abstasks/dataloaders.py index 79c29ac37a..0b6505c5ee 100644 --- a/mteb/abstasks/dataloaders.py +++ b/mteb/abstasks/dataloaders.py @@ -99,7 +99,9 @@ def load( dict[str, str | list[str]], # queries dict[str, dict[str, int]], # qrels/relevant_docs dict[str, str | list[str]] | None, # instructions (optional) - dict[str, list[str]] | dict[str, dict[str, float]] | None, # top_ranked (optional) + dict[str, list[str]] + | dict[str, dict[str, float]] + | None, # top_ranked (optional) ]: if not self.hf_repo: self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv") diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index e934bf6089..e01e0ec463 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -13,7 +13,6 @@ from packaging.version import Version from sklearn.metrics import auc - logger = logging.getLogger(__name__) @@ -62,7 +61,9 @@ def _cos_sim_core(a_tensor, b_tensor): return torch.mm(a_norm, b_norm.transpose(0, 1)) # Compile the core function once - if hasattr(torch, "compile") and use_torch_compile(): # Check if torch.compile is available + if ( + hasattr(torch, "compile") and use_torch_compile() + ): # Check if torch.compile is available _cos_sim_core_compiled = torch.compile(_cos_sim_core) return _cos_sim_core_compiled(a, b) else: @@ -89,7 +90,9 @@ def _dot_score_core(a_tensor, b_tensor): return torch.mm(a_tensor, b_tensor.transpose(0, 1)) # Compile the core function once - if hasattr(torch, "compile") and use_torch_compile(): # Check if torch.compile is available + if ( + hasattr(torch, "compile") and use_torch_compile() + ): # Check if torch.compile is available _dot_score_core_compiled = torch.compile(_dot_score_core) return _dot_score_core_compiled(a, b) else: