metric / llm_as_judge.py
Elron's picture
Upload llm_as_judge.py with huggingface_hub
f418928 verified
raw
history blame
No virus
2.01 kB
from typing import Any, Dict, List
import evaluate
from .api import produce
from .inference import InferenceEngine
from .metrics import BulkInstanceMetric
class LLMAsJudge(BulkInstanceMetric):
"""LLM as judge based metric class for evaluating correctness.
Attributes:
main_score (str): The main score used for evaluation.
reduction_map (dict): A dictionary specifying the reduction method for the metric.
betch_size (int): The size of the bulk.
recipe (str): The unitxt recipe that will be used to create the judge dataset.
inference (InferenceEngine): the module that creates the inference.
Methods:
prepare(self): Initialization method for the metric.
compute(self, references, predictions, additional_inputs): Method to compute the metric.
Usage:
metric = LlamaIndexCorrectnessMetric()
scores = metric.compute(references, prediction, additional_inputs)
"""
main_score: str = "llm_as_judge"
reduction_map: Dict[str, List[str]] = None
batch_size: int = 32
recipe: str
inference_model: InferenceEngine
def prepare(self):
super().prepare()
if self.reduction_map is None:
self.reduction_map = {"mean": [self.main_score]}
def compute(
self,
references: List[List[Any]],
predictions: List[Any],
task_data: List[Dict],
) -> List[Dict[str, Any]]:
instances = [
{
**task_data_instance,
**{"model_output": prediction, "rating_label": "[[5]]"},
}
for task_data_instance, prediction in zip(task_data, predictions)
]
dataset = produce(instances, self.recipe)
verdicts = self.inference_model.infer(dataset)
meta_metric = evaluate.load("unitxt/metric")
meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
return [{self.main_score: instance["prediction"]} for instance in meta_scores]