File size: 2,014 Bytes
f418928 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from typing import Any, Dict, List
import evaluate
from .api import produce
from .inference import InferenceEngine
from .metrics import BulkInstanceMetric
class LLMAsJudge(BulkInstanceMetric):
"""LLM as judge based metric class for evaluating correctness.
Attributes:
main_score (str): The main score used for evaluation.
reduction_map (dict): A dictionary specifying the reduction method for the metric.
betch_size (int): The size of the bulk.
recipe (str): The unitxt recipe that will be used to create the judge dataset.
inference (InferenceEngine): the module that creates the inference.
Methods:
prepare(self): Initialization method for the metric.
compute(self, references, predictions, additional_inputs): Method to compute the metric.
Usage:
metric = LlamaIndexCorrectnessMetric()
scores = metric.compute(references, prediction, additional_inputs)
"""
main_score: str = "llm_as_judge"
reduction_map: Dict[str, List[str]] = None
batch_size: int = 32
recipe: str
inference_model: InferenceEngine
def prepare(self):
super().prepare()
if self.reduction_map is None:
self.reduction_map = {"mean": [self.main_score]}
def compute(
self,
references: List[List[Any]],
predictions: List[Any],
task_data: List[Dict],
) -> List[Dict[str, Any]]:
instances = [
{
**task_data_instance,
**{"model_output": prediction, "rating_label": "[[5]]"},
}
for task_data_instance, prediction in zip(task_data, predictions)
]
dataset = produce(instances, self.recipe)
verdicts = self.inference_model.infer(dataset)
meta_metric = evaluate.load("unitxt/metric")
meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
return [{self.main_score: instance["prediction"]} for instance in meta_scores]
|