from typing import Any, Dict, List import evaluate from .api import produce from .inference import InferenceEngine from .metrics import BulkInstanceMetric class LLMAsJudge(BulkInstanceMetric): """LLM as judge based metric class for evaluating correctness. Attributes: main_score (str): The main score used for evaluation. reduction_map (dict): A dictionary specifying the reduction method for the metric. betch_size (int): The size of the bulk. recipe (str): The unitxt recipe that will be used to create the judge dataset. inference (InferenceEngine): the module that creates the inference. Methods: prepare(self): Initialization method for the metric. compute(self, references, predictions, additional_inputs): Method to compute the metric. Usage: metric = LlamaIndexCorrectnessMetric() scores = metric.compute(references, prediction, additional_inputs) """ main_score: str = "llm_as_judge" reduction_map: Dict[str, List[str]] = None batch_size: int = 32 recipe: str inference_model: InferenceEngine def prepare(self): super().prepare() if self.reduction_map is None: self.reduction_map = {"mean": [self.main_score]} def compute( self, references: List[List[Any]], predictions: List[Any], task_data: List[Dict], ) -> List[Dict[str, Any]]: instances = [ { **task_data_instance, **{"model_output": prediction, "rating_label": "[[5]]"}, } for task_data_instance, prediction in zip(task_data, predictions) ] dataset = produce(instances, self.recipe) verdicts = self.inference_model.infer(dataset) meta_metric = evaluate.load("unitxt/metric") meta_scores = meta_metric.compute(predictions=verdicts, references=dataset) return [{self.main_score: instance["prediction"]} for instance in meta_scores]