Spaces:

unitxt
/

metric

Running

@@ -1,104 +0,0 @@
-from typing import Union
-from .card import TaskCard
-from .collections import ItemPicker, RandomPicker
-from .dataclass import OptionalField
-from .operator import SourceOperator
-from .recipe import Recipe, SequentialRecipe
-from .schema import ToUnitxtGroup
-from .splitters import RandomSampler, Sampler, SeparateSplit, SliceSplit, SpreadSplit
-from .stream import MultiStream
-from .templates import RenderTemplatedICL
-class CommonRecipe(Recipe, SourceOperator):
-    card: TaskCard
-    demos_pool_name: str = "demos_pool"
-    demos_taken_from: str = "train"
-    demos_pool_size: int = None
-    demos_field: str = "demos"
-    num_demos: int = None
-    sampler: Sampler = None
-    instruction_item: Union[str, int] = None
-    template_item: Union[str, int] = None
-    system_prompt: str = None
-    def verify(self):
-        super().verify()
-    def prepare(self):
-        steps = [
-            self.card.loader,
-        ]
-        if self.card.preprocess_steps is not None:
-            steps.extend(self.card.preprocess_steps)
-        steps.append(self.card.task)
-        if self.demos_pool_size is not None:
-            steps.append(
-                SeparateSplit(
-                    from_split=self.demos_taken_from,
-                    to_split_names=[self.demos_pool_name, self.demos_taken_from],
-                    to_split_sizes=[int(self.demos_pool_size)],
-                )
-            )
-        if self.num_demos is not None:
-            sampler = self.card.sampler
-            if self.sampler is not None:
-                sampler = self.sampler
-            sampler.set_size(self.num_demos)
-            steps.append(
-                SpreadSplit(
-                    source_stream=self.demos_pool_name,
-                    target_field=self.demos_field,
-                    sampler=sampler,
-                )
-            )
-        if self.card.instructions is not None:
-            if not self.instruction_item is None:
-                picker = ItemPicker(int(self.instruction_item))
-            else:
-                picker = RandomPicker()
-            instruction = picker(self.card.instructions)
-        else:
-            instruction = None
-        if self.card.templates is not None:
-            if self.template_item is None:
-                picker = RandomPicker()
-            else:
-                picker = ItemPicker(self.template_item)
-            template = picker(self.card.templates)
-        else:
-            template = None
-        render = RenderTemplatedICL(
-            instruction=instruction,
-            template=template,
-            demos_field=self.demos_field,
-            system_prompt=self.system_prompt,
-        )
-        steps.append(render)
-        postprocessors = render.get_postprocessors()
-        steps.append(
-            ToUnitxtGroup(
-                group="unitxt",
-                metrics=self.card.task.metrics,
-                postprocessors=postprocessors,
-            )
-        )
-        self.recipe = SequentialRecipe(steps)
-    def process(self) -> MultiStream:
-        return self.recipe()

dataset.py CHANGED Viewed

@@ -10,7 +10,6 @@ from .catalog import __file__ as _
 from .collections import __file__ as _
 from .collections_operators import __file__ as _
 from .dataclass import __file__ as _
-from .dataset_utils import __file__ as _
 from .dataset_utils import get_dataset_artifact
 from .deprecation_utils import __file__ as _
 from .dialog_operators import __file__ as _
@@ -20,13 +19,11 @@ from .file_utils import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _
 from .generator_utils import __file__ as _
-from .hf_utils import __file__ as _
 from .hf_utils import verify_versions_compatibility
 from .inference import __file__ as _
 from .instructions import __file__ as _
 from .llm_as_judge import __file__ as _
 from .loaders import __file__ as _
-from .logging_utils import __file__ as _
 from .logging_utils import get_logger
 from .metric import __file__ as _
 from .metric_utils import __file__ as _
@@ -40,7 +37,6 @@ from .random_utils import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
-from .settings_utils import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
@@ -54,7 +50,6 @@ from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
-from .utils import __file__ as _
 from .utils import is_package_installed
 from .validate import __file__ as _
 from .version import __file__ as _
@@ -75,8 +70,9 @@ class Dataset(datasets.GeneratorBasedBuilder):
             if is_package_installed("unitxt"):
                 verify_versions_compatibility("dataset", self.VERSION)
-                from unitxt.dataset_utils import \
-                    get_dataset_artifact as get_dataset_artifact_installed
                 logger.info("Loading with installed unitxt library...")
                 dataset = get_dataset_artifact_installed(self.config.name)

 from .collections import __file__ as _
 from .collections_operators import __file__ as _
 from .dataclass import __file__ as _
 from .dataset_utils import get_dataset_artifact
 from .deprecation_utils import __file__ as _
 from .dialog_operators import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _
 from .generator_utils import __file__ as _
 from .hf_utils import verify_versions_compatibility
 from .inference import __file__ as _
 from .instructions import __file__ as _
 from .llm_as_judge import __file__ as _
 from .loaders import __file__ as _
 from .logging_utils import get_logger
 from .metric import __file__ as _
 from .metric_utils import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .utils import is_package_installed
 from .validate import __file__ as _
 from .version import __file__ as _
             if is_package_installed("unitxt"):
                 verify_versions_compatibility("dataset", self.VERSION)
+                from unitxt.dataset_utils import (
+                    get_dataset_artifact as get_dataset_artifact_installed,
+                )
                 logger.info("Loading with installed unitxt library...")
                 dataset = get_dataset_artifact_installed(self.config.name)

hf_utils.py CHANGED Viewed

@@ -24,9 +24,9 @@ class UnitxtVersionsConflictError(ValueError):
     def __init__(self, error_in: str, hf_unitxt_version, installed_unitxt_version):
         assert hf_unitxt_version != installed_unitxt_version
         if compare_versions(hf_unitxt_version, installed_unitxt_version) == 1:
-            msg = f"Located installed unitxt version {installed_unitxt_version} that is older then unitxt {error_in} version {hf_unitxt_version}. Please update unitxt package or uninstall it to avoid conflicts."
         if compare_versions(hf_unitxt_version, installed_unitxt_version) == -1:
-            msg = f"Located installed unitxt version {installed_unitxt_version} that is newer then unitxt {error_in} version {hf_unitxt_version}. Please force-reload the {error_in} or downgrade unitxt to {error_in} version or uninstall unitxt to avoid conflicts."
         super().__init__(msg)

     def __init__(self, error_in: str, hf_unitxt_version, installed_unitxt_version):
         assert hf_unitxt_version != installed_unitxt_version
         if compare_versions(hf_unitxt_version, installed_unitxt_version) == 1:
+            msg = f"Located installed unitxt version {installed_unitxt_version} that is older than unitxt {error_in} version {hf_unitxt_version}. Please update unitxt package or uninstall it to avoid conflicts."
         if compare_versions(hf_unitxt_version, installed_unitxt_version) == -1:
+            msg = f"Located installed unitxt version {installed_unitxt_version} that is newer than unitxt {error_in} version {hf_unitxt_version}. Please force-reload the {error_in} or downgrade unitxt to {error_in} version or uninstall unitxt to avoid conflicts."
         super().__init__(msg)

inference.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import abc
 from .artifact import Artifact
 class InferenceEngine(abc.ABC, Artifact):
@@ -11,12 +16,21 @@ class InferenceEngine(abc.ABC, Artifact):
         """Perform inference on the input dataset."""
         pass
-class HFPipelineBasedInferenceEngine(Artifact):
-    """Abstract base class for inference."""
     model_name: str
     max_new_tokens: int
     def prepare(self):
         from transformers import pipeline
@@ -31,3 +45,111 @@ class HFPipelineBasedInferenceEngine(Artifact):
                 max_new_tokens=self.max_new_tokens,
             )
         ]

 import abc
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Union
 from .artifact import Artifact
+from .operator import PackageRequirementsMixin
+from .settings_utils import get_settings
 class InferenceEngine(abc.ABC, Artifact):
         """Perform inference on the input dataset."""
         pass
+    @staticmethod
+    def _assert_allow_passing_data_to_remote_api(remote_api_label: str):
+        assert get_settings().allow_passing_data_to_remote_api, (
+            f"LlmAsJudge metric cannot run send data to remote APIs ({remote_api_label}) when"
+            f" unitxt.settings.allow_passing_data_to_remote_api=False."
+            f" Set UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API environment variable, if you want to allow this. "
+        )
+class HFPipelineBasedInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     model_name: str
     max_new_tokens: int
+    _requirement = {
+        "transformers": "Install huggingface package using 'pip install --upgrade transformers"
+    }
     def prepare(self):
         from transformers import pipeline
                 max_new_tokens=self.max_new_tokens,
             )
         ]
+@dataclass()
+class IbmGenAiInferenceEngineParams:
+    decoding_method: str = None
+    max_new_tokens: Optional[int] = None
+    min_new_tokens: Optional[int] = None
+    random_seed: Optional[int] = None
+    repetition_penalty: Optional[float] = None
+    stop_sequences: Optional[List[str]] = None
+    temperature: Optional[float] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    typical_p: Optional[float] = None
+class IbmGenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
+    label: str = "ibm_genai"
+    model_name: str
+    parameters: IbmGenAiInferenceEngineParams = IbmGenAiInferenceEngineParams()
+    _requirement = {
+        "genai": "Install ibm-genai package using 'pip install --upgrade ibm-generative-ai"
+    }
+    def prepare(self):
+        from genai import Client, Credentials
+        api_key_env_var_name = "GENAI_KEY"
+        api_key = os.environ.get(api_key_env_var_name)
+        assert api_key is not None, (
+            f"Error while trying to run IbmGenAiInferenceEngine."
+            f" Please set the environment param '{api_key_env_var_name}'."
+        )
+        api_endpoint = os.environ.get("GENAI_KEY")
+        credentials = Credentials(api_key=api_key, api_endpoint=api_endpoint)
+        self.client = Client(credentials=credentials)
+        self._assert_allow_passing_data_to_remote_api(self.label)
+    def infer(self, dataset):
+        from genai.schema import TextGenerationParameters
+        genai_params = TextGenerationParameters(**self.parameters.__dict__)
+        return list(
+            self.client.text.generation.create(
+                model_id=self.model_name,
+                inputs=[instance["source"] for instance in dataset],
+                parameters=genai_params,
+            )
+        )
+@dataclass
+class OpenAiInferenceEngineParams:
+    frequency_penalty: Optional[float] = None
+    presence_penalty: Optional[float] = None
+    max_tokens: Optional[int] = None
+    seed: Optional[int] = None
+    stop: Union[Optional[str], List[str]] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+class OpenAiInferenceEngine(InferenceEngine, PackageRequirementsMixin):
+    label: str = "openai"
+    model_name: str
+    parameters: OpenAiInferenceEngineParams = OpenAiInferenceEngineParams()
+    _requirement = {
+        "openai": "Install openai package using 'pip install --upgrade openai"
+    }
+    def prepare(self):
+        from openai import OpenAI
+        api_key_env_var_name = "OPENAI_API_KEY"
+        api_key = os.environ.get(api_key_env_var_name)
+        assert api_key is not None, (
+            f"Error while trying to run OpenAiInferenceEngine."
+            f" Please set the environment param '{api_key_env_var_name}'."
+        )
+        self.client = OpenAI(api_key=api_key)
+        self._assert_allow_passing_data_to_remote_api(self.label)
+    def infer(self, dataset):
+        return [
+            self.client.chat.completions.create(
+                messages=[
+                    # {
+                    #     "role": "system",
+                    #     "content": self.system_prompt,
+                    # },
+                    {
+                        "role": "user",
+                        "content": instance["source"],
+                    }
+                ],
+                model=self.model_name,
+                frequency_penalty=self.parameters.frequency_penalty,
+                presence_penalty=self.parameters.presence_penalty,
+                max_tokens=self.parameters.max_tokens,
+                seed=self.parameters.seed,
+                stop=self.parameters.stop,
+                temperature=self.parameters.temperature,
+                top_p=self.parameters.top_p,
+            )
+            for instance in dataset
+        ]

load.py DELETED Viewed

@@ -1,15 +0,0 @@
-from typing import Union
-from datasets import DatasetDict
-from .artifact import fetch_artifact
-from .operator import StreamSource
-def load_dataset(source: Union[StreamSource, str]) -> DatasetDict:
-    assert isinstance(
-        source, (StreamSource, str)
-    ), "source must be a StreamSource or a string"
-    if isinstance(source, str):
-        source, _ = fetch_artifact(source)
-    return source().to_dataset()

metric.py CHANGED Viewed

@@ -19,16 +19,13 @@ from .file_utils import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _
 from .generator_utils import __file__ as _
-from .hf_utils import __file__ as _
 from .hf_utils import verify_versions_compatibility
 from .inference import __file__ as _
 from .instructions import __file__ as _
 from .llm_as_judge import __file__ as _
 from .loaders import __file__ as _
 from .logging_utils import __file__ as _
-from .metric_utils import UNITXT_METRIC_SCHEMA
-from .metric_utils import __file__ as _
-from .metric_utils import _compute
 from .metrics import __file__ as _
 from .normalizers import __file__ as _
 from .operator import __file__ as _
@@ -39,7 +36,6 @@ from .random_utils import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
-from .settings_utils import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
@@ -53,7 +49,6 @@ from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
-from .utils import __file__ as _
 from .utils import is_package_installed
 from .validate import __file__ as _
 from .version import __file__ as _

 from .formats import __file__ as _
 from .fusion import __file__ as _
 from .generator_utils import __file__ as _
 from .hf_utils import verify_versions_compatibility
 from .inference import __file__ as _
 from .instructions import __file__ as _
 from .llm_as_judge import __file__ as _
 from .loaders import __file__ as _
 from .logging_utils import __file__ as _
+from .metric_utils import UNITXT_METRIC_SCHEMA, _compute
 from .metrics import __file__ as _
 from .normalizers import __file__ as _
 from .operator import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
 from .schema import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .utils import is_package_installed
 from .validate import __file__ as _
 from .version import __file__ as _

metrics.py CHANGED Viewed

@@ -2255,6 +2255,8 @@ class Perplexity(BulkInstanceMetric):
                 self.model_class().from_pretrained(self.model_name).to(self.device)
             )
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
             self.single_token_mode = single_token_mode
         def compute_lm(
@@ -3348,7 +3350,8 @@ class BinaryMaxF1(F1Binary):
         best_thr = -1
         best_f1 = -1
-        for thr in set(float_predictions):
             new_predictions = [
                 "1" if float_prediction >= thr else "0"
                 for float_prediction in float_predictions

                 self.model_class().from_pretrained(self.model_name).to(self.device)
             )
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            if self.tokenizer.pad_token_id is None:
+                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
             self.single_token_mode = single_token_mode
         def compute_lm(
         best_thr = -1
         best_f1 = -1
+        thrs = {round(fp, 3) for fp in float_predictions}
+        for thr in thrs:
             new_predictions = [
                 "1" if float_prediction >= thr else "0"
                 for float_prediction in float_predictions

operators.py CHANGED Viewed

@@ -1704,6 +1704,66 @@ class Shuffle(PagedStreamOperator):
         yield from page
 class EncodeLabels(StreamInstanceOperator):
     """Encode each value encountered in any field in 'fields' into the integers 0,1,...

         yield from page
+class FeatureGroupedShuffle(Shuffle):
+    """Class for shuffling an input dataset by instance 'blocks', not on the individual instance level.
+    Example is if the dataset consists of questions with paraphrases of it, and each question falls into a topic.
+    All paraphrases have the same ID value as the original.
+    In this case, we may want to shuffle on grouping_features = ['question ID'],
+    to keep the paraphrases and original question together.
+    We may also want to group by both 'question ID' and 'topic', if the question IDs are repeated between topics.
+    In this case, grouping_features = ['question ID', 'topic']
+    Args:
+        grouping_features (list of strings): list of feature names to use to define the groups.
+            a group is defined by each unique observed combination of data values for features in grouping_features
+        shuffle_within_group (bool): whether to further shuffle the instances within each group block, keeping the block order
+    Args (of superclass):
+        page_size (int): The size of each page in the stream. Defaults to 1000.
+            Note: shuffle_by_grouping_features determines the unique groups (unique combinations of values of grouping_features)
+            separately by page (determined by page_size).  If a block of instances in the same group are split
+            into separate pages (either by a page break falling in the group, or the dataset was not sorted by
+            grouping_features), these instances will be shuffled separately and thus the grouping may be
+            broken up by pages.  If the user wants to ensure the shuffle does the grouping and shuffling
+            across all pages, set the page_size to be larger than the dataset size.
+            See outputs_2features_bigpage and outputs_2features_smallpage in test_grouped_shuffle.
+    """
+    grouping_features: List[str] = None
+    shuffle_within_group: bool = False
+    def process(self, page: List[Dict], stream_name: Optional[str] = None) -> Generator:
+        if self.grouping_features is None:
+            super().process(page, stream_name)
+        else:
+            yield from self.shuffle_by_grouping_features(page)
+    def shuffle_by_grouping_features(self, page):
+        import itertools
+        from collections import defaultdict
+        groups_to_instances = defaultdict(list)
+        for item in page:
+            groups_to_instances[
+                tuple(item[ff] for ff in self.grouping_features)
+            ].append(item)
+        # now extract the groups (i.e., lists of dicts with order preserved)
+        page_blocks = list(groups_to_instances.values())
+        # and now shuffle the blocks
+        self.random_generator.shuffle(page_blocks)
+        if self.shuffle_within_group:
+            blocks = []
+            # reshuffle the instances within each block, but keep the blocks in order
+            for block in page_blocks:
+                self.random_generator.shuffle(block)
+                blocks.append(block)
+            page_blocks = blocks
+        # now flatten the list so it consists of individual dicts, but in (randomized) block order
+        return list(itertools.chain(*page_blocks))
 class EncodeLabels(StreamInstanceOperator):
     """Encode each value encountered in any field in 'fields' into the integers 0,1,...

processors.py CHANGED Viewed

@@ -46,6 +46,14 @@ class RegexParser(FieldOperator):
         return re.findall(self.regex, text)
 class LoadJson(FieldOperator):
     def process_value(self, text: Any) -> Any:
         try:

         return re.findall(self.regex, text)
+class ExtractWithRegex(RegexParser):
+    def process_value(self, text: Any) -> Any:
+        matches = super().process_value(text)
+        if matches:
+            return matches[0]
+        return ""
 class LoadJson(FieldOperator):
     def process_value(self, text: Any) -> Any:
         try:

renderers.py DELETED Viewed

@@ -1,132 +0,0 @@
-from abc import ABC
-from typing import Any, Dict, List, Optional
-from .dataclass import InternalField
-from .formats import Format, ICLFormat
-from .instructions import Instruction
-from .operator import Operator, SequentialOperator, StreamInstanceOperator
-from .random_utils import get_random
-from .templates import Template
-class Renderer(ABC):
-    pass
-    # @abstractmethod
-    # def get_postprocessors(self) -> List[str]:
-    #     pass
-class RenderTemplate(Renderer, StreamInstanceOperator):
-    template: Template
-    random_reference: bool = False
-    skip_rendered_instance: bool = True
-    def process(
-        self, instance: Dict[str, Any], stream_name: Optional[str] = None
-    ) -> Dict[str, Any]:
-        if self.skip_rendered_instance:
-            if (
-                "inputs" not in instance
-                and "outputs" not in instance
-                and "source" in instance
-                and "target" in instance
-                and "references" in instance
-            ):
-                return instance
-        inputs = instance["inputs"]
-        outputs = instance["outputs"]
-        source = self.template.process_inputs(inputs)
-        targets = self.template.process_outputs(outputs)
-        if self.template.is_multi_reference:
-            assert isinstance(targets, list), f"{targets} must be a list"
-            references = targets
-            if self.random_reference:
-                target = get_random().choice(references)
-            else:
-                if len(references) == 0:
-                    raise ValueError("No references found")
-                target = references[0]
-        else:
-            references = [targets]
-            target = targets
-        instance.update(
-            {
-                "source": source,
-                "target": target,
-                "references": references,
-            }
-        )
-        return instance
-class RenderDemonstrations(RenderTemplate):
-    demos_field: str
-    def process(
-        self, instance: Dict[str, Any], stream_name: Optional[str] = None
-    ) -> Dict[str, Any]:
-        demos = instance.get(self.demos_field, [])
-        processed_demos = []
-        for demo_instance in demos:
-            demo_instance = super().process(demo_instance)
-            processed_demos.append(demo_instance)
-        instance[self.demos_field] = processed_demos
-        return instance
-class RenderInstruction(Renderer, StreamInstanceOperator):
-    instruction: Instruction
-    def process(
-        self, instance: Dict[str, Any], stream_name: Optional[str] = None
-    ) -> Dict[str, Any]:
-        if self.instruction is not None:
-            instance["instruction"] = self.instruction()
-        else:
-            instance["instruction"] = ""
-        return instance
-class RenderFormat(Renderer, StreamInstanceOperator):
-    format: Format
-    demos_field: str = None
-    def process(
-        self, instance: Dict[str, Any], stream_name: Optional[str] = None
-    ) -> Dict[str, Any]:
-        demos_instances = instance.pop(self.demos_field, None)
-        if demos_instances is not None:
-            instance["source"] = self.format.format(
-                instance, demos_instances=demos_instances
-            )
-        else:
-            instance["source"] = self.format.format(instance)
-        return instance
-class StandardRenderer(Renderer, SequentialOperator):
-    template: Template
-    instruction: Instruction = None
-    demos_field: str = None
-    format: ICLFormat = None
-    steps: List[Operator] = InternalField(default_factory=list)
-    def prepare(self):
-        self.steps = [
-            RenderTemplate(template=self.template),
-            RenderDemonstrations(template=self.template, demos_field=self.demos_field),
-            RenderInstruction(instruction=self.instruction),
-            RenderFormat(format=self.format, demos_field=self.demos_field),
-        ]
-    def get_postprocessors(self):
-        return self.template.get_postprocessors()

serializers.py DELETED Viewed

@@ -1,130 +0,0 @@
-from abc import ABC, abstractmethod
-from copy import deepcopy
-from typing import (
-    Any,
-    Dict,
-    List,
-)
-from .operators import FieldOperator
-"""
-TableSerializer converts a given table into a flat sequence with special symbols.
-Input table format must be:
-{"header": ["col1", "col2"], "rows": [["row11", "row12"], ["row21", "row22"], ["row31", "row32"]]}
-Output format varies depending on the chosen serializer. Abstract class at the top defines structure of a typical table serializer that any concrete implementation should follow.
-"""
-class TableSerializer(ABC, FieldOperator):
-    # main method to serialize a table
-    @abstractmethod
-    def serialize_table(self, table_content: Dict) -> str:
-        pass
-    # method to process table header
-    @abstractmethod
-    def process_header(self, header: List):
-        pass
-    # method to process a table row
-    @abstractmethod
-    def process_row(self, row: List, row_index: int):
-        pass
-# Concrete classes implementing table serializers follow..
-"""
-Indexed Row Major Table Serializer.
-Commonly used row major serialization format.
-Format:  col : col1 | col2 | col 3 row 1 : val1 | val2 | val3 | val4 row 2 : val1 | ...
-"""
-class IndexedRowMajorTableSerializer(TableSerializer):
-    def process_value(self, table: Any) -> Any:
-        table_input = deepcopy(table)
-        return self.serialize_table(table_content=table_input)
-    # main method that processes a table
-    # table_content must be in the presribed input format
-    def serialize_table(self, table_content: Dict) -> str:
-        # Extract headers and rows from the dictionary
-        header = table_content.get("header", [])
-        rows = table_content.get("rows", [])
-        assert header and rows, "Incorrect input table format"
-        # Process table header first
-        serialized_tbl_str = self.process_header(header) + " "
-        # Process rows sequentially starting from row 1
-        for i, row in enumerate(rows, start=1):
-            serialized_tbl_str += self.process_row(row, row_index=i) + " "
-        # return serialized table as a string
-        return serialized_tbl_str.strip()
-    # serialize header into a string containing the list of column names separated by '|' symbol
-    def process_header(self, header: List):
-        return "col : " + " | ".join(header)
-    # serialize a table row into a string containing the list of cell values separated by '|'
-    def process_row(self, row: List, row_index: int):
-        serialized_row_str = ""
-        row_cell_values = [
-            str(value) if isinstance(value, (int, float)) else value for value in row
-        ]
-        serialized_row_str += " | ".join(row_cell_values)
-        return f"row {row_index} : {serialized_row_str}"
-"""
-Markdown Table Serializer.
-Markdown table format is used in GitHub code primarily.
-Format:
-|col1|col2|col3|
-|---|---|---|
-|A|4|1|
-|I|2|1|
-...
-"""
-class MarkdownTableSerializer(TableSerializer):
-    def process_value(self, table: Any) -> Any:
-        table_input = deepcopy(table)
-        return self.serialize_table(table_content=table_input)
-    # main method that serializes a table.
-    # table_content must be in the presribed input format.
-    def serialize_table(self, table_content: Dict) -> str:
-        # Extract headers and rows from the dictionary
-        header = table_content.get("header", [])
-        rows = table_content.get("rows", [])
-        assert header and rows, "Incorrect input table format"
-        # Process table header first
-        serialized_tbl_str = self.process_header(header)
-        # Process rows sequentially starting from row 1
-        for i, row in enumerate(rows, start=1):
-            serialized_tbl_str += self.process_row(row, row_index=i)
-        # return serialized table as a string
-        return serialized_tbl_str.strip()
-    # serialize header into a string containing the list of column names
-    def process_header(self, header: List):
-        header_str = "|{}|\n".format("|".join(header))
-        header_str += "|{}|\n".format("|".join(["---"] * len(header)))
-        return header_str
-    # serialize a table row into a string containing the list of cell values
-    def process_row(self, row: List, row_index: int):
-        row_str = ""
-        row_str += "|{}|\n".format("|".join(str(cell) for cell in row))
-        return row_str

standard.py CHANGED Viewed

@@ -187,6 +187,11 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
         return list(multi_stream["__inference__"])
     def prepare(self):
         self.set_pipelines()
         loader = self.card.loader
@@ -220,7 +225,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
             self.augmentor.set_task_input_fields(self.card.task.augmentable_inputs)
             self.processing.steps.append(self.augmentor)
-        if self.demos_pool_size is not None:
             self.processing.steps.append(
                 CreateDemosPool(
                     from_split=self.demos_taken_from,
@@ -229,8 +234,6 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
                     remove_targets_from_source_split=self.demos_removed_from_data,
                 )
             )
-        if self.num_demos > 0:
             if self.sampler is None:
                 if self.card.sampler is None:
                     raise ValueError(

         return list(multi_stream["__inference__"])
     def prepare(self):
+        # To avoid the Python's mutable default list trap, we set the default value to None
+        # and then set it to an empty list if it is None.
+        if self.card.preprocess_steps is None:
+            self.card.preprocess_steps = []
         self.set_pipelines()
         loader = self.card.loader
             self.augmentor.set_task_input_fields(self.card.task.augmentable_inputs)
             self.processing.steps.append(self.augmentor)
+        if self.num_demos > 0:
             self.processing.steps.append(
                 CreateDemosPool(
                     from_split=self.demos_taken_from,
                     remove_targets_from_source_split=self.demos_removed_from_data,
                 )
             )
             if self.sampler is None:
                 if self.card.sampler is None:
                     raise ValueError(

task.py CHANGED Viewed

@@ -1,6 +1,9 @@
-from typing import Any, Dict, List, Optional
 from .operator import StreamInstanceOperator
 class Tasker:
@@ -10,41 +13,88 @@ class Tasker:
 class FormTask(Tasker, StreamInstanceOperator):
     """FormTask packs the different instance fields into dictionaries by their roles in the task.
     The output instance contains three fields:
         "inputs" whose value is a sub-dictionary of the input instance, consisting of all the fields listed in Arg 'inputs'.
         "outputs" -- for the fields listed in Arg "outputs".
         "metrics" -- to contain the value of Arg 'metrics'
     """
-    inputs: List[str]
-    outputs: List[str]
     metrics: List[str]
     augmentable_inputs: List[str] = []
     def verify(self):
         for augmentable_input in self.augmentable_inputs:
             assert (
                 augmentable_input in self.inputs
             ), f"augmentable_input {augmentable_input} is not part of {self.inputs}"
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
-        try:
-            inputs = {key: instance[key] for key in self.inputs}
-        except KeyError as e:
-            raise KeyError(
-                f"Unexpected FormTask input column names ({[key for key in self.inputs if key not in instance]})."
-                f"The available input names: {list(instance.keys())}"
-            ) from e
-        try:
-            outputs = {key: instance[key] for key in self.outputs}
-        except KeyError as e:
-            raise KeyError(
-                f"Unexpected FormTask output column names: {[key for key in self.outputs if key not in instance]}"
-                f" \n available names:{list(instance.keys())}\n given output names:{self.outputs}"
-            ) from e
         return {
             "inputs": inputs,

+from typing import Any, Dict, List, Optional, Union
+from .artifact import fetch_artifact
+from .logging_utils import get_logger
 from .operator import StreamInstanceOperator
+from .type_utils import isoftype, parse_type_string, verify_required_schema
 class Tasker:
 class FormTask(Tasker, StreamInstanceOperator):
     """FormTask packs the different instance fields into dictionaries by their roles in the task.
+    Attributes:
+        inputs (Union[Dict[str, str], List[str]]):
+            Dictionary with string names of instance input fields and types of respective values.
+            In case a list is passed, each type will be assumed to be Any.
+        outputs (Union[Dict[str, str], List[str]]):
+            Dictionary with string names of instance output fields and types of respective values.
+            In case a list is passed, each type will be assumed to be Any.
+        metrics (List[str]): List of names of metrics to be used in the task.
+        prediction_type (Optional[str]):
+            Need to be consistent with all used metrics. Defaults to None, which means that it will
+            be set to Any.
     The output instance contains three fields:
         "inputs" whose value is a sub-dictionary of the input instance, consisting of all the fields listed in Arg 'inputs'.
         "outputs" -- for the fields listed in Arg "outputs".
         "metrics" -- to contain the value of Arg 'metrics'
     """
+    inputs: Union[Dict[str, str], List[str]]
+    outputs: Union[Dict[str, str], List[str]]
     metrics: List[str]
+    prediction_type: Optional[str] = None
     augmentable_inputs: List[str] = []
     def verify(self):
+        for io_type in ["inputs", "outputs"]:
+            data = self.inputs if io_type == "inputs" else self.outputs
+            if not isoftype(data, Dict[str, str]):
+                get_logger().warning(
+                    f"'{io_type}' field of Task should be a dictionary of field names and their types. "
+                    f"For example, {{'text': 'str', 'classes': 'List[str]'}}. Instead only '{data}' was "
+                    f"passed. All types will be assumed to be 'Any'. In future version of unitxt this "
+                    f"will raise an exception."
+                )
+                data = {key: "Any" for key in data}
+                if io_type == "inputs":
+                    self.inputs = data
+                else:
+                    self.outputs = data
+        if not self.prediction_type:
+            get_logger().warning(
+                "'prediction_type' was not set in Task. It is used to check the output of "
+                "template post processors is compatible with the expected input of the metrics. "
+                "Setting `prediction_type` to 'Any' (no checking is done). In future version "
+                "of unitxt this will raise an exception."
+            )
+            self.prediction_type = "Any"
+        self.check_metrics_type()
         for augmentable_input in self.augmentable_inputs:
             assert (
                 augmentable_input in self.inputs
             ), f"augmentable_input {augmentable_input} is not part of {self.inputs}"
+    def check_metrics_type(self) -> None:
+        prediction_type = parse_type_string(self.prediction_type)
+        for metric_name in self.metrics:
+            metric = fetch_artifact(metric_name)[0]
+            metric_prediction_type = metric.get_prediction_type()
+            if (
+                prediction_type == metric_prediction_type
+                or prediction_type == Any
+                or metric_prediction_type == Any
+            ):
+                continue
+            raise ValueError(
+                f"The task's prediction type ({prediction_type}) and '{metric_name}' "
+                f"metric's prediction type ({metric_prediction_type}) are different."
+            )
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
+        verify_required_schema(self.inputs, instance)
+        verify_required_schema(self.outputs, instance)
+        inputs = {key: instance[key] for key in self.inputs.keys()}
+        outputs = {key: instance[key] for key in self.outputs.keys()}
         return {
             "inputs": inputs,

templates.py CHANGED Viewed

@@ -137,7 +137,7 @@ class InputOutputTemplate(Template):
         return target, references
-class InputOutputReferenceTemplate(InputOutputTemplate):
     reference: str
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:

         return target, references
+class InputOutputTemplateWithCustomTarget(InputOutputTemplate):
     reference: str
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:

tests.py DELETED Viewed

@@ -1,17 +0,0 @@
-test_cases = [
-    {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
-    },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
-]

type_utils.py CHANGED Viewed

@@ -841,3 +841,34 @@ def to_float_or_default(v, failure_default=0):
         if failure_default is None:
             raise e
         return failure_default

         if failure_default is None:
             raise e
         return failure_default
+def verify_required_schema(
+    required_schema_dict: typing.Dict[str, str],
+    input_dict: typing.Dict[str, typing.Any],
+) -> None:
+    """Verifies if passed input_dict has all required fields, and they are of proper types according to required_schema_dict.
+    Parameters:
+        required_schema_dict (Dict[str, str]):
+            Schema where a key is name of a field and a value is a string
+            representing a type of its value.
+        input_dict (Dict[str, Any]):
+            Dict with input fields and their respective values.
+    """
+    for field_name, data_type_string in required_schema_dict.items():
+        try:
+            value = input_dict[field_name]
+        except KeyError as e:
+            raise KeyError(
+                f"Unexpected field name: '{field_name}'. "
+                f"The available names: {list(input_dict.keys())}."
+            ) from e
+        data_type = parse_type_string(data_type_string)
+        if not isoftype(value, data_type):
+            raise ValueError(
+                f"Passed value '{value}' of field '{field_name}' is not "
+                f"of required type: ({data_type_string})."
+            )

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.7.7"


1	+ version = "1.7.8"