File size: 3,342 Bytes
3e28aad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a0a584
3e28aad
 
 
3a0a584
 
f414f88
 
3a0a584
 
3e28aad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4e068f
3e28aad
 
 
 
 
 
 
 
 
3a0a584
 
 
 
 
 
255a1f0
3e28aad
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
###############
# `ls -1 src/unitxt | grep '\.py$' | grep -Ev 'dataset\.py|__init__\.py' | sort`:
# artifact.py
# blocks.py
# card.py
# catalog.py
# collections.py
# common.py
# file_utils.py
# fusion.py
# generator_utils.py
# instructions.py
# loaders.py
# load.py
# metric.py
# metrics.py
# normalizers.py
# operator.py
# operators.py
# processors.py
# recipe.py
# register.py
# splitters.py
# split_utils.py
# stream.py
# task.py
# templates.py
# text_utils.py
# utilize.py
# validate.py
#####
# imports for hf system:
#####
from .artifact import __file__ as _
from .blocks import __file__ as _
from .card import __file__ as _
from .catalog import __file__ as _
from .collections import __file__ as _
from .common import __file__ as _
from .file_utils import __file__ as _

# from .fusion import __file__
from .generator_utils import __file__ as _
from .instructions import __file__ as _
from .loaders import __file__ as _
from .load import __file__ as _
from .metric import __file__ as _
from .metrics import __file__ as _
from .normalizers import __file__ as _
from .operator import __file__ as _
from .operators import __file__ as _
from .processors import __file__ as _
from .recipe import __file__ as _
from .register import __file__ as _
from .schema import __file__ as _
from .splitters import __file__ as _
from .split_utils import __file__ as _
from .stream import __file__ as _
from .task import __file__ as _
from .templates import __file__ as _
from .text_utils import __file__ as _

# from .utilize import __file__ as _
# from .validate import __file__ as _
#############

from .register import register_blocks
from .artifact import Artifact, fetch_artifact, UnitxtArtifactNotFoundError

import datasets

def fetch(artifact_name):
    try:
        artifact, _ = fetch_artifact(artifact_name)
        return artifact
    except UnitxtArtifactNotFoundError:
        return None

def parse(query: str):
    """
    Parses a query of the form 'key1=value1,key2=value2,...' into a dictionary.
    """
    result = {}
    for kv in query.split(","):
        parts = kv.split("=")
        if parts[1].isdigit():
            result[parts[0]] = int(parts[1])
        elif parts[1].replace(".", "", 1).isdigit():
            result[parts[0]] = float(parts[1])

        result[parts[0]] = parts[1]

    return result


class Dataset(datasets.GeneratorBasedBuilder):
    """TODO: Short description of my dataset."""

    VERSION = datasets.Version("1.1.1")
    builder_configs = {}

    @property
    def generators(self):
        register_blocks()
        if not hasattr(self, "_generators") or self._generators is None:
            recipe = fetch(self.config.name)
            if recipe is None:
                args = parse(self.config.name)
                if "type" not in args:
                    args["type"] = "common_recipe"
                recipe = Artifact.from_dict(args)
            self._generators = recipe()
        return self._generators

    def _info(self):
        return datasets.DatasetInfo()

    def _split_generators(self, _):
        return [datasets.SplitGenerator(name=name, gen_kwargs={"split_name": name}) for name in self.generators.keys()]

    def _generate_examples(self, split_name):
        generator = self.generators[split_name]
        for i, row in enumerate(generator):
            yield i, row