Elron commited on
Commit
1e05e68
1 Parent(s): 14e01c6

Upload standard.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. standard.py +46 -39
standard.py CHANGED
@@ -1,18 +1,23 @@
1
- import logging
2
  from typing import List
3
 
4
  from .card import TaskCard
5
- from .dataclass import InternalField, OptionalField
6
- from .formats import ICLFormat
7
- from .instructions import Instruction
 
8
  from .operator import SourceSequentialOperator, StreamingOperator
9
- from .operators import Augmentor, NullAugmentor, StreamRefiner
 
 
 
 
10
  from .recipe import Recipe
11
- from .renderers import StandardRenderer
12
  from .schema import ToUnitxtGroup
13
  from .splitters import Sampler, SeparateSplit, SpreadSplit
14
  from .templates import Template
15
 
 
 
16
 
17
  # Used to give meaningful name to recipe steps
18
  class CreateDemosPool(SeparateSplit):
@@ -26,8 +31,8 @@ class AddDemosField(SpreadSplit):
26
  class BaseRecipe(Recipe, SourceSequentialOperator):
27
  card: TaskCard
28
  template: Template = None
29
- instruction: Instruction = None
30
- format: ICLFormat = ICLFormat()
31
 
32
  loader_limit: int = None
33
 
@@ -51,6 +56,11 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
51
 
52
  steps: List[StreamingOperator] = InternalField(default_factory=list)
53
 
 
 
 
 
 
54
  def verify(self):
55
  super().verify()
56
  if self.num_demos > 0:
@@ -60,31 +70,31 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
60
  )
61
  if self.demos_pool_size < self.num_demos:
62
  raise ValueError(
63
- f"demos_pool_size must be bigger than num_demos ({self.num_demos}), Got demos_pool_size={self.demos_pool_size}"
64
  )
65
  if self.loader_limit and self.demos_pool_size > self.loader_limit:
66
  raise ValueError(
67
- f"demos_pool_size must be bigger than loader_limit ({self.loader_limit}), Got demos_pool_size={self.demos_pool_size}"
68
  )
69
 
70
  if self.loader_limit:
71
  if self.max_test_instances and self.max_test_instances > self.loader_limit:
72
  raise ValueError(
73
- f"max_test_instances must be bigger than loader_limit ({self.loader_limit}), Got max_test_instances={self.max_test_instances}"
74
  )
75
  if (
76
  self.max_validation_instances
77
  and self.max_validation_instances > self.loader_limit
78
  ):
79
  raise ValueError(
80
- f"max_validation_instances must be bigger than loader_limit ({self.loader_limit}), Got max_validation_instances={self.max_validation_instances}"
81
  )
82
  if (
83
  self.max_train_instances
84
  and self.max_train_instances > self.loader_limit
85
  ):
86
  raise ValueError(
87
- f"max_train_instances must be bigger than loader_limit ({self.loader_limit}), Got max_train_instances={self.max_train_instances}"
88
  )
89
 
90
  def prepare(self):
@@ -94,7 +104,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
94
 
95
  if self.loader_limit:
96
  self.card.loader.loader_limit = self.loader_limit
97
- logging.info(f"Loader line limit was set to {self.loader_limit}")
98
  self.steps.append(StreamRefiner(max_instances=self.loader_limit))
99
 
100
  if self.card.preprocess_steps is not None:
@@ -116,20 +126,15 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
116
  )
117
 
118
  if self.num_demos > 0:
119
- sampler = self.card.sampler
120
-
121
- if self.sampler is not None:
122
- sampler = self.sampler
 
 
 
123
 
124
- sampler.set_size(self.num_demos)
125
-
126
- self.steps.append(
127
- AddDemosField(
128
- source_stream=self.demos_pool_name,
129
- target_field=self.demos_field,
130
- sampler=sampler,
131
- )
132
- )
133
 
134
  self.train_refiner.max_instances = self.max_train_instances
135
  self.train_refiner.apply_to_streams = ["train"]
@@ -143,19 +148,21 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
143
  self.test_refiner.apply_to_streams = ["test"]
144
  self.steps.append(self.test_refiner)
145
 
146
- render = StandardRenderer(
147
- instruction=self.instruction,
148
- template=self.template,
149
- format=self.format,
150
- demos_field=self.demos_field,
151
- )
152
-
153
- self.steps.append(render)
154
-
 
 
155
  if self.augmentor.augment_model_input:
156
  self.steps.append(self.augmentor)
157
 
158
- postprocessors = render.get_postprocessors()
159
 
160
  self.steps.append(
161
  ToUnitxtGroup(
@@ -198,7 +205,7 @@ class StandardRecipeWithIndexes(BaseRecipe):
198
 
199
 
200
  class StandardRecipe(StandardRecipeWithIndexes):
201
- """This class represents a standard recipe for data processing and preperation.
202
 
203
  This class can be used to prepare a recipe.
204
  with all necessary steps, refiners and renderers included. It allows to set various
@@ -209,7 +216,7 @@ class StandardRecipe(StandardRecipeWithIndexes):
209
  template (Template, optional): Template object to be used for the recipe.
210
  instruction (Instruction, optional): Instruction object to be used for the recipe.
211
  loader_limit (int, optional): Specifies the maximum number of instances per stream to be returned from the loader (used to reduce loading time in large datasets)
212
- format (ICLFormat, optional): ICLFormat object to be used for the recipe.
213
  train_refiner (StreamRefiner, optional): Train refiner to be used in the recipe.
214
  max_train_instances (int, optional): Maximum training instances for the refiner.
215
  validation_refiner (StreamRefiner, optional): Validation refiner to be used in the recipe.
 
 
1
  from typing import List
2
 
3
  from .card import TaskCard
4
+ from .dataclass import Field, InternalField, OptionalField
5
+ from .formats import Format, SystemFormat
6
+ from .instructions import EmptyInstruction, Instruction
7
+ from .logging_utils import get_logger
8
  from .operator import SourceSequentialOperator, StreamingOperator
9
+ from .operators import (
10
+ Augmentor,
11
+ NullAugmentor,
12
+ StreamRefiner,
13
+ )
14
  from .recipe import Recipe
 
15
  from .schema import ToUnitxtGroup
16
  from .splitters import Sampler, SeparateSplit, SpreadSplit
17
  from .templates import Template
18
 
19
+ logger = get_logger()
20
+
21
 
22
  # Used to give meaningful name to recipe steps
23
  class CreateDemosPool(SeparateSplit):
 
31
  class BaseRecipe(Recipe, SourceSequentialOperator):
32
  card: TaskCard
33
  template: Template = None
34
+ instruction: Instruction = Field(default_factory=EmptyInstruction)
35
+ format: Format = Field(default_factory=SystemFormat)
36
 
37
  loader_limit: int = None
38
 
 
56
 
57
  steps: List[StreamingOperator] = InternalField(default_factory=list)
58
 
59
+ def before_process_multi_stream(self):
60
+ super().before_process_multi_stream()
61
+ if self.sampler: # e.g. when num_demos is 0, the sampler may not be initialized
62
+ self.sampler.init_new_random_generator()
63
+
64
  def verify(self):
65
  super().verify()
66
  if self.num_demos > 0:
 
70
  )
71
  if self.demos_pool_size < self.num_demos:
72
  raise ValueError(
73
+ f"num_demos (got: {self.num_demos}) should not exceed demos_pool_size (got: {self.demos_pool_size})"
74
  )
75
  if self.loader_limit and self.demos_pool_size > self.loader_limit:
76
  raise ValueError(
77
+ f"demos_pool_size should not exceed loader_limit ({self.loader_limit}), Got demos_pool_size={self.demos_pool_size}"
78
  )
79
 
80
  if self.loader_limit:
81
  if self.max_test_instances and self.max_test_instances > self.loader_limit:
82
  raise ValueError(
83
+ f"max_test_instances should not exceed loader_limit ({self.loader_limit}), Got max_test_instances={self.max_test_instances}"
84
  )
85
  if (
86
  self.max_validation_instances
87
  and self.max_validation_instances > self.loader_limit
88
  ):
89
  raise ValueError(
90
+ f"max_validation_instances should not exceed loader_limit ({self.loader_limit}), Got max_validation_instances={self.max_validation_instances}"
91
  )
92
  if (
93
  self.max_train_instances
94
  and self.max_train_instances > self.loader_limit
95
  ):
96
  raise ValueError(
97
+ f"max_train_instances should not exceed loader_limit ({self.loader_limit}), Got max_train_instances={self.max_train_instances}"
98
  )
99
 
100
  def prepare(self):
 
104
 
105
  if self.loader_limit:
106
  self.card.loader.loader_limit = self.loader_limit
107
+ logger.info(f"Loader line limit was set to {self.loader_limit}")
108
  self.steps.append(StreamRefiner(max_instances=self.loader_limit))
109
 
110
  if self.card.preprocess_steps is not None:
 
126
  )
127
 
128
  if self.num_demos > 0:
129
+ if self.sampler is None:
130
+ if self.card.sampler is None:
131
+ raise ValueError(
132
+ "Unexpected None value for card.sampler. "
133
+ "To use num_demos > 0, please set a sampler on the TaskCard."
134
+ )
135
+ self.sampler = self.card.sampler
136
 
137
+ self.sampler.set_size(self.num_demos)
 
 
 
 
 
 
 
 
138
 
139
  self.train_refiner.max_instances = self.max_train_instances
140
  self.train_refiner.apply_to_streams = ["train"]
 
148
  self.test_refiner.apply_to_streams = ["test"]
149
  self.steps.append(self.test_refiner)
150
 
151
+ self.steps.append(self.template)
152
+ if self.num_demos > 0:
153
+ self.steps.append(
154
+ AddDemosField(
155
+ source_stream=self.demos_pool_name,
156
+ target_field=self.demos_field,
157
+ sampler=self.sampler,
158
+ )
159
+ )
160
+ self.steps.append(self.instruction)
161
+ self.steps.append(self.format)
162
  if self.augmentor.augment_model_input:
163
  self.steps.append(self.augmentor)
164
 
165
+ postprocessors = self.template.get_postprocessors()
166
 
167
  self.steps.append(
168
  ToUnitxtGroup(
 
205
 
206
 
207
  class StandardRecipe(StandardRecipeWithIndexes):
208
+ """This class represents a standard recipe for data processing and preparation.
209
 
210
  This class can be used to prepare a recipe.
211
  with all necessary steps, refiners and renderers included. It allows to set various
 
216
  template (Template, optional): Template object to be used for the recipe.
217
  instruction (Instruction, optional): Instruction object to be used for the recipe.
218
  loader_limit (int, optional): Specifies the maximum number of instances per stream to be returned from the loader (used to reduce loading time in large datasets)
219
+ format (SystemFormat, optional): SystemFormat object to be used for the recipe.
220
  train_refiner (StreamRefiner, optional): Train refiner to be used in the recipe.
221
  max_train_instances (int, optional): Maximum training instances for the refiner.
222
  validation_refiner (StreamRefiner, optional): Validation refiner to be used in the recipe.