davidberenstein1957 HF staff commited on
Commit
6fc91c7
1 Parent(s): c668bc2

feat: add org dropdown

Browse files

fix: missing logs upload try
fix: non-org-based dataset push to hub

.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -5,7 +5,7 @@ from src.distilabel_dataset_generator.sft import demo
5
  demo = gr.TabbedInterface(
6
  [demo],
7
  ["Supervised Fine-Tuning"],
8
- title="Distilabel Dataset Generator",
9
  head="⚗️ Distilabel Dataset Generator",
10
  )
11
 
 
5
  demo = gr.TabbedInterface(
6
  [demo],
7
  ["Supervised Fine-Tuning"],
8
+ title="⚗️ Distilabel Dataset Generator",
9
  head="⚗️ Distilabel Dataset Generator",
10
  )
11
 
src/distilabel_dataset_generator/__init__.py CHANGED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Optional, Union
3
+
4
+ import distilabel
5
+ import distilabel.distiset
6
+ from distilabel.utils.card.dataset_card import (
7
+ DistilabelDatasetCard,
8
+ size_categories_parser,
9
+ )
10
+ from huggingface_hub import DatasetCardData, HfApi
11
+
12
+
13
+ class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
14
+ def _generate_card(
15
+ self,
16
+ repo_id: str,
17
+ token: str,
18
+ include_script: bool = False,
19
+ filename_py: Optional[str] = None,
20
+ ) -> None:
21
+ """Generates a dataset card and pushes it to the Hugging Face Hub, and
22
+ if the `pipeline.yaml` path is available in the `Distiset`, uploads that
23
+ to the same repository.
24
+
25
+ Args:
26
+ repo_id: The ID of the repository to push to, from the `push_to_hub` method.
27
+ token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
28
+ include_script: Whether to upload the script to the hugging face repository.
29
+ filename_py: The name of the script. If `include_script` is True, the script will
30
+ be uploaded to the repository using this name, otherwise it won't be used.
31
+ """
32
+ card = self._get_card(
33
+ repo_id=repo_id,
34
+ token=token,
35
+ include_script=include_script,
36
+ filename_py=filename_py,
37
+ )
38
+
39
+ card.push_to_hub(
40
+ repo_id,
41
+ repo_type="dataset",
42
+ token=token,
43
+ )
44
+ if self.pipeline_path:
45
+ # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.
46
+ HfApi().upload_file(
47
+ path_or_fileobj=self.pipeline_path,
48
+ path_in_repo=distilabel.distiset.PIPELINE_CONFIG_FILENAME,
49
+ repo_id=repo_id,
50
+ repo_type="dataset",
51
+ token=token,
52
+ )
53
+
54
+ def _get_card(
55
+ self,
56
+ repo_id: str,
57
+ token: Optional[str] = None,
58
+ include_script: bool = False,
59
+ filename_py: Optional[str] = None,
60
+ ) -> DistilabelDatasetCard:
61
+ """Generates the dataset card for the `Distiset`.
62
+
63
+ Note:
64
+ If `repo_id` and `token` are provided, it will extract the metadata from the README.md file
65
+ on the hub.
66
+
67
+ Args:
68
+ repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.
69
+ token: The token to authenticate with the Hugging Face Hub.
70
+ We assume that if it's provided, the dataset will be in the Hugging Face Hub,
71
+ so the README metadata will be extracted from there.
72
+ include_script: Whether to upload the script to the hugging face repository.
73
+ filename_py: The name of the script. If `include_script` is True, the script will
74
+ be uploaded to the repository using this name, otherwise it won't be used.
75
+
76
+ Returns:
77
+ The dataset card for the `Distiset`.
78
+ """
79
+ sample_records = {}
80
+ for name, dataset in self.items():
81
+ sample_records[name] = (
82
+ dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
83
+ )
84
+
85
+ readme_metadata = {}
86
+ if repo_id and token:
87
+ readme_metadata = self._extract_readme_metadata(repo_id, token)
88
+
89
+ metadata = {
90
+ **readme_metadata,
91
+ "size_categories": size_categories_parser(
92
+ max(len(dataset) for dataset in self.values())
93
+ ),
94
+ "tags": [
95
+ "synthetic",
96
+ "distilabel",
97
+ "rlaif",
98
+ "distilabel-dataset-generator",
99
+ ],
100
+ }
101
+
102
+ card = DistilabelDatasetCard.from_template(
103
+ card_data=DatasetCardData(**metadata),
104
+ repo_id=repo_id,
105
+ sample_records=sample_records,
106
+ include_script=include_script,
107
+ filename_py=filename_py,
108
+ references=self.citations,
109
+ )
110
+
111
+ return card
112
+
113
+
114
+ distilabel.distiset.Distiset = CustomDistisetWithAdditionalTag
src/distilabel_dataset_generator/sft.py CHANGED
@@ -3,11 +3,18 @@ import os
3
 
4
  import gradio as gr
5
  import pandas as pd
 
6
  from distilabel.llms import InferenceEndpointsLLM
7
  from distilabel.pipeline import Pipeline
8
  from distilabel.steps.tasks import MagpieGenerator, TextGeneration
9
 
10
- from src.distilabel_dataset_generator.utils import OAuthToken, get_login_button
 
 
 
 
 
 
11
 
12
  INFORMATION_SEEKING_PROMPT = (
13
  "You are an AI assistant designed to provide accurate and concise information on a wide"
@@ -118,11 +125,23 @@ User dataset description:
118
  """
119
 
120
  MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
 
123
- def _run_pipeline(
124
- result_queue, _num_turns, _num_rows, _system_prompt, _token: str = None
125
- ):
126
  with Pipeline(name="sft") as pipeline:
127
  magpie_step = MagpieGenerator(
128
  llm=InferenceEndpointsLLM(
@@ -131,19 +150,22 @@ def _run_pipeline(
131
  magpie_pre_query_template="llama3",
132
  generation_kwargs={
133
  "temperature": 0.8, # it's the best value for Llama 3.1 70B Instruct
 
134
  },
135
- api_key=_token,
136
  ),
137
- n_turns=_num_turns,
138
- num_rows=_num_rows,
139
- system_prompt=_system_prompt,
140
  )
141
- distiset = pipeline.run()
142
  result_queue.put(distiset)
143
 
144
 
145
- def _generate_system_prompt(_dataset_description, _token: OAuthToken = None):
146
- os.environ["HF_TOKEN"] = _token.token
 
 
147
  generate_description = TextGeneration(
148
  llm=InferenceEndpointsLLM(
149
  model_id=MODEL,
@@ -153,7 +175,7 @@ def _generate_system_prompt(_dataset_description, _token: OAuthToken = None):
153
  "max_new_tokens": 2048,
154
  "do_sample": True,
155
  },
156
- api_key=_token.token,
157
  ),
158
  use_system_prompt=True,
159
  )
@@ -163,44 +185,58 @@ def _generate_system_prompt(_dataset_description, _token: OAuthToken = None):
163
  [
164
  {
165
  "system_prompt": PROMPT_CREATION_PROMPT,
166
- "instruction": _dataset_description,
167
  }
168
  ]
169
  )
170
  )[0]["generation"]
171
 
172
 
173
- def _generate_dataset(
174
- _system_prompt,
175
- _num_turns=1,
176
- _num_rows=5,
177
- _dataset_name=None,
178
- _token: OAuthToken = None,
 
 
179
  ):
180
- os.environ["HF_TOKEN"] = _token.token
 
 
 
 
 
 
 
 
 
 
 
181
  gr.Info("Started pipeline execution.")
182
  result_queue = multiprocessing.Queue()
183
  p = multiprocessing.Process(
184
  target=_run_pipeline,
185
- args=(result_queue, _num_turns, _num_rows, _system_prompt, _token.token),
186
  )
187
  p.start()
188
  p.join()
189
  distiset = result_queue.get()
190
 
191
- if _dataset_name is not None:
192
  gr.Info("Pushing dataset to Hugging Face Hub...")
 
193
  distiset.push_to_hub(
194
- repo_id=_dataset_name,
195
- private=False,
196
  include_script=True,
197
- token=_token.token,
198
  )
199
- gr.Info("Dataset pushed to Hugging Face Hub: https://huggingface.co")
200
  else:
201
  # If not pushing to hub, generate the dataset directly
202
  distiset = distiset["default"]["train"]
203
- if _num_turns == 1:
204
  outputs = distiset.to_pandas()[["instruction", "response"]]
205
  else:
206
  outputs = {"conversation_id": [], "role": [], "content": []}
@@ -212,63 +248,80 @@ def _generate_dataset(
212
  outputs["content"].append(message["content"])
213
  return pd.DataFrame(outputs)
214
 
215
- return pd.DataFrame(distiset.to_pandas())
216
-
217
 
218
  with gr.Blocks(
219
  title="⚗️ Distilabel Dataset Generator",
220
  head="⚗️ Distilabel Dataset Generator",
221
  ) as demo:
222
- get_login_button()
 
 
 
 
223
 
224
  dataset_description = gr.Textbox(
225
  label="Provide a description of the dataset",
226
- value="A chemistry dataset for an assistant that explains chemical reactions and formulas",
227
  )
228
 
229
- btn_generate_system_prompt = gr.Button(
230
- value="🧪 Generate Sytem Prompt",
231
- )
232
 
233
- system_prompt = gr.Textbox(label="Provide or correct the system prompt")
 
 
234
 
235
  btn_generate_system_prompt.click(
236
- fn=_generate_system_prompt,
237
  inputs=[dataset_description],
238
  outputs=[system_prompt],
239
  )
240
 
241
  btn_generate_sample_dataset = gr.Button(
242
- value="🧪 Generate Sample Dataset of 5 rows and a single turn"
243
  )
244
 
245
- table = gr.Dataframe(label="Generated Dataset", wrap=True)
246
 
247
  btn_generate_sample_dataset.click(
248
- fn=_generate_dataset,
249
  inputs=[system_prompt],
250
  outputs=[table],
251
  )
252
 
253
  with gr.Row(variant="panel"):
254
- with gr.Column():
255
- num_turns = gr.Number(
256
- value=1, label="Number of turns in the conversation", minimum=1
257
- )
258
- with gr.Column():
259
- num_rows = gr.Number(
260
- value=100, label="Number of rows in the dataset", minimum=1
261
- )
 
 
 
 
 
262
 
263
- dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
 
 
264
 
265
  btn_generate_full_dataset = gr.Button(
266
  value="⚗️ Generate Full Dataset", variant="primary"
267
  )
268
 
269
  btn_generate_full_dataset.click(
270
- fn=_generate_dataset,
271
- inputs=[system_prompt, num_turns, num_rows, dataset_name_push_to_hub],
 
 
 
 
 
 
 
272
  )
273
 
 
274
  demo
 
3
 
4
  import gradio as gr
5
  import pandas as pd
6
+ from distilabel.distiset import Distiset
7
  from distilabel.llms import InferenceEndpointsLLM
8
  from distilabel.pipeline import Pipeline
9
  from distilabel.steps.tasks import MagpieGenerator, TextGeneration
10
 
11
+ from src.distilabel_dataset_generator.utils import (
12
+ OAuthToken,
13
+ get_duplicate_button,
14
+ get_login_button,
15
+ get_org_dropdown,
16
+ list_orgs,
17
+ )
18
 
19
  INFORMATION_SEEKING_PROMPT = (
20
  "You are an AI assistant designed to provide accurate and concise information on a wide"
 
125
  """
126
 
127
  MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct"
128
+ DEFAULT_SYSTEM_PROMPT_DESCRIPTION = (
129
+ "A chemistry dataset for an assistant that explains chemical reactions and formulas"
130
+ )
131
+ DEFAULT_SYSTEM_PROMPT = "You are an AI assistant specializing in chemistry and chemical reactions. Your purpose is to help users understand and work with chemical formulas, equations, and reactions. Provide clear explanations of reaction mechanisms, assist in balancing chemical equations, and offer guidance on the interpretation of chemical structures. Explain the roles of reactants, products, catalysts, and solvents, and define key chemistry terms when necessary."
132
+ DEFAULT_DATASET = pd.DataFrame(
133
+ {
134
+ "instruction": [
135
+ "What is the term for the study of the structure and evolution of the Earth's interior. "
136
+ ],
137
+ "response": [
138
+ """The study of the structure and evolution of the Earth's interior is called geophysics, particularly the subfield of geology known as geodynamics, and more specifically the subfield of geology known as geotectonics. However, a more specific term for this study is "geology of the Earth's interior" or "Earth internal structure." However, the most commonly used term for this study is geophysics. """
139
+ ],
140
+ }
141
+ )
142
 
143
 
144
+ def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str = None):
 
 
145
  with Pipeline(name="sft") as pipeline:
146
  magpie_step = MagpieGenerator(
147
  llm=InferenceEndpointsLLM(
 
150
  magpie_pre_query_template="llama3",
151
  generation_kwargs={
152
  "temperature": 0.8, # it's the best value for Llama 3.1 70B Instruct
153
+ "do_sample": True,
154
  },
155
+ api_key=token,
156
  ),
157
+ n_turns=num_turns,
158
+ num_rows=num_rows,
159
+ system_prompt=system_prompt,
160
  )
161
+ distiset: Distiset = pipeline.run()
162
  result_queue.put(distiset)
163
 
164
 
165
+ def generate_system_prompt(dataset_description, token: OAuthToken = None):
166
+ if token is None:
167
+ raise gr.Error("Please sign in with Hugging Face to generate a dataset.")
168
+ os.environ["HF_TOKEN"] = token.token
169
  generate_description = TextGeneration(
170
  llm=InferenceEndpointsLLM(
171
  model_id=MODEL,
 
175
  "max_new_tokens": 2048,
176
  "do_sample": True,
177
  },
178
+ api_key=token.token,
179
  ),
180
  use_system_prompt=True,
181
  )
 
185
  [
186
  {
187
  "system_prompt": PROMPT_CREATION_PROMPT,
188
+ "instruction": dataset_description,
189
  }
190
  ]
191
  )
192
  )[0]["generation"]
193
 
194
 
195
+ def generate_dataset(
196
+ system_prompt,
197
+ num_turns=1,
198
+ num_rows=5,
199
+ private=True,
200
+ orgs_selector=None,
201
+ dataset_name=None,
202
+ token: OAuthToken = None,
203
  ):
204
+ if token is None:
205
+ raise gr.Error("Please sign in with Hugging Face to generate a dataset.")
206
+ if dataset_name is not None:
207
+ if not dataset_name:
208
+ raise gr.Error("Please provide a dataset name to push the dataset to.")
209
+ if orgs_selector is not None:
210
+ if not orgs_selector:
211
+ raise gr.Error(
212
+ f"Please select an organization to push the dataset to from: {list_orgs(token)}"
213
+ )
214
+
215
+ os.environ["HF_TOKEN"] = token.token
216
  gr.Info("Started pipeline execution.")
217
  result_queue = multiprocessing.Queue()
218
  p = multiprocessing.Process(
219
  target=_run_pipeline,
220
+ args=(result_queue, num_turns, num_rows, system_prompt, token.token),
221
  )
222
  p.start()
223
  p.join()
224
  distiset = result_queue.get()
225
 
226
+ if dataset_name is not None:
227
  gr.Info("Pushing dataset to Hugging Face Hub...")
228
+ repo_id = f"{orgs_selector}/{dataset_name}"
229
  distiset.push_to_hub(
230
+ repo_id=repo_id,
231
+ private=private,
232
  include_script=True,
233
+ token=token.token,
234
  )
235
+ gr.Info(f"Dataset pushed to Hugging Face Hub: https://huggingface.co/{repo_id}")
236
  else:
237
  # If not pushing to hub, generate the dataset directly
238
  distiset = distiset["default"]["train"]
239
+ if num_turns == 1:
240
  outputs = distiset.to_pandas()[["instruction", "response"]]
241
  else:
242
  outputs = {"conversation_id": [], "role": [], "content": []}
 
248
  outputs["content"].append(message["content"])
249
  return pd.DataFrame(outputs)
250
 
 
 
251
 
252
  with gr.Blocks(
253
  title="⚗️ Distilabel Dataset Generator",
254
  head="⚗️ Distilabel Dataset Generator",
255
  ) as demo:
256
+ with gr.Row(variant="panel"):
257
+ with gr.Column():
258
+ btn_login = get_login_button()
259
+ with gr.Column():
260
+ btn_duplicate = get_duplicate_button()
261
 
262
  dataset_description = gr.Textbox(
263
  label="Provide a description of the dataset",
264
+ value=DEFAULT_SYSTEM_PROMPT_DESCRIPTION,
265
  )
266
 
267
+ btn_generate_system_prompt = gr.Button(value="🧪 Generate Sytem Prompt")
 
 
268
 
269
+ system_prompt = gr.Textbox(
270
+ label="Provide or correct the system prompt", value=DEFAULT_SYSTEM_PROMPT
271
+ )
272
 
273
  btn_generate_system_prompt.click(
274
+ fn=generate_system_prompt,
275
  inputs=[dataset_description],
276
  outputs=[system_prompt],
277
  )
278
 
279
  btn_generate_sample_dataset = gr.Button(
280
+ value="🧪 Generate Sample Dataset of 5 rows and a single turn",
281
  )
282
 
283
+ table = gr.Dataframe(label="Generated Dataset", wrap=True, value=DEFAULT_DATASET)
284
 
285
  btn_generate_sample_dataset.click(
286
+ fn=generate_dataset,
287
  inputs=[system_prompt],
288
  outputs=[table],
289
  )
290
 
291
  with gr.Row(variant="panel"):
292
+ num_turns = gr.Number(
293
+ value=1,
294
+ label="Number of turns in the conversation",
295
+ minimum=1,
296
+ info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
297
+ )
298
+ num_rows = gr.Number(
299
+ value=100,
300
+ label="Number of rows in the dataset",
301
+ minimum=1,
302
+ info="The number of rows in the dataset. Note that you are able to generate several 1000 rows at once but that this will take time.",
303
+ )
304
+ private = gr.Checkbox(label="Private dataset", value=True, interactive=True)
305
 
306
+ with gr.Row(variant="panel"):
307
+ orgs_selector = gr.Dropdown(label="Organization")
308
+ dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
309
 
310
  btn_generate_full_dataset = gr.Button(
311
  value="⚗️ Generate Full Dataset", variant="primary"
312
  )
313
 
314
  btn_generate_full_dataset.click(
315
+ fn=generate_dataset,
316
+ inputs=[
317
+ system_prompt,
318
+ num_turns,
319
+ num_rows,
320
+ private,
321
+ orgs_selector,
322
+ dataset_name_push_to_hub,
323
+ ],
324
  )
325
 
326
+ demo.load(get_org_dropdown, outputs=[orgs_selector])
327
  demo
src/distilabel_dataset_generator/utils.py CHANGED
@@ -6,6 +6,7 @@ from gradio.oauth import (
6
  OPENID_PROVIDER_URL,
7
  get_space,
8
  )
 
9
 
10
  if (
11
  all(
@@ -36,6 +37,32 @@ def get_login_button():
36
  or get_space() is None
37
  ):
38
  return gr.LoginButton(
39
- value="Sign in with Hugging Face - a login will reset the data!",
40
  size="lg",
41
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  OPENID_PROVIDER_URL,
7
  get_space,
8
  )
9
+ from huggingface_hub import whoami
10
 
11
  if (
12
  all(
 
37
  or get_space() is None
38
  ):
39
  return gr.LoginButton(
40
+ value="Sign in with Hugging Face to generate a dataset!",
41
  size="lg",
42
  )
43
+
44
+
45
+ def get_duplicate_button():
46
+ if get_space() is not None:
47
+ return gr.DuplicateButton(size="lg")
48
+
49
+
50
+ def list_orgs(token: OAuthToken = None):
51
+ if token is not None:
52
+ data = whoami(token)
53
+ organisations = [
54
+ entry["entity"]["name"]
55
+ for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
56
+ if "repo.write" in entry["permissions"]
57
+ ]
58
+ print(organisations)
59
+ return organisations
60
+ else:
61
+ return []
62
+
63
+
64
+ def get_org_dropdown(token: OAuthToken = None):
65
+ orgs = list_orgs(token)
66
+ return gr.Dropdown(
67
+ label="Organization", choices=orgs, value=orgs[0] if orgs else None
68
+ )