Elron commited on
Commit
72ea1b4
1 Parent(s): 50db311

Upload loaders.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. loaders.py +38 -7
loaders.py CHANGED
@@ -203,8 +203,9 @@ class LoadCSV(Loader):
203
  files: Dict[str, str]
204
  chunksize: int = 1000
205
  _cache = InternalField(default_factory=dict)
206
- loader_limit: int = None
207
  streaming: bool = True
 
208
 
209
  def stream_csv(self, file):
210
  if self.get_limit() is not None:
@@ -214,7 +215,7 @@ class LoadCSV(Loader):
214
  chunksize = self.chunksize
215
 
216
  row_count = 0
217
- for chunk in pd.read_csv(file, chunksize=chunksize):
218
  for _, row in chunk.iterrows():
219
  if self.get_limit() is not None and row_count >= self.get_limit():
220
  return
@@ -225,9 +226,9 @@ class LoadCSV(Loader):
225
  if file not in self._cache:
226
  if self.get_limit() is not None:
227
  self.log_limited_loading()
228
- self._cache[file] = pd.read_csv(file, nrows=self.get_limit()).to_dict(
229
- "records"
230
- )
231
  else:
232
  self._cache[file] = pd.read_csv(file).to_dict("records")
233
 
@@ -250,11 +251,41 @@ class LoadCSV(Loader):
250
  )
251
 
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  class MissingKaggleCredentialsError(ValueError):
254
  pass
255
 
256
 
257
- # TODO write how to obtain kaggle credentials
258
  class LoadFromKaggle(Loader):
259
  url: str
260
  _requirements_list: List[str] = ["opendatasets"]
@@ -375,7 +406,7 @@ class LoadFromIBMCloud(Loader):
375
  local_dir = os.path.join(
376
  self.cache_dir,
377
  self.bucket_name,
378
- self.data_dir,
379
  f"loader_limit_{self.get_limit()}",
380
  )
381
  if not os.path.exists(local_dir):
 
203
  files: Dict[str, str]
204
  chunksize: int = 1000
205
  _cache = InternalField(default_factory=dict)
206
+ loader_limit: Optional[int] = None
207
  streaming: bool = True
208
+ sep: str = ","
209
 
210
  def stream_csv(self, file):
211
  if self.get_limit() is not None:
 
215
  chunksize = self.chunksize
216
 
217
  row_count = 0
218
+ for chunk in pd.read_csv(file, chunksize=chunksize, sep=self.sep):
219
  for _, row in chunk.iterrows():
220
  if self.get_limit() is not None and row_count >= self.get_limit():
221
  return
 
226
  if file not in self._cache:
227
  if self.get_limit() is not None:
228
  self.log_limited_loading()
229
+ self._cache[file] = pd.read_csv(
230
+ file, nrows=self.get_limit(), sep=self.sep
231
+ ).to_dict("records")
232
  else:
233
  self._cache[file] = pd.read_csv(file).to_dict("records")
234
 
 
251
  )
252
 
253
 
254
+ class LoadFromSklearn(Loader):
255
+ dataset_name: str
256
+ splits: List[str] = ["train", "test"]
257
+
258
+ _requirements_list: List[str] = ["sklearn", "pandas"]
259
+
260
+ def verify(self):
261
+ super().verify()
262
+
263
+ if self.streaming:
264
+ raise NotImplementedError("LoadFromSklearn cannot load with streaming.")
265
+
266
+ def prepare(self):
267
+ super().prepare()
268
+ from sklearn import datasets as sklearn_datatasets
269
+
270
+ self.downloader = getattr(sklearn_datatasets, f"fetch_{self.dataset_name}")
271
+
272
+ def process(self):
273
+ with TemporaryDirectory() as temp_directory:
274
+ for split in self.splits:
275
+ split_data = self.downloader(subset=split)
276
+ targets = [split_data["target_names"][t] for t in split_data["target"]]
277
+ df = pd.DataFrame([split_data["data"], targets]).T
278
+ df.columns = ["data", "target"]
279
+ df.to_csv(os.path.join(temp_directory, f"{split}.csv"), index=None)
280
+ dataset = hf_load_dataset(temp_directory, streaming=False)
281
+
282
+ return MultiStream.from_iterables(dataset)
283
+
284
+
285
  class MissingKaggleCredentialsError(ValueError):
286
  pass
287
 
288
 
 
289
  class LoadFromKaggle(Loader):
290
  url: str
291
  _requirements_list: List[str] = ["opendatasets"]
 
406
  local_dir = os.path.join(
407
  self.cache_dir,
408
  self.bucket_name,
409
+ self.data_dir or "", # data_dir can be None
410
  f"loader_limit_{self.get_limit()}",
411
  )
412
  if not os.path.exists(local_dir):