inoid commited on
Commit
c7a8ef4
1 Parent(s): 888f360

Fix errors in load data

Browse files
Files changed (1) hide show
  1. spanish_medica_llm.py +10 -12
spanish_medica_llm.py CHANGED
@@ -682,14 +682,13 @@ def run_training_process():
682
  login(token = os.environ.get('HG_FACE_TOKEN'))
683
  os.environ['WANDB_DISABLED'] = 'true'
684
  tokenizer = loadSpanishTokenizer()
685
- medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
686
- medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
687
 
688
- # train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
689
- # getTokenizedDataset( medicalSpanishDataset, tokenizer)
690
- # )
691
-
692
-
 
693
  train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
694
 
695
  base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
@@ -702,11 +701,10 @@ def run_finnetuning_process():
702
  login(token = os.environ.get('HG_FACE_TOKEN'))
703
  os.environ['WANDB_DISABLED'] = 'true'
704
  tokenizer = loadSpanishTokenizer()
705
- medicalSpanishDataset = loadSpanishDataset()
706
- train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
707
- getTokenizedDataset( medicalSpanishDataset, tokenizer)
708
- )
709
-
710
  base_model = loadBaseModel(HUB_MODEL_ID)
711
 
712
  configAndRunFineTuning(base_model,train_dataset, eval_dataset, tokenizer)
 
682
  login(token = os.environ.get('HG_FACE_TOKEN'))
683
  os.environ['WANDB_DISABLED'] = 'true'
684
  tokenizer = loadSpanishTokenizer()
 
 
685
 
686
+ medicalSpanishDataset = loadSpanishDataset()
687
+ train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
688
+ getTokenizedDataset( medicalSpanishDataset, tokenizer)
689
+ )
690
+
691
+
692
  train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
693
 
694
  base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
 
701
  login(token = os.environ.get('HG_FACE_TOKEN'))
702
  os.environ['WANDB_DISABLED'] = 'true'
703
  tokenizer = loadSpanishTokenizer()
704
+ medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
705
+ medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
706
+ train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
707
+
 
708
  base_model = loadBaseModel(HUB_MODEL_ID)
709
 
710
  configAndRunFineTuning(base_model,train_dataset, eval_dataset, tokenizer)