cosyvoice/cli/cosyvoice.py CHANGED
@@ -14,7 +14,7 @@
14
  import os
15
  import torch
16
  from hyperpyyaml import load_hyperpyyaml
17
- from modelscope import snapshot_download
18
  from cosyvoice.cli.frontend import CosyVoiceFrontEnd
19
  from cosyvoice.cli.model import CosyVoiceModel
20
 
@@ -24,7 +24,7 @@ class CosyVoice:
24
  instruct = True if '-Instruct' in model_dir else False
25
  self.model_dir = model_dir
26
  if not os.path.exists(model_dir):
27
- model_dir = snapshot_download(model_dir)
28
  with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
29
  configs = load_hyperpyyaml(f)
30
  self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
 
14
  import os
15
  import torch
16
  from hyperpyyaml import load_hyperpyyaml
17
+ from huggingface_hub import snapshot_download
18
  from cosyvoice.cli.frontend import CosyVoiceFrontEnd
19
  from cosyvoice.cli.model import CosyVoiceModel
20
 
 
24
  instruct = True if '-Instruct' in model_dir else False
25
  self.model_dir = model_dir
26
  if not os.path.exists(model_dir):
27
+ model_dir = snapshot_download(model_dir, local_dir=model_dir)
28
  with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
29
  configs = load_hyperpyyaml(f)
30
  self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
cosyvoice/cli/model.py CHANGED
@@ -19,18 +19,17 @@ class CosyVoiceModel:
19
  llm: torch.nn.Module,
20
  flow: torch.nn.Module,
21
  hift: torch.nn.Module):
22
- #self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23
- self.device = 'cpu'
24
  self.llm = llm
25
  self.flow = flow
26
  self.hift = hift
27
 
28
  def load(self, llm_model, flow_model, hift_model):
29
- self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
30
  self.llm.to(self.device).eval()
31
- self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
32
  self.flow.to(self.device).eval()
33
- self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
34
  self.hift.to(self.device).eval()
35
 
36
  def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
@@ -38,7 +37,6 @@ class CosyVoiceModel:
38
  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
39
  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
40
  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
41
- self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
42
  tts_speech_token = self.llm.inference(text=text.to(self.device),
43
  text_len=text_len.to(self.device),
44
  prompt_text=prompt_text.to(self.device),
 
19
  llm: torch.nn.Module,
20
  flow: torch.nn.Module,
21
  hift: torch.nn.Module):
22
+ self.device = torch.device('cuda')
 
23
  self.llm = llm
24
  self.flow = flow
25
  self.hift = hift
26
 
27
  def load(self, llm_model, flow_model, hift_model):
28
+ self.llm.load_state_dict(torch.load(llm_model, map_location='cpu'))
29
  self.llm.to(self.device).eval()
30
+ self.flow.load_state_dict(torch.load(flow_model, map_location='cpu'))
31
  self.flow.to(self.device).eval()
32
+ self.hift.load_state_dict(torch.load(hift_model, map_location='cpu'))
33
  self.hift.to(self.device).eval()
34
 
35
  def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
 
37
  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
38
  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
39
  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
 
40
  tts_speech_token = self.llm.inference(text=text.to(self.device),
41
  text_len=text_len.to(self.device),
42
  prompt_text=prompt_text.to(self.device),
css/utils.py CHANGED
@@ -13,9 +13,9 @@ from cosyvoice.utils.file_utils import load_wav
13
 
14
  from cosyvoice.cli.cosyvoice import CosyVoice
15
 
16
- cosyvoice= CosyVoice('speech_tts/CosyVoice-300M')
17
- cosyvoice_sft= CosyVoice('speech_tts/CosyVoice-300M-SFT')
18
- cosyvoice_instruct= CosyVoice('speech_tts/CosyVoice-300M-Instruct')
19
 
20
  example_tts_text = ["我们走的每一步,都是我们策略的一部分;你看到的所有一切,包括我此刻与你交谈,所做的一切,所说的每一句话,都有深远的含义。",
21
  "那位喜剧演员真有才,[laughter]一开口就让全场观众爆笑。",
 
13
 
14
  from cosyvoice.cli.cosyvoice import CosyVoice
15
 
16
+ cosyvoice= CosyVoice('FunAudioLLM/CosyVoice-300M')
17
+ cosyvoice_sft= CosyVoice('FunAudioLLM/CosyVoice-300M-SFT')
18
+ cosyvoice_instruct= CosyVoice('FunAudioLLM/CosyVoice-300M-Instruct')
19
 
20
  example_tts_text = ["我们走的每一步,都是我们策略的一部分;你看到的所有一切,包括我此刻与你交谈,所做的一切,所说的每一句话,都有深远的含义。",
21
  "那位喜剧演员真有才,[laughter]一开口就让全场观众爆笑。",