#!D:\GitDownload\SupThirdParty\audioldm2\venv\Scripts\python.exe import os import torch import logging from audioldm2 import text_to_audio, build_model, save_wave, get_time, read_list import argparse os.environ["TOKENIZERS_PARALLELISM"] = "true" matplotlib_logger = logging.getLogger('matplotlib') matplotlib_logger.setLevel(logging.WARNING) parser = argparse.ArgumentParser() parser.add_argument( "-t", "--text", type=str, required=False, default="", help="Text prompt to the model for audio generation", ) parser.add_argument( "--transcription", type=str, required=False, default="", help="Transcription for Text-to-Speech", ) parser.add_argument( "-tl", "--text_list", type=str, required=False, default="", help="A file that contains text prompt to the model for audio generation", ) parser.add_argument( "-s", "--save_path", type=str, required=False, help="The path to save model output", default="./output", ) parser.add_argument( "--model_name", type=str, required=False, help="The checkpoint you gonna use", default="audioldm_48k", choices=["audioldm_48k", "audioldm_16k_crossattn_t5", "audioldm2-full", "audioldm2-music-665k", "audioldm2-full-large-1150k", "audioldm2-speech-ljspeech", "audioldm2-speech-gigaspeech"] ) parser.add_argument( "-d", "--device", type=str, required=False, help="The device for computation. If not specified, the script will automatically choose the device based on your environment.", default="auto", ) parser.add_argument( "-b", "--batchsize", type=int, required=False, default=1, help="Generate how many samples at the same time", ) parser.add_argument( "--ddim_steps", type=int, required=False, default=200, help="The sampling step for DDIM", ) parser.add_argument( "-gs", "--guidance_scale", type=float, required=False, default=3.5, help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)", ) parser.add_argument( "-dur", "--duration", type=float, required=False, default=10.0, help="The duration of the samples", ) parser.add_argument( "-n", "--n_candidate_gen_per_text", type=int, required=False, default=3, help="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation", ) parser.add_argument( "--seed", type=int, required=False, default=0, help="Change this value (any integer number) will lead to a different generation result.", ) args = parser.parse_args() torch.set_float32_matmul_precision("high") save_path = os.path.join(args.save_path, get_time()) text = args.text random_seed = args.seed duration = args.duration sample_rate = 16000 if ("audioldm2" in args.model_name): print( "Warning: For AudioLDM2 we currently only support 10s of generation. Please use audioldm_48k or audioldm_16k_crossattn_t5 if you want a different duration.") duration = 10 if ("48k" in args.model_name): sample_rate = 48000 guidance_scale = args.guidance_scale n_candidate_gen_per_text = args.n_candidate_gen_per_text transcription = args.transcription if (transcription): if "speech" not in args.model_name: print( "Warning: You choose to perform Text-to-Speech by providing the transcription.However you do not choose the correct model name (audioldm2-speech-gigaspeech or audioldm2-speech-ljspeech).") print("Warning: We will use audioldm2-speech-gigaspeech by default") args.model_name = "audioldm2-speech-gigaspeech" if (not text): print( "Warning: You should provide text as a input to describe the speaker. Use default (A male reporter is speaking)") text = "A female reporter is speaking full of emotion" os.makedirs(save_path, exist_ok=True) audioldm2 = build_model(model_name=args.model_name, device=args.device) if (args.text_list): print("Generate audio based on the text prompts in %s" % args.text_list) prompt_todo = read_list(args.text_list) else: prompt_todo = [text] for text in prompt_todo: if ("|" in text): text, name = text.split("|") else: name = text[:128] if (transcription): name += "-TTS-%s" % transcription waveform = text_to_audio( audioldm2, text, transcription=transcription, # To avoid the model to ignore the last vocab seed=random_seed, duration=duration, guidance_scale=guidance_scale, ddim_steps=args.ddim_steps, n_candidate_gen_per_text=n_candidate_gen_per_text, batchsize=args.batchsize, ) save_wave(waveform, save_path, name=name, samplerate=sample_rate)