import gradio as gr import torch from diffusers import I2VGenXLPipeline from transformers import MusicgenForConditionalGeneration, AutoProcessor from PIL import Image from moviepy.editor import ImageSequenceClip import numpy as np import io import scipy.io.wavfile import ffmpeg def generate_video(image, prompt, negative_prompt, video_length): generator = torch.manual_seed(8888) device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float32) pipeline.to(device) frames = [] total_frames = video_length * 30 # Assuming 30 frames per second for i in range(total_frames): frame = pipeline( prompt=prompt, image=image, num_inference_steps=5, negative_prompt=negative_prompt, guidance_scale=9.0, generator=generator, num_frames=1 ).frames[0] frames.append(np.array(frame)) yield (i + 1) / total_frames # Update progress output_file = "output_video.mp4" clip = ImageSequenceClip(frames, fps=30) clip.write_videofile(output_file, codec='libx264', audio=False) return output_file def generate_music(prompt, unconditional=False): model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") device = "cuda:0" if torch.cuda.is_available() else "cpu" model.to(device) if unconditional: unconditional_inputs = model.get_unconditional_inputs(num_samples=1) audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256) else: processor = AutoProcessor.from_pretrained("facebook/musicgen-small") inputs = processor( text=prompt, padding=True, return_tensors="pt", ) audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=256) sampling_rate = model.config.audio_encoder.sampling_rate audio_file = "musicgen_out.wav" audio_data = audio_values[0].cpu().numpy() audio_data = np.clip(audio_data, -1.0, 1.0) audio_data = (audio_data * 32767).astype(np.int16) scipy.io.wavfile.write(audio_file, sampling_rate, audio_data) return audio_file def combine_audio_video(audio_file, video_file): output_file = "combined_output.mp4" audio = ffmpeg.input(audio_file) video = ffmpeg.input(video_file) output = ffmpeg.output(video, audio, output_file, vcodec='copy', acodec='aac') ffmpeg.run(output) return output_file def interface(image_path, prompt, negative_prompt, video_length, music_prompt, unconditional): image = Image.open(image_path) video_file = generate_video(image, prompt, negative_prompt, video_length) audio_file = generate_music(music_prompt, unconditional) combined_file = combine_audio_video(audio_file, video_file) return combined_file with gr.Blocks() as demo: gr.Markdown("# AI-Powered Video and Music Generation") with gr.Row(): image_input = gr.Image(type="filepath", label="Upload Image") prompt_input = gr.Textbox(label="Enter the Video Prompt") negative_prompt_input = gr.Textbox(label="Enter the Negative Prompt") video_length_input = gr.Number(label="Video Length (seconds)", value=10, precision=0) music_prompt_input = gr.Textbox(label="Enter the Music Prompt") unconditional_checkbox = gr.Checkbox(label="Generate Unconditional Music") generate_button = gr.Button("Generate Video and Music") output_video = gr.Video(label="Output Video with Sound") generate_button.click( interface, inputs=[image_input, prompt_input, negative_prompt_input, video_length_input, music_prompt_input, unconditional_checkbox], outputs=output_video, show_progress=True ) demo.launch()