Spaces:

aach456
/

Ad123

Running

App Files Files Community

Ad123 / app.py

aach456

Update app.py

f6cb684 verified 2 months ago

raw

history blame

4.6 kB

	import gradio as gr
	import torch
	import numpy as np
	from diffusers import I2VGenXLPipeline
	from transformers import MusicgenForConditionalGeneration, AutoProcessor
	from PIL import Image
	from moviepy.editor import ImageSequenceClip
	import io
	import scipy.io.wavfile
	import ffmpeg

	def generate_video(image, prompt, negative_prompt, video_length):
	generator = torch.manual_seed(8888)

	# Set the device to CPU or a non-NVIDIA GPU
	device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load the pipeline
	pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float32)
	pipeline.to(device) # Move the model to the selected device

	# Generate frames with progress tracking
	frames = []
	total_frames = video_length * 30 # Assuming 30 frames per second

	for i in range(total_frames):
	frame = pipeline(
	prompt=prompt,
	image=image,
	num_inference_steps=5,
	negative_prompt=negative_prompt,
	guidance_scale=9.0,
	generator=generator,
	num_frames=1
	).frames[0]
	frames.append(np.array(frame))

	# Update progress
	yield (i + 1) / total_frames # Yield progress

	# Create a video clip from the frames
	output_file = "output_video.mp4"
	clip = ImageSequenceClip(frames, fps=30) # Set the frames per second
	clip.write_videofile(output_file, codec='libx264', audio=False)

	return output_file

	def generate_music(prompt, unconditional=False):
	model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	model.to(device)

	# Generate music
	if unconditional:
	unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
	audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256)
	else:
	processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
	inputs = processor(
	text=prompt,
	padding=True,
	return_tensors="pt",
	)
	audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=256)

	sampling_rate = model.config.audio_encoder.sampling_rate
	audio_file = "musicgen_out.wav"

	# Ensure audio_values is 1D and scale if necessary
	audio_data = audio_values[0].cpu().numpy()

	# Check if audio_data is in the correct format
	if audio_data.ndim > 1:
	audio_data = audio_data[0] # Take the first channel if stereo

	# Scale audio data to 16-bit PCM format
	audio_data = np.clip(audio_data, -1.0, 1.0) # Ensure values are in the range [-1, 1]
	audio_data = (audio_data * 32767).astype(np.int16) # Scale to int16

	# Save the generated audio
	scipy.io.wavfile.write(audio_file, sampling_rate, audio_data)

	return audio_file

	def combine_audio_video(audio_file, video_file):
	output_file = "combined_output.mp4"
	audio = ffmpeg.input(audio_file)
	video = ffmpeg.input(video_file)
	output = ffmpeg.output(video, audio, output_file, vcodec='copy', acodec='aac')
	ffmpeg.run(output)
	return output_file

	def interface(image_path, prompt, negative_prompt, video_length, music_prompt, unconditional):
	image = Image.open(image_path)
	video_file = generate_video(image, prompt, negative_prompt, video_length)
	audio_file = generate_music(music_prompt, unconditional)
	combined_file = combine_audio_video(audio_file, video_file)
	return combined_file

	with gr.Blocks() as demo:
	gr.Markdown("# AI-Powered Video and Music Generation")

	with gr.Row():
	image_input = gr.Image(type="filepath", label="Upload Image")
	prompt_input = gr.Textbox(label="Enter the Video Prompt")
	negative_prompt_input = gr.Textbox(label="Enter the Negative Prompt")
	video_length_input = gr.Number(label="Video Length (seconds)", value=10, precision=0)
	music_prompt_input = gr.Textbox(label="Enter the Music Prompt")
	unconditional_checkbox = gr.Checkbox(label="Generate Unconditional Music")

	generate_button = gr.Button("Generate Video and Music")
	output_video = gr.Video(label="Output Video with Sound")

	generate_button.click(
	interface,
	inputs=[image_input, prompt_input, negative_prompt_input, video_length_input, music_prompt_input, unconditional_checkbox],
	outputs=output_video,
	show_progress=True
	)

	demo.launch()