Spaces:

fffiloni
/

ReNO

Running on L40S

App Files Files Community

ReNO / models /RewardStableDiffusion.py

fffiloni

Upload 24 files

ca25718 verified 5 days ago

raw

history blame

No virus

10.2 kB

	import inspect
	from typing import Callable, List, Optional, Union

	import torch
	from diffusers import StableDiffusionPipeline


	def freeze_params(params):
	for param in params:
	param.requires_grad = False


	class RewardStableDiffusion(StableDiffusionPipeline):
	def __init__(
	self,
	vae,
	text_encoder,
	tokenizer,
	unet,
	scheduler,
	safety_checker,
	feature_extractor,
	image_encoder=None,
	requires_safety_checker: bool = True,
	memsave=False,
	):
	super().__init__(
	vae,
	text_encoder,
	tokenizer,
	unet,
	scheduler,
	safety_checker,
	feature_extractor,
	image_encoder,
	)
	# optionally enable memsave_torch
	if memsave:
	import memsave_torch.nn

	self.vae = memsave_torch.nn.convert_to_memory_saving(self.vae)
	self.unet = memsave_torch.nn.convert_to_memory_saving(self.unet)
	self.text_encoder = memsave_torch.nn.convert_to_memory_saving(
	self.text_encoder
	)
	# enable checkpointing
	self.text_encoder.gradient_checkpointing_enable()
	self.unet.enable_gradient_checkpointing()
	self.vae.eval()
	self.text_encoder.eval()
	self.unet.eval()

	# freeze diffusion parameters
	freeze_params(self.vae.parameters())
	freeze_params(self.unet.parameters())
	freeze_params(self.text_encoder.parameters())

	def decode_latents_tensors(self, latents):
	latents = 1 / 0.18215 * latents
	image = self.vae.decode(latents).sample
	image = (image / 2 + 0.5).clamp(0, 1)
	return image

	def apply(
	self,
	latents: torch.Tensor,
	prompt: Union[str, List[str]] = None,
	text_embeddings=None,
	image=None,
	height: Optional[int] = None,
	width: Optional[int] = None,
	timesteps: Optional[List[int]] = None,
	num_inference_steps: int = 1,
	guidance_scale: float = 1.0,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
	callback_steps: Optional[int] = 1,
	) -> torch.Tensor:
	# 0. Default height and width to unet
	height = height or self.unet.config.sample_size * self.vae_scale_factor
	width = width or self.unet.config.sample_size * self.vae_scale_factor
	# to deal with lora scaling and other possible forward hooks

	prompt_embeds = None
	negative_prompt_embeds = None
	ip_adapter_image = None
	ip_adapter_image_embeds = None
	callback_on_step_end_tensor_inputs = None
	guidance_rescale = 0.0
	clip_skip = None
	cross_attention_kwargs = None
	# 1. Check inputs. Raise error if not correct
	self.check_inputs(
	prompt,
	height,
	width,
	callback_steps,
	negative_prompt,
	prompt_embeds,
	negative_prompt_embeds,
	ip_adapter_image,
	ip_adapter_image_embeds,
	callback_on_step_end_tensor_inputs,
	)

	self._guidance_scale = guidance_scale
	self._guidance_rescale = guidance_rescale
	self._clip_skip = clip_skip
	self._cross_attention_kwargs = cross_attention_kwargs
	self._interrupt = False

	# 2. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	device = self._execution_device

	# 3. Encode input prompt
	lora_scale = (
	self.cross_attention_kwargs.get("scale", None)
	if self.cross_attention_kwargs is not None
	else None
	)

	prompt_embeds, negative_prompt_embeds = self.encode_prompt(
	prompt,
	device,
	num_images_per_prompt,
	self.do_classifier_free_guidance,
	negative_prompt,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	lora_scale=lora_scale,
	clip_skip=self.clip_skip,
	)

	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	if self.do_classifier_free_guidance:
	prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

	if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
	image_embeds = self.prepare_ip_adapter_image_embeds(
	ip_adapter_image,
	ip_adapter_image_embeds,
	device,
	batch_size * num_images_per_prompt,
	self.do_classifier_free_guidance,
	)

	# 4. Prepare timesteps
	timesteps, num_inference_steps = retrieve_timesteps(
	self.scheduler, num_inference_steps, device, timesteps
	)

	# 5. Prepare latent variables
	num_channels_latents = self.unet.config.in_channels
	latents = self.prepare_latents(
	batch_size * num_images_per_prompt,
	num_channels_latents,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	latents,
	)

	# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

	# 6.1 Add image embeds for IP-Adapter
	added_cond_kwargs = (
	{"image_embeds": image_embeds}
	if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
	else None
	)

	# 6.2 Optionally get Guidance Scale Embedding
	timestep_cond = None
	if self.unet.config.time_cond_proj_dim is not None:
	guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
	batch_size * num_images_per_prompt
	)
	timestep_cond = self.get_guidance_scale_embedding(
	guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
	).to(device=device, dtype=latents.dtype)

	# 7. Denoising loop
	self._num_timesteps = len(timesteps)
	for i, t in enumerate(timesteps):
	# expand the latents if we are doing classifier free guidance
	latent_model_input = (
	torch.cat([latents] * 2)
	if self.do_classifier_free_guidance
	else latents
	)
	latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

	# predict the noise residual
	noise_pred = self.unet(
	latent_model_input,
	t,
	encoder_hidden_states=prompt_embeds,
	timestep_cond=timestep_cond,
	added_cond_kwargs=added_cond_kwargs,
	return_dict=False,
	)[0]

	# perform guidance
	if self.do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (
	noise_pred_text - noise_pred_uncond
	)

	# compute the previous noisy sample x_t -> x_t-1
	latents = self.scheduler.step(
	noise_pred, t, latents, **extra_step_kwargs, return_dict=False
	)[0]

	image = self.decode_latents_tensors(latents)
	return image


	def retrieve_timesteps(
	scheduler,
	num_inference_steps: Optional[int] = None,
	device: Optional[Union[str, torch.device]] = None,
	timesteps: Optional[List[int]] = None,
	**kwargs,
	):
	"""
	Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
	custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

	Args:
	scheduler (`SchedulerMixin`):
	The scheduler to get timesteps from.
	num_inference_steps (`int`):
	The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
	must be `None`.
	device (`str` or `torch.device`, optional):
	The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
	timesteps (`List[int]`, optional):
	Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
	timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
	must be `None`.

	Returns:
	`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
	second element is the number of inference steps.
	"""
	if timesteps is not None:
	accepts_timesteps = "timesteps" in set(
	inspect.signature(scheduler.set_timesteps).parameters.keys()
	)
	if not accepts_timesteps:
	raise ValueError(
	f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
	f" timestep schedules. Please check whether you are using the correct scheduler."
	)
	scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
	timesteps = scheduler.timesteps
	num_inference_steps = len(timesteps)
	else:
	scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
	timesteps = scheduler.timesteps
	return timesteps, num_inference_steps