|
import inspect |
|
from typing import Callable, List, Optional, Union |
|
|
|
import torch |
|
from diffusers import StableDiffusionPipeline |
|
|
|
|
|
def freeze_params(params): |
|
for param in params: |
|
param.requires_grad = False |
|
|
|
|
|
class RewardStableDiffusion(StableDiffusionPipeline): |
|
def __init__( |
|
self, |
|
vae, |
|
text_encoder, |
|
tokenizer, |
|
unet, |
|
scheduler, |
|
safety_checker, |
|
feature_extractor, |
|
image_encoder=None, |
|
requires_safety_checker: bool = True, |
|
memsave=False, |
|
): |
|
super().__init__( |
|
vae, |
|
text_encoder, |
|
tokenizer, |
|
unet, |
|
scheduler, |
|
safety_checker, |
|
feature_extractor, |
|
image_encoder, |
|
) |
|
|
|
if memsave: |
|
import memsave_torch.nn |
|
|
|
self.vae = memsave_torch.nn.convert_to_memory_saving(self.vae) |
|
self.unet = memsave_torch.nn.convert_to_memory_saving(self.unet) |
|
self.text_encoder = memsave_torch.nn.convert_to_memory_saving( |
|
self.text_encoder |
|
) |
|
|
|
self.text_encoder.gradient_checkpointing_enable() |
|
self.unet.enable_gradient_checkpointing() |
|
self.vae.eval() |
|
self.text_encoder.eval() |
|
self.unet.eval() |
|
|
|
|
|
freeze_params(self.vae.parameters()) |
|
freeze_params(self.unet.parameters()) |
|
freeze_params(self.text_encoder.parameters()) |
|
|
|
def decode_latents_tensors(self, latents): |
|
latents = 1 / 0.18215 * latents |
|
image = self.vae.decode(latents).sample |
|
image = (image / 2 + 0.5).clamp(0, 1) |
|
return image |
|
|
|
def apply( |
|
self, |
|
latents: torch.Tensor, |
|
prompt: Union[str, List[str]] = None, |
|
text_embeddings=None, |
|
image=None, |
|
height: Optional[int] = None, |
|
width: Optional[int] = None, |
|
timesteps: Optional[List[int]] = None, |
|
num_inference_steps: int = 1, |
|
guidance_scale: float = 1.0, |
|
negative_prompt: Optional[Union[str, List[str]]] = None, |
|
num_images_per_prompt: Optional[int] = 1, |
|
eta: float = 0.0, |
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, |
|
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, |
|
callback_steps: Optional[int] = 1, |
|
) -> torch.Tensor: |
|
|
|
height = height or self.unet.config.sample_size * self.vae_scale_factor |
|
width = width or self.unet.config.sample_size * self.vae_scale_factor |
|
|
|
|
|
prompt_embeds = None |
|
negative_prompt_embeds = None |
|
ip_adapter_image = None |
|
ip_adapter_image_embeds = None |
|
callback_on_step_end_tensor_inputs = None |
|
guidance_rescale = 0.0 |
|
clip_skip = None |
|
cross_attention_kwargs = None |
|
|
|
self.check_inputs( |
|
prompt, |
|
height, |
|
width, |
|
callback_steps, |
|
negative_prompt, |
|
prompt_embeds, |
|
negative_prompt_embeds, |
|
ip_adapter_image, |
|
ip_adapter_image_embeds, |
|
callback_on_step_end_tensor_inputs, |
|
) |
|
|
|
self._guidance_scale = guidance_scale |
|
self._guidance_rescale = guidance_rescale |
|
self._clip_skip = clip_skip |
|
self._cross_attention_kwargs = cross_attention_kwargs |
|
self._interrupt = False |
|
|
|
|
|
if prompt is not None and isinstance(prompt, str): |
|
batch_size = 1 |
|
elif prompt is not None and isinstance(prompt, list): |
|
batch_size = len(prompt) |
|
else: |
|
batch_size = prompt_embeds.shape[0] |
|
|
|
device = self._execution_device |
|
|
|
|
|
lora_scale = ( |
|
self.cross_attention_kwargs.get("scale", None) |
|
if self.cross_attention_kwargs is not None |
|
else None |
|
) |
|
|
|
prompt_embeds, negative_prompt_embeds = self.encode_prompt( |
|
prompt, |
|
device, |
|
num_images_per_prompt, |
|
self.do_classifier_free_guidance, |
|
negative_prompt, |
|
prompt_embeds=prompt_embeds, |
|
negative_prompt_embeds=negative_prompt_embeds, |
|
lora_scale=lora_scale, |
|
clip_skip=self.clip_skip, |
|
) |
|
|
|
|
|
|
|
|
|
if self.do_classifier_free_guidance: |
|
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) |
|
|
|
if ip_adapter_image is not None or ip_adapter_image_embeds is not None: |
|
image_embeds = self.prepare_ip_adapter_image_embeds( |
|
ip_adapter_image, |
|
ip_adapter_image_embeds, |
|
device, |
|
batch_size * num_images_per_prompt, |
|
self.do_classifier_free_guidance, |
|
) |
|
|
|
|
|
timesteps, num_inference_steps = retrieve_timesteps( |
|
self.scheduler, num_inference_steps, device, timesteps |
|
) |
|
|
|
|
|
num_channels_latents = self.unet.config.in_channels |
|
latents = self.prepare_latents( |
|
batch_size * num_images_per_prompt, |
|
num_channels_latents, |
|
height, |
|
width, |
|
prompt_embeds.dtype, |
|
device, |
|
generator, |
|
latents, |
|
) |
|
|
|
|
|
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) |
|
|
|
|
|
added_cond_kwargs = ( |
|
{"image_embeds": image_embeds} |
|
if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) |
|
else None |
|
) |
|
|
|
|
|
timestep_cond = None |
|
if self.unet.config.time_cond_proj_dim is not None: |
|
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat( |
|
batch_size * num_images_per_prompt |
|
) |
|
timestep_cond = self.get_guidance_scale_embedding( |
|
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim |
|
).to(device=device, dtype=latents.dtype) |
|
|
|
|
|
self._num_timesteps = len(timesteps) |
|
for i, t in enumerate(timesteps): |
|
|
|
latent_model_input = ( |
|
torch.cat([latents] * 2) |
|
if self.do_classifier_free_guidance |
|
else latents |
|
) |
|
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) |
|
|
|
|
|
noise_pred = self.unet( |
|
latent_model_input, |
|
t, |
|
encoder_hidden_states=prompt_embeds, |
|
timestep_cond=timestep_cond, |
|
added_cond_kwargs=added_cond_kwargs, |
|
return_dict=False, |
|
)[0] |
|
|
|
|
|
if self.do_classifier_free_guidance: |
|
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) |
|
noise_pred = noise_pred_uncond + guidance_scale * ( |
|
noise_pred_text - noise_pred_uncond |
|
) |
|
|
|
|
|
latents = self.scheduler.step( |
|
noise_pred, t, latents, **extra_step_kwargs, return_dict=False |
|
)[0] |
|
|
|
image = self.decode_latents_tensors(latents) |
|
return image |
|
|
|
|
|
def retrieve_timesteps( |
|
scheduler, |
|
num_inference_steps: Optional[int] = None, |
|
device: Optional[Union[str, torch.device]] = None, |
|
timesteps: Optional[List[int]] = None, |
|
**kwargs, |
|
): |
|
""" |
|
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles |
|
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. |
|
|
|
Args: |
|
scheduler (`SchedulerMixin`): |
|
The scheduler to get timesteps from. |
|
num_inference_steps (`int`): |
|
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` |
|
must be `None`. |
|
device (`str` or `torch.device`, *optional*): |
|
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. |
|
timesteps (`List[int]`, *optional*): |
|
Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default |
|
timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` |
|
must be `None`. |
|
|
|
Returns: |
|
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the |
|
second element is the number of inference steps. |
|
""" |
|
if timesteps is not None: |
|
accepts_timesteps = "timesteps" in set( |
|
inspect.signature(scheduler.set_timesteps).parameters.keys() |
|
) |
|
if not accepts_timesteps: |
|
raise ValueError( |
|
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" |
|
f" timestep schedules. Please check whether you are using the correct scheduler." |
|
) |
|
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) |
|
timesteps = scheduler.timesteps |
|
num_inference_steps = len(timesteps) |
|
else: |
|
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) |
|
timesteps = scheduler.timesteps |
|
return timesteps, num_inference_steps |
|
|