Ctrl-X: Controlling Structure and Appearance for Text-To-Image Generation Without Guidance

from argparse import ArgumentParser

from diffusers import DDIMScheduler, StableDiffusionXLImg2ImgPipeline
import gradio as gr
import torch
import yaml

from ctrl_x.pipelines.pipeline_sdxl import CtrlXStableDiffusionXLPipeline
from ctrl_x.utils import *
from ctrl_x.utils.sdxl import *

import spaces


parser = ArgumentParser()
parser.add_argument("-m", "--model", type=str, default=None)  # Optionally, load model checkpoint from single file
args = parser.parse_args()

torch.backends.cudnn.enabled = False  # Sometimes necessary to suppress CUDNN_STATUS_NOT_SUPPORTED

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
refiner_id_or_path = "stabilityai/stable-diffusion-xl-refiner-1.0"
device = "cuda" if torch.cuda.is_available() else "cpu"
#variant = "fp16" if device == "cuda" else "fp32"

scheduler = DDIMScheduler.from_config(model_id_or_path, subfolder="scheduler")  # TODO: Support other schedulers
if args.model is None:
    pipe = CtrlXStableDiffusionXLPipeline.from_pretrained(
        model_id_or_path, scheduler=scheduler, torch_dtype=torch_dtype, use_safetensors=True
    )
else:
    print(f"Using weights {args.model} for SDXL base model.")
    pipe = CtrlXStableDiffusionXLPipeline.from_single_file(args.model, scheduler=scheduler, torch_dtype=torch_dtype)
refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
    refiner_id_or_path, scheduler=scheduler, text_encoder_2=pipe.text_encoder_2, vae=pipe.vae,
    torch_dtype=torch_dtype, use_safetensors=True,
)

if torch.cuda.is_available():
    pipe = pipe.to("cuda")
    refiner = refiner.to("cuda")
    

def get_control_config(structure_schedule, appearance_schedule):
    s = structure_schedule
    a = appearance_schedule
    
    control_config =\
f"""control_schedule:
    #       structure_conv   structure_attn   appearance_attn  conv/attn
    encoder:                                                # (num layers)
        0: [[             ], [             ], [             ]]  # 2/0
        1: [[             ], [             ], [{a}, {a}     ]]  # 2/2
        2: [[             ], [             ], [{a}, {a}     ]]  # 2/2
    middle: [[            ], [             ], [             ]]  # 2/1
    decoder:
        0: [[{s}          ], [{s}, {s}, {s}], [0.0, {a}, {a}]]  # 3/3
        1: [[             ], [             ], [{a}, {a}     ]]  # 3/3
        2: [[             ], [             ], [             ]]  # 3/0

control_target:
    - [output_tensor]  # structure_conv   choices: {{hidden_states, output_tensor}}
    - [query, key]     # structure_attn   choices: {{query, key, value}}
    - [before]         # appearance_attn  choices: {{before, value, after}}

self_recurrence_schedule:
    - [0.1, 0.5, 2]  # format: [start, end, num_recurrence]"""
    
    return control_config
    

css = """
.config textarea {font-family: monospace; font-size: 80%; white-space: pre}
.mono {font-family: monospace}
"""

title = """
<div style="display: flex; align-items: center; justify-content: center;margin-bottom: -15px">
    <h1 style="margin-left: 12px;text-align: center;display: inline-block">
        Ctrl-X: Controlling Structure and Appearance for Text-To-Image Generation Without Guidance
    </h1>
    <h3 style="display: inline-block; margin-left: 10px; margin-top: 7.5px; font-weight: 500">
        SDXL v1.0
    </h3>
</div>
<div style="display: flex; align-items: center; justify-content: center;margin-bottom: 25px">
    <h3 style="text-align: center">
        [<a href="https://genforce.github.io/ctrl-x/">Page</a>]
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
        [<a href="https://arxiv.org/abs/2406.07540">Paper</a>]
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
        [<a href="https://github.com/genforce/ctrl-x">Code</a>]
    </h3>
</div>
"""
description = """<div>
    <p>
        <b>Ctrl-X</b> is a simple training-free and guidance-free framework for text-to-image (T2I) generation with 
        structure and appearance control. Given structure and appearance images, Ctrl-X designs feedforward structure 
        control to enable structure alignment with the arbitrary structure image and semantic-aware appearance transfer 
        to facilitate the appearance transfer from the appearance image.
    </p>
    <p>
        Here are some notes and tips for this demo:
    </p>
    <ul>
        <li> On input images:
            <ul>
                <li>
                    If both the structure and appearance images are provided, then Ctrl-X does <i>structure and 
                    appearance</i> control.
                </li>
                <li>
                    If only the structure image is provided, then Ctrl-X does <i>structure-only</i> control and the 
                    appearance image is jointly generated with the output image.
                </li>
                <li>
                    Similarly, if only the appearance image is provided, then Ctrl-X does <i>appearance-only</i> 
                    control.
                </li>
            </ul>
        </li>
        <li> On prompts:
            <ul>
                <li>
                    Though the output prompt can affect the output image to a noticeable extent, the "accuracy" of the 
                    structure and appearance prompts are not impactful to the final image.
                </li>
                <li>
                    If the structure or appearance prompt is left blank, then it uses the (non-optional) output prompt 
                    by default.
                </li>
            </ul>
        </li>
        <li> On control schedules:
            <ul>
                <li>
                    When "Use advanced config" is <b>OFF</b>, the demo uses the structure guidance 
                    (<span class="mono">structure_conv</span> and <span class="mono">structure_attn</span> 
                    in the advanced config) and appearance guidance (<span class="mono">appearance_attn</span> in the 
                    advanced config) sliders to change the control schedules.
                </li>
                <li>
                    Otherwise, the demo uses "Advanced control config," which allows per-layer structure and 
                    appearance schedule control, along with self-recurrence control. <i>This should be used 
                    carefully</i>, and we recommend switching "Use advanced config" <b>OFF</b> in most cases. (For the 
                    examples provided at the bottom of the demo, the advanced config uses the default schedules that 
                    may not be the best settings for these examples.)
                </li>
            </ul>
        </li>
    </ul>
    <p>
        Have fun! :D
    </p>
</div>
"""

@spaces.GPU
def inference(
    structure_image,
    appearance_image,
    prompt,
    structure_prompt,
    appearance_prompt,
    positive_prompt="high quality",
    negative_prompt="ugly, blurry, dark, low res, unrealistic",
    guidance_scale=5.0,
    structure_guidance_scale=5.0,
    appearance_guidance_scale=5.0,
    num_inference_steps=28,
    eta=1.0,
    seed=42,
    width=1024,
    height=1024,
    structure_schedule=0.6,
    appearance_schedule=0.6,
    use_advanced_config=False,
    control_config="",
    progress=gr.Progress(track_tqdm=True)
):
    torch.manual_seed(seed)
    
    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
    timesteps = pipe.scheduler.timesteps
    
    print(f"\nUsing the following control config (use_advanced_config={use_advanced_config}):")
    if not use_advanced_config:
        control_config = get_control_config(structure_schedule, appearance_schedule)
    print(control_config, end="\n\n")
    
    config = yaml.safe_load(control_config)
    register_control(
        model = pipe,
        timesteps = timesteps,
        control_schedule = config["control_schedule"],
        control_target = config["control_target"],
    )
    
    pipe.safety_checker = None
    pipe.requires_safety_checker = False
    
    self_recurrence_schedule = get_self_recurrence_schedule(config["self_recurrence_schedule"], num_inference_steps)

    pipe.set_progress_bar_config(desc="Ctrl-X inference")
    refiner.set_progress_bar_config(desc="Refiner")
    
    result, structure, appearance = pipe(
        prompt = prompt,
        structure_prompt = structure_prompt,
        appearance_prompt = appearance_prompt,
        structure_image = structure_image,
        appearance_image = appearance_image,
        num_inference_steps = num_inference_steps,
        negative_prompt = negative_prompt,
        positive_prompt = positive_prompt,
        height = height,
        width = width,
        guidance_scale = guidance_scale,
        structure_guidance_scale = structure_guidance_scale,
        appearance_guidance_scale = appearance_guidance_scale,
        eta = eta,
        output_type = "pil",
        return_dict = False,
        control_schedule = config["control_schedule"],
        self_recurrence_schedule = self_recurrence_schedule,
    )
    
    result_refiner = refiner(
        image = pipe.refiner_args["latents"],
        prompt = pipe.refiner_args["prompt"],
        negative_prompt = pipe.refiner_args["negative_prompt"],
        height = height,
        width = width,
        num_inference_steps = num_inference_steps,
        guidance_scale = guidance_scale,
        guidance_rescale = 0.7,
        num_images_per_prompt = 1,
        eta = eta,
        output_type = "pil",
    ).images
    del pipe.refiner_args
    
    return [result[0], result_refiner[0], structure[0], appearance[0]]
    
    
with gr.Blocks(theme=gr.themes.Default(), css=css, title="Ctrl-X (SDXL v1.0)") as app:
    gr.HTML(title)
    with gr.Accordion("Instructions", open=False):
        gr.HTML(description)
    with gr.Row():
        with gr.Column(scale=45):
            with gr.Group():
                kwargs = {}  # {"width": 400, "height": 400}
                with gr.Row():
                    structure_image = gr.Image(label="Upload structure image (optional)", type="pil", **kwargs)
                    appearance_image = gr.Image(label="Upload appearance image (optional)", type="pil", **kwargs)
                with gr.Row():
                    structure_prompt = gr.Textbox(label="Structure prompt (optional)", placeholder="Describes the structure image")
                    appearance_prompt = gr.Textbox(label="Appearance prompt (optional)", placeholder="Describes the style image")
                with gr.Row():
                    prompt = gr.Textbox(label="Output prompt", placeholder="Prompt which describes the output image")
                with gr.Row():
                    positive_prompt = gr.Textbox(label="Positive prompt", value="high quality", placeholder="")
                    negative_prompt = gr.Textbox(label="Negative prompt", value="ugly, blurry, dark, low res, unrealistic", placeholder="")
                with gr.Accordion("Advanced Options", open=False):
                    with gr.Row():
                        guidance_scale = gr.Slider(label="Target guidance scale", value=5.0, minimum=1, maximum=10)
                        structure_guidance_scale = gr.Slider(label="Structure guidance scale", value=5.0, minimum=1, maximum=10)
                        appearance_guidance_scale = gr.Slider(label="Appearance guidance scale", value=5.0, minimum=1, maximum=10)
                    with gr.Row():
                        num_inference_steps = gr.Slider(label="# inference steps", value=28, minimum=1, maximum=200, step=1)
                        eta = gr.Slider(label="Eta (noise)", value=1.0, minimum=0, maximum=1.0, step=0.01)
                        seed = gr.Slider(0, 2147483647, label="Seed", value=90095, step=1)
                    with gr.Row():
                        width = gr.Slider(label="Width", value=1024, minimum=256, maximum=2048, step=pipe.vae_scale_factor)
                        height = gr.Slider(label="Height", value=1024, minimum=256, maximum=2048, step=pipe.vae_scale_factor)
                    with gr.Row():
                        structure_schedule = gr.Slider(label="Structure schedule", value=0.6, minimum=0.0, maximum=1.0, step=0.01, scale=2)
                        appearance_schedule = gr.Slider(label="Appearance schedule", value=0.6, minimum=0.0, maximum=1.0, step=0.01, scale=2)
                        use_advanced_config = gr.Checkbox(label="Use advanced config", value=False, scale=1)
                    with gr.Row():
                        control_config = gr.Textbox(
                            label="Advanced control config", lines=20, value=get_control_config(0.6, 0.6), elem_classes=["config"], visible=False,
                        )
                        use_advanced_config.change(
                            fn=lambda value: gr.update(visible=value), inputs=use_advanced_config, outputs=control_config,
                        )
                with gr.Row():
                    generate = gr.Button(value="Run")
        
        with gr.Column(scale=55):
            with gr.Group():
                with gr.Row():
                    result_refiner = gr.Image(label="Output image w/ refiner", format="jpg", **kwargs)
                with gr.Row():
                    result = gr.Image(label="Output image", format="jpg", **kwargs)
                    structure_recon = gr.Image(label="Structure image", format="jpg", **kwargs)
                    appearance_recon = gr.Image(label="Style image", format="jpg", **kwargs)
                        
    inputs = [
        structure_image, appearance_image,
        prompt, structure_prompt, appearance_prompt,
        positive_prompt, negative_prompt,
        guidance_scale, structure_guidance_scale, appearance_guidance_scale,
        num_inference_steps, eta, seed,
        width, height,
        structure_schedule, appearance_schedule, use_advanced_config,
        control_config,
    ]
    outputs = [result, result_refiner, structure_recon, appearance_recon]
    
    generate.click(inference, inputs=inputs, outputs=outputs)

    examples = gr.Examples(
        [
            [
                "assets/images/horse__point_cloud.jpg",
                "assets/images/horse.jpg",
                "a photo of a horse standing on grass",
                "a 3D point cloud of a horse",
                "",
            ],
            [
                "assets/images/cat__mesh.jpg",
                "assets/images/tiger.jpg",
                "a photo of a tiger standing on snow",
                "a 3D mesh of a cat",
                "",
            ],
            [
                "assets/images/dog__sketch.jpg",
                "assets/images/squirrel.jpg",
                "a photo of a squirrel",
                "a sketch of a dog",
                "",
            ],
            [
                "assets/images/living_room__seg.jpg",
                "assets/images/van_gogh.jpg",
                "a Van Gogh painting of a living room",
                "a segmentation map of a living room",
                "",
            ],
            [
                "assets/images/bedroom__sketch.jpg",
                "assets/images/living_room_modern.jpg",
                "a sketch of a bedroom",
                "a photo of a modern bedroom during sunset",
                "",
            ],
            [
                "assets/images/running__pose.jpg",
                "assets/images/man_park.jpg",
                "a photo of a man running in a park",
                "a pose image of a person running",
                "",
            ],
            [
                "assets/images/fruit_bowl.jpg",
                "assets/images/grapes.jpg",
                "a photo of a bowl of grapes in the trees",
                "a photo of a bowl of fruits",
                "",
            ],
            [
                "assets/images/bear_avocado__spatext.jpg",
                None,
                "a realistic photo of a bear and an avocado in a forest",
                "a segmentation map of a bear and an avocado",
                "",
            ],
            [
                "assets/images/cat__point_cloud.jpg",
                None,
                "an embroidery of a white cat sitting on a rock under the night sky",
                "a 3D point cloud of a cat",
                "",
            ],
            [
                "assets/images/library__mesh.jpg",
                None,
                "a Polaroid photo of an old library, sunlight streaming in",
                "a 3D mesh of a library",
                "",
            ],
            [
                "assets/images/knight__humanoid.jpg",
                None,
                "a photo of a medieval soldier standing on a barren field, raining",
                "a 3D model of a person holding a sword and shield",
                "",
            ],
            [
                "assets/images/person__mesh.jpg",
                None,
                "a photo of a Karate man performing in a cyberpunk city at night",
                "a 3D mesh of a person",
                "",
            ],
        ],
        [
            structure_image,
            appearance_image,
            prompt,
            structure_prompt,
            appearance_prompt,
        ],
        examples_per_page=50,
        cache_examples="lazy",
        fn=inference,
        outputs=[result, result_refiner, structure_recon, appearance_recon]
    )

app.launch(debug=False, share=False)