Spaces:

JingyeChen22
/

TextDiffuser

Sleeping

App Files Files Community

JingyeChen22 commited on Jun 15, 2023

Commit

9de996f

•

1 Parent(s): 4595437

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -60

app.py CHANGED Viewed

@@ -26,10 +26,6 @@ os.system('wget https://huggingface.co/datasets/JingyeChen22/TextDiffuser/resolv
 if not os.path.exists('Arial.ttf'):
     os.system('wget https://huggingface.co/datasets/JingyeChen22/TextDiffuser/resolve/main/Arial.ttf')
-os.system('echo finish')
-os.system('ls -a')
 import cv2
 import random
 import logging
@@ -67,7 +63,7 @@ from diffusers.utils.import_utils import is_xformers_available
 import transformers
 from transformers import CLIPTextModel, CLIPTokenizer
-from util import segmentation_mask_visualization, make_caption_pil, combine_image, transform_mask, transform_mask_pil, filter_segmentation_mask, inpainting_merge_image
 from model.layout_generator import get_layout_from_prompt
 from model.text_segmenter.unet import UNet
@@ -364,20 +360,40 @@ if accelerator.is_main_process:
         print(args.output_dir)
 # Load scheduler, tokenizer and models.
-tokenizer = CLIPTokenizer.from_pretrained(
-    args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
 )
-text_encoder = CLIPTextModel.from_pretrained(
-    args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
 )
-vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision).cuda()
-unet = UNet2DConditionModel.from_pretrained(
-    args.resume_from_checkpoint, subfolder="unet", revision=None
 ).cuda()
 # Freeze vae and text_encoder
-vae.requires_grad_(False)
-text_encoder.requires_grad_(False)
 if args.enable_xformers_memory_efficient_attention:
     if is_xformers_available():
@@ -421,7 +437,6 @@ if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
 # setup schedulers
-scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
 # sample_num = args.vis_num
 def to_tensor(image):
@@ -461,7 +476,25 @@ def has_chinese_char(string):
 image_404 = Image.open('404.jpg')
-def text_to_image(prompt,slider_step,slider_guidance,slider_batch):
     if has_chinese_char(prompt):
         print('trigger')
@@ -484,7 +517,7 @@ def text_to_image(prompt,slider_step,slider_guidance,slider_batch):
     set_seed(seed)
     scheduler.set_timesteps(slider_step)
-    noise = torch.randn((sample_num, 4, 64, 64)).to("cuda")  # (b, 4, 64, 64)
     input = noise # (b, 4, 64, 64)
     captions = [args.prompt] * sample_num
@@ -504,25 +537,18 @@ def text_to_image(prompt,slider_step,slider_guidance,slider_batch):
     encoder_hidden_states_nocond = text_encoder(inputs_nocond)[0].cuda() # (b, 77, 768)
     print(f'{colored("[√]", "green")} encoder_hidden_states_nocond: {encoder_hidden_states_nocond.shape}.')
-    # load character-level segmenter
-    segmenter = UNet(3, 96, True).cuda()
-    segmenter = torch.nn.DataParallel(segmenter)
-    segmenter.load_state_dict(torch.load(args.character_segmenter_path))
-    segmenter.eval()
-    print(f'{colored("[√]", "green")} Text segmenter is successfully loaded.')
     #### text-to-image ####
     render_image, segmentation_mask_from_pillow = get_layout_from_prompt(args)
     segmentation_mask = torch.Tensor(np.array(segmentation_mask_from_pillow)).cuda() # (512, 512)
     segmentation_mask = filter_segmentation_mask(segmentation_mask)
-    segmentation_mask = torch.nn.functional.interpolate(segmentation_mask.unsqueeze(0).unsqueeze(0).float(), size=(256, 256), mode='nearest')
     segmentation_mask = segmentation_mask.squeeze(1).repeat(sample_num, 1, 1).long().to('cuda') # (1, 1, 256, 256)
     print(f'{colored("[√]", "green")} character-level segmentation_mask: {segmentation_mask.shape}.')
-    feature_mask = torch.ones(sample_num, 1, 64, 64).to('cuda') # (b, 1, 64, 64)
-    masked_image = torch.zeros(sample_num, 3, 512, 512).to('cuda') # (b, 3, 512, 512)
     masked_feature = vae.encode(masked_image).latent_dist.sample() # (b, 4, 64, 64)
     masked_feature = masked_feature * vae.config.scaling_factor
     print(f'{colored("[√]", "green")} feature_mask: {feature_mask.shape}.')
@@ -543,10 +569,11 @@ def text_to_image(prompt,slider_step,slider_guidance,slider_batch):
     input = 1 / vae.config.scaling_factor * input
     sample_images = vae.decode(input.float(), return_dict=False)[0] # (b, 3, 512, 512)
-    image_pil = render_image.resize((512,512))
     segmentation_mask = segmentation_mask[0].squeeze().cpu().numpy()
-    character_mask_pil = Image.fromarray(((segmentation_mask!=0)*255).astype('uint8')).resize((512,512))
     character_mask_highlight_pil = segmentation_mask_visualization(args.font_path,segmentation_mask)
     caption_pil = make_caption_pil(args.font_path, captions)
     # save pred_img
@@ -557,12 +584,12 @@ def text_to_image(prompt,slider_step,slider_guidance,slider_batch):
         image = Image.fromarray((image * 255).round().astype("uint8")).convert('RGB')
         pred_image_list.append(image)
-    blank_pil = combine_image(args, None, pred_image_list, image_pil, character_mask_pil, character_mask_highlight_pil, caption_pil)
-    intermediate_result = Image.new('RGB', (512*3, 512))
     intermediate_result.paste(image_pil, (0, 0))
-    intermediate_result.paste(character_mask_pil, (512, 0))
-    intermediate_result.paste(character_mask_highlight_pil, (512*2, 0))
     return blank_pil, intermediate_result
@@ -577,7 +604,25 @@ print(f'{colored("[√]", "green")} Text segmenter is successfully loaded.')
-def text_to_image_with_template(prompt,template_image,slider_step,slider_guidance,slider_batch, binary):
     if has_chinese_char(prompt):
         print('trigger')
@@ -586,7 +631,7 @@ def text_to_image_with_template(prompt,template_image,slider_step,slider_guidanc
     if slider_step>=50:
         slider_step = 50
-    orig_template_image = template_image.resize((512,512)).convert('RGB')
     args.prompt = prompt
     sample_num = slider_batch
     # If passed along, set the training seed now.
@@ -595,7 +640,7 @@ def text_to_image_with_template(prompt,template_image,slider_step,slider_guidanc
     set_seed(seed)
     scheduler.set_timesteps(slider_step)
-    noise = torch.randn((sample_num, 4, 64, 64)).to("cuda")  # (b, 4, 64, 64)
     input = noise # (b, 4, 64, 64)
     captions = [args.prompt] * sample_num
@@ -634,12 +679,12 @@ def text_to_image_with_template(prompt,template_image,slider_step,slider_guidanc
     segmentation_mask = segmentation_mask.max(1)[1].squeeze(0) # (256, 256)
     segmentation_mask = filter_segmentation_mask(segmentation_mask) # (256, 256)
-    segmentation_mask = torch.nn.functional.interpolate(segmentation_mask.unsqueeze(0).unsqueeze(0).float(), size=(256, 256), mode='nearest') # (b, 1, 256, 256)
     segmentation_mask = segmentation_mask.squeeze(1).repeat(sample_num, 1, 1).long().to('cuda') # (b, 1, 256, 256)
     print(f'{colored("[√]", "green")} Character-level segmentation_mask: {segmentation_mask.shape}.')
-    feature_mask = torch.ones(sample_num, 1, 64, 64).to('cuda') # (b, 1, 64, 64)
-    masked_image = torch.zeros(sample_num, 3, 512, 512).to('cuda') # (b, 3, 512, 512)
     masked_feature = vae.encode(masked_image).latent_dist.sample() # (b, 4, 64, 64)
     masked_feature = masked_feature * vae.config.scaling_factor # (b, 4, 64, 64)
@@ -660,8 +705,9 @@ def text_to_image_with_template(prompt,template_image,slider_step,slider_guidanc
     image_pil = None
     segmentation_mask = segmentation_mask[0].squeeze().cpu().numpy()
-    character_mask_pil = Image.fromarray(((segmentation_mask!=0)*255).astype('uint8')).resize((512,512))
     character_mask_highlight_pil = segmentation_mask_visualization(args.font_path,segmentation_mask)
     caption_pil = make_caption_pil(args.font_path, captions)
     # save pred_img
@@ -672,17 +718,35 @@ def text_to_image_with_template(prompt,template_image,slider_step,slider_guidanc
         image = Image.fromarray((image * 255).round().astype("uint8")).convert('RGB')
         pred_image_list.append(image)
-    blank_pil = combine_image(args, None, pred_image_list, image_pil, character_mask_pil, character_mask_highlight_pil, caption_pil)
-    intermediate_result = Image.new('RGB', (512*3, 512))
     intermediate_result.paste(orig_template_image, (0, 0))
-    intermediate_result.paste(character_mask_pil, (512, 0))
-    intermediate_result.paste(character_mask_highlight_pil, (512*2, 0))
     return blank_pil, intermediate_result
-def text_inpainting(prompt,orig_image,mask_image,slider_step,slider_guidance,slider_batch):
     if has_chinese_char(prompt):
         print('trigger')
@@ -699,7 +763,7 @@ def text_inpainting(prompt,orig_image,mask_image,slider_step,slider_guidance,sli
     set_seed(seed)
     scheduler.set_timesteps(slider_step)
-    noise = torch.randn((sample_num, 4, 64, 64)).to("cuda")  # (b, 4, 64, 64)
     input = noise # (b, 4, 64, 64)
     captions = [args.prompt] * sample_num
@@ -719,7 +783,7 @@ def text_inpainting(prompt,orig_image,mask_image,slider_step,slider_guidance,sli
     encoder_hidden_states_nocond = text_encoder(inputs_nocond)[0].cuda() # (b, 77, 768)
     print(f'{colored("[√]", "green")} encoder_hidden_states_nocond: {encoder_hidden_states_nocond.shape}.')
-    mask_image = cv2.resize(mask_image, (512,512))
     # mask_image = mask_image.resize((512,512)).convert('RGB')
     text_mask = np.array(mask_image)
     threshold = 128
@@ -732,21 +796,21 @@ def text_inpainting(prompt,orig_image,mask_image,slider_step,slider_guidance,sli
     segmentation_mask = segmentation_mask.max(1)[1].squeeze(0)
     segmentation_mask = filter_segmentation_mask(segmentation_mask)
-    segmentation_mask = torch.nn.functional.interpolate(segmentation_mask.unsqueeze(0).unsqueeze(0).float(), size=(256, 256), mode='nearest')
-    image_mask = transform_mask_pil(mask_image)
     image_mask = torch.from_numpy(image_mask).cuda().unsqueeze(0).unsqueeze(0)
-    orig_image = orig_image.convert('RGB').resize((512,512))
     image = orig_image
     image_tensor = to_tensor(image).unsqueeze(0).cuda().sub_(0.5).div_(0.5)
     masked_image = image_tensor * (1-image_mask)
     masked_feature = vae.encode(masked_image).latent_dist.sample().repeat(sample_num, 1, 1, 1)
     masked_feature = masked_feature * vae.config.scaling_factor
-    image_mask = torch.nn.functional.interpolate(image_mask, size=(256, 256), mode='nearest').repeat(sample_num, 1, 1, 1)
     segmentation_mask = segmentation_mask * image_mask
-    feature_mask = torch.nn.functional.interpolate(image_mask, size=(64, 64), mode='nearest')
     # diffusion process
     intermediate_images = []
@@ -767,6 +831,7 @@ def text_inpainting(prompt,orig_image,mask_image,slider_step,slider_guidance,sli
     segmentation_mask = segmentation_mask[0].squeeze().cpu().numpy()
     character_mask_pil = Image.fromarray(((segmentation_mask!=0)*255).astype('uint8')).resize((512,512))
     character_mask_highlight_pil = segmentation_mask_visualization(args.font_path,segmentation_mask)
     caption_pil = make_caption_pil(args.font_path, captions)
     # save pred_img
@@ -786,7 +851,7 @@ def text_inpainting(prompt,orig_image,mask_image,slider_step,slider_guidance,sli
     character_mask_highlight_pil.save('character_mask_highlight_pil.png')
-    blank_pil = combine_image(args, None, pred_image_list, image_pil, character_mask_pil, character_mask_highlight_pil, caption_pil)
     background = orig_image.resize((512, 512))
@@ -825,6 +890,11 @@ with gr.Blocks() as demo:
         We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds.
         Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text.
         </h2>
         <img src="file/images/huggingface_blank.jpg" alt="textdiffuser">
         </div>
         """)
@@ -833,9 +903,10 @@ with gr.Blocks() as demo:
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.Textbox(label="Input your prompt here. Please enclose keywords with 'single quotes', you may refer to the examples below. The current version only supports input in English characters.", placeholder="Placeholder 'Team' hat")
                 slider_step = gr.Slider(minimum=1, maximum=50, value=20, step=1, label="Sampling step", info="The sampling step for TextDiffuser.")
                 slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
-                slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled.")
                 # slider_seed = gr.Slider(minimum=1, maximum=10000, label="Seed", randomize=True)
                 button = gr.Button("Generate")
@@ -851,7 +922,7 @@ with gr.Blocks() as demo:
             [
                 ["Distinguished poster of 'SPIDERMAN'. Trending on ArtStation and Pixiv. A vibrant digital oil painting. A highly detailed fantasy character illustration by Wayne Reynolds and Charles Monet and Gustave Dore and Carl Critchlow and Bram Sels"],
                 ["A detailed portrait of a fox guardian with a shield with 'Kung Fu' written on it, by victo ngai and justin gerard, digital art, realistic painting, very detailed, fantasy, high definition, cinematic light, dnd, trending on artstation"],
-                ["portrait of a 'dragon', concept art, sumi - e style, intricate linework, green smoke, artstation, trending, highly detailed, smooth, focus, art by yoji shinkawa,"],
                 ["elderly woman dressed in extremely colorful clothes with many strange patterns posing for a high fashion photoshoot of 'FASHION', haute couture, golden hour, artstation, by J. C. Leyendecker and Peter Paul Rubens"],
                 ["epic digital art of a luxury yacht named 'Time Machine' driving through very dark hard edged city towers from tron movie, faint tall mountains in background, wlop, pixiv"],
                 ["A poster of 'Adventurer'.  A beautiful so tall boy with big eyes and small nose is in the jungle, he wears normal clothes and shows his full length, which we see from the front, unreal engine, cozy indoor lighting, artstation, detailed"],
@@ -876,16 +947,17 @@ with gr.Blocks() as demo:
             examples_per_page=100
         )
-        button.click(text_to_image, inputs=[prompt,slider_step,slider_guidance,slider_batch], outputs=[output,intermediate_results])
     with gr.Tab("Text-to-Image-with-Template"):
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.Textbox(label='Input your prompt here.')
                 template_image = gr.Image(label='Template image', type="pil")
                 slider_step = gr.Slider(minimum=1, maximum=50, value=20, step=1, label="Sampling step", info="The sampling step for TextDiffuser.")
                 slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
-                slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled.")
                 # binary = gr.Radio(["park", "zoo", "road"], label="Location", info="Where did they go?")
                 binary = gr.Checkbox(label="Binarization", bool=True, info="Whether to binarize the template image? You may need it when using handwritten images as templates.")
                 button = gr.Button("Generate")
@@ -923,7 +995,7 @@ with gr.Blocks() as demo:
             examples_per_page=100
         )
-        button.click(text_to_image_with_template, inputs=[prompt,template_image,slider_step,slider_guidance,slider_batch,binary], outputs=[output,intermediate_results])
     with gr.Tab("Text-Inpainting"):
         with gr.Row():
@@ -932,9 +1004,10 @@ with gr.Blocks() as demo:
                 with gr.Row():
                     orig_image = gr.Image(label='Original image', type="pil")
                     mask_image = gr.Image(label='Mask image', type="numpy")
                 slider_step = gr.Slider(minimum=1, maximum=50, value=20, step=1, label="Sampling step", info="The sampling step for TextDiffuser.")
                 slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
-                slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled.")
                 button = gr.Button("Generate")
             with gr.Column(scale=1):
                 output = gr.Image(label='Generated image')
@@ -969,7 +1042,7 @@ with gr.Blocks() as demo:
         )
-        button.click(text_inpainting, inputs=[prompt,orig_image,mask_image,slider_step,slider_guidance,slider_batch], outputs=[output, intermediate_results])

 if not os.path.exists('Arial.ttf'):
     os.system('wget https://huggingface.co/datasets/JingyeChen22/TextDiffuser/resolve/main/Arial.ttf')
 import cv2
 import random
 import logging
 import transformers
 from transformers import CLIPTextModel, CLIPTokenizer
+from util import segmentation_mask_visualization, make_caption_pil, combine_image, transform_mask_pil, filter_segmentation_mask, inpainting_merge_image
 from model.layout_generator import get_layout_from_prompt
 from model.text_segmenter.unet import UNet
         print(args.output_dir)
 # Load scheduler, tokenizer and models.
+tokenizer15 = CLIPTokenizer.from_pretrained(
+    'runwayml/stable-diffusion-v1-5', subfolder="tokenizer", revision=args.revision
 )
+tokenizer21 = CLIPTokenizer.from_pretrained(
+    'stabilityai/stable-diffusion-2-1', subfolder="tokenizer", revision=args.revision
+)
+text_encoder15 = CLIPTextModel.from_pretrained(
+    'runwayml/stable-diffusion-v1-5', subfolder="text_encoder", revision=args.revision
 )
+text_encoder21 = CLIPTextModel.from_pretrained(
+    'stabilityai/stable-diffusion-2-1', subfolder="text_encoder", revision=args.revision
+)
+vae15 = AutoencoderKL.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder="vae", revision=args.revision).cuda()
+unet15 = UNet2DConditionModel.from_pretrained(
+    'textdiffuser-ckpt/diffusion_backbone_1.5', subfolder="unet", revision=None
+).cuda()
+vae21 = AutoencoderKL.from_pretrained('stabilityai/stable-diffusion-2-1', subfolder="vae", revision=args.revision).cuda()
+unet21 = UNet2DConditionModel.from_pretrained(
+    'textdiffuser-ckpt/diffusion_backbone_2.1', subfolder="unet", revision=None
 ).cuda()
+scheduler15 = DDPMScheduler.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder="scheduler")
+scheduler21 = DDPMScheduler.from_pretrained('stabilityai/stable-diffusion-2-1', subfolder="scheduler")
 # Freeze vae and text_encoder
+vae15.requires_grad_(False)
+vae21.requires_grad_(False)
+text_encoder15.requires_grad_(False)
+text_encoder21.requires_grad_(False)
 if args.enable_xformers_memory_efficient_attention:
     if is_xformers_available():
 # setup schedulers
 # sample_num = args.vis_num
 def to_tensor(image):
 image_404 = Image.open('404.jpg')
+def text_to_image(prompt,slider_step,slider_guidance,slider_batch, version):
+    print(f'【version】{version}')
+    if version == 'Stable Diffusion v2.1':
+        vae = vae21
+        unet = unet21
+        text_encoder = text_encoder21
+        tokenizer = tokenizer21
+        scheduler = scheduler21
+        slider_batch = min(slider_batch, 2)
+        size = 768
+    elif version == 'Stable Diffusion v1.5':
+        vae = vae15
+        unet = unet15
+        text_encoder = text_encoder15
+        tokenizer = tokenizer15
+        scheduler = scheduler15
+        size = 512
+    else:
+        assert False, 'Version Not Found'
     if has_chinese_char(prompt):
         print('trigger')
     set_seed(seed)
     scheduler.set_timesteps(slider_step)
+    noise = torch.randn((sample_num, 4, size//8, size//8)).to("cuda")  # (b, 4, 64, 64)
     input = noise # (b, 4, 64, 64)
     captions = [args.prompt] * sample_num
     encoder_hidden_states_nocond = text_encoder(inputs_nocond)[0].cuda() # (b, 77, 768)
     print(f'{colored("[√]", "green")} encoder_hidden_states_nocond: {encoder_hidden_states_nocond.shape}.')
     #### text-to-image ####
     render_image, segmentation_mask_from_pillow = get_layout_from_prompt(args)
     segmentation_mask = torch.Tensor(np.array(segmentation_mask_from_pillow)).cuda() # (512, 512)
     segmentation_mask = filter_segmentation_mask(segmentation_mask)
+    segmentation_mask = torch.nn.functional.interpolate(segmentation_mask.unsqueeze(0).unsqueeze(0).float(), size=(size//2, size//2), mode='nearest')
     segmentation_mask = segmentation_mask.squeeze(1).repeat(sample_num, 1, 1).long().to('cuda') # (1, 1, 256, 256)
     print(f'{colored("[√]", "green")} character-level segmentation_mask: {segmentation_mask.shape}.')
+    feature_mask = torch.ones(sample_num, 1, size//8, size//8).to('cuda') # (b, 1, 64, 64)
+    masked_image = torch.zeros(sample_num, 3, size, size).to('cuda') # (b, 3, 512, 512)
     masked_feature = vae.encode(masked_image).latent_dist.sample() # (b, 4, 64, 64)
     masked_feature = masked_feature * vae.config.scaling_factor
     print(f'{colored("[√]", "green")} feature_mask: {feature_mask.shape}.')
     input = 1 / vae.config.scaling_factor * input
     sample_images = vae.decode(input.float(), return_dict=False)[0] # (b, 3, 512, 512)
+    image_pil = render_image.resize((size,size))
     segmentation_mask = segmentation_mask[0].squeeze().cpu().numpy()
+    character_mask_pil = Image.fromarray(((segmentation_mask!=0)*255).astype('uint8')).resize((size,size))
     character_mask_highlight_pil = segmentation_mask_visualization(args.font_path,segmentation_mask)
+    character_mask_highlight_pil = character_mask_highlight_pil.resize((size, size))
     caption_pil = make_caption_pil(args.font_path, captions)
     # save pred_img
         image = Image.fromarray((image * 255).round().astype("uint8")).convert('RGB')
         pred_image_list.append(image)
+    blank_pil = combine_image(args, size, None, pred_image_list, image_pil, character_mask_pil, character_mask_highlight_pil, caption_pil)
+    intermediate_result = Image.new('RGB', (size*3, size))
     intermediate_result.paste(image_pil, (0, 0))
+    intermediate_result.paste(character_mask_pil, (size, 0))
+    intermediate_result.paste(character_mask_highlight_pil, (size*2, 0))
     return blank_pil, intermediate_result
+def text_to_image_with_template(prompt,template_image,slider_step,slider_guidance,slider_batch, binary, version):
+    if version == 'Stable Diffusion v2.1':
+        vae = vae21
+        unet = unet21
+        text_encoder = text_encoder21
+        tokenizer = tokenizer21
+        scheduler = scheduler21
+        slider_batch = min(slider_batch, 2)
+        size = 768
+    elif version == 'Stable Diffusion v1.5':
+        vae = vae15
+        unet = unet15
+        text_encoder = text_encoder15
+        tokenizer = tokenizer15
+        scheduler = scheduler15
+        size = 512
+    else:
+        assert False, 'Version Not Found'
     if has_chinese_char(prompt):
         print('trigger')
     if slider_step>=50:
         slider_step = 50
+    orig_template_image = template_image.resize((size,size)).convert('RGB')
     args.prompt = prompt
     sample_num = slider_batch
     # If passed along, set the training seed now.
     set_seed(seed)
     scheduler.set_timesteps(slider_step)
+    noise = torch.randn((sample_num, 4, size//8, size//8)).to("cuda")  # (b, 4, 64, 64)
     input = noise # (b, 4, 64, 64)
     captions = [args.prompt] * sample_num
     segmentation_mask = segmentation_mask.max(1)[1].squeeze(0) # (256, 256)
     segmentation_mask = filter_segmentation_mask(segmentation_mask) # (256, 256)
+    segmentation_mask = torch.nn.functional.interpolate(segmentation_mask.unsqueeze(0).unsqueeze(0).float(), size=(size//2, size//2), mode='nearest') # (b, 1, 256, 256)
     segmentation_mask = segmentation_mask.squeeze(1).repeat(sample_num, 1, 1).long().to('cuda') # (b, 1, 256, 256)
     print(f'{colored("[√]", "green")} Character-level segmentation_mask: {segmentation_mask.shape}.')
+    feature_mask = torch.ones(sample_num, 1, size//8, size//8).to('cuda') # (b, 1, 64, 64)
+    masked_image = torch.zeros(sample_num, 3, size, size).to('cuda') # (b, 3, 512, 512)
     masked_feature = vae.encode(masked_image).latent_dist.sample() # (b, 4, 64, 64)
     masked_feature = masked_feature * vae.config.scaling_factor # (b, 4, 64, 64)
     image_pil = None
     segmentation_mask = segmentation_mask[0].squeeze().cpu().numpy()
+    character_mask_pil = Image.fromarray(((segmentation_mask!=0)*255).astype('uint8')).resize((size,size))
     character_mask_highlight_pil = segmentation_mask_visualization(args.font_path,segmentation_mask)
+    character_mask_highlight_pil = character_mask_highlight_pil.resize((size, size))
     caption_pil = make_caption_pil(args.font_path, captions)
     # save pred_img
         image = Image.fromarray((image * 255).round().astype("uint8")).convert('RGB')
         pred_image_list.append(image)
+    blank_pil = combine_image(args, size, None, pred_image_list, image_pil, character_mask_pil, character_mask_highlight_pil, caption_pil)
+    intermediate_result = Image.new('RGB', (size*3, size))
     intermediate_result.paste(orig_template_image, (0, 0))
+    intermediate_result.paste(character_mask_pil, (size, 0))
+    intermediate_result.paste(character_mask_highlight_pil, (size*2, 0))
     return blank_pil, intermediate_result
+def text_inpainting(prompt,orig_image,mask_image,slider_step,slider_guidance,slider_batch, version):
+    if version == 'Stable Diffusion v2.1':
+        vae = vae21
+        unet = unet21
+        text_encoder = text_encoder21
+        tokenizer = tokenizer21
+        scheduler = scheduler21
+        slider_batch = min(slider_batch, 2)
+        size = 768
+    elif version == 'Stable Diffusion v1.5':
+        vae = vae15
+        unet = unet15
+        text_encoder = text_encoder15
+        tokenizer = tokenizer15
+        scheduler = scheduler15
+        size = 512
+    else:
+        assert False, 'Version Not Found'
     if has_chinese_char(prompt):
         print('trigger')
     set_seed(seed)
     scheduler.set_timesteps(slider_step)
+    noise = torch.randn((sample_num, 4, size//8, size//8)).to("cuda")  # (b, 4, 64, 64)
     input = noise # (b, 4, 64, 64)
     captions = [args.prompt] * sample_num
     encoder_hidden_states_nocond = text_encoder(inputs_nocond)[0].cuda() # (b, 77, 768)
     print(f'{colored("[√]", "green")} encoder_hidden_states_nocond: {encoder_hidden_states_nocond.shape}.')
+    mask_image = cv2.resize(mask_image, (size,size))
     # mask_image = mask_image.resize((512,512)).convert('RGB')
     text_mask = np.array(mask_image)
     threshold = 128
     segmentation_mask = segmentation_mask.max(1)[1].squeeze(0)
     segmentation_mask = filter_segmentation_mask(segmentation_mask)
+    segmentation_mask = torch.nn.functional.interpolate(segmentation_mask.unsqueeze(0).unsqueeze(0).float(), size=(size//2, size//2), mode='nearest')
+    image_mask = transform_mask_pil(mask_image, size)
     image_mask = torch.from_numpy(image_mask).cuda().unsqueeze(0).unsqueeze(0)
+    orig_image = orig_image.convert('RGB').resize((size,size))
     image = orig_image
     image_tensor = to_tensor(image).unsqueeze(0).cuda().sub_(0.5).div_(0.5)
     masked_image = image_tensor * (1-image_mask)
     masked_feature = vae.encode(masked_image).latent_dist.sample().repeat(sample_num, 1, 1, 1)
     masked_feature = masked_feature * vae.config.scaling_factor
+    image_mask = torch.nn.functional.interpolate(image_mask, size=(size//2, size//2), mode='nearest').repeat(sample_num, 1, 1, 1)
     segmentation_mask = segmentation_mask * image_mask
+    feature_mask = torch.nn.functional.interpolate(image_mask, size=(size//8, size//8), mode='nearest')
     # diffusion process
     intermediate_images = []
     segmentation_mask = segmentation_mask[0].squeeze().cpu().numpy()
     character_mask_pil = Image.fromarray(((segmentation_mask!=0)*255).astype('uint8')).resize((512,512))
     character_mask_highlight_pil = segmentation_mask_visualization(args.font_path,segmentation_mask)
+    character_mask_highlight_pil = character_mask_highlight_pil.resize((size, size))
     caption_pil = make_caption_pil(args.font_path, captions)
     # save pred_img
     character_mask_highlight_pil.save('character_mask_highlight_pil.png')
+    blank_pil = combine_image(args, size, None, pred_image_list, image_pil, character_mask_pil, character_mask_highlight_pil, caption_pil)
     background = orig_image.resize((512, 512))
         We propose <b>TextDiffuser</b>, a flexible and controllable framework to generate images with visually appealing text that is coherent with backgrounds.
         Main features include: (a) <b><font color="#A52A2A">Text-to-Image</font></b>: The user provides a prompt and encloses the keywords with single quotes (e.g., a text image of ‘hello’). The model first determines the layout of the keywords and then draws the image based on the layout and prompt. (b) <b><font color="#A52A2A">Text-to-Image with Templates</font></b>: The user provides a prompt and a template image containing text, which can be a printed, handwritten, or scene text image. These template images can be used to determine the layout of the characters. (c) <b><font color="#A52A2A">Text Inpainting</font></b>: The user provides an image and specifies the region to be modified along with the desired text content. The model is able to modify the original text or add text to areas without text.
         </h2>
+        <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
+        🔥 <b>News</b>: We further trained TextDiffuser based on <b>Stable Diffusion v2.1</b> pre-trained model, enlarging the resolution from 512x512 to <b>768x768</b> to enhance the legibility of small text. Additionally, we fine-tuned the model with images with <b>high aesthetical score</b>, enabling generating images with richer details.
+        </h2>
         <img src="file/images/huggingface_blank.jpg" alt="textdiffuser">
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.Textbox(label="Input your prompt here. Please enclose keywords with 'single quotes', you may refer to the examples below. The current version only supports input in English characters.", placeholder="Placeholder 'Team' hat")
+                radio = gr.Radio(["Stable Diffusion v2.1", "Stable Diffusion v1.5"], label="Pre-trained Model", value="Stable Diffusion v2.1")
                 slider_step = gr.Slider(minimum=1, maximum=50, value=20, step=1, label="Sampling step", info="The sampling step for TextDiffuser.")
                 slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
+                slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled. Maximum number is set to 【2】 for SD v2.1 to avoid OOM.")
                 # slider_seed = gr.Slider(minimum=1, maximum=10000, label="Seed", randomize=True)
                 button = gr.Button("Generate")
             [
                 ["Distinguished poster of 'SPIDERMAN'. Trending on ArtStation and Pixiv. A vibrant digital oil painting. A highly detailed fantasy character illustration by Wayne Reynolds and Charles Monet and Gustave Dore and Carl Critchlow and Bram Sels"],
                 ["A detailed portrait of a fox guardian with a shield with 'Kung Fu' written on it, by victo ngai and justin gerard, digital art, realistic painting, very detailed, fantasy, high definition, cinematic light, dnd, trending on artstation"],
+                ["portrait of a 'dragon', concept art, sumi - e style, intricate linework, green smoke, artstation, trending, highly detailed, smooth, focus, art by yoji shinkawa,"],
                 ["elderly woman dressed in extremely colorful clothes with many strange patterns posing for a high fashion photoshoot of 'FASHION', haute couture, golden hour, artstation, by J. C. Leyendecker and Peter Paul Rubens"],
                 ["epic digital art of a luxury yacht named 'Time Machine' driving through very dark hard edged city towers from tron movie, faint tall mountains in background, wlop, pixiv"],
                 ["A poster of 'Adventurer'.  A beautiful so tall boy with big eyes and small nose is in the jungle, he wears normal clothes and shows his full length, which we see from the front, unreal engine, cozy indoor lighting, artstation, detailed"],
             examples_per_page=100
         )
+        button.click(text_to_image, inputs=[prompt,slider_step,slider_guidance,slider_batch,radio], outputs=[output,intermediate_results])
     with gr.Tab("Text-to-Image-with-Template"):
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.Textbox(label='Input your prompt here.')
                 template_image = gr.Image(label='Template image', type="pil")
+                radio = gr.Radio(["Stable Diffusion v2.1", "Stable Diffusion v1.5"], label="Pre-trained Model", value="Stable Diffusion v2.1")
                 slider_step = gr.Slider(minimum=1, maximum=50, value=20, step=1, label="Sampling step", info="The sampling step for TextDiffuser.")
                 slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
+                slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled. Maximum number is set to 【2】 for SD v2.1 to avoid OOM.")
                 # binary = gr.Radio(["park", "zoo", "road"], label="Location", info="Where did they go?")
                 binary = gr.Checkbox(label="Binarization", bool=True, info="Whether to binarize the template image? You may need it when using handwritten images as templates.")
                 button = gr.Button("Generate")
             examples_per_page=100
         )
+        button.click(text_to_image_with_template, inputs=[prompt,template_image,slider_step,slider_guidance,slider_batch,binary,radio], outputs=[output,intermediate_results])
     with gr.Tab("Text-Inpainting"):
         with gr.Row():
                 with gr.Row():
                     orig_image = gr.Image(label='Original image', type="pil")
                     mask_image = gr.Image(label='Mask image', type="numpy")
+                radio = gr.Radio(["Stable Diffusion v2.1", "Stable Diffusion v1.5"], label="Pre-trained Model", value="Stable Diffusion v2.1")
                 slider_step = gr.Slider(minimum=1, maximum=50, value=20, step=1, label="Sampling step", info="The sampling step for TextDiffuser.")
                 slider_guidance = gr.Slider(minimum=1, maximum=9, value=7.5, step=0.5, label="Scale of classifier-free guidance", info="The scale of classifier-free guidance and is set to 7.5 in default.")
+                slider_batch = gr.Slider(minimum=1, maximum=4, value=4, step=1, label="Batch size", info="The number of images to be sampled. Maximum number is set to 【2】 for SD v2.1 to avoid OOM.")
                 button = gr.Button("Generate")
             with gr.Column(scale=1):
                 output = gr.Image(label='Generated image')
         )
+        button.click(text_inpainting, inputs=[prompt,orig_image,mask_image,slider_step,slider_guidance,slider_batch,radio], outputs=[output, intermediate_results])