maximilian
initial commit
36a6fc9
raw
history blame contribute delete
No virus
3.56 kB
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
from PIL import Image
import io
import subprocess
subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
models = {
"maxiw/Florence-2-ScreenQA-base": AutoModelForCausalLM.from_pretrained("maxiw/Florence-2-ScreenQA-base", trust_remote_code=True).to("cuda").eval(),
}
processors = {
"maxiw/Florence-2-ScreenQA-base": AutoProcessor.from_pretrained("maxiw/Florence-2-ScreenQA-base", trust_remote_code=True),
}
DESCRIPTION = "# [Florence-2-ScreenQA Demo](https://huggingface.co/maxiw/Florence-2-ScreenQA-base)"
@spaces.GPU
def run_example(task_prompt, image, text_input=None, model_id="maxiw/Florence-2-ScreenQA-base"):
model = models[model_id]
processor = processors[model_id]
if text_input is None:
prompt = task_prompt
else:
prompt = task_prompt + text_input
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(
generated_text,
task=task_prompt,
image_size=(image.width, image.height)
)
if "<SQA>" in parsed_answer:
parsed_answer = parsed_answer["<SQA>"]
return parsed_answer
def process_image(image, task_prompt, text_input=None, model_id="maxiw/Florence-2-ScreenQA-base"):
image = Image.fromarray(image) # Convert NumPy array to PIL Image
if task_prompt == "ScreenQA":
task_prompt = "<SQA>"
results = run_example(task_prompt, image, text_input, model_id=model_id)
return results
else:
print("Unknown task prompt")
return "", None # Return empty string and None for unknown task prompts
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
single_task_list =[
"ScreenQA"
]
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Florence-2 Input"):
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Picture")
model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="maxiw/Florence-2-ScreenQA-base")
task_prompt = gr.Dropdown(choices=single_task_list, label="Task Prompt", value="ScreenQA")
text_input = gr.Textbox(label="Question")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
gr.Examples(
examples=[
["image1.jpg", "ScreenQA", "What is the version of the settings?"],
["image1.jpg", "ScreenQA", "What is the state of use lower resolution images?"],
["image2.jpg", "ScreenQA", "How much is the discount for the product?"]
],
inputs=[input_img, task_prompt, text_input],
outputs=[output_text],
fn=process_image,
cache_examples=True,
label="Try examples"
)
submit_btn.click(process_image, [input_img, task_prompt, text_input, model_selector], [output_text])
demo.launch(debug=True)