Spaces:

maxiw
/

Florence-2-ScreenQA

Sleeping

App Files Files Community

Florence-2-ScreenQA / app.py

maximilian

initial commit

36a6fc9 about 2 months ago

raw

history blame contribute delete

No virus

3.56 kB

	import gradio as gr
	from transformers import AutoProcessor, AutoModelForCausalLM
	import spaces
	from PIL import Image
	import io
	import subprocess
	subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)

	models = {
	"maxiw/Florence-2-ScreenQA-base": AutoModelForCausalLM.from_pretrained("maxiw/Florence-2-ScreenQA-base", trust_remote_code=True).to("cuda").eval(),
	}

	processors = {
	"maxiw/Florence-2-ScreenQA-base": AutoProcessor.from_pretrained("maxiw/Florence-2-ScreenQA-base", trust_remote_code=True),
	}


	DESCRIPTION = "# [Florence-2-ScreenQA Demo](https://huggingface.co/maxiw/Florence-2-ScreenQA-base)"


	@spaces.GPU
	def run_example(task_prompt, image, text_input=None, model_id="maxiw/Florence-2-ScreenQA-base"):
	model = models[model_id]
	processor = processors[model_id]
	if text_input is None:
	prompt = task_prompt
	else:
	prompt = task_prompt + text_input
	inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
	generated_ids = model.generate(
	input_ids=inputs["input_ids"],
	pixel_values=inputs["pixel_values"],
	max_new_tokens=1024,
	early_stopping=False,
	do_sample=False,
	num_beams=3,
	)
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
	parsed_answer = processor.post_process_generation(
	generated_text,
	task=task_prompt,
	image_size=(image.width, image.height)
	)
	if "<SQA>" in parsed_answer:
	parsed_answer = parsed_answer["<SQA>"]
	return parsed_answer


	def process_image(image, task_prompt, text_input=None, model_id="maxiw/Florence-2-ScreenQA-base"):
	image = Image.fromarray(image) # Convert NumPy array to PIL Image
	if task_prompt == "ScreenQA":
	task_prompt = "<SQA>"
	results = run_example(task_prompt, image, text_input, model_id=model_id)
	return results
	else:
	print("Unknown task prompt")
	return "", None # Return empty string and None for unknown task prompts

	css = """
	#output {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""


	single_task_list =[
	"ScreenQA"
	]


	with gr.Blocks(css=css) as demo:
	gr.Markdown(DESCRIPTION)
	with gr.Tab(label="Florence-2 Input"):
	with gr.Row():
	with gr.Column():
	input_img = gr.Image(label="Input Picture")
	model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="maxiw/Florence-2-ScreenQA-base")
	task_prompt = gr.Dropdown(choices=single_task_list, label="Task Prompt", value="ScreenQA")
	text_input = gr.Textbox(label="Question")
	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	output_text = gr.Textbox(label="Output Text")

	gr.Examples(
	examples=[
	["image1.jpg", "ScreenQA", "What is the version of the settings?"],
	["image1.jpg", "ScreenQA", "What is the state of use lower resolution images?"],
	["image2.jpg", "ScreenQA", "How much is the discount for the product?"]
	],
	inputs=[input_img, task_prompt, text_input],
	outputs=[output_text],
	fn=process_image,
	cache_examples=True,
	label="Try examples"
	)

	submit_btn.click(process_image, [input_img, task_prompt, text_input, model_selector], [output_text])

	demo.launch(debug=True)