import gradio as gr from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig from PIL import Image import requests from io import BytesIO # Load the model and processor repo_name = "cyan2k/molmo-7B-O-bnb-4bit" arguments = { "device_map": "auto", # Force CPU inference "torch_dtype": "auto", # Set model to use float32 precision "trust_remote_code": True # Allow the loading of remote code } # Load the processor and model processor = AutoProcessor.from_pretrained(repo_name, **arguments) model = AutoModelForCausalLM.from_pretrained(repo_name, **arguments) def describe_image(image): # Process the uploaded image inputs = processor.process( images=[image], text="Describe this image in great detail without missing any piece of information" ) # Move inputs to model device inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()} # Generate output output = model.generate_from_batch( inputs, GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"), tokenizer=processor.tokenizer, ) # Decode the generated tokens generated_tokens = output[0, inputs["input_ids"].size(1):] generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) return generated_text def gradio_app(): # Define Gradio interface image_input = gr.Image(type="pil", label="Upload Image") output_text = gr.Textbox(label="Image Description", interactive=False) # Create Gradio interface interface = gr.Interface( fn=describe_image, inputs=image_input, outputs=output_text, title="Image Description App", description="Upload an image and get a detailed description using the Molmo 7B model" ) # Launch the interface interface.launch() # Launch the Gradio app gradio_app()