import torch, PIL import gradio as gr title = "OctoBERT" description = """Interactive Demo for OctoBERT. This base model is trained only on Flickr-30k.""" examples =[ ['swing.jpg','The woman stands outdoors, next to a child in a .'], ['tennis.jpg', 'A woman in blue shorts and white shirt holds a tennis racket on a blue court.'], ['birthday.jpg', 'The smiling is celebrating her party with friends, surrounded by balloons and a with candles.'], ['skate.jpg', 'A person in a rainbow colored snowsuit is snowboarding down a slope.'], ['street.jpg', 'A man with plays with a little girl while walking down the street, while an Asian woman walks ahead of them.'], ['dog.jpg', 'A black dog stands on a , green fields behind him.'], ] device = "cuda" if torch.cuda.is_available() else "cpu" model, img_transform, tokenizer, post_processor, plot_results = torch.hub.load('Jiayi-Pan/RefCloze_Pub', 'flickr_base_model', force_reload=True) # model, img_transform, tokenizer, post_processor, plot_results = torch.hub.load('.', 'flickr_base_model', source='local') model = model.to(device) def plot_inference(img, caption): imgs_tensor = img_transform(img).to(device).unsqueeze(0) tokens_tensor = tokenizer(caption, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(imgs_tensor, tokens_tensor, one_pass=True) processed_outputs = post_processor(outputs, img, tokenizer) vis = plot_results(img, processed_outputs, save_path="numpy_array") return vis, processed_outputs['cap'] gr.Interface( plot_inference, [gr.inputs.Image(type="pil", label="Input"), gr.inputs.Textbox(label="input text")], [gr.outputs.Image(type="numpy", label="Output"), gr.outputs.Textbox(label="Predicted Words")], title=title, description=description, examples=examples, cache_examples=True, ).launch()