import gradio as gr import pytorchvideo import torch import torchvision import numpy as np import accelerate import evaluate from transformers import TrainingArguments, Trainer from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification from torchvision.transforms import Compose from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset from pytorchvideo.transforms import ( ApplyTransformToKey, Normalize, RandomShortSideScale, RemoveKey, ShortSideScale, UniformTemporalSubsample, ) from torchvision.transforms import ( Compose, Lambda, Resize, ) # def preprocess_video(video, image_processor, model_config): # mean = image_processor.image_mean # std = image_processor.image_std # if "shortest_edge" in image_processor.size: # height = width = image_processor.size["shortest_edge"] # else: # height = image_processor.size["height"] # width = image_processor.size["width"] # resize_to = (height, width) # num_frames_to_sample = model_config.num_frames # transform = Compose( # [ # UniformTemporalSubsample(num_frames_to_sample), # Lambda(lambda x: x / 255.0), # Normalize(mean, std), # Resize(resize_to), # ] # ) # video_tensor = transform(video) # return video_tensor # def run_inference(model,image_processor, video): # """Utility to run inference given a model and test video. # The video is assumed to be preprocessed already. # """ # # (num_frames, num_channels, height, width) # # perumuted_sample_test_video = video.permute(1, 0, 2, 3) # preprocessed_video = preprocess_video(video, image_processor, model.config) # inputs = { # "pixel_values": preprocessed_video.unsqueeze(0), # "labels": torch.tensor([int(sample_test_video["label"])]), # this can be skipped if you don't have labels available. # } # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # inputs = {k: v.to(device) for k, v in inputs.items()} # model = model.to(device) # # forward pass # with torch.no_grad(): # outputs = model(**inputs) # logits = outputs.logits # predicted_class = logits.argmax(dim=-1).item() # class_labels = model.config.id2label # predicted_label = class_labels[predicted_class] # return predicted_label # def video_identity(video): # return video # model_name = "latif98/videomae-base-finetuned-isl-numbers_aug" # image_processor = VideoMAEImageProcessor.from_pretrained(model_name) # model = VideoMAEForVideoClassification.from_pretrained(model_name) # predicted_label = run_inference(model,image_processor,video_identity(gr.Video())) # demo = gr.Interface(video_identity, # gr.Video(), # "playable_video", # output = predicted_label, # title="VideoMAE fine-tuned on numbers, alphabets and nouns videos.", # description="Gradio demo app of fine-tuned VideoMAE for video classification, To use it simply upload your video.", # article = "VideoMAE" # ) import gradio as gr import pytorchvideo import torch from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification def preprocess_video(video, image_processor, model_config): mean = image_processor.image_mean std = image_processor.image_std if "shortest_edge" in image_processor.size: height = width = image_processor.size["shortest_edge"] else: height = image_processor.size["height"] width = image_processor.size["width"] resize_to = (height, width) num_frames_to_sample = model_config.num_frames transform = Compose( [ UniformTemporalSubsample(num_frames_to_sample), Lambda(lambda x: x / 255.0), Normalize(mean, std), Resize(resize_to), ] ) video_tensor = transform(video) return video_tensor def run_inference(model, image_processor, video): """Utility to run inference given a model and test video.""" preprocessed_video = preprocess_video(video, image_processor, model.config) inputs = {"pixel_values": preprocessed_video.unsqueeze(0)} device = torch.device("cuda" if torch.cuda.is_available() else "cpu") inputs = {k: v.to(device) for k, v in inputs.items()} model = model.to(device) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits predicted_class = logits.argmax(dim=-1).item() class_labels = model.config.id2label predicted_label = class_labels[predicted_class] return predicted_label model_name = "latif98/videomae-base-finetuned-isl-numbers_aug" image_processor = VideoMAEImageProcessor.from_pretrained(model_name) model = VideoMAEForVideoClassification.from_pretrained(model_name) demo = gr.Interface( run_inference, [model,gr.Video(), image_processor], outputs = 'text', title="VideoMAE fine-tuned on numbers, alphabets and nouns videos.", description="Gradio demo app of fine-tuned VideoMAE for video classification, To use it simply upload your video.", article="VideoMAE" ) if __name__ == "__main__": demo.launch()