|
|
|
import gradio as gr |
|
import pytorchvideo |
|
import torch |
|
import torchvision |
|
import numpy as np |
|
import accelerate |
|
import evaluate |
|
from transformers import TrainingArguments, Trainer |
|
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification |
|
from torchvision.transforms import Compose |
|
from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset |
|
from pytorchvideo.transforms import ( |
|
ApplyTransformToKey, |
|
Normalize, |
|
RandomShortSideScale, |
|
RemoveKey, |
|
ShortSideScale, |
|
UniformTemporalSubsample, |
|
) |
|
|
|
from torchvision.transforms import ( |
|
Compose, |
|
Lambda, |
|
Resize, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import pytorchvideo |
|
import torch |
|
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification |
|
|
|
def preprocess_video(video, image_processor, model_config): |
|
mean = image_processor.image_mean |
|
std = image_processor.image_std |
|
|
|
if "shortest_edge" in image_processor.size: |
|
height = width = image_processor.size["shortest_edge"] |
|
else: |
|
height = image_processor.size["height"] |
|
width = image_processor.size["width"] |
|
|
|
resize_to = (height, width) |
|
num_frames_to_sample = model_config.num_frames |
|
|
|
transform = Compose( |
|
[ |
|
UniformTemporalSubsample(num_frames_to_sample), |
|
Lambda(lambda x: x / 255.0), |
|
Normalize(mean, std), |
|
Resize(resize_to), |
|
] |
|
) |
|
|
|
video_tensor = transform(video) |
|
return video_tensor |
|
|
|
def run_inference(model, image_processor, video): |
|
"""Utility to run inference given a model and test video.""" |
|
preprocessed_video = preprocess_video(video, image_processor, model.config) |
|
inputs = {"pixel_values": preprocessed_video.unsqueeze(0)} |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
model = model.to(device) |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
|
|
predicted_class = logits.argmax(dim=-1).item() |
|
class_labels = model.config.id2label |
|
predicted_label = class_labels[predicted_class] |
|
|
|
return predicted_label |
|
|
|
model_name = "latif98/videomae-base-finetuned-isl-numbers_aug" |
|
image_processor = VideoMAEImageProcessor.from_pretrained(model_name) |
|
model = VideoMAEForVideoClassification.from_pretrained(model_name) |
|
|
|
demo = gr.Interface( |
|
run_inference, |
|
[model,gr.Video(), image_processor], |
|
outputs = 'text', |
|
title="VideoMAE fine-tuned on numbers, alphabets and nouns videos.", |
|
description="Gradio demo app of fine-tuned VideoMAE for video classification, To use it simply upload your video.", |
|
article="VideoMAE" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|
|
|
|
|