latif98's picture
Update app.py
13ed778 verified
raw
history blame
No virus
5.36 kB
import gradio as gr
import pytorchvideo
import torch
import torchvision
import numpy as np
import accelerate
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from torchvision.transforms import Compose
from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset
from pytorchvideo.transforms import (
ApplyTransformToKey,
Normalize,
RandomShortSideScale,
RemoveKey,
ShortSideScale,
UniformTemporalSubsample,
)
from torchvision.transforms import (
Compose,
Lambda,
Resize,
)
# def preprocess_video(video, image_processor, model_config):
# mean = image_processor.image_mean
# std = image_processor.image_std
# if "shortest_edge" in image_processor.size:
# height = width = image_processor.size["shortest_edge"]
# else:
# height = image_processor.size["height"]
# width = image_processor.size["width"]
# resize_to = (height, width)
# num_frames_to_sample = model_config.num_frames
# transform = Compose(
# [
# UniformTemporalSubsample(num_frames_to_sample),
# Lambda(lambda x: x / 255.0),
# Normalize(mean, std),
# Resize(resize_to),
# ]
# )
# video_tensor = transform(video)
# return video_tensor
# def run_inference(model,image_processor, video):
# """Utility to run inference given a model and test video.
# The video is assumed to be preprocessed already.
# """
# # (num_frames, num_channels, height, width)
# # perumuted_sample_test_video = video.permute(1, 0, 2, 3)
# preprocessed_video = preprocess_video(video, image_processor, model.config)
# inputs = {
# "pixel_values": preprocessed_video.unsqueeze(0),
# "labels": torch.tensor([int(sample_test_video["label"])]), # this can be skipped if you don't have labels available.
# }
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# inputs = {k: v.to(device) for k, v in inputs.items()}
# model = model.to(device)
# # forward pass
# with torch.no_grad():
# outputs = model(**inputs)
# logits = outputs.logits
# predicted_class = logits.argmax(dim=-1).item()
# class_labels = model.config.id2label
# predicted_label = class_labels[predicted_class]
# return predicted_label
# def video_identity(video):
# return video
# model_name = "latif98/videomae-base-finetuned-isl-numbers_aug"
# image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
# model = VideoMAEForVideoClassification.from_pretrained(model_name)
# predicted_label = run_inference(model,image_processor,video_identity(gr.Video()))
# demo = gr.Interface(video_identity,
# gr.Video(),
# "playable_video",
# output = predicted_label,
# title="VideoMAE fine-tuned on numbers, alphabets and nouns videos.",
# description="Gradio demo app of fine-tuned VideoMAE for video classification, To use it simply upload your video.",
# article = "VideoMAE"
# )
import gradio as gr
import pytorchvideo
import torch
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
def preprocess_video(video, image_processor, model_config):
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
height = width = image_processor.size["shortest_edge"]
else:
height = image_processor.size["height"]
width = image_processor.size["width"]
resize_to = (height, width)
num_frames_to_sample = model_config.num_frames
transform = Compose(
[
UniformTemporalSubsample(num_frames_to_sample),
Lambda(lambda x: x / 255.0),
Normalize(mean, std),
Resize(resize_to),
]
)
video_tensor = transform(video)
return video_tensor
def run_inference(model, image_processor, video):
"""Utility to run inference given a model and test video."""
preprocessed_video = preprocess_video(video, image_processor, model.config)
inputs = {"pixel_values": preprocessed_video.unsqueeze(0)}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {k: v.to(device) for k, v in inputs.items()}
model = model.to(device)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax(dim=-1).item()
class_labels = model.config.id2label
predicted_label = class_labels[predicted_class]
return predicted_label
model_name = "latif98/videomae-base-finetuned-isl-numbers_aug"
image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(model_name)
demo = gr.Interface(
run_inference,
[model,gr.Video(), image_processor],
outputs = 'text',
title="VideoMAE fine-tuned on numbers, alphabets and nouns videos.",
description="Gradio demo app of fine-tuned VideoMAE for video classification, To use it simply upload your video.",
article="VideoMAE"
)
if __name__ == "__main__":
demo.launch()