File size: 2,635 Bytes
a5ca47b
ccc32ab
587176e
 
 
 
 
 
 
 
 
 
8d513dc
 
 
 
 
 
587176e
8d513dc
 
 
 
587176e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c82cfbc
0e841d6
587176e
0e841d6
 
587176e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccc32ab
587176e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import gradio as gr
import pytorchvideo
import torch
import torchvision
import numpy as np
import accelerate
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from torchvision.transforms import Compose
from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    Resize,
)

def preprocess_video(video_path, image_processor, model_config):
    mean = image_processor.image_mean
    std = image_processor.image_std

    if "shortest_edge" in image_processor.size:
        height = width = image_processor.size["shortest_edge"]
    else:
        height = image_processor.size["height"]
        width = image_processor.size["width"]

    resize_to = (height, width)
    num_frames_to_sample = model_config.num_frames

    transform = Compose(
        [
            UniformTemporalSubsample(num_frames_to_sample),
            Lambda(lambda x: x / 255.0),
            Normalize(mean, std),
            Resize(resize_to),
        ]
    )

    video = pytorchvideo.data.encoded_video.EncodedVideo.from_path(video_path)
    video_tensor = transform(video)

    return video_tensor


def run_inference(model, video):
    """Utility to run inference given a model and test video.

    The video is assumed to be preprocessed already.
    """
    # (num_frames, num_channels, height, width)
    perumuted_sample_test_video = video.permute(1, 0, 2, 3)

    inputs = {
        "pixel_values": perumuted_sample_test_video.unsqueeze(0),
        "labels": torch.tensor([int(sample_test_video["label"])]), # this can be skipped if you don't have labels available.
    }
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model = model.to(device)

    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    return logits

   
model_name = "latif98/videomae-base-finetuned-isl-numbers_aug"
image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(model_name)




def video_identity(video):
    return video


demo = gr.Interface(video_identity, 
                    gr.Video(), 
                    "playable_video", 
                    )

if __name__ == "__main__":
    demo.launch()