latif98 commited on
Commit
3acce3f
1 Parent(s): 13ed778

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -157
app.py CHANGED
@@ -1,15 +1,8 @@
1
-
2
  import gradio as gr
3
- import pytorchvideo
4
- import torch
5
- import torchvision
6
  import numpy as np
7
- import accelerate
8
- import evaluate
9
- from transformers import TrainingArguments, Trainer
10
- from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
11
- from torchvision.transforms import Compose
12
- from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset
13
  from pytorchvideo.transforms import (
14
  ApplyTransformToKey,
15
  Normalize,
@@ -18,157 +11,130 @@ from pytorchvideo.transforms import (
18
  ShortSideScale,
19
  UniformTemporalSubsample,
20
  )
21
-
22
  from torchvision.transforms import (
23
  Compose,
24
  Lambda,
 
 
25
  Resize,
26
  )
27
-
28
- # def preprocess_video(video, image_processor, model_config):
29
- # mean = image_processor.image_mean
30
- # std = image_processor.image_std
31
-
32
- # if "shortest_edge" in image_processor.size:
33
- # height = width = image_processor.size["shortest_edge"]
34
- # else:
35
- # height = image_processor.size["height"]
36
- # width = image_processor.size["width"]
37
-
38
- # resize_to = (height, width)
39
- # num_frames_to_sample = model_config.num_frames
40
-
41
- # transform = Compose(
42
- # [
43
- # UniformTemporalSubsample(num_frames_to_sample),
44
- # Lambda(lambda x: x / 255.0),
45
- # Normalize(mean, std),
46
- # Resize(resize_to),
47
- # ]
48
- # )
49
-
50
-
51
- # video_tensor = transform(video)
52
-
53
- # return video_tensor
54
-
55
-
56
- # def run_inference(model,image_processor, video):
57
- # """Utility to run inference given a model and test video.
58
-
59
- # The video is assumed to be preprocessed already.
60
- # """
61
- # # (num_frames, num_channels, height, width)
62
- # # perumuted_sample_test_video = video.permute(1, 0, 2, 3)
63
- # preprocessed_video = preprocess_video(video, image_processor, model.config)
64
- # inputs = {
65
- # "pixel_values": preprocessed_video.unsqueeze(0),
66
- # "labels": torch.tensor([int(sample_test_video["label"])]), # this can be skipped if you don't have labels available.
67
- # }
68
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
69
- # inputs = {k: v.to(device) for k, v in inputs.items()}
70
- # model = model.to(device)
71
-
72
- # # forward pass
73
- # with torch.no_grad():
74
- # outputs = model(**inputs)
75
- # logits = outputs.logits
76
-
77
- # predicted_class = logits.argmax(dim=-1).item()
78
- # class_labels = model.config.id2label
79
- # predicted_label = class_labels[predicted_class]
80
-
81
- # return predicted_label
82
-
83
-
84
- # def video_identity(video):
85
- # return video
86
-
87
-
88
- # model_name = "latif98/videomae-base-finetuned-isl-numbers_aug"
89
- # image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
90
- # model = VideoMAEForVideoClassification.from_pretrained(model_name)
91
- # predicted_label = run_inference(model,image_processor,video_identity(gr.Video()))
92
-
93
-
94
-
95
-
96
-
97
-
98
- # demo = gr.Interface(video_identity,
99
- # gr.Video(),
100
- # "playable_video",
101
- # output = predicted_label,
102
- # title="VideoMAE fine-tuned on numbers, alphabets and nouns videos.",
103
- # description="Gradio demo app of fine-tuned VideoMAE for video classification, To use it simply upload your video.",
104
- # article = "VideoMAE"
105
- # )
106
-
107
-
108
- import gradio as gr
109
- import pytorchvideo
110
- import torch
111
- from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
112
-
113
- def preprocess_video(video, image_processor, model_config):
114
- mean = image_processor.image_mean
115
- std = image_processor.image_std
116
-
117
- if "shortest_edge" in image_processor.size:
118
- height = width = image_processor.size["shortest_edge"]
119
- else:
120
- height = image_processor.size["height"]
121
- width = image_processor.size["width"]
122
-
123
- resize_to = (height, width)
124
- num_frames_to_sample = model_config.num_frames
125
-
126
- transform = Compose(
127
- [
128
- UniformTemporalSubsample(num_frames_to_sample),
129
- Lambda(lambda x: x / 255.0),
130
- Normalize(mean, std),
131
- Resize(resize_to),
132
- ]
133
- )
134
-
135
- video_tensor = transform(video)
136
- return video_tensor
137
-
138
- def run_inference(model, image_processor, video):
139
- """Utility to run inference given a model and test video."""
140
- preprocessed_video = preprocess_video(video, image_processor, model.config)
141
- inputs = {"pixel_values": preprocessed_video.unsqueeze(0)}
142
-
143
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
144
- inputs = {k: v.to(device) for k, v in inputs.items()}
145
- model = model.to(device)
146
-
147
  with torch.no_grad():
148
- outputs = model(**inputs)
149
  logits = outputs.logits
150
-
151
- predicted_class = logits.argmax(dim=-1).item()
152
- class_labels = model.config.id2label
153
- predicted_label = class_labels[predicted_class]
154
-
155
- return predicted_label
156
-
157
- model_name = "latif98/videomae-base-finetuned-isl-numbers_aug"
158
- image_processor = VideoMAEImageProcessor.from_pretrained(model_name)
159
- model = VideoMAEForVideoClassification.from_pretrained(model_name)
160
-
161
- demo = gr.Interface(
162
- run_inference,
163
- [model,gr.Video(), image_processor],
164
- outputs = 'text',
165
- title="VideoMAE fine-tuned on numbers, alphabets and nouns videos.",
166
- description="Gradio demo app of fine-tuned VideoMAE for video classification, To use it simply upload your video.",
167
- article="VideoMAE"
168
- )
169
-
170
-
171
- if __name__ == "__main__":
172
- demo.launch()
173
-
174
-
 
 
1
+ import cv2
2
  import gradio as gr
3
+ import imutils
 
 
4
  import numpy as np
5
+ import torch
 
 
 
 
 
6
  from pytorchvideo.transforms import (
7
  ApplyTransformToKey,
8
  Normalize,
 
11
  ShortSideScale,
12
  UniformTemporalSubsample,
13
  )
 
14
  from torchvision.transforms import (
15
  Compose,
16
  Lambda,
17
+ RandomCrop,
18
+ RandomHorizontalFlip,
19
  Resize,
20
  )
21
+ from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
22
+
23
+ MODEL_CKPT = "latif98/videomae-base-finetuned-isl-numbers_aug"
24
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+
26
+ MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
27
+ PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
28
+
29
+ RESIZE_TO = PROCESSOR.size["shortest_edge"]
30
+ NUM_FRAMES_TO_SAMPLE = MODEL.config.num_frames
31
+ IMAGE_STATS = {"image_mean": [0.485, 0.456, 0.406], "image_std": [0.229, 0.224, 0.225]}
32
+ VAL_TRANSFORMS = Compose(
33
+ [
34
+ UniformTemporalSubsample(NUM_FRAMES_TO_SAMPLE),
35
+ Lambda(lambda x: x / 255.0),
36
+ Normalize(IMAGE_STATS["image_mean"], IMAGE_STATS["image_std"]),
37
+ Resize((RESIZE_TO, RESIZE_TO)),
38
+ ]
39
+ )
40
+ LABELS = list(MODEL.config.label2id.keys())
41
+
42
+
43
+ def parse_video(video_file):
44
+ """A utility to parse the input videos.
45
+
46
+ Reference: https://pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/
47
+ """
48
+ vs = cv2.VideoCapture(video_file)
49
+
50
+ # try to determine the total number of frames in the video file
51
+ try:
52
+ prop = (
53
+ cv2.cv.CV_CAP_PROP_FRAME_COUNT
54
+ if imutils.is_cv2()
55
+ else cv2.CAP_PROP_FRAME_COUNT
56
+ )
57
+ total = int(vs.get(prop))
58
+ print("[INFO] {} total frames in video".format(total))
59
+
60
+ # an error occurred while trying to determine the total
61
+ # number of frames in the video file
62
+ except:
63
+ print("[INFO] could not determine # of frames in video")
64
+ print("[INFO] no approx. completion time can be provided")
65
+ total = -1
66
+
67
+ frames = []
68
+
69
+ # loop over frames from the video file stream
70
+ while True:
71
+ # read the next frame from the file
72
+ (grabbed, frame) = vs.read()
73
+ if frame is not None:
74
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
75
+ frames.append(frame)
76
+ # if the frame was not grabbed, then we have reached the end
77
+ # of the stream
78
+ if not grabbed:
79
+ break
80
+
81
+ return frames
82
+
83
+
84
+ def preprocess_video(frames: list):
85
+ """Utility to apply preprocessing transformations to a video tensor."""
86
+ # Each frame in the `frames` list has the shape: (height, width, num_channels).
87
+ # Collated together the `frames` has the the shape: (num_frames, height, width, num_channels).
88
+ # So, after converting the `frames` list to a torch tensor, we permute the shape
89
+ # such that it becomes (num_channels, num_frames, height, width) to make
90
+ # the shape compatible with the preprocessing transformations. After applying the
91
+ # preprocessing chain, we permute the shape to (num_frames, num_channels, height, width)
92
+ # to make it compatible with the model. Finally, we add a batch dimension so that our video
93
+ # classification model can operate on it.
94
+ video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
95
+ video_tensor = video_tensor.permute(
96
+ 3, 0, 1, 2
97
+ ) # (num_channels, num_frames, height, width)
98
+ video_tensor_pp = VAL_TRANSFORMS(video_tensor)
99
+ video_tensor_pp = video_tensor_pp.permute(
100
+ 1, 0, 2, 3
101
+ ) # (num_frames, num_channels, height, width)
102
+ video_tensor_pp = video_tensor_pp.unsqueeze(0)
103
+ return video_tensor_pp.to(DEVICE)
104
+
105
+
106
+ def infer(video_file):
107
+ frames = parse_video(video_file)
108
+ video_tensor = preprocess_video(frames)
109
+ inputs = {"pixel_values": video_tensor}
110
+
111
+ # forward pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  with torch.no_grad():
113
+ outputs = MODEL(**inputs)
114
  logits = outputs.logits
115
+ softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
116
+ confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
117
+ return confidences
118
+
119
+
120
+ gr.Interface(
121
+ fn=infer,
122
+ inputs=gr.Video(type="file"),
123
+ outputs=gr.Label(num_top_classes=3),
124
+ examples=[
125
+ ["examples/babycrawling.mp4"],
126
+ ["examples/baseball.mp4"],
127
+ ["examples/balancebeam.mp4"],
128
+ ],
129
+ title="VideoMAE fine-tuned on a subset of UCF-101",
130
+ description=(
131
+ "Gradio demo for VideoMAE for video classification. To use it, simply upload your video or click one of the"
132
+ " examples to load them. Read more at the links below."
133
+ ),
134
+ article=(
135
+ "<div style='text-align: center;'><a href='https://huggingface.co/docs/transformers/model_doc/videomae' target='_blank'>VideoMAE</a>"
136
+ " <center><a href='https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset' target='_blank'>Fine-tuned Model</a></center></div>"
137
+ ),
138
+ allow_flagging=False,
139
+ allow_screenshot=False,
140
+ ).launch()