# Pose inferencing import mmpose from mmpose.apis import MMPoseInferencer # Ultralytics from ultralytics import YOLO import torch # Gradio import gradio as gr import moviepy.editor as moviepy # System and files import os import glob import uuid # Image manipulation import numpy as np import cv2 print("[INFO]: Imported modules!") human = MMPoseInferencer("human") hand = MMPoseInferencer("hand") human3d = MMPoseInferencer(pose3d="human3d") track_model = YOLO('yolov8n.pt') # Load an official Detect model # ultraltics # Defining inferencer models to lookup in function inferencers = {"Estimate human 2d poses":human, "Estimate human 2d hand poses":hand, "Estimate human 3d poses":human3d, "Detect and track":track_model} print("[INFO]: Downloaded models!") def check_extension(video): split_tup = os.path.splitext(video) # extract the file name and extension file_name = split_tup[0] file_extension = split_tup[1] if file_extension is not ".mp4": clip = moviepy.VideoFileClip(video) video = file_name+".mp4" clip.write_videofile(video) return video def tracking(video, model, boxes=True): print("[INFO] Loading model...") # Load an official or custom model # Perform tracking with the model print("[INFO] Starting tracking!") # https://docs.ultralytics.com/modes/predict/ annotated_frame = model(video, boxes=boxes) return annotated_frame def show_tracking(video_content): # https://docs.ultralytics.com/datasets/detect/coco/ video = cv2.VideoCapture(video_content) # Track video_track = tracking(video_content, track_model.track) # Prepare to save video #out_file = os.path.join(vis_out_dir, "track.mp4") out_file = "track.mp4" print("[INFO]: TRACK", out_file) fourcc = cv2.VideoWriter_fourcc(*"mp4v") # Codec for MP4 video fps = video.get(cv2.CAP_PROP_FPS) height, width, _ = video_track[0][0].orig_img.shape size = (width,height) out_track = cv2.VideoWriter(out_file, fourcc, fps, size) # Go through frames and write them for frame_track in video_track: result_track = frame_track[0].plot() # plot a BGR numpy array of predictions print("[INFO] Done with frames") #print(type(result_pose)) numpy ndarray out_track.write(result_track) out_track.release() video.release() cv2.destroyAllWindows() # Closing window return out_file def pose3d(video): video = check_extension(video) # Define new unique folder add_dir = str(uuid.uuid4()) vis_out_dir = os.path.join("/".join(video.split("/")[:-1]), add_dir) os.makedirs(vis_out_dir) result_generator = human3d(video, vis_out_dir = vis_out_dir, thickness=2, return_vis=True, rebase_keypoint_height=True, device="cuda") result = [result for result in result_generator] #next(result_generator) out_file = glob.glob(os.path.join(vis_out_dir, "*.mp4")) #+ glob.glob(os.path.join(vis_out_dir, "*.webm")) return "".join(out_file) def pose2d(video, kpt_threshold): video = check_extension(video) # Define new unique folder add_dir = str(uuid.uuid4()) vis_out_dir = os.path.join("/".join(video.split("/")[:-1]), add_dir) os.makedirs(vis_out_dir) result_generator = human(video, vis_out_dir = vis_out_dir, return_vis=True, thickness=2, rebase_keypoint_height=True, kpt_thr=kpt_threshold, device="cuda" ) result = [result for result in result_generator] #next(result_generator) out_file = glob.glob(os.path.join(vis_out_dir, "*.mp4")) #+ glob.glob(os.path.join(vis_out_dir, "*.webm")) return "".join(out_file) def pose2dhand(video, kpt_threshold): video = check_extension(video) # Define new unique folder add_dir = str(uuid.uuid4()) vis_out_dir = os.path.join("/".join(video.split("/")[:-1]), add_dir) os.makedirs(vis_out_dir) result_generator = hand(video, vis_out_dir = vis_out_dir, return_vis=True, thickness=2, rebase_keypoint_height=True, kpt_thr=kpt_threshold, device="cuda") result = [result for result in result_generator] #next(result_generator) out_file = glob.glob(os.path.join(vis_out_dir, "*.mp4")) #+ glob.glob(os.path.join(vis_out_dir, "*.webm")) return "".join(out_file) def run_UI(): with gr.Blocks() as demo: with gr.Column(): with gr.Tab("Upload video"): with gr.Row(): with gr.Column(): video_input = gr.Video(source="upload", type="filepath", height=612) # Insert slider with kpt_thr file_kpthr = gr.Slider(minimum=1e3, maximum=1e6, step=1e3, default=1e3, label='Keypoint threshold') submit_pose_file = gr.Button("Make 2d pose estimation") submit_pose3d_file = gr.Button("Make 3d pose estimation") submit_hand_file = gr.Button("Make 2d hand estimation") submit_detect_file = gr.Button("Detect and track objects") with gr.Column(): video_output1 = gr.PlayableVideo(height=512, label = "Estimate human 2d poses", show_label=True) video_output2 = gr.PlayableVideo(height=512, label = "Estimate human 3d poses", show_label=True) video_output3 = gr.PlayableVideo(height=512, label = "Estimate human hand poses", show_label=True) video_output4 = gr.Video(height=512, label = "Detection and tracking", show_label=True, format="mp4") with gr.Tab("Record video with webcam"): with gr.Column: with gr.Row(): with gr.Column(): webcam_input = gr.Video(source="webcam", height=612) web_kpthr = gr.Slider(minimum=0.1, maximum=1, step=1e3, default=0.3, label='Keypoint threshold') submit_pose_web = gr.Button("Make 2d pose estimation") submit_pose3d_web = gr.Button("Make 3d pose estimation") submit_hand_web = gr.Button("Make 2d hand estimation") submit_detect_web = gr.Button("Detect and track objects") with gr.Row(): webcam_output1 = gr.PlayableVideo(height=512, label = "Estimate human 2d poses", show_label=True) webcam_output2 = gr.PlayableVideo(height=512, label = "Estimate human 3d poses", show_label=True) webcam_output3 = gr.PlayableVideo(height=512, label = "Estimate human hand position", show_label=True) webcam_output4 = gr.Video(height=512, label = "Detection and tracking", show_label=True, format="mp4") # From file submit_pose_file.click(fn=pose2d, inputs= [video_input, file_kpthr], outputs = video_output1) submit_pose3d_file.click(fn=pose3d, inputs= video_input, outputs = video_output2) submit_hand_file.click(fn=pose2dhand, inputs= [video_input, file_kpthr], outputs = video_output3) submit_detect_file.click(fn=show_tracking, inputs= video_input, outputs = video_output4) # Web submit_pose_web.click(fn=pose2d, inputs = [webcam_input, web_kpthr], outputs = webcam_output1) submit_pose3d_web.click(fn=pose3d, inputs= webcam_input, outputs = webcam_output2) submit_hand_web.click(fn=pose2dhand, inputs= [webcam_input, web_kpthr], outputs = webcam_output3) submit_detect_web.click(fn=show_tracking, inputs= webcam_input, outputs = webcam_output4) demo.launch(server_name="0.0.0.0", server_port=7860) if __name__ == "__main__": run_UI()