In my Python application I am using Detectron2 to run prediction on an image and detect the key-points of all the humans in the image.
I want to run the prediction on frames that are streamed to my app live (using aiortc), but I discovered that the predictions time is much worse because it now runs on a new thread (the main thread is occupied with the server).
Running predictions on a thread takes anywhere between 1.5 to 4 seconds, which is a lot.
When running the predictions on the main-thread (without the video streaming part), I get predictions times of less than a second.
My question is why it happens and how can I fix it¿ Why the GPU performance is degraded so drastically when using it from a new thread¿
Notes:
The code is tested in Google Colab with Tesla P100 GPU and the video streaming is emulated by reading frames from a video file.
I calculate the time it takes to run prediction on a frame using the code in this question.
I tried switching to multiprocessing instead, but couldn't make it work with cuda (I tried both import multiprocessing
as well as import torch.multiprocessing
with set_stratup_method('spawn')
) it just gets stuck when calling start
on the process.
Example code:
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
import threading
from typing import List
import numpy as np
import timeit
import cv2
# Prepare the configuration file
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 # set threshold for this model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml")
cfg.MODEL.DEVICE = "cuda"
predictor = DefaultPredictor(cfg)
def get_frames(video: cv2.VideoCapture):
frames = list()
while True:
has_frame, frame = video.read()
if not has_frame:
break
frames.append(frame)
return frames
class CodeTimer:
# Source: https://stackoverflow.com/a/52749808/9977758
def __init__(self, name=None):
self.name = " '" + name + "'" if name else ''
def __enter__(self):
self.start = timeit.default_timer()
def __exit__(self, exc_type, exc_value, traceback):
self.took = (timeit.default_timer() - self.start) * 1000.0
print('Code block' + self.name + ' took: ' + str(self.took) + ' ms')
video = cv2.VideoCapture('DemoVideo.mp4')
num_frames = round(video.get(cv2.CAP_PROP_FRAME_COUNT))
frames_buffer = list()
predictions = list()
def send_frames():
# This function emulates the stream, so here we "get" a frame and add it to our buffer
for frame in get_frames(video):
frames_buffer.append(frame)
# Simulate delays between frames
time.sleep(random.uniform(0.3, 2.1))
def predict_frames():
predicted_frames = 0 # The number of frames predicted so far
while predicted_frames < num_frames: # Stop after we predicted all frames
buffer_length = len(frames_buffer)
if buffer_length <= predicted_frames:
continue # Wait until we get a new frame
# Read all the frames from the point we stopped
for frame in frames_buffer[predicted_frames:]:
# Measure the prediction time
with CodeTimer('In stream prediction'):
predictions.append(predictor(frame))
predicted_frames += 1
t1 = threading.Thread(target=send_frames)
t1.start()
t2 = threading.Thread(target=predict_frames)
t2.start()
t1.join()
t2.join()