To start, this is a continuation of this question: Multithreading degrades GPU performance. However, since that question never got resolved due to everyone not being able to reproduce the results, I have created a new question with code here that reproduces the slower results outlined there.
To recap: when using cv2.VideoCapture
with multi-threading, the inferencing time for Detectron2 is much slower compared to when multi-threading is disabled.
Some additional information is that I am operating on Windows and am using an RTX3070 so inferencing times may be slightly different for those trying to rerun this.
Here is the code:
import time
import cv2
from queue import Queue
from threading import Thread
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
class FileVideoStream:
def __init__(self, path, queueSize=15):
self.stream = cv2.VideoCapture(path)
self.stopped = False
self.Q = Queue(maxsize=queueSize)
def start(self):
t = Thread(target=self.update, args=())
t.daemon = True
t.start()
return self
def update(self):
while True:
if self.stopped:
self.stream.release()
return
if not self.Q.full():
(grabbed, frame) = self.stream.read()
if not grabbed:
self.stop()
return
self.Q.put(frame)
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
"COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
)
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 # set threshold for this model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
"COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
)
cfg.MODEL.DEVICE = "cuda"
predictor = DefaultPredictor(cfg)
def threading_example():
print("Threading Example:")
fvs = FileVideoStream(r"DemoVideo.mp4")
fvs.start()
# allow time for thread to fill the queue
time.sleep(1)
for i in range(5):
img = fvs.Q.get()
start = time.time()
p = predictor(img)
end = time.time()
print(f"Frame {i} Prediction: {(end - start):.2f}s")
fvs.stopped = True
def non_threading_example():
print("Non-Threading Example:")
video = cv2.VideoCapture(r"DemoVideo.mp4")
for i in range(5):
_, img = video.read()
start = time.time()
p = predictor(img)
end = time.time()
print(f"Frame {i} Prediction: {(end - start):.2f}s")
non_threading_example()
threading_example()
This produces the following output:
Non-Threading Example:
Frame 0 Prediction: 1.41s
Frame 1 Prediction: 0.14s
Frame 2 Prediction: 0.14s
Frame 3 Prediction: 0.14s
Frame 4 Prediction: 0.14s
Threading Example:
Frame 0 Prediction: 10.55s
Frame 1 Prediction: 10.41s
Frame 2 Prediction: 10.77s
Frame 3 Prediction: 10.64s
Frame 4 Prediction: 10.27s
EDIT: I've added code to answer a comment about testing if the GPU on inferencing when inside a thread, which does not appear to be the case.
def infer_5(img):
for i in range(5):
start = time.time()
p = predictor(img)
end = time.time()
print(f"Frame {i}: {(end - start):.2f}s")
def system_load():
img = cv2.imread(
r"Image.jpg")
t = Thread(target=infer_5, args=(img,))
t.start()
Frame 0: 7.51s
Frame 1: 0.39s
Frame 2: 0.15s
Frame 3: 0.15s
Frame 4: 0.15s