I have posted code on this site before and I learnt that I can't post the whole thing. So, I will only post the code that matters.
So, what I am trying to do is to take an object detector(for images) and applying it to each frame of a given video.
The only thing is that I don't know how to finish it up. That is, once I detect the first frame what do I do with this frame? Do I store it somewhere? What do I do with the other frames? And once I handle these frames how do I recombine these frames into a video ie the output video?
Here is the code:
import numpy as np
import cv2
from numpy import expand_dims
from keras.models import load_model
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from matplotlib import pyplot
from matplotlib.patches import Rectangle
model = load_model('model.h5')
# define the expected input shape for the model
input_w, input_h = 416, 416
# define the anchors
anchors = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]]
# define the labels
labels = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck",
"boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
"bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
"sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
"apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
"chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse",
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
"book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]
vs = cv2.VideoCapture('video.mp4')
class_threshold = 0.6
boxes = list()
while True:
(grabbed, frame) = vs.read()
if not grabbed:
break
if W is None or H is None:
(H, W) = frame.shape[:2]
image, image_w, image_h = load_image_pixels(frame, (input_w, input_h))
yhat = model.predict(image)
for i in range(len(yhat)):
# decode the output of the network
boxes += decode_netout(yhat[i][0], anchors[i], class_threshhold, input_h, input_w)
# correct the sizes of the bounding boxes for the shape of the image
correct_yolo_boxes(boxes, image_h, image_w, input_h, input_w)
# suppress non-maximal boxes
do_nms(boxes, 0.5)
# get the details of the detected objects
v_boxes, v_labels, v_scores = get_boxes(boxes, labels, class_threshold)
# draw what we found
draw_boxes(frame, v_boxes, v_labels, v_scores)