0

I have this simple Python code that makes predictions on the emotions of the face (took it from here in case you need to run it), and shows it on the rectangle around the face on the camera. But the problem is it has many noises. For instance, Fearful -- Sad -- -- Sad and such. I want to smooth out the predictions and filter out singled out predictions. How can I make a change that if n number of predictions in a row said Sad, then display it as Sad?

You'll only need to change the last few lines as the initial parts are all for predictions.

import numpy as np
import argparse
import matplotlib.pyplot as plt
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# command line argument
ap = argparse.ArgumentParser()
ap.add_argument("--mode",help="train/display")
mode = ap.parse_args().mode

# plots accuracy and loss curves
def plot_model_history(model_history):
    """
    Plot Accuracy and Loss curves given the model_history
    """
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['accuracy'])+1),model_history.history['accuracy'])
    axs[0].plot(range(1,len(model_history.history['val_accuracy'])+1),model_history.history['val_accuracy'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['accuracy'])+1),len(model_history.history['accuracy'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    fig.savefig('plot.png')
    plt.show()

# Define data generators
train_dir = 'data/train'
val_dir = 'data/test'

num_train = 28709
num_val = 7178
batch_size = 64
num_epoch = 50

train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        train_dir,
        target_size=(48,48),
        batch_size=batch_size,
        color_mode="grayscale",
        class_mode='categorical')

validation_generator = val_datagen.flow_from_directory(
        val_dir,
        target_size=(48,48),
        batch_size=batch_size,
        color_mode="grayscale",
        class_mode='categorical')

# Create the model
model = Sequential()

model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(48,48,1)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(7, activation='softmax'))

# If you want to train the same model or try other models, go for this
if mode == "train":
    model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.0001, decay=1e-6),metrics=['accuracy'])
    model_info = model.fit_generator(
            train_generator,
            steps_per_epoch=num_train // batch_size,
            epochs=num_epoch,
            validation_data=validation_generator,
            validation_steps=num_val // batch_size)
    plot_model_history(model_info)
    model.save_weights('model.h5')

# emotions will be displayed on your face from the webcam feed
elif mode == "display":
    model.load_weights('model.h5')

    # prevents openCL usage and unnecessary logging messages
    cv2.ocl.setUseOpenCL(False)

    # dictionary which assigns each label an emotion (alphabetical order)
    emotion_dict = {0: "Angry", 1: "Disgusted", 2: "Fearful", 3: "Happy", 4: "Neutral", 5: "Sad", 6: "Surprised"}

    # start the webcam feed
    cap = cv2.VideoCapture(1)
    while True:
        # Find haar cascade to draw bounding box around face
        ret, frame = cap.read()
        if not ret:
            break
        facecasc = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = facecasc.detectMultiScale(gray,scaleFactor=1.3, minNeighbors=5)

        for (x, y, w, h) in faces:
            cv2.rectangle(frame, (x, y-50), (x+w, y+h+10), (255, 0, 0), 2)
            roi_gray = gray[y:y + h, x:x + w]
            cropped_img = np.expand_dims(np.expand_dims(cv2.resize(roi_gray, (48, 48)), -1), 0)
            prediction = model.predict(cropped_img)
            maxindex = int(np.argmax(prediction))
            text = emotion_dict[maxindex]
            if ("Sad" in text) or ("Angry" in text) or ("Disgusted" in text):
                text = "Sad"
            if ("Happy" in text) or ("Sad" in text):
                cv2.putText(frame, text, (x+20, y-60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.imshow('Video', cv2.resize(frame,(1600,960),interpolation = cv2.INTER_CUBIC))
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
Tina J
  • 4,983
  • 13
  • 59
  • 125
  • it looks like your output vector (final step) is of shape (7,1) -- if your goal is to turn that into a unified output I'd simply add a fully connected layer 7 -> 1 and update your training set to train on one result (the mode of the current 7) --- unless I'm missing something. IMHO if you want the model to predict a singular outcome bake that into the model don't mask the results -- it'll lead to better performance – Schalton Oct 05 '21 at 03:43
  • Retraining is too expensive for me at this point. Let's start with playing with the output now. – Tina J Oct 05 '21 at 08:25

2 Answers2

1

I'd make a list of predictions and take the mode, something like this:

prediction_history = []
LOOKBACK = 5 # how far you want to look back

# in loop:
prediction_history.append(maxindex)
most_common_index = max(set(prediction_history[-LOOKBACK:][::-1]), key = prediction_history.count)
text = emotion_dict[most_common_index]

specifically in your code:

import os
...
prediction_history = []
LOOKBACK = 5 # how far you want to look back
...

    cap = cv2.VideoCapture(1)
    while True:
        # Find haar cascade to draw bounding box around face
        ret, frame = cap.read()
        if not ret:
            break
        facecasc = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = facecasc.detectMultiScale(gray,scaleFactor=1.3, minNeighbors=5)

        for (x, y, w, h) in faces:
            cv2.rectangle(frame, (x, y-50), (x+w, y+h+10), (255, 0, 0), 2)
            roi_gray = gray[y:y + h, x:x + w]
            cropped_img = np.expand_dims(np.expand_dims(cv2.resize(roi_gray, (48, 48)), -1), 0)
            prediction = model.predict(cropped_img)
            
            # updates
            prediction_history.append(int(np.argmax(prediction)))
            most_common_index = max(set(prediction_history[-LOOKBACK:][::-1]), key = prediction_history.count)
            text = emotion_dict[most_common_index]
            
            if ("Sad" in text) or ("Angry" in text) or ("Disgusted" in text):
                text = "Sad"
            if ("Happy" in text) or ("Sad" in text):
                cv2.putText(frame, text, (x+20, y-60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.imshow('Video', cv2.resize(frame,(1600,960),interpolation = cv2.INTER_CUBIC))
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

Schalton
  • 2,867
  • 2
  • 32
  • 44
  • It shows an error: `prediction_history.push(maxindex) ... AttributeError: 'list' object has no attribute 'push'`. Can you please write the actual modified code? – Tina J Oct 05 '21 at 14:49
  • `NameError: name 'List' is not defined` – Tina J Oct 05 '21 at 14:51
  • `ValueError: max() arg is an empty sequence` – Tina J Oct 05 '21 at 14:53
  • Sorry; I'd been writing a lot of JS lately, updated – Schalton Oct 05 '21 at 14:55
  • Yes, the prediction_history list wont be populated until you add the first maxindex to it – Schalton Oct 05 '21 at 14:55
  • Thanks. But it doesn't run and crashes right in the beginning: `ValueError: max() arg is an empty sequence` – Tina J Oct 05 '21 at 15:00
  • Yes, sorry -- I had a typo changed `[:-LOOKBACK]` to `[-LOOKBACK:]` – Schalton Oct 05 '21 at 15:02
  • FYI the code `max(set(lst), key=lst.count)` returns the fist occurrence of the highest count item eg `[1,2,3,1,2,3] -> 1` and `[2,3,1,2,3,1] -> 2` for this reason it mike make sense to reverse the slice `[:-LOOKBACK][::-1]` so that it returns the newest added, highest count item vs `[:LOOKBACK]` which will return the oldest added, highest count item – Schalton Oct 05 '21 at 15:07
  • Also worth noting that if you have `[1,2,1,2,1,2] + 1 + 2` .... you'll still see the oscillation – Schalton Oct 05 '21 at 15:07
  • One thing I had in mind was instead of `mode`, look at 3 in a row. Would that alternative be very different? – Tina J Oct 05 '21 at 15:12
  • @Tina; you can replace this line `most_common_index = max(set(prediction_history[-LOOKBACK:][::-1]), key = prediction_history.count)` with an arbitrary function: `most_common_index = mySelectionCriteria(prediction_history)` and play with different ways to select a result – Schalton Oct 05 '21 at 16:04
  • can you answer my new question? https://stackoverflow.com/questions/69456905/python-cv2-display-a-logo-on-video-capture-based-on-a-condition – Tina J Oct 07 '21 at 01:14
  • I think I already answered that here – Schalton Oct 07 '21 at 16:49
1

Another option that came to me was to skip the initial argmax and take the total (or average) of the proceeding frames.

prediction_history.append(prediction) instead of prediction_history.append(int(np.argmax(prediction)))

Append the raw output to the prediction history instead of the argmax. That'll give you a list like this:

prediction_history = [
  [.1, .1, .1, .5, .4, .3, .1] # argmax = 3
  [.1, .2, .1, .5, .4, .3, .1] # argmax = 3
  [.1, .4, .1, .5, .4, .3, .1] # argmax = 3
  [.2, .6, .1, .5, .4, .3, .1] # argmax = 2
  [.1, .3, .1, .1, .4, .3, .1] # argmax = 2
  [.1, .6, .1, .1, .4, .3, .1] # argmax = 2
  [.1, .3, .1, .1, .4, .3, .1] # argmax = 4
]

# sum: [.8, 2.5, .7, 2.3, 2.6, 2.1, .7] --> 4
most_common_index = int(np.argmax(np.sum(prediction_history[-LOOKBACK:], 0)))
# mode (other answer) of argmax: [3,3,3,2,2,2,4] --> 3


this is also a nice framework for doing simple moving averages or other things -- I think you may end up with a more continuous state ie not [1,2,1,2,1,2,1...]

Given that you want the previous result to impact the outcome I'd say if you have an opportunity to re-architect this thing I'd consider a RNN or something of the sort and let the ML algo handle selecting the rolling category

Schalton
  • 2,867
  • 2
  • 32
  • 44