I would like to have several processes, each one loading different images one in a time and performing inference (for example VGG16).
I am using Keras with tensorFlow backend, one GPU (GTX 1070). Following is the code:
import tensorflow as tf
import multiprocessing
from multiprocessing import Pool, Process, Queue
import os
from os.path import isfile, join
from PIL import Image
import time
from keras.applications.vgg16 import VGG16
import numpy as np
from keras.backend.tensorflow_backend import set_session
test_path = 'test path to images ...'
output = Queue()
def worker(file_names, output):
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.25
config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))
inference_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3), pooling='avg')
model_image_size = (224,224)
times = []
for file_name in file_names:
image = Image.open(os.path.join(test_path, file_name))
im_width = image.size[0]
im_height = image.size[1]
m = (im_width - im_height) // 2
image = image.crop((m, 0, im_width - m, im_height))
image = image.resize((model_image_size), Image.BICUBIC)
image = np.array(image, dtype='float32')
image /= 255.
image = np.expand_dims(image, 0) # Add batch dimension.
start = time.time()
res = inference_model.predict(image)
end = time.time()
elapsed_time = end - start
print("elapsed time", elapsed_time)
times.append(elapsed_time)
average_time = np.mean(times[2:])
print("average time ", average_time)
if __name__ == '__main__':
file_names = [f for f in os.listdir(test_path) if isfile(join(test_path, f))]
file_names.sort()
num_workers = 3
processes = [Process(target=worker, args=(file_names[x::num_workers], output)) for x in range(num_workers)]
for p in processes:
p.start()
for p in processes:
p.join()
I have noticed that the inference elapsed times per image are slower for multi processes compared to single process. For example while for single image the inference elapsed time is 0.012 sec. When running 3 processes, I would expect the same result, however, the average inference time per image is almost 0.02 sec. What could be the reason for that? (Maybe CUDA context – switching?) Is there a way to solve this?