I am currently processing 15k images but that number could grow to more at some point. I wrote a function that makes several changes to the images such as convert them to black and white, crop, resize and then flattening them. At a later time I will save the formatted images to a csv file for later use with tensorflow. I am using the multiprocessing module to utilize more cores on my CPU. It seems that it takes longer using multiprocessing then it does using a for loop to edit a single image at a time. I also wrote a simple version of the same program that squares a series of numbers. Using multiprocessing for that is actually faster.
Would I be better off splinting the data into batches? I wrote a generator to give me different batches but I couldn't get multiprocessing to work with it.
Compares the times of formatting images with multiprocessing verses sequential function calls
# comparing time for image formating using
# sequential and multiprocessing
# vonderasche
# 2/3/2019
import multiprocessing as mp
import time
import numpy as np
import cv2
import os
import sys
def my_format_images(image):
''' converts to BW, crops, resizes and then flattens the image'''
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
height, width = image.shape
if (height < width):
x_start = int((width - height) / 2)
x_end = height + x_start
image = image[0:height, x_start:x_end]
elif (width < height):
y_start = int((height - width) / 2)
y_end = width + y_start
image = image[y_start:y_end, 0:width]
image = cv2.resize(image, (100, 100))
image = image.flatten()
return image
def load_images(path):
'''loads images from a provided path'''
print('loading images')
image_list = []
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".jpg"):
img = cv2.imread(os.path.join(root, file))
image_list.append(img)
return image_list
def main():
path = 'images'
images = load_images(path)
print('total images loaded: ' + str(len(images)))
# multiprocessing function call
start_mp_timer = time.time()
pool = mp.Pool(4)
result = pool.map(my_format_images, images)
end_mp_timer = time.time() - start_mp_timer
# sequential function call
sum_of_single_thread = []
start_timer = time.time()
for i in images:
num = my_format_images(i)
sum_of_single_thread.append(num)
end_timer = time.time() - start_timer
print('multiprocessing time: ' + ' {: 05.5f}'.format(end_mp_timer) + ' sequential time: ' +' {: 05.5f}'.format(end_timer))
if __name__ == "__main__":
main()
main()
Simple version that squares a series of numbers to see that multiprocessing works.
# multiprocessing - test using numbers
# vonderasche
# 2/3/2019
import multiprocessing as mp
import time
import os
def square(x):
''' prints the current process id and returns the square'''
print(os.getpid())
return x**x
def main():
data = [4784, 2454, 34545, 54545,
34545, 24545, 1454, 454542, 52221, 11242, 88478, 447511]
# multiprocessing function call
pool = mp.Pool(4)
start_mp_timer = time.time()
result = pool.map(square, data)
end_mp_timer = time.time() - start_mp_timer
# sequential function call
sum_of_single_thread = []
start_timer = time.time()
for i in data:
num = square(i)
sum_of_single_thread.append(num)
end_timer = time.time() - start_timer
print('multiprocessing time: ' + '{:05.5f}'.format(end_mp_timer))
print('sequential time: ' + '{:05.5f}'.format(end_timer))
if __name__ == "__main__":
main()