I am trying to benchmark training MNIST between host OS and docker containerization. To do performance check, I am using Python's psutil.
If I train on the host OS, the training goes fine. But doing this on docker with 8gb shared memory hangs at the beginning of the training
0%| | 0/938 [00:00<?, ?it/s]
Here is what my function that uses multiprocessing and psutil looks like
def check_usage(system_info, p):
system_info['cpu_percent'].append(p.cpu_percent())
system_info['ram_percent'].append(p.memory_percent())
system_info['cpu_threads'].append(p.num_threads())
system_info['memory_rss'].append(p.memory_full_info().rss / 1024 / 1024)
system_info['memory_vms'].append(p.memory_full_info().vms / 1024 / 1024)
system_info['memory_shared'].append(p.memory_full_info().shared / 1024 / 1024)
system_info['memory_data'].append(p.memory_full_info().data / 1024 / 1024)
system_info['memory_swap'].append(p.memory_full_info().swap / 1024 / 1024)
system_info['memory_uss'].append(p.memory_full_info().uss / 1024 / 1024)
system_info['memory_pss'].append(p.memory_full_info().pss / 1024 / 1024)
system_info['swap_memory_used'].append(p.memory_full_info().swap / 1024 / 1024)
def monitor(target):
worker_process = mp.Process(target=target)
worker_process.start()
p = psutil.Process(worker_process.pid)
system_info = {
'cpu_percent': [],
'ram_percent': [],
'cpu_threads': [],
'memory_rss': [],
'memory_vms': [],
'memory_shared': [],
'memory_data': [],
'memory_swap': [],
'memory_uss': [],
'memory_pss': [],
'swap_memory_used': []
}
# log cpu usage of `worker_process` every 10 ms
while worker_process.is_alive():
time.sleep(sleep_interval)
check_usage(system_info, p)
worker_process.join()
return system_info
Function that calls above
sleep_interval = 0.5
training_val = monitor(target=run_model)
run_model
def run_model():
# Train
start_time = time.time()
per_epoch_train_duration = []
per_epoch_with_val_duration = []
for _ in range(epochs):
epoch_time = time.time()
model.train()
for _, (data, targets) in enumerate(tqdm(train_loader)):
data = data.to(device=device)
targets = targets.to(device=device)
# forward
output = model(data)
loss = criterion(output, targets)
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
per_epoch_train_duration.append(time.time() - epoch_time)
accuracy = check_accuracy(test_loader, model, device)
per_epoch_with_val_duration.append(time.time() - epoch_time)
I ran my docker with these commands
docker -it --shm-size=8gb /bin/bash
I am not sure why it hangs