PyTorch Dataloader hangs when num_workers > 0. The code hangs with only about 500 M
GPU memory usage.
System info: NVIDIA-SMI 418.56 Driver Version: 418.56 CUDA Version: 10.1
.
The same issue appears with pytorch1.5 or pytorch1.6, codes are run in anaconda envs.
Note that this error appears when I run my script in terminal as python main.py
, but when I
debug the same code on Pycharm or VScode, or when I run the same code (in terminal) on
other machines, everything goes fine.
Any idea about the reason for this?
Here is the trace when I ctrl c
the code in terminal:
File "train.py", line 226, in main
train_domain_adaptation(model, source_loader, target_loader, val_loader,
File "/home/zhangyu/codes/person_seg/IntraDA/ADVENT/advent/domain_adaptation/train_UDA.py", line 326, in train_domain_adaptation
train_advent(model, trainloader, targetloader, val_loader, cfg, group=group, fk_loader=fk_loader)
File "/home/zhangyu/codes/person_seg/IntraDA/ADVENT/advent/domain_adaptation/train_UDA.py", line 114, in train_advent
_, (images_source, labels, src_names, voc_ids, _) = trainloader_iter.__next__()
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 363, in __next__
data = self._next_data()
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 974, in _next_data
idx, data = self._get_data()
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 931, in _get_data
success, data = self._try_get_data()
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 779, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/queue.py", line 179, in get
self.not_empty.wait(remaining)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/threading.py", line 306, in wait
gotit = waiter.acquire(True, timeout)
KeyboardInterrupt
Exception in thread Thread-2:
Traceback (most recent call last):
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
fd = df.detach()
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/multiprocessing/connection.py", line 752, in answer_challenge
message = connection.recv_bytes(256) # reject large message
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/home/zhangyu/anaconda3/envs/pt16/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer