So I attempted the kaggle mnist challenge and I used torch's custom dataset module to load the csv files. Whenever I train the network using a dataloader with num_workers set to more than 0, it seems to give me a BrokenPipeError. I followed many tutorials on the net and even put my code under if __name__ == "__main__"
line but nothing seems to fix this error also num_workers=0 gives me no error rather it gives me a UserWarning about Named Tensors.
Below is the code
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
class mnistdataset(torch.utils.data.Dataset):
def __init__(self, file, transform=None):
self.file = pd.read_csv(file)
self.labels = self.file["label"].values
self.transform = transform
def __len__(self):
return self.file.shape[0]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
im = self.file.iloc[idx, 1:].to_numpy(dtype="uint8").reshape(-1)
im = np.array([im]).reshape(28,28)
if self.transform:
im = self.transform(im)
return im, self.labels[idx]
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 64, 5, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(64)
self.conv2 = nn.Conv2d(64, 64, 5, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(64)
self.m1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.d1 = nn.Dropout2d(0.25)
self.conv3 = nn.Conv2d(64,64,3, stride=1, padding=1)
self.bn3 = nn.BatchNorm2d(64)
self.conv4 = nn.Conv2d(64,64,3, stride=1, padding=1)
self.bn4 = nn.BatchNorm2d(64)
self.conv5 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
self.bn5 = nn.BatchNorm2d(64)
self.m2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.d2 = nn.Dropout2d(0.25)
self.conv6 = nn.Conv2d(64, 128, 3, stride=1, padding=1)
self.bn6 = nn.BatchNorm2d(128)
self.d3 = nn.Dropout2d(0.25)
self.lin1 = nn.Linear(4608, 400)
self.d4 = nn.Dropout(0.4)
self.lin2 = nn.Linear(400, 28)
self.d5 = nn.Dropout(0.2)
self.lin3 = nn.Linear(28, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.bn1(x)
x = F.relu(self.conv2(x))
x = self.bn2(x)
x = torch.max_pool2d(x, kernel_size=2, stride=2)
x = self.d1(x)
x = F.relu(self.conv3(x))
x = self.bn3(x)
x = F.relu(self.conv4(x))
x = self.bn4(x)
x = F.relu(self.conv5(x))
x = self.bn5(x)
x = torch.max_pool2d(x, kernel_size=2, stride=2)
x = self.d2(x)
x = F.relu(self.conv6(x))
x = self.bn6(x)
x = self.d3(x)
x = x.view(x.size(0), -1)
x = F.relu(self.lin1(x))
x = self.d1(x)
x = F.relu(self.lin2(x))
x = self.d2(x)
x = self.lin3(x)
return x
def get_dataloaders():
train_transform = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor(),transforms.Normalize((0.1307), (0.3081))])
train = mnistdataset("train.csv", transform = train_transform)
return torch.utils.data.DataLoader(train, batch_size=20, shuffle=True, num_workers=2)
def train_network(train_loader):
net = Net().cuda()
opt = optim.SGD(net.parameters(), lr= 0.01, momentum=0.5)
loss = nn.CrossEntropyLoss().cuda()
epochs = 2
for epoch in range(epochs):
net.train()
for batch_id, (im, target) in enumerate(train_loader):
im = im.to('cuda', non_blocking=True)
target = target.to('cuda', non_blocking=True).long()
opt.zero_grad()
pred = net(im)
l = loss(pred, target)
l.backward()
opt.step()
if (batch_id + 1)% 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_id + 1) * len(im), len(train_loader.dataset),
100. * (batch_id + 1) / len(train_loader), l.item()))
if __name__ == '__main__':
train_loader = get_dataloaders()
train_network(train_loader)
The error I am getting is
---------------------------------------------------------------------------
BrokenPipeError Traceback (most recent call last)
<ipython-input-8-5af6b8b22e93> in <module>
2
3 train_loader = get_dataloaders()
----> 4 train_network(train_loader)
<ipython-input-4-24f1b1c4c822> in train_network(train_loader)
8
9 net.train()
---> 10 for batch_id, (im, target) in enumerate(train_loader):
11
12 im = im.to('cuda', non_blocking=True)
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
357 return self._iterator
358 else:
--> 359 return self._get_iterator()
360
361 @property
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in _get_iterator(self)
303 else:
304 self.check_worker_number_rationality()
--> 305 return _MultiProcessingDataLoaderIter(self)
306
307 @property
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
916 # before it starts, and __del__ tries to join but will get:
917 # AssertionError: can only join a started process.
--> 918 w.start()
919 self._index_queues.append(index_queue)
920 self._workers.append(w)
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py in start(self)
103 'daemonic processes are not allowed to have children'
104 _cleanup()
--> 105 self._popen = self._Popen(self)
106 self._sentinel = self._popen.sentinel
107 # Avoid a refcycle if the target function holds an indirect
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
221 @staticmethod
222 def _Popen(process_obj):
--> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
320 def _Popen(process_obj):
321 from .popen_spawn_win32 import Popen
--> 322 return Popen(process_obj)
323
324 class SpawnContext(BaseContext):
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
63 try:
64 reduction.dump(prep_data, to_child)
---> 65 reduction.dump(process_obj, to_child)
66 finally:
67 set_spawning_popen(None)
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
58 def dump(obj, file, protocol=None):
59 '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60 ForkingPickler(file, protocol).dump(obj)
61
62 #
BrokenPipeError: [Errno 32] Broken pipe
The warning I am getting with num_workers set to 0 is
ipykernel_launcher:33: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at ..\c10/core/TensorImpl.h:1156.)
The model still trains with num_workers set to 0.
My Environment Details:
Windows 10 Home Edition, Pytorch for CUDA 11.2(installed with pip, no conda), Python 3.6.7 for windows, GTX 1050 Ti GPU, Intel i5 9th Gen
Edit: The code seems to work when I run the code in a python file but doesn't seem to run while using a jupyter notebook