0

So I attempted the kaggle mnist challenge and I used torch's custom dataset module to load the csv files. Whenever I train the network using a dataloader with num_workers set to more than 0, it seems to give me a BrokenPipeError. I followed many tutorials on the net and even put my code under if __name__ == "__main__" line but nothing seems to fix this error also num_workers=0 gives me no error rather it gives me a UserWarning about Named Tensors.
Below is the code

import torch.nn as nn 
import torch 
import torch.nn.functional as F
import torch.optim as optim 
import pandas as pd
import numpy as np 
import torchvision.transforms as transforms

class mnistdataset(torch.utils.data.Dataset):
    def __init__(self, file, transform=None):

        self.file = pd.read_csv(file)
        self.labels = self.file["label"].values
        self.transform = transform  
        

    def __len__(self):
        return self.file.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        im = self.file.iloc[idx, 1:].to_numpy(dtype="uint8").reshape(-1)
        im = np.array([im]).reshape(28,28)
        if self.transform:
            im = self.transform(im)
        return im, self.labels[idx]

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, 5, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 64, 5, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.m1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.d1 = nn.Dropout2d(0.25)
        self.conv3 = nn.Conv2d(64,64,3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(64,64,3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(64)
        self.conv5 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.bn5 = nn.BatchNorm2d(64)
        self.m2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.d2 = nn.Dropout2d(0.25)
        self.conv6 = nn.Conv2d(64, 128, 3, stride=1, padding=1)
        self.bn6 = nn.BatchNorm2d(128)
        self.d3 = nn.Dropout2d(0.25)
        self.lin1 = nn.Linear(4608, 400)
        self.d4 = nn.Dropout(0.4)
        self.lin2 = nn.Linear(400, 28)
        self.d5 = nn.Dropout(0.2)
        self.lin3 = nn.Linear(28, 10)


    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.relu(self.conv2(x))
        x = self.bn2(x)
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = self.d1(x)
        x = F.relu(self.conv3(x))
        x = self.bn3(x)
        x = F.relu(self.conv4(x))
        x = self.bn4(x)
        x = F.relu(self.conv5(x))
        x = self.bn5(x)
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = self.d2(x)
        x = F.relu(self.conv6(x))
        x = self.bn6(x)
        x = self.d3(x)

        x = x.view(x.size(0), -1)
        
        x = F.relu(self.lin1(x))
        x = self.d1(x)
        x = F.relu(self.lin2(x))
        x = self.d2(x)
        x = self.lin3(x)

        return x

def get_dataloaders():
    train_transform = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor(),transforms.Normalize((0.1307), (0.3081))])
    train = mnistdataset("train.csv", transform = train_transform)
    return torch.utils.data.DataLoader(train, batch_size=20, shuffle=True, num_workers=2)


def train_network(train_loader):
    net = Net().cuda()
    opt = optim.SGD(net.parameters(), lr= 0.01, momentum=0.5)
    loss = nn.CrossEntropyLoss().cuda()

    epochs = 2
    for epoch in range(epochs):
        
        net.train()
        for batch_id, (im, target) in enumerate(train_loader):
        
            im = im.to('cuda', non_blocking=True)
            target = target.to('cuda', non_blocking=True).long()
            opt.zero_grad()
            pred = net(im)
            l = loss(pred, target)
            l.backward()
            opt.step()
            if (batch_id + 1)% 100 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, (batch_id + 1) * len(im), len(train_loader.dataset),
                    100. * (batch_id + 1) / len(train_loader), l.item()))

if __name__ == '__main__':
    
    train_loader = get_dataloaders()
    train_network(train_loader)

The error I am getting is

---------------------------------------------------------------------------
BrokenPipeError                           Traceback (most recent call last)
<ipython-input-8-5af6b8b22e93> in <module>
      2 
      3     train_loader = get_dataloaders()
----> 4     train_network(train_loader)

<ipython-input-4-24f1b1c4c822> in train_network(train_loader)
      8 
      9         net.train()
---> 10         for batch_id, (im, target) in enumerate(train_loader):
     11 
     12             im = im.to('cuda', non_blocking=True)

~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
    357             return self._iterator
    358         else:
--> 359             return self._get_iterator()
    360 
    361     @property

~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in _get_iterator(self)
    303         else:
    304             self.check_worker_number_rationality()
--> 305             return _MultiProcessingDataLoaderIter(self)
    306 
    307     @property

~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
    916             #     before it starts, and __del__ tries to join but will get:
    917             #     AssertionError: can only join a started process.
--> 918             w.start()
    919             self._index_queues.append(index_queue)
    920             self._workers.append(w)

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py in start(self)
    103                'daemonic processes are not allowed to have children'
    104         _cleanup()
--> 105         self._popen = self._Popen(self)
    106         self._sentinel = self._popen.sentinel
    107         # Avoid a refcycle if the target function holds an indirect

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
    221     @staticmethod
    222     def _Popen(process_obj):
--> 223         return _default_context.get_context().Process._Popen(process_obj)
    224 
    225 class DefaultContext(BaseContext):

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
    320         def _Popen(process_obj):
    321             from .popen_spawn_win32 import Popen
--> 322             return Popen(process_obj)
    323 
    324     class SpawnContext(BaseContext):

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
     63             try:
     64                 reduction.dump(prep_data, to_child)
---> 65                 reduction.dump(process_obj, to_child)
     66             finally:
     67                 set_spawning_popen(None)

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
     58 def dump(obj, file, protocol=None):
     59     '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60     ForkingPickler(file, protocol).dump(obj)
     61 
     62 #

BrokenPipeError: [Errno 32] Broken pipe

The warning I am getting with num_workers set to 0 is

ipykernel_launcher:33: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  ..\c10/core/TensorImpl.h:1156.)

The model still trains with num_workers set to 0.

My Environment Details:
Windows 10 Home Edition, Pytorch for CUDA 11.2(installed with pip, no conda), Python 3.6.7 for windows, GTX 1050 Ti GPU, Intel i5 9th Gen

Edit: The code seems to work when I run the code in a python file but doesn't seem to run while using a jupyter notebook

  • You might need to see this https://stackoverflow.com/a/68805662/16310106 –  Aug 16 '21 at 17:48

0 Answers0