Fine tuning Bert for NER attempt on Mac OS

Question

I'm using a MacBook Air/OS Monterey 12.5 (There are updates available; Ventura 13.1 Python version 3.10.8 and also tried using 3.11

Pylance has pointed that all the imports I was trying to execute were not being resolved so I changed the VS Code interpreter to Python 3.10.

Anyways, here's the code:

            import pandas as pd
            import torch
            import numpy as np
            from tqdm import tqdm
            from transformers import BertTokenizerFast
            from transformers import BertForTokenClassification
            from torch.utils.data import Dataset, DataLoader

            df = pd.read_csv('ner.csv')
            labels = [i.split() for i in df['labels'].values.tolist()]

            unique_labels = set()
            for lb in labels:
                [unique_labels.add(i) for i in lb if i not in unique_labels]
            # print(unique_labels)

            labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
            ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
            # print(labels_to_ids)


            text = df['text'].values.tolist()
            example = text[36]
            #print(example)

            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
            text_tokenized = tokenizer(example, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
            '''
            print(text_tokenized)
            print(tokenizer.decode(text_tokenized.input_ids[0]))
            '''

            def align_label_example(tokenized_input, labels):
                word_ids = tokenized_input.word_ids()
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        try:
                            label_ids.append(labels_to_ids[labels[word_idx]])
                        except:
                            label_ids.append(-100)
                    else:
                        label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
                        previous_word_idx = word_idx
                return label_ids;
                
            label = labels[36]

            label_all_tokens = False
            new_label = align_label_example(text_tokenized, label)
            '''
            print(new_label)
            print(tokenizer.convert_ids_to_tokens(text_tokenized['input_ids'][0]))
            '''

            def align_label(texts, labels):
                tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
                word_ids = tokenized_inputs.word_ids()
                
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        try:
                            label_ids.append(labels_to_ids[labels[word_idx]])
                        except:
                            label_ids.append(-100)
                    else:
                        try:
                            label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
                        except:
                            label_ids.append(-100)
                    previous_word_idx = word_idx
                    
                return label_ids

            class DataSequence(torch.utils.data.Dataset):
                def __init__(self, df):
                    lb = [i.split() for i in df['labels'].values.tolist()]
                    txt = df['text'].values.tolist()
                    self.texts = [tokenizer(str(i),
                                            padding='max_length', max_length=512, truncation=True, return_tensors='pt') for i in txt]
                    self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

            def __len__(self):
                return len(self.labels)
            def get_batch_labels(self, idx):
                return torch.LongTensor(self.labels[idx])

            def __getitem__(self, idx):
                batch_data = self.get_batch_data(idx)
                batch_labels = self.get_batch_labels(idx)
                return batch_data, batch_labels

            df = df[0:1000]
            df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                [int(.8 * len(df)), int(.9 * len(df))])


            class BertModel(torch.nn.Module):
                def __init__(self):
                    super(BertModel, self).__init__()
                    self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
                def forward(self, input_id, mask, label):
                    output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
                    return output
                
            def train_loop(model, df_train, df_val):

                train_dataset = DataSequence(df_train)
                val_dataset = DataSequence(df_val)
                
                train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
                val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)
                
                use_cuda = torch.cuda.is_available()
                device = torch.device('cuda' if use_cuda else 'cpu')
                
                optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
                
                if use_cuda:
                    model = model.cuda()
                    
                    best_acc = 0
                    best_loss = 1000
                    
                    for epoch_num in range(EPOCHS):
                        total_acc_train = 0
                        total_loss_train = 0
                        model.train()
                    for train_data, train_label in tqdm(train_dataloader):
                        train_label = train_label.to(device)
                        mask = train_data['attention_mask'].squeeze(1).to(device)
                        input_id = train_data['input_ids'].squeeze(1).to(device)
                        
                        optimizer.zero_grad()
                        loss, logits = model(input_id, mask, train_label)
                        
                        for i in range(logits.shape[0]):
                            logits_clean = logits[i][train_label[i] != -100]
                            label_clean = train_label[i][train_label[i] != -100]
                            predictions = logits_clean.argmax(dim=1)
                            acc = (predictions == label_clean).float().mean()
                            total_acc_train += acc
                            total_loss_train += loss.item()

                        loss.backward()
                        optimizer.step()

                    model.eval()
                    total_acc_val = 0
                    total_loss_val = 0
                    
                    for val_data, val_label in val_dataloader:

                        val_label = val_label.to(device)
                        mask = val_data['attention_mask'].squeeze(1).to(device)
                        input_id = val_data['input_ids'].squeeze(1).to(device)

                        loss, logits = model(input_id, mask, val_label)

                        for i in range(logits.shape[0]):

                        logits_clean = logits[i][val_label[i] != -100]
                        label_clean = val_label[i][val_label[i] != -100]

                        predictions = logits_clean.argmax(dim=1)
                        acc = (predictions == label_clean).float().mean()
                        total_acc_val += acc
                        total_loss_val += loss.item()

                    val_accuracy = total_acc_val / len(df_val)
                    val_loss = total_loss_val / len(df_val)

                    print(
                        f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

            LEARNING_RATE = 5e-3
            EPOCHS = 5
            BATCH_SIZE = 2

            model = BertModel()
            train_loop(model, df_train, df_val)

And the debugger says:

            Exception has occurred: RuntimeError       (note: full exception trace is shown but execution is paused at: <module>)

                    An attempt has been made to start a new process before the
                    current process has finished its bootstrapping phase.

                    This probably means that you are not using fork to start your
                    child processes and you have forgotten to use the proper idiom
                    in the main module:

                        if __name__ == '__main__':
                            freeze_support()
                            ...

                    The "freeze_support()" line can be omitted if the program
                    is not going to be frozen to produce an executable.
            File "/Users/filipedonatti/Projects/pyCodes/second_try.py", line 141, in train_loop
                for train_data, train_label in tqdm(train_dataloader):
            File "/Users/filipedonatti/Projects/pyCodes/second_try.py", line 197, in <module>
                train_loop(model, df_train, df_val)
            File "<string>", line 1, in <module> (Current frame)

By the way, Despite using Mac, I have downloaded Anaconda-Navigator, however I've been trying and executing this code on VS Code. I've downloaded numpy, torch, datasets and other libraries through Brew with the pip3 command.

I'm at a loss, I can run the code on a google collab notebook or Jupiter notebook, and I know training models and such in my humble Mac would not be advised, but I am just exercising this so I can train and use the model in a much more powerful machine. Please help me with this issue, I've been trying to find a solution for days. Peace and happy holidays.

I've tried solving the issue by writing:

 if __name__ == '__main__':
   freeze_support()

I've tried using this:

import parallelTestModule

extractor = parallelTestModule.ParallelExtractor()
extractor.runInParallel(numProcesses=2, numThreads=4)

I finally may have found an answer for this issue. Here's the link: https://stackoverflow.com/a/72926996/19258431 One of the methods mentioned should work. — Filipe Donatti, Dec 16 '22 at 13:42

score 0 · Answer 1 · answered Dec 21 '22 at 17:10

So... It turns out the correct way to solve this is to implement a function to train the loop as such:

def run():
model = BertModel()
torch.multiprocessing.freeze_support()
print('loop')
train_loop(model, df_train, df_val)

if __name__ == '__main__':
run()

Redefining that train_loop line in the end. Issue solved. For more see this link: https://github.com/pytorch/pytorch/issues/5858

Fine tuning Bert for NER attempt on Mac OS

1 Answers1