Labeling model with hugginface Dataset

Question

I have the following code

from scipy.spatial.distance import dice, directed_hausdorff
from sklearn.metrics        import f1_score
from segments import SegmentsClient
from segments import SegmentsDataset
from datasets import load_dataset
from segments.huggingface import release2dataset
from huggingface_hub import cached_download, hf_hub_url
from transformers import TrainingArguments
import numpy as np
from transformers import Trainer
from torchvision.transforms import ColorJitter
from transformers import SegformerFeatureExtractor

#!huggingface-cli login
api_key = "..."
#etc

client = SegmentsClient(api_key)
dataset_identifier = "kasumi222/busigt"
vers = "v0.1"
release = client.get_release(dataset_identifier, vers)


ds = release2dataset(release)
ds = ds.shuffle(seed=1)
ds = ds.train_test_split(test_size=0.2)
train_ds = ds["train"]
test_ds = ds["test"]

Here I'm transforming from the SegmentAi API format to a Huggingface Dataset. However, I would like to extract the label for the following code to work:

# repo_id = f"datasets/{hf_dataset_identifier}"
filename = "dataset_infos.json"
#id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
id2label = {"benigno":0, "maligno":1}
label2id = {0:"benigno",1:"maligno"}
num_labels = len(id2label)


feature_extractor = SegformerFeatureExtractor()
jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) 

def train_transforms(example_batch):
    images = [jitter(x) for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = feature_extractor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [x for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = feature_extractor(images, labels)
    return inputs


# Set transforms
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_trans)

from transformers import SegformerForSemanticSegmentation

pretrained_model_name = "nvidia/mit-b0" 
model = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


epochs = 1
lr = 0.00006
batch_size = 1

hub_model_id = "segformer-b0-finetuned-busigt"

training_args = TrainingArguments(
    "segformer-b0-finetuned-busigt-outputs",
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=20,
    eval_steps=20,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="end",
)


metric = load_metric("mean_iou")

def compute_metrics(eval_pred):
  with torch.no_grad():
    logits, labels = eval_pred
    logits_tensor = torch.from_numpy(logits)
    # scale the logits to the size of the label
    logits_tensor = nn.functional.interpolate(
        logits_tensor,
        size=labels.shape[-2:],
        mode="bilinear",
        align_corners=False,
    ).argmax(dim=1)

    pred_labels = logits_tensor.detach().cpu().numpy()
    metrics = metric.compute(predictions=pred_labels, references=labels, 
                                   num_labels=num_labels, 
                                   ignore_index=0,
                                   reduce_labels=feature_extractor.reduce_labels)
    for key, value in metrics.items():
      if type(value) is np.ndarray:
        metrics[key] = value.tolist()
    return metrics



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    )

trainer.train()

Inside each sample of the dataset there is a field label.annotations, which contains a list with only one element, that is a dictionary with the label field, named category_id.

I suppose that the problem with the code is that it is not finding that as a label, because on the last line a warning appears

The following columns in the training set don't have a corresponding argument in `SegformerForSemanticSegmentation.forward` and have been ignored: label.annotations, image, status, uuid, name, label.segmentation_bitmap. If label.annotations, image, status, uuid, name, label.segmentation_bitmap are not expected by `SegformerForSemanticSegmentation.forward`,  you can safely ignore this message.

So what can I do to correctly train the model?

Edit:

I changed the transform functions to:

def train_transforms(example_batch):
    images = [jitter(x) for x in example_batch['image']]
    labels = [x for x in example_batch['label.annotations'][0]["category_id"]]
    inputs = feature_extractor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [jitter(x) for x in example_batch['image']]
    labels = [x for x in example_batch['label.annotations'][0]["category_id"]]
    inputs = feature_extractor(images, labels)
    return inputs

Same error:

The following columns in the training set don't have a corresponding argument in `SegformerForSemanticSegmentation.forward` and have been ignored: image, name, uuid, label.annotations, status, label.segmentation_bitmap. If image, name, uuid, label.annotations, status, label.segmentation_bitmap are not expected by `SegformerForSemanticSegmentation.forward`,  you can safely ignore this message.
/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  FutureWarning,
***** Running training *****
  Num examples = 517
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 517

---------------------------------------------------------------------------

ZeroDivisionError                         Traceback (most recent call last)

<ipython-input-31-d8203853aa11> in <module>
      1 import numpy as np
----> 2 trainer.train()

10 frames

/usr/local/lib/python3.7/dist-packages/datasets/formatting/formatting.py in _query_table(table, key)
     79     """
     80     if isinstance(key, int):
---> 81         return table.fast_slice(key % table.num_rows, 1)
     82     if isinstance(key, slice):
     83         key = range(*key.indices(table.num_rows))

ZeroDivisionError: integer division or modulo by zero

score 0 · Answer 1 · answered Sep 06 '22 at 09:08

The current error you are receiving is due to table.num_rows is 0, huggingface has expectations of extra attributes from the dataset itself.

My suggestion would be to download a small similar dataset of Huggingface and see what it has. But in general it would be something like this example from huggingface, or features in their documentation.

Labeling model with hugginface Dataset

1 Answers1