I have the following code
from scipy.spatial.distance import dice, directed_hausdorff
from sklearn.metrics import f1_score
from segments import SegmentsClient
from segments import SegmentsDataset
from datasets import load_dataset
from segments.huggingface import release2dataset
from huggingface_hub import cached_download, hf_hub_url
from transformers import TrainingArguments
import numpy as np
from transformers import Trainer
from torchvision.transforms import ColorJitter
from transformers import SegformerFeatureExtractor
#!huggingface-cli login
api_key = "..."
#etc
client = SegmentsClient(api_key)
dataset_identifier = "kasumi222/busigt"
vers = "v0.1"
release = client.get_release(dataset_identifier, vers)
ds = release2dataset(release)
ds = ds.shuffle(seed=1)
ds = ds.train_test_split(test_size=0.2)
train_ds = ds["train"]
test_ds = ds["test"]
Here I'm transforming from the SegmentAi API format to a Huggingface Dataset
.
However, I would like to extract the label for the following code to work:
# repo_id = f"datasets/{hf_dataset_identifier}"
filename = "dataset_infos.json"
#id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
id2label = {"benigno":0, "maligno":1}
label2id = {0:"benigno",1:"maligno"}
num_labels = len(id2label)
feature_extractor = SegformerFeatureExtractor()
jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
def train_transforms(example_batch):
images = [jitter(x) for x in example_batch['pixel_values']]
labels = [x for x in example_batch['label']]
inputs = feature_extractor(images, labels)
return inputs
def val_transforms(example_batch):
images = [x for x in example_batch['pixel_values']]
labels = [x for x in example_batch['label']]
inputs = feature_extractor(images, labels)
return inputs
# Set transforms
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_trans)
from transformers import SegformerForSemanticSegmentation
pretrained_model_name = "nvidia/mit-b0"
model = SegformerForSemanticSegmentation.from_pretrained(
pretrained_model_name,
num_labels=num_labels,
id2label=id2label,
label2id=label2id
)
epochs = 1
lr = 0.00006
batch_size = 1
hub_model_id = "segformer-b0-finetuned-busigt"
training_args = TrainingArguments(
"segformer-b0-finetuned-busigt-outputs",
learning_rate=lr,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
save_total_limit=3,
evaluation_strategy="steps",
save_strategy="steps",
save_steps=20,
eval_steps=20,
logging_steps=1,
eval_accumulation_steps=5,
load_best_model_at_end=True,
push_to_hub=True,
hub_model_id=hub_model_id,
hub_strategy="end",
)
metric = load_metric("mean_iou")
def compute_metrics(eval_pred):
with torch.no_grad():
logits, labels = eval_pred
logits_tensor = torch.from_numpy(logits)
# scale the logits to the size of the label
logits_tensor = nn.functional.interpolate(
logits_tensor,
size=labels.shape[-2:],
mode="bilinear",
align_corners=False,
).argmax(dim=1)
pred_labels = logits_tensor.detach().cpu().numpy()
metrics = metric.compute(predictions=pred_labels, references=labels,
num_labels=num_labels,
ignore_index=0,
reduce_labels=feature_extractor.reduce_labels)
for key, value in metrics.items():
if type(value) is np.ndarray:
metrics[key] = value.tolist()
return metrics
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=test_ds,
compute_metrics=compute_metrics,
)
trainer.train()
Inside each sample of the dataset there is a field label.annotations
, which contains a list with only one element, that is a dictionary with the label field, named category_id
.
I suppose that the problem with the code is that it is not finding that as a label, because on the last line a warning appears
The following columns in the training set don't have a corresponding argument in `SegformerForSemanticSegmentation.forward` and have been ignored: label.annotations, image, status, uuid, name, label.segmentation_bitmap. If label.annotations, image, status, uuid, name, label.segmentation_bitmap are not expected by `SegformerForSemanticSegmentation.forward`, you can safely ignore this message.
So what can I do to correctly train the model?
Edit:
I changed the transform functions to:
def train_transforms(example_batch):
images = [jitter(x) for x in example_batch['image']]
labels = [x for x in example_batch['label.annotations'][0]["category_id"]]
inputs = feature_extractor(images, labels)
return inputs
def val_transforms(example_batch):
images = [jitter(x) for x in example_batch['image']]
labels = [x for x in example_batch['label.annotations'][0]["category_id"]]
inputs = feature_extractor(images, labels)
return inputs
Same error:
The following columns in the training set don't have a corresponding argument in `SegformerForSemanticSegmentation.forward` and have been ignored: image, name, uuid, label.annotations, status, label.segmentation_bitmap. If image, name, uuid, label.annotations, status, label.segmentation_bitmap are not expected by `SegformerForSemanticSegmentation.forward`, you can safely ignore this message.
/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
FutureWarning,
***** Running training *****
Num examples = 517
Num Epochs = 1
Instantaneous batch size per device = 1
Total train batch size (w. parallel, distributed & accumulation) = 1
Gradient Accumulation steps = 1
Total optimization steps = 517
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
<ipython-input-31-d8203853aa11> in <module>
1 import numpy as np
----> 2 trainer.train()
10 frames
/usr/local/lib/python3.7/dist-packages/datasets/formatting/formatting.py in _query_table(table, key)
79 """
80 if isinstance(key, int):
---> 81 return table.fast_slice(key % table.num_rows, 1)
82 if isinstance(key, slice):
83 key = range(*key.indices(table.num_rows))
ZeroDivisionError: integer division or modulo by zero