I am trying to add a loss term to regularise between two neural networks and make them as similar as possible while still performing different tasks. The closes I could find is the answers in this post: Pytorch: how to add L1 regularizer to activations?
But trying the solutions I could not get it to work. The model trains both models to a good accuracy, but ignores the regularization ( even if set to an insanely high value ), and the difference between the two only ever seems to go up. Is there something else I need to do with the additional regularization loss term to make it so that it is not ignored?
My current best attempt is shown here:
def train_combined(nets, dataset_train, dataset_test, num_epochs, alpha=0):
criterion = nn.L1Loss()
optimizers = [optim.SGD(net.parameters(), lr=0.01, momentum=0.9 ) for net in nets]
trainloader = DataLoader(dataset_train, batch_size=32, shuffle=True )
train_losses = []
test_losses = []
for epoch in range(num_epochs): # loop over the dataset multiple times
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, *labels = data
inputs = inputs
# get the average of the paramaters between the two networks
with t.no_grad():
params = t.stack([t.cat(tuple(t.flatten(p.data) for p in net.parameters())) for net in nets])
avg = t.sum(params, dim=0)*0.5
# keep track of loss for both models
all_losses = np.zeros( 2 )
all_reg_losses = np.zeros( 2 )
all_final_losses = np.zeros( 2 )
# forward + backward + optimize
for i, (net, optimizer, label) in enumerate(zip(nets, optimizers, labels)):
optimizer.zero_grad()
# calculate normal loss
outputs = net(inputs)
loss = criterion(outputs, label)
# calculate regularization loss loss
params = t.cat(tuple(t.flatten(p.data) for p in net.parameters()))
regularization_loss = t.sum(t.abs( params - avg ))
regularization = regularization_loss * alpha
# calculate total loss
final_loss = loss + regularization
final_loss.backward()
optimizer.step()
# keep track of losses
all_losses[i] = float( loss.item() )
all_reg_losses[i] = float( 0 if (regularization == 0) else regularization.item() )
all_final_losses[i] = float( final_loss.item() )
# keep track of performance
train_losses.append( loss )
with t.no_grad():
for i in range(2):
test_losses.append( light_eval( nets[i], data_test, index=i ) )
# log performance each epoch
for i in range(2):
print("%3d" % (epoch+1), i, ':',
f' train loss = { ("%.4f "*3) % (all_losses[i], all_reg_losses[i], all_final_losses[i]) }',
f', test_losses = { "%.4f" % test_losses[-(2-i)] }')
print('Finished Training')
models = [ Net().to(device) for i in range(2) ]
train_combined( models, dataset_train, dataset_test, 50, alpha=1e-2 )
What am I doing wrong?