I'm trying the following Kaggle.
TL;DR: I want to classify a sequence (time-series) of measurements to 1 of K classes using LSTM.
I'm trying to overfit the model on 2 sequences:
My input is (B, N, M):
- B : batch-size = 1
- N : sequence-size = 128
- M : num-of-feature = 14 (number of measurements in each timestamp) My model is a very simple LSTM:
class LSTMClassifier(nn.Module):
def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
super(LSTMClassifier, self).__init__()
self.in_dim = in_dim
self.hidden_dim = hidden_dim
self.out_dim = out_dim
self.num_layers = num_layers
self.lstm = nn.LSTM(in_dim, hidden_dim, num_layers=num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, out_dim)
def forward(self, x):
lstm_out, (ht, ct) = self.lstm(x)
y = self.fc(ht[-1].reshape(-1, self.hidden_dim))
return y
And the train process is:
def train_lstm_model(model, data_loader, num_epochs, loss_cls, optimizer_cls, learning_rate):
start = time.time()
loss = loss_cls()
optimizer = optimizer_cls(model.parameters(), lr=learning_rate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
for epoch in tqdm(range(num_epochs)):
hidden = (torch.zeros((1, data_loader.batch_size, model.hidden_dim), requires_grad=True).to(device),
torch.zeros(1, data_loader.batch_size, model.hidden_dim, requires_grad=True).to(device))
for i, (batch_x, batch_y) in enumerate(data_loader):
batch_x = batch_x.to(device).float()
batch_y = batch_y.to(device).long()
optimizer.zero_grad()
y_predicted, hidden = model(batch_x, hidden)
l = loss(y_predicted, batch_y)
l.backward()
optimizer.step()
# print(f'epoch {epoch+1}, batch {i+1}: loss = {l.item()} |',
# f'train accuracy: {eval_lstm_model(model, data_loader.dataset, hidden)}')
end = time.time()
print(f'Training took {end-start} seconds.')
And my setup code is:
loss_cls = nn.CrossEntropyLoss
optimizer_cls = torch.optim.SGD
hidden_dim = 100
model_lstm = LSTMClassifier(X_of.shape[-1], hidden_dim, len(np.unique(y_train)))
learning_rate = 0.01
num_epochs = 1000
train_lstm_model(model_lstm, overfit_loader, num_epochs, loss_cls, optimizer_cls, learning_rate)
The overfit_loader
is a DataLoader
which contains only 2 samples.
But the training process outputs the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-87-5f725d0ecc50> in <module>
27 learning_rate = 0.001
28 num_epochs = 100
---> 29 train_lstm_model(model_lstm, overfit_loader, num_epochs, loss_cls, optimizer_cls, learning_rate)
<ipython-input-86-ba60b3627f13> in train_lstm_model(model, data_loader, num_epochs, loss_cls, optimizer_cls, learning_rate, test_loader)
20 l = loss(y_predicted, batch_y)
21
---> 22 l.backward(retain_graph=True)
23 optimizer.step()
24
/usr/local/lib64/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
219 retain_graph=retain_graph,
220 create_graph=create_graph)
--> 221 torch.autograd.backward(self, gradient, retain_graph, create_graph)
222
223 def register_hook(self, hook):
/usr/local/lib64/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
130 Variable._execution_engine.run_backward(
131 tensors, grad_tensors_, retain_graph, create_graph,
--> 132 allow_unreachable=True) # allow_unreachable flag
133
134
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace
operation: [torch.cuda.FloatTensor [400, 14]] is at version 2; expected version 1 instead. Hint: the
backtrace further above shows the operation that failed to compute its gradient. The variable in question
was changed in there or anywhere later. Good luck!
EDIT: I've removed the loss printing and stop re-using the hidden
, according to @SzymonMaszke comment, and the exception gone, but there's still a problem that the loss isn't converges below 0.7
I'd like to get some help please, Thanks!