0

I'm following this pytorch tutorial on the nn module and am trying to do things a different way. The hope is that I can avoid using functions like nn.Sequential so that I can fit other custom functions (say ones involving e^bx, cos(wx) etc.) to data with the forward method, where the syntax defining the function to be fit is a lot clearer. I'm getting many nan values here. I really don't see what is drastically different between my example and the one shown. Is my approach okay?

The tutorial defines the model in a manner that I think would be onerous if used for more complicated functions:

A lr of 1e-6 gives nan, 1e-9 is comically slow. The initial loss values seem far too high.

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)

import numpy as np
import torch
from torch import nn
import torch.optim as optim

# torch.autograd.set_detect_anomaly(True)
# https://stackoverflow.com/questions/49433936/how-do-i-initialize-weights-in-pytorch
# https://discuss.pytorch.org/t/gradient-value-is-nan/91663/11

x = torch.linspace(-np.pi, np.pi, 2000)
y = torch.sin(x)

class Model(nn.Module):  # inherit from the base-class of NN in PyTorch
    def __init__(self):
        super(Model, self).__init__()  # run nn.Module init
        self.a = nn.Linear(1, 1, bias=True)
        self.b = nn.Linear(1, 1, bias=False)
        self.c = nn.Linear(1, 1, bias=False)

        # self.a.weight.data.fill_(1)
        # self.b.weight.data.fill_(1)
        # self.c.weight.data.fill_(1)

    def forward(self, x):  # forward pass: this part is the actual model
        return self.a(x) + self.b(x)**2 + self.c(x)**3
        # return self.a(x) + self.b(x).pow(2) + self.c(x).pow(3)
        # return torch.add(torch.add(self.a(x), self.b(x)), self.c(x))


model = Model()

use_tutorial_grad_descent = False

loss_fn = torch.nn.MSELoss(reduction='sum')
opti = optim.SGD(model.parameters(), lr=1e-6)  #
x = x.unsqueeze(-1)

for t in range(2000):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    model.zero_grad()
    loss.backward()
    if use_tutorial_grad_descent:
        with torch.no_grad():
            for param in model.parameters():
                param -= 1e-6 * param.grad
    else:
        opti.step()

    print(loss)


# You can access the first layer of `model` like accessing the first item of a list


def fit(x):
    o = model.a.bias.detach().numpy().flatten()
    a = model.a.weight.detach().numpy().flatten()
    b = model.b.weight.detach().numpy().flatten()
    c = model.c.weight.detach().numpy().flatten()
    return o + a*x + a*x**2 + c*x**3


a = 1


import matplotlib.pyplot as plt
x = x.detach().numpy().flatten()
y = y.detach().numpy()
fig, ax = plt.subplots()
ax.plot(x, y, x, fit(x))
fig.show()









EDIT: I found that changing the reduction to mean and dividing the loss helped with the nan issue and allowed me to increase learning rate.

import numpy as np
import torch
from torch import nn
import torch.optim as optim

# torch.autograd.set_detect_anomaly(True)
# https://stackoverflow.com/questions/49433936/how-do-i-initialize-weights-in-pytorch
# https://discuss.pytorch.org/t/gradient-value-is-nan/91663/11

x = torch.linspace(-np.pi, np.pi, 2000)
y = torch.sin(x)

class Model(nn.Module):  # inherit from the base-class of NN in PyTorch
    def __init__(self):
        super(Model, self).__init__()  # run nn.Module init
        self.a = nn.Linear(1, 1, bias=True)
        self.b = nn.Linear(1, 1, bias=False)
        self.c = nn.Linear(1, 1, bias=False)

        # self.a.weight.data.fill_(1)
        # self.b.weight.data.fill_(1)
        # self.c.weight.data.fill_(1)

    def forward(self, x):  # forward pass: this part is the actual model
        return self.a(x) + self.b(x)**2 + self.c(x)**3
        # return self.a(x) + self.b(x).pow(2) + self.c(x).pow(3)
        # return torch.add(torch.add(self.a(x), self.b(x)), self.c(x))


model = Model()

use_tutorial_grad_descent = False
tut_lr = 1e-8

if not use_tutorial_grad_descent:
    opti = optim.SGD(model.parameters(), lr=0.1, momentum=0.8)  #

loss_fn = torch.nn.MSELoss(reduction='mean')
x = x.unsqueeze(-1)

for t in range(200):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)/1e3
    model.zero_grad()
    loss.backward()
    if use_tutorial_grad_descent:
        with torch.no_grad():
            for param in model.parameters():
                param -= tut_lr * param.grad
    else:
        opti.step()

    print(loss)


# You can access the first layer of `model` like accessing the first item of a list


def fit(x):
    o = model.a.bias.detach().numpy().flatten()
    a = model.a.weight.detach().numpy().flatten()
    b = model.b.weight.detach().numpy().flatten()
    c = model.c.weight.detach().numpy().flatten()
    return o + a*x + a*x**2 + c*x**3


a = 1


import matplotlib.pyplot as plt
x = x.detach().numpy().flatten()
y = y.detach().numpy()
fig, ax = plt.subplots()
ax.plot(x, y, x, fit(x))
fig.show()
likethevegetable
  • 264
  • 1
  • 4
  • 17

0 Answers0