Denormalization with multivariate data

Question

I would like to denormalize my predictions. When I normalize the data with sklearn MinMaxscaler a train set that has 6 columns it won't let me use inverse_transform() for the data because my prediction only has one column. Anyone know how to inverse transform it or maybe normalize the data in a better way so it will be easier to denormalize the predictions?

api_key = '.......'

train_pct = 0.7 
val_pct = 0.1
test_pct = 0.2

sequence_length = 20
target_prediction = 1
stock = 'msft'


def get_df(stock):
  ts = TimeSeries(key=api_key, output_format='pandas')
  df, meta_data = ts.get_daily_adjusted(symbol=stock, outputsize='full')

  #adjusting for stock split
  df['Open'] = (1 / df.loc[:, ('8. split coefficient')]).replace(np.inf, 1).cumprod() * df.loc[:, ('1. open')]
  df['High'] = (1 / df.loc[:, ('8. split coefficient')]).replace(np.inf, 1).cumprod() * df.loc[:, ('2. high')]
  df['Low'] = (1 / df.loc[:, ('8. split coefficient')]).replace(np.inf, 1).cumprod() * df.loc[:, ('3. low')]
  df['Close'] = (1 / df.loc[:, ('8. split coefficient')]).replace(np.inf, 1).cumprod() * df.loc[:, ('4. close')]

  df = df[['Open', 'High', 'Low', 'Close', '6. volume', '7. dividend amount']]

  #reverses dataframe
  df = df.iloc[::-1]

  #Transform dates(index) into number
  df.set_index(mdates.date2num(df.index), inplace=True)

  return df


def data_split(df, train_pct=train_pct, test_pct=test_pct, val_pct=val_pct):
  #spliting data into training and testing
  start_index = df.head().index.values[0]
  n = df.tail().index.values[4] - start_index

  train_df = df[start_index : int(train_pct* n) + start_index - 1]
  val_df = df[(int(n* train_pct) + start_index) :(int(n*(train_pct + val_pct)) + start_index) - 1]
  test_df = df[(int(n*(train_pct + val_pct)) + start_index) : (start_index + n)]
  #den ene verdien blir sett to ganger når aksjen er msft, kan være desimal feil.(avrunding 735470.0)

  data_normaliser = preprocessing.StandardScaler()
  train_np = data_normaliser.fit_transform(train_df[['Open', 'High', 'Low', 'Close', '6. volume', '7. dividend amount']])
  test_np = data_normaliser.transform(test_df[['Open', 'High', 'Low', 'Close', '6. volume', '7. dividend amount']])
  val_np = data_normaliser.transform(val_df[['Open', 'High', 'Low', 'Close', '6. volume', '7. dividend amount']])

  #turn numpy into dataframe again
  train = pd.DataFrame(train_np, columns=['Open', 'High', 'Low', 'Close', 'Volum', 'Dividend'])
  test = pd.DataFrame(test_np, columns=['Open', 'High', 'Low', 'Close', 'Volum', 'Dividend'])
  val = pd.DataFrame(val_np, columns=['Open', 'High', 'Low', 'Close', 'Volum', 'Dividend'])

  return train, val, test


def create_df(df, sequence_length=sequence_length, target_prediction=target_prediction):
  x = []
  y = []
  index1 = len(df)

  for i in range(sequence_length, index1):
    x.append(df.iloc[i-sequence_length:i,:])
    y.append(df.iloc[i,0])
  x, y = np.array(x), np.array(y)
  
  return x, y

################################# CODE ITSELF START HERE #################################

df = get_df('msft')
train, val, test = data_split(df)

x_train, y_train = create_df(train)
x_val, y_val = create_df(val)
x_test, y_test = create_df(test)


### MODEL ###

model = Sequential()

model.add(LSTM(units = 50, activation = 'relu', return_sequences=True, input_shape = (x_train.shape[1], 6)))
model.add(Dropout(0.2))

model.add(LSTM(units = 60, activation = 'relu', return_sequences=True))
model.add(Dropout(0.3))

model.add(LSTM(units = 80, activation = 'relu', return_sequences=True))
model.add(Dropout(0.4))

model.add(LSTM(units = 120, activation = 'relu'))

model.add(Dense(units = 1))
model.add(Dropout(0.5))


model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, epochs=10, batch_size=64)

### TESTING THE MODEL ###
from sklearn import preprocessing
import matplotlib.pyplot as plt

y_pred = model.predict(x_train)
y_pred = data_normalizer.inverse_transform(y_pred)

### VISUALIZATION ###
plt.figure(figsize=(20,7))
plt.plot(y_train, color = 'red', label = 'Real stock price')
plt.plot(y_pred, color = 'blue', label = 'Predicted stock price')
plt.title('Stock price prediction')
plt.ylabel('Price')
plt.legend()
plt.show()

ValueError                                Traceback (most recent call last)
<ipython-input-161-16162cc08160> in <module>()
      4 
      5 y_pred = model.predict(x_train)
----> 6 y_pred = data_normalizer.inverse_transform(y_pred)
      7 
      8 ### VISUALIZATION ###

/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/_data.py in inverse_transform(self, X)
    434                         force_all_finite="allow-nan")
    435 
--> 436         X -= self.min_
    437         X /= self.scale_
    438         return X

ValueError: non-broadcastable output operand with shape (3715,1) doesn't match the broadcast shape (3715,6)

There is a [difference](https://stackoverflow.com/a/58850139/10452700) between `MinMaxScaler(feature_range = (0, 1))` and `StandardScaler()`. You used `StandardScaler()` not the other one! — Mario, Jan 20 '21 at 23:30
ohh, forgot that i changed part of the code to test something. but i still cant use inverse transform on the predicted result if i use MinMaxScaler() or StandardScaler() — Nursk, Jan 21 '21 at 10:42
This problem answered [here](https://stackoverflow.com/a/49885379/10452700) and [here](https://stackoverflow.com/a/38059031/10452700). another thing you might forget to use `fit_transform()` for `test_np` and `val_np` as well like [here](https://stackoverflow.com/q/50955155/10452700). — Mario, Jan 21 '21 at 11:17

Denormalization with multivariate data

0 Answers0