How is it possible, that i can calculate PCA without Covariance matrix?
The code below do this:
- With covariance: i substract the mean of every column and calculate the cov()-matrix, before i calculate the eigenvalues and eigenvectors.
- Without covariance: i calculate the dot product of X_train (raw data), which is not centered by mean values
For comparing the results, i've calculated for both the SVD.
So why is it possible to take the raw data for PCA without mean and cov()??
import numpy as np
from scipy.linalg import svd
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from scipy import linalg as LA
from sklearn.decomposition import PCA
import copy
# data
X_train = np.asarray([[13.71,1.86,2.36,16.6],[12.22,1.29,1.94,19],
[13.27,4.28,2.26,20],[13.16,3.57,2.15,21],
[13.86,1.51,2.67,25]])
# with covariance
X = copy.copy(X_train)
n_samples = np.shape(X)[0]
X -= np.mean(X, axis=0)
U,S,VT = svd(X)
cov_m = np.cov(X.T)
eigval, eigvec = np.linalg.eigh(cov_m)
print('with covariance')
print('S\t %s' %S)
print('S**2\t %s' %str(S**2/(n_samples-1)))
print('eigval\t %s' %np.asarray(sorted(eigval, reverse=True)))
with covariance
S [6.1900012 2.67966882 1.2864974 0.08662946]
S**2 [9.57902870e+00 1.79515624e+00 4.13768889e-01 1.87616595e-03]
eigval [9.57902870e+00 1.79515624e+00 4.13768889e-01 1.87616595e-03]
## without covariance
U1,S1,VT1 = svd(X_train)
XTX = np.dot(X_train.T, X_train)
eigval1, eigvec1 = np.linalg.eigh(XTX)
print('\n without covariance')
print('S1\t %s' %S1)
print('S1**2\t %s' %str(S1**2))
print('eigval1\t %s' %np.asarray(sorted(eigval1, reverse=True)))
with covariance:
S [6.1900012 2.67966882 1.2864974 0.08662946]
S**2 [9.57902870e+00 1.79515624e+00 4.13768889e-01 1.87616595e-03]
eigval [9.57902870e+00 1.79515624e+00 4.13768889e-01 1.87616595e-03]