I want to implement a covariance matrix from scratch. I got an error ValueError: invalid literal for int() with base 10
,
import pandas as pd
import statistics
df = pd.read_csv('C:/Users/User/Downloads/Admission_Predict.csv')
df = df.sample(frac=1)
mean = df.mean()
cov = {}
for j in range(len(mean)):
total = 0
for k in range(len(mean)):
# Use Pearson correlation for interval variables
terms = ((df.iloc[i, j] - mean.iloc[j]) * (df.iloc[i, k] - mean.iloc[k]) for i in range(len(mean)))
covariance = sum(terms) / (len(mean) - 1)
df_num = list(map(int, df))
st_dev = statistics.pstdev(df_num)
pearson = covariance[j, k] / (st_dev(j) * st_dev(k))
# If feature k is already in dict, append Pearson. Else, apply feature k as a list object.
if k in cov:
cov[k].append(pearson)
else:
cov[k] = [pearson]
# Coerce the cov list into a dataframe
cov_df = pd.DataFrame(cov)
# rename the columns of the cov dataframe
col_dict = {}
for idx, col in enumerate(cov_df.columns):
col_dict[idx] = col
cov_df = cov_df.rename(index=col_dict)
print(cov_df)
> Traceback (most recent call last): File
> "C:\Users\User\PycharmProjects\algorithms\mod2.py", line 17, in
> <module>
> df_num = list(map(int, df)) ValueError: invalid literal for int() with base 10: 'GRE Score'
I've tried to cast the string using astype
but it raised a type error.
df_num = df.astype(int)
TypeError: can't convert type 'str' to numerator/denominator
Output: I want to emulate the output from pandas.corr()
corr = df.corr()
print(corr)
GRE Score TOEFL Score ... Research Chance of Admit GRE Score 1.000000 0.835977 ... 0.580391 0.802610 TOEFL Score 0.835977 1.000000 ... 0.489858 0.791594 University Rating 0.668976 0.695590 ... 0.447783 0.711250 SOP 0.612831 0.657981 ... 0.444029 0.675732 LOR 0.557555 0.567721 ... 0.396859 0.669889 CGPA 0.833060 0.828417 ... 0.521654 0.873289 Research 0.580391 0.489858 ... 1.000000 0.553202 Chance of Admit 0.802610 0.791594 ... 0.553202 1.000000 [8 rows x 8 columns] Process finished with exit code 0