I am currently using SDV and GaussianCopula (https://sdv.dev/SDV/user_guides/single_table/gaussian_copula.html) to train my models. I have a given data set which is loaded for training.
However, I get the following error message when creating the datasets:
Saving Model to path D:/.../GaussianCopula/model_MLB_1.pkl
Generating 22479 rows of synthetic data
Traceback (most recent call last):
File ".\generate_gaussian_model.py", line 47, in <module>
samples = gaussianCopula.sample(len(data.index))
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 442, in sample
return self._sample_batch(num_rows, max_retries, max_rows_multiplier)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 300, in _sample_batch
num_rows, conditions, transformed_conditions, float_rtol)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 228, in _sample_rows
sampled = self._sample(num_rows)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\copulas.py", line 319, in _sample
return self._model.sample(num_rows, conditions=conditions)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\__init__.py", line 36, in wrapper
return function(self, *args, **kwargs)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\multivariate\gaussian.py", line 249, in sample
samples = self._get_normal_samples(num_rows, conditions)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\multivariate\gaussian.py", line 223, in _get_normal_samples
samples = np.random.multivariate_normal(means, covariance, size=num_rows)
File "mtrand.pyx", line 4120, in numpy.random.mtrand.RandomState.multivariate_normal
File "<__array_function__ internals>", line 6, in svd
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\linalg\linalg.py", line 1660, in svd
u, s, vh = gufunc(a, signature=signature, extobj=extobj)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\linalg\linalg.py", line 97, in _raise_linalgerror_svd_nonconvergence
raise LinAlgError("SVD did not converge")
numpy.linalg.LinAlgError: SVD did not converge
I also checked out this following thread and tried to apply their solution (which you can see below) but it didn't work.
And this is my class (generate_gaussian_model.py
) and what I've tried so far:
from sdv.tabular import GaussianCopula
import pickle
import pandas as pd
from pandas.core.indexes.base import Index
header_import_path = "C:/Users/.../headers/all_headers.txt"
all_mlb_names = ['MLB_1', 'MLB_7', 'MLB_19', 'MLB_31', 'MLB_41', 'MLB_45', 'MLB_49', 'MLB_53', 'MLB_58']
with open(header_import_path, 'rb') as fp:
all_headers = pickle.load(fp)
for mlb_file_name in all_mlb_names:
#Create separate model for each MLB Table
model_export_path = "D:/.../GaussianCopula/model_{0}.pkl".format(mlb_file_name)
synth_data_export_path = "C:/Users/.../models/generated/{0}_samples.csv".format(mlb_file_name)
data_import_path = "C:/Users/.../models/original/{0}.csv".format(mlb_file_name)
headers = all_headers[mlb_file_name]
print("Read data for table {0}".format(mlb_file_name))
data = pd.read_csv(data_import_path, sep='|', names=headers)
# This is necessary to remove invalid columns from my original dataset
for colname in data.columns:
if colname.startswith("Calculation"):
data = data.drop(axis=1, labels=[colname])
# Thought this would fix my issue but it didn't
# https://stackoverflow.com/questions/21827594/raise-linalgerrorsvd-did-not-converge-linalgerror-svd-did-not-converge-in-m
data.dropna(inplace=True)
#print("Takes a third of the dataset")
data = data.sample(frac=0.3)
print(data)
gaussianCopula = GaussianCopula()
print("Start training of GaussianCopula Model")
gaussianCopula.fit(data)
print("Saving Model to path {0}".format(model_export_path))
gaussianCopula.save(model_export_path)
print("Generating {0} rows of synthetic data".format(len(data.index)))
# Here it begins to crash
samples = gaussianCopula.sample(len(data.index))
samples.to_csv(synth_data_export_path, header=True, sep='|', index=False)
The following command would work, but these are not enough datasets for me: data = data.sample(n=1000)
Hope you guys can help me out and explain this error message to me.