I trying make synthetic data with SDV HMASynthesizer. But I got failing, because I need to add custom logic for relationships: mentor_id - user_id and mentee_id - user_id.
That I need.
If "user_id" in table "users" had role "mentor" it should be in column "mentor_id" (sessions). Same thing for "mentee".
This is full code of SDV model:
database_data = {
'domain': domain,
'region': region,
'sessions': sessions,
'users': users
}
database_metadata = MultiTableMetadata()
for i in database_data:
database_metadata.detect_table_from_dataframe(
table_name=i,
data=database_data[i]
)
# sessions________________________
database_metadata.update_column(
table_name='sessions',
column_name='session_id',
sdtype='id',
regex_format='[0-9]{5}'
)
database_metadata.update_column(
table_name='sessions',
column_name='mentor_id',
sdtype='id',
regex_format='[a-zA-Z]{4}'
)
database_metadata.update_column(
table_name='sessions',
column_name='mentee_id',
sdtype='id',
regex_format='[a-zA-Z]{4}'
)
database_metadata.update_column(
table_name='sessions',
column_name='mentor_domain_id',
sdtype='id',
regex_format='[a-zA-Z]{2}'
)
database_metadata.set_primary_key(
table_name='sessions',
column_name='session_id'
)
# users________________________
database_metadata.update_column(
table_name='users',
column_name='user_id',
sdtype='id',
regex_format='[0-9]{4}'
)
database_metadata.update_column(
table_name='users',
column_name='region_id',
sdtype='id',
regex_format='[a-zA-Z]{2}'
)
database_metadata.set_primary_key(
table_name='users',
column_name='user_id'
)
# domain ________________________
database_metadata.update_column(
table_name='domain',
column_name='id',
sdtype='id',
regex_format='[0-9]{2}'
)
database_metadata.set_primary_key(
table_name='domain',
column_name='id'
)
# region _______________________
database_metadata.update_column(
table_name='region',
column_name='id',
sdtype='id',
regex_format='[0-9]{2}'
)
database_metadata.set_primary_key(
table_name='region',
column_name='id'
)
# add relationship
database_metadata.add_relationship(
parent_table_name='domain',
child_table_name='sessions',
parent_primary_key='id',
child_foreign_key='mentor_domain_id'
)
database_metadata.add_relationship(
parent_table_name='region',
child_table_name='users',
parent_primary_key='id',
child_foreign_key='region_id'
)
database_metadata.add_relationship(
parent_table_name='users',
child_table_name='sessions',
parent_primary_key='user_id',
child_foreign_key='mentor_id'
)
database_metadata.add_relationship(
parent_table_name='users',
child_table_name='sessions',
parent_primary_key='user_id',
child_foreign_key='mentee_id'
)
database_metadata.visualize(
show_table_details=True,
show_relationship_labels=True,
output_filepath='my_metadata.png'
)
# Synthesizer
synthesizer = HMASynthesizer(database_metadata, locales=['ru_RU'])
# transformers
synthesizer.auto_assign_transformers(database_data)
from rdt.transformers.categorical import LabelEncoder
synthesizer.update_transformers(
table_name='domain',
column_name_to_transformer={
'name': LabelEncoder(add_noise=False)
}
)
synthesizer.update_transformers(
table_name='region',
column_name_to_transformer={
'name': LabelEncoder(add_noise=False)
}
)
synthesizer.update_transformers(
table_name='users',
column_name_to_transformer={
'role': LabelEncoder(add_noise=False)
}
)
synthesizer.update_transformers(
table_name='sessions',
column_name_to_transformer={
'session_status': LabelEncoder(add_noise=False)
}
)
# preprocess data
processed_data = synthesizer.preprocess(database_data)
# model fit
synthesizer.fit_processed_data(processed_data)
synthesizer.reset_sampling()
database_syntetic_data = synthesizer.sample(scale=1.01)
Check table "users" and "sessions" (user with id 2566 is mentee, but in sessions this id in column mentor_id).
How to avoid this error?
user_id | reg_date | role | region_id | |
---|---|---|---|---|
2556 | 2556 | 2022-08-13 | mentee | 17 |
session_id | session_date_time | mentor_id | mentee_id | session_status | mentor_domain_id | |
---|---|---|---|---|---|---|
170 | 170 | 2021-08-04 | 2566 | 1720 | finished | 0 |
1296 | 1296 | 2021-12-17 | 2566 | 431 | canceled | 1 |
4497 | 4497 | 2022-05-16 | 2566 | 1327 | canceled | 4 |
5429 | 5429 | 2022-03-11 | 2566 | 1543 | canceled | 5 |
How to improve the quality of categorical data? I'm using LabelEncoder(add_noise=False) to transform categorial data.