0

I trying make synthetic data with SDV HMASynthesizer. But I got failing, because I need to add custom logic for relationships: mentor_id - user_id and mentee_id - user_id.

That I need.

If "user_id" in table "users" had role "mentor" it should be in column "mentor_id" (sessions). Same thing for "mentee".

This is full code of SDV model:

database_data = {
    'domain': domain,
    'region': region,
    'sessions': sessions,
    'users': users
}

database_metadata = MultiTableMetadata()

for i in database_data:
    database_metadata.detect_table_from_dataframe(
        table_name=i,
        data=database_data[i]
    )
# sessions________________________

database_metadata.update_column(
    table_name='sessions',
    column_name='session_id',
    sdtype='id',
    regex_format='[0-9]{5}'
)

database_metadata.update_column(
    table_name='sessions',
    column_name='mentor_id',
    sdtype='id',
    regex_format='[a-zA-Z]{4}'
)

database_metadata.update_column(
    table_name='sessions',
    column_name='mentee_id',
    sdtype='id',
    regex_format='[a-zA-Z]{4}'
)

database_metadata.update_column(
    table_name='sessions',
    column_name='mentor_domain_id',
    sdtype='id',
    regex_format='[a-zA-Z]{2}'
)

database_metadata.set_primary_key(
    table_name='sessions',
    column_name='session_id'
)

# users________________________

database_metadata.update_column(
    table_name='users',
    column_name='user_id',
    sdtype='id',
    regex_format='[0-9]{4}'
)


database_metadata.update_column(
    table_name='users',
    column_name='region_id',
    sdtype='id',
    regex_format='[a-zA-Z]{2}'
)

database_metadata.set_primary_key(
    table_name='users',
    column_name='user_id'
)

# domain ________________________

database_metadata.update_column(
    table_name='domain',
    column_name='id',
    sdtype='id',
    regex_format='[0-9]{2}'
)

database_metadata.set_primary_key(
    table_name='domain',
    column_name='id'
)

# region _______________________

database_metadata.update_column(
    table_name='region',
    column_name='id',
    sdtype='id',
    regex_format='[0-9]{2}'
)

database_metadata.set_primary_key(
    table_name='region',
    column_name='id'
)

# add relationship

database_metadata.add_relationship(
    parent_table_name='domain',
    child_table_name='sessions',
    parent_primary_key='id',
    child_foreign_key='mentor_domain_id'
)

database_metadata.add_relationship(
    parent_table_name='region',
    child_table_name='users',
    parent_primary_key='id',
    child_foreign_key='region_id'
)

database_metadata.add_relationship(
    parent_table_name='users',
    child_table_name='sessions',
    parent_primary_key='user_id',
    child_foreign_key='mentor_id'
)

database_metadata.add_relationship(
    parent_table_name='users',
    child_table_name='sessions',
    parent_primary_key='user_id',
    child_foreign_key='mentee_id'
)

database_metadata.visualize(
    show_table_details=True,
    show_relationship_labels=True,
    output_filepath='my_metadata.png'
)

Plot with relationships: enter image description here

# Synthesizer
synthesizer = HMASynthesizer(database_metadata, locales=['ru_RU'])

# transformers
synthesizer.auto_assign_transformers(database_data)

from rdt.transformers.categorical import LabelEncoder

synthesizer.update_transformers(
    table_name='domain',
    column_name_to_transformer={
        'name': LabelEncoder(add_noise=False)
    }
)

synthesizer.update_transformers(
    table_name='region',
    column_name_to_transformer={
        'name': LabelEncoder(add_noise=False)
    }
)

synthesizer.update_transformers(
    table_name='users',
    column_name_to_transformer={
        'role': LabelEncoder(add_noise=False)
    }
)

synthesizer.update_transformers(
    table_name='sessions',
    column_name_to_transformer={
        'session_status': LabelEncoder(add_noise=False)
    }
)

# preprocess data
processed_data = synthesizer.preprocess(database_data)

# model fit
synthesizer.fit_processed_data(processed_data)

synthesizer.reset_sampling()
database_syntetic_data = synthesizer.sample(scale=1.01)

Check table "users" and "sessions" (user with id 2566 is mentee, but in sessions this id in column mentor_id).

How to avoid this error?

user_id reg_date role region_id
2556 2556 2022-08-13 mentee 17
session_id session_date_time mentor_id mentee_id session_status mentor_domain_id
170 170 2021-08-04 2566 1720 finished 0
1296 1296 2021-12-17 2566 431 canceled 1
4497 4497 2022-05-16 2566 1327 canceled 4
5429 5429 2022-03-11 2566 1543 canceled 5

How to improve the quality of categorical data? I'm using LabelEncoder(add_noise=False) to transform categorial data.

John Doe
  • 95
  • 6

1 Answers1

0

I never got HMA working... Had to move forward with a product like YData Fabric to achieve some results.