My problem is : I have multiple identical databases and I want to merge them into one. But I may have duplicate entries as Primary Keys. What i'm trying to do is handle the duplicates before putting them in Mysql.
my actual code is:
df = pd.DataFrame()
duplicates = pd.DataFrame()
size=[]
lignes=[]
chunksize = 100000
for db in dbtuple: #For each databases in the tuple given as entry
engine = create_engine(URL(
drivername="mysql",
username="xxx",
password="xxx",
host="localhost",
database=db
))
conn = engine.connect()
#Get the data from the table given as entry
sql = "SELECT * FROM "+ tableName
#Execution of the query above
generator_df = pd.read_sql(sql=sql,con=conn,chunksize= chunksize)
#Init of sizechunk value
sizechunk = 0
#Because the query can be very big number of rows there's a separation
# every 100k rows so that dataframe size <= 100k
for dataframe in generator_df:
df = pd.concat([df,dataframe],ignore_index = True, axis=0,sort=False)
#We add the size of the chunk to know how many rows we have per database
sizechunk+= dataframe.shape[0]
size.append(sizechunk)
if tableName == 'table1':
duplicates = df.duplicated(subset='id')
for i in range(0,len(df)):
if duplicates[i]:
df.id[i] = numligne + '_' + df.id[i]
#same for all tables
But this is not pythonic way at all and is it very long to execute. Do you have any suggestion on how to improve the code to make it faster ?
Here is my db schema to understand better:
table1 = Table('table1', metadata,
Column('id', VARCHAR(40), primary_key=True,nullable=False),
mysql_engine='InnoDB'
)
table2= Table('table2', metadata,
Column('id', VARCHAR(40), primary_key=True,nullable=False),
Column('id_of', VARCHAR(20),ForeignKey("table1.id"), nullable=False, index= True)
)
table3= Table('table3', metadata,
Column('index',BIGINT(10), primary_key=True,nullable=False,autoincrement=True),
Column('id', VARCHAR(40),nullable=False),
Column('id_produit', VARCHAR(40),ForeignKey("table2.id"), nullable=False, index= True),
Column('id_produit_enfant', VARCHAR(40),ForeignKey("table2.id"), nullable=False, index= True)
)
table4= Table('table4', metadata,
Column('index',BIGINT(10), primary_key=True,nullable=False,autoincrement=True),
Column('id', VARCHAR(40),nullable=False),
Column('id_produit', VARCHAR(40),ForeignKey("table2.id"), nullable=False, index= True)
)
table5= Table('table5', metadata,
Column('index',BIGINT(10), primary_key=True,nullable=False,autoincrement=True),
Column('id', VARCHAR(40),nullable=False),
Column('id_produit', VARCHAR(40),ForeignKey("table2.id"), nullable=False, index= True)
)
table6= Table('table6', metadata,
Column('index',BIGINT(10), primary_key=True,nullable=False,autoincrement=True),
Column('id', VARCHAR(40),nullable=False),
Column('id_produit', VARCHAR(40),ForeignKey("table2.id"), nullable=False, index= True)
)```