0

I have a sql db with columns [Record_ID](Primary Key) ,[Source] ,[Entity] (Can be null) ,[Last_Name] (Can be null) ,[Given_Name] (Can be null) ,[Aliases] (Can be null) ,[Date_Of_Birth] (Can be null). These values come from 3 different XML files. I parse these files in python and add in DB. These 3 XMLs are coming from website. If these files are updated I need to update my db with new values without duplicates. How can I do that ?

def readXML(sql_server, database, sql_user_name, sql_password, sql_driver):
   print("Start")
   url = 'somewebsite.com'
   resp = requests.get(url)
   soup = BeautifulSoup(resp.content, "xml")
   recordData = soup.findAll('record')
   now = datetime.datetime.now()
   sql_insert_in_table = """
    INSERT INTO myTable (Source, Given_Name, Last_Name, Date_Of_Birth, Aliases,Entity,Date_Added,Added_by) values (?,?,?,?,?,?,?,?)
    """
   params = []

   for child in recordData:
      firstName = child.find('GivenName').text if child.find('GivenName')!=None else "N/A"
      lastName = child.find('LastName').text if child.find('LastName')!=None else "N/A"
      DoB = child.find('DateOfBirth').text if child.find('DateOfBirth')!=None else "N/A"
      entity= child.find('Entity').text if child.find('Entity')!=None else "N/A"
      aliases = child.find('Aliases').text if  child.find('Aliases')!=None else "N/A"
      params.append((source, firstName, lastName,
                  DoB, aliases, entity, now.date(), "Admin"))
   exec_sql_query (sql_insert_in_table, params, sql_server, database, sql_user_name, sql_password, sql_driver)


def exec_sql_query(query, params, sql_server, database, sql_user_name, sql_password, sql_driver):
    try:
        with pyodbc.connect('DRIVER='+sql_driver+';SERVER=tcp:'+sql_server+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD=' + sql_password) as conn:
            with conn.cursor() as cursor:
                conn.autocommit = True
            cursor.executemany(query, params)
    except pyodbc.Error as e:
        logging.error(f"SQL query failed: {e}")
        raise
heman123
  • 3,009
  • 4
  • 24
  • 37
  • 3
    One simple way to do this: Create a combined unique key with the columns you need. Then, if an INSERT fails, catch the error and ignore it. – Matthias Jul 20 '23 at 16:16
  • 1
    Additional info: [Comparison with `None` should be done with `is`](https://stackoverflow.com/questions/14247373/python-none-comparison-should-i-use-is-or). So something like `if child.find('GivenName')!=None` would be `if child.find('GivenName') is not None`. This information can also be found in the [Style Guide for Python Code](https://peps.python.org/pep-0008/#programming-recommendations). – Matthias Jul 20 '23 at 16:19
  • @Matthias Thank you for correction. Will go through the link and I will look into Combined Unique Key. – heman123 Jul 20 '23 at 16:47

1 Answers1

1

The problem you're facing is how to add new records to your database without creating duplicates. In SQL Server, you can solve this using the MERGE statement. It checks whether a record already exists. If it does, the MERGE statement can be instructed to do nothing or to update the existing record. If the record doesn't exist, it inserts a new one.

In your case, the unique identifiers for a record could be the combination of Given_Name, Last_Name, and Date_Of_Birth. But note that this might not always guarantee uniqueness.

Here's an example of your code using MERGE:

def readXML(sql_server, database, sql_user_name, sql_password, sql_driver):
    url = 'somewebsite.com'
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, "xml")
    recordData = soup.findAll('record')

    for child in recordData:
        firstName = child.find('GivenName').text if child.find('GivenName') else "N/A"
        lastName = child.find('LastName').text if child.find('LastName') else "N/A"
        DoB = child.find('DateOfBirth').text if child.find('DateOfBirth') else "N/A"
        entity= child.find('Entity').text if child.find('Entity') else "N/A"
        aliases = child.find('Aliases').text if child.find('Aliases') else "N/A"
        now = datetime.datetime.now()

        params = (source, firstName, lastName, DoB, aliases, entity, now.date(), "Admin")

        sql_merge = f"""
        MERGE INTO myTable AS Target
        USING (SELECT ? as Given_Name, ? as Last_Name, ? as Date_Of_Birth) AS Source
        ON Target.Given_Name = Source.Given_Name AND Target.Last_Name = Source.Last_Name AND Target.Date_Of_Birth = Source.Date_Of_Birth
        WHEN MATCHED THEN
            UPDATE SET Target.Aliases = ?, Target.Entity = ?
        WHEN NOT MATCHED THEN
            INSERT (Source, Given_Name, Last_Name, Date_Of_Birth, Aliases, Entity, Date_Added, Added_by)
            VALUES (?,?,?,?,?,?,?,?)
        """
        
        exec_sql_query(sql_merge, params, sql_server, database, sql_user_name, sql_password, sql_driver)

def exec_sql_query(query, params, sql_server, database, sql_user_name, sql_password, sql_driver):
    try:
        with pyodbc.connect('DRIVER='+sql_driver+';SERVER=tcp:'+sql_server+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD=' + sql_password) as conn:
            with conn.cursor() as cursor:
                conn.autocommit = True
                cursor.execute(query, params)
    except pyodbc.Error as e:
        logging.error(f"SQL query failed: {e}")
        raise
suchislife
  • 4,251
  • 10
  • 47
  • 78