I have a dataframe with multiple columns. Two of the columns in the dataframe are table_name and column_name. I need to check whether the column_name is available in the table_name in an SQL Server database.
Now, I have a function check_column_existence() for this, which seems to work fine, if I use the traditional method of reading the csv file and calling the function:
def check_column_existence(dbHost, dbPort, dbName, dbUser, dbPassword, table_name, column_name):
sqlQuery="""select count(*) from
INFORMATION_SCHEMA.columns
where table_name='{}' and column_name='{}';""".format(table_name, column_name)
conn = pymssql.connect(host=dbHost, port=dbPort, user=dbUser, password=dbPassword, database=dbName)
cursor = conn.cursor()
sql = sqlQuery.encode('utf-8')
cursor.execute(sql)
for row in cursor:
if row[0] == 1:
result = True
else:
result = False
print(result)
return result
def populate_db_attribute_existence(dbHost, dbPort, dbName, dbUser, dbPassword, input_csv_file_path):
input_csv_file_path = os.path.abspath(input_csv_file_path)
input_csv_folder_path = os.path.dirname(input_csv_file_path)
input_csv_file_name = os.path.basename(input_csv_file_path)
temp1_output_file_path = "{}/temp1_{}".format(input_csv_folder_path, input_csv_file_name)
temp2_output_file_path = "{}/temp2_{}".format(input_csv_folder_path, input_csv_file_name)
df = pd.read_csv(input_csv_file_path)
df['DB_Entity'] = df['DB_Entity'].str.replace(' ', '')
df['DB_Attributes'] = df['DB_Attributes'].str.replace(' ', '')
df2 = df[['API_Attributes', 'DB_Entity', 'DB_Attributes']]
df2.to_csv(temp1_output_file_path, index=False, encoding='utf-8')
with open(temp1_output_file_path,'r') as ftemp1, open(temp2_output_file_path,'w') as ftemp2:
i = 1
for row in ftemp1:
if i > 1:
row = row.strip()
list_columns = row.split(',')
api_attr_name = list_columns[0]
db_entity_name = list_columns[1]
c_name = list_columns[2]
t_name = 'dbo.{}'.format(db_entity_name)
result = check_column_existence(dbHost, dbPort, dbName, dbUser, dbPassword, t_name, c_name)
ftemp2.write(','.join([api_attr_name, db_entity_name, c_name, str(result)])+'\n')
i += 1
del(df2)
df2 = pd.read_csv(temp2_output_file_path, names=['API_Attributes', 'DB_Entity', 'DB_Attributes', 'DB_Attr_Exist'])
merge_key = ['API_Attributes', 'DB_Attributes']
merged_df = pd.merge(df, df2, on=merge_key, how='outer')
merged_df.to_csv(input_csv_file_path, index=False, encoding='utf-8')
os.remove(temp1_output_file_path)
os.remove(temp2_output_file_path)
print("Completed DB Attributes existence check: {}".format(input_csv_file_path))
Updated this question with my current approach. Please recommend if there are faster ways to achieve the same. How should I use df.apply() or a lambda function in this context, if possible?
Appreciate your help or recommendation in advance.