I'm applying the code of Shahin Shirazi to retrieve the target url of a redirect link using Python. I'm applying this code to multiple redirect links. However, when I run this code, in some cases Python writes the target url of the previous redirect link again. Obviously, Chrome has not updated the history file on time. I already added some sleep time.
from bs4 import BeautifulSoup as soup
import webbrowser
import sqlite3
import pandas as pd
import shutil
import time
import os
# ---------------------FILEREADING------------------------------------------
colnames = ['Column1']
Filename_Link = "reflinks.csv"
data = pd.read_csv(Filename_Link, names=colnames)
links = data.Column1.tolist()
# ---------------------Create a new .csv file that we are going to fill with the target urls------------------------------------------
output_data = "output_data.csv"
fd = open(output_data, "a",encoding='utf-8-sig')
topline = "url, target_url,"
fd.write(topline)
fd.write("\n")
# ---------------------Define the history folder of the browser ------------------------------------------
#source file is where the history of your webbroser is saved, I was using chrome, but it should be the same process if you are using different browser
source_file = 'C:\\Users\\xxx\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\History'
# could not directly connect to history file as it was locked and had to make a copy of it in different location
destination_file = 'C:\\Users\\xxx\\Downloads\\History'
# ---------------------Run the code to get target urls for all redirect links ------------------------------------------
for link in links:
webbrowser.open(link)
time.sleep(30) # there is some delay to update the history file, so 30 sec wait give it enough time to make sure your last url get logged
os.system("taskkill /im chrome.exe /f")
shutil.copy(source_file,destination_file) # copying the file.
time.sleep(10) # I added some delay to copy the files
con = sqlite3.connect('C:\\Users\\xxx\\Downloads\\History') # connecting to browser history
cursor = con.execute("SELECT * FROM urls")
names = [description[0] for description in cursor.description]
urls = cursor.fetchall()
con.close()
df_history = pd.DataFrame(urls,columns=names)
last_url = df_history.loc[len(df_history)-1,'url']
print(last_url)
fd.write(link.replace(",","").replace("\n"," "))
fd.write(",")
fd.write(last_url.replace(",","").replace("\n"," "))
fd.write(",")
fd.write("\n")
fd.close()
Is there any solution to this?