I'm having a hard time understanding why my dictionary is storing only the last values after a for loop to scrap multiple pages on a website with the same structure.
pages = ['https://example.com/page1.html',
'https://example.com/page2.html']
final_dict = {}
for i in pages:
url = i
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
first_table = soup.select_one("table:nth-of-type(1)")
labels = first_table.findAll('td', class_='label')
label_filter = []
for label in labels:
label_filter.extend(label.findAll('span', class_='txt'))
label_filter_txt = []
for i in label_filter:
label_filter_txt.append(i.text)
label_data = []
datapoints = first_table.findAll('td', class_='data')
for i in datapoints:
label_data.extend(i.findAll('span', class_='txt'))
label_data_txt = []
for i in label_data:
label_data_txt.append(i.text)
first_table_dict = dict(zip(label_filter_txt, label_data_txt))
second_table = soup.select_one("table:nth-of-type(2)")
labels = second_table.findAll('td', class_='label')
label_filter = []
for label in labels:
label_filter.extend(label.findAll('span', class_='txt'))
label_filter_txt = []
for i in label_filter:
label_filter_txt.append(i.text)
datapoints = second_table.findAll('td', class_='data')
label_data = []
for i in datapoints:
label_data.extend(i.findAll('span', class_='txt'))
label_data_txt = []
for i in label_data:
label_data_txt.append(i.text)
second_table_dict = dict(zip(label_filter_txt, label_data_txt))
final_dict.update(first_table_dict)
final_dict.update(second_table_dict)
df = pd.DataFrame([final_dict])
With this code my dataframe has only the data from the last URL. I'm overwriting the dict at each loop, but I don't know why.