import requests
from bs4 import BeautifulSoup
from lxml import etree
import csv
with open('1_colonia.csv', 'r', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter=';')
next(reader) # skip the header row
for row in reader:
url = row[0]
page = requests.get(url)
# parse the html with BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
# parse the HTML and print the result to the console
dom = etree.HTML(str(soup))
property = (dom.xpath('//*[@id="header"]/div/div[2]/h1'))
duration = (dom.xpath('//*[@id="header"]/div/p'))
price = (dom.xpath('//*[@id="price"]/div/div/span/span[3]'))
# save the data to a CSV file, adding the url as a column to the CSV file
with open('2_colonia.csv', 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=';')
writer.writerow([url, property[0].text, duration[0].text,price[0].text])
'1_colonia.csv' contains a list of 815 links of properties on sale. The script works until this message appears:
Traceback (most recent call last):
File "/home/flimflam/Python/colonia/2_colonia.py", line 23, in <module>
writer.writerow([url, property[0].text, duration[0].text, price[0].text])
IndexError: list index out of range
I am not sure where the problem lies. Can anyone help me out, please? Thanks,