2

I want to scrape prices of every hotel from a tourist site , i'm extracting names and arrangements butt he problem that the prices shows of after clic arrangments and i didn't know how to deal with it.

the out put i want to get :

{' Julius ': [('Petit Déjeuner', '216'),('Demi pension','264')]}

I put at your disposal my code if any of you can help me and thank you in advance.

#!/usr/bin/env python
# coding: utf-8
import json
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select


# create path and start webdriver
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)

# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)

# params to select
params = {
    'destination': 'El Jem',
    'date_from': '08/08/2021',
    'date_to': '09/08/2021',
    'bedroom': '1'
}

# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])

# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])

# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)

# click bouton search
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)

# click bouton details
#btn_plus = driver.find_element_by_id('plus_res')
#btn_plus.click()
#sleep(10)

# ----------------------------------------------------------------------------
# get list of all hotels
hotels_list = []
hotels_objects = driver.find_elements_by_xpath(
    '//div[contains(@class, "enveloppe_produit")]'
)
for hotel_obj in hotels_objects:
    # get price object
    price_object = hotel_obj.find_element_by_xpath(
        './/div[@class="monaieprix"]'
    )
    price_value = price_object.find_element_by_xpath(
        './/div[1]'
    ).text.replace('\n', '')

    # get title data
    title_data = hotel_obj.find_element_by_xpath(
        './/span[contains(@class, "tittre_hotel")]'
    )

    # get arrangements
    arrangements_obj = hotel_obj.find_elements_by_xpath(
        './/div[contains(@class, "angle")]//u'
    )
    arrangements = [ao.text for ao in arrangements_obj]
    
    # get arrangements
    prixM_obj = hotel_obj.find_elements_by_xpath(
        './/div[contains(@id, "prixtotal")]'
    )
    prixM = [ao.text for ao in  prixM_obj]

    # create new object
    hotels_list.append({
        'name': title_data.find_element_by_xpath('.//a//h3').text,
        'arrangements': arrangements,
        'prixM':prixM,
        'price': f'{price_value}'
    })

# ----------------------------------------------------------------
#for hotel in hotels_list:
#    print(json.dumps(hotel, indent=4))

import pandas as pd
df = pd.DataFrame(hotels_list, columns=['name','arrangements','price'])
df.head()

demouser123
  • 4,108
  • 9
  • 50
  • 82
HiFAR
  • 48
  • 1
  • 13
  • 1
    Just a general note: I would definitely add something like `time.sleep(2)` somewhere to make sure the requests dont DOS attack the server. Generally you need to be careful with scraping because it could go against the Terms of Usage of the website that you are accessing. If you could elaborate a bit on whether the site explicitly allows this I am sure it would invite more people to provide answers to your question. – logical x 2 Aug 06 '21 at 11:17

1 Answers1

1

It seems that the DOM keeps changing. So based on the answers from this question and StaleElementReferenceException, below code might be useful for you.

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
import time

driver = webdriver.Chrome(executable_path="path")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://tn.tunisiebooking.com/")
#Code to choose options.
hoteldata = {}
hotels = driver.find_elements_by_xpath("//div[starts-with(@id,'produit_affair')]")
for hotel in hotels:
    name = hotel.find_element_by_tag_name("h3").text
    details = []
    argmts = hotel.find_element_by_class_name("angle_active").text
    prize = hotel.find_element_by_xpath(".//div[contains(@id,'prixtotal_')]").get_attribute("innerText")
    details.append((argmts,prize))
    inactive = hotel.find_elements_by_xpath(".//div[@class='angle_desactive']")
    for item in inactive:
        try:
            n = item.get_attribute("innerText")
            item.click()
            time.sleep(2)
            pri = hotel.find_element_by_xpath(".//div[contains(@id,'prixtotal_')]").get_attribute("innerText")
            details.append((n,pri))
        except StaleElementReferenceException:
            pass
    hoteldata[name]=details
print(hoteldata)
driver.quit()
pmadhu
  • 3,373
  • 2
  • 11
  • 23