-1
import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests

#Function that open the url with selenium, and return the page source
def getPageBySel(url):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--remote-debugging-port=9222")
    options.add_argument("--window-size=1920x1080")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    page = driver.page_source
    driver.quit()
    return page


offset=0
hotelsArr = { f'https://www.booking.com/searchresults.he.html?aid=397594&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&dest_id=-2601889&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&checkin=2024-01-04&checkout=2024-01-07&req_children=0&offset={offset}',
            f'https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&efdco=1&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&aid=397594&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2602512&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=39e046de839803a2&ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
            f'https://www.booking.com/searchresults.he.html?aid=7961375&lang=he&sid=ff4607c90e3e0d79763672e65389c94b&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&ss=Liverpool%2C+Merseyside%2C+United+Kingdom&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&efdco=1&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=Liverpool&ac_position=0&ac_langcode=en&ac_click_type=b&ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&dest_id=-2601422&dest_type=city&iata=LPL&place_id_lat=53.4109&place_id_lon=-2.97811&search_pageview_id=a5494aa83618021d&search_selected=true&search_pageview_id=a5494aa83618021d&checkin=2024-01-04&checkout=2024-01-07&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0&offset={offset}',
            f'https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2589989&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=acea4ea102160498&ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
            f'https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2595386&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=he&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=ec504f01a010003c&ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
            f'https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&ssne=%D7%99%D7%95%D7%A8%D7%A7&ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2612321&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}'
}
x=0
hotels=[]
links=[]
prices=[]
for x in hotelsArr:
    offset=0
    page = x
# Loop that run on the first 40 pages (offset+25 each time)
    while offset < 980:
    
        
        while True: #Loop that make sure that the page loaded successfully 
            temp=getPageBySel(page)
            soup = BeautifulSoup(temp, 'html.parser')
            if len(soup("h3",{"class":"a4225678b2"}))>0:
                break
            else:
                time.sleep(1.5)
                
        for element in soup.select(".fcab3ed991.fbd1d3018c.e729ed5ab6"):
            # Find the price
            price = element.get_text(strip=True) if element else 'N/A'
            

            # Append the price to the prices list
           
            prices.append(price)
            # Extract the hotel name, link, and price
        for element in soup("h3", {"class": "a4225678b2"}):
            # Find the hotel link
            link = element('a')[0]['href']
            
            # Find the hotel name
            name = element.select_one('.fcab3ed991.a23c043802').get_text() if element else 'N/A'
            
         
          
            # Append the data to the respective lists
            hotels.append(name)
            links.append(link)
            

        
        offset= offset+25 #Move to the next page
    
# Ensure all lists have the same length
length = len(hotels)
if len(links) < length:
    length = len(links)
if len(prices) < length:
    length = len(prices)
#Take the data into dataframe
df = pd.DataFrame({
    'Hotel':hotels, 'Link':links, 'Prices':prices
})
print(df)

#Df to csv
df.to_csv('hotels_list30.csv', index=True)


I've get an error in this code, all the arrys must be in the same length, how can i fix this problem? tried everythins, the condition in the end and try to put 'NA' in the black cells.

I've tried to solve this but it didn't work, i need to get a df with the hotels, link and price cols, through scrapping. maybe i've got a mistake there. this is why i also put the booking links.

Gilad Levy
  • 17
  • 4
  • Does this answer your question? [Python Pandas ValueError Arrays Must be All Same Length](https://stackoverflow.com/questions/40442014/python-pandas-valueerror-arrays-must-be-all-same-length) – baduker May 22 '23 at 12:15
  • No, i need to check why this problem happening in this code, maybe i've got a mistake in the scrapping proccess. it shouldn't happen i'm just extract the link price and name of specific hotel. – Gilad Levy May 22 '23 at 12:25
  • Well, you're using pandas and this is pandas specific error, so my guess is that, yes, this could answer your question. – baduker May 22 '23 at 12:30
  • But i don't want to lose data and just change the frame of the list, i need to know what is the problem there because it suppose to be in the same length – Gilad Levy May 22 '23 at 12:34
  • 1
    Then, you might find this useful - [How to Debug Small Programs](https://ericlippert.com/2014/03/05/how-to-debug-small-programs/). – baduker May 22 '23 at 12:35
  • Not so much.... i need a help with this exact code. – Gilad Levy May 22 '23 at 12:40

1 Answers1

1

There are a few things that need to be fixed in your code

First is You are not changing the offset value in the url so it keeps scraping the same url again and again until offset condition breaks, i have modified code and added below to change it

 page = page.replace(f"offset={offset - 25}", f"offset={offset}")

Also instead ofcreating driver instance again and again for every offset scraping you should use a single one which saves times change getPageBySel method like below

def get_driver():
    global driver
    if not driver:
        options = webdriver.ChromeOptions()
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--remote-debugging-port=9222")
        options.add_argument("--window-size=1920x1080")
        # options.headless=True
        driver = webdriver.Chrome(options=options)
    return driver


def getPageBySel(url):
    driver = get_driver()
    print(url)
    driver.get(url)
    page = driver.page_source
    return page

Also you have hardcoded the max offset value to 980 which won't work as when no of records are less than that, it turn the below loop into an infinite loop as length of elements will always be zero once offset has exceeded the actual number of results

while True:  # Loop that make sure that the page loaded successfully
            temp = getPageBySel(page)
            soup = BeautifulSoup(temp, 'html.parser')
            if len(soup("h3", {"class": "a4225678b2"})) > 0:
                break
            else:
                time.sleep(1.5)

So you should get this max value according to the number of results in page like below where we pass the page and get the total pages

def getOffset(x):
    driver = get_driver()
    driver.get(x)
    links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
    # Extract the integer values from the elements
    values = [int(link.text) for link in links]
    # Find the integer with the highest value
    highest_value = max(values)
    return highest_value

then we keep max value of offset as

maxValue = getOffset(x) *25

Full Code

import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests

driver = None


def get_driver():
    global driver
    if not driver:
        options = webdriver.ChromeOptions()
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--remote-debugging-port=9222")
        options.add_argument("--window-size=1920x1080")
        # options.headless=True
        driver = webdriver.Chrome(options=options)
    return driver


def getPageBySel(url):
    driver = get_driver()
    print(url)
    driver.get(url)
    page = driver.page_source
    return page


def getOffset(x):
    driver = get_driver()
    driver.get(x)
    links = driver.find_elements(By.XPATH, '//button[@class="fc63351294 f9c5690c58"]')
    # Extract the integer values from the elements
    values = [int(link.text) for link in links]
    # Find the integer with the highest value
    highest_value = max(values)
    return highest_value


offset = 0
hotelsArr = {
    f'https://www.booking.com/searchresults.he.html?aid=397594&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&dest_id=-2601889&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&checkin=2024-01-04&checkout=2024-01-07&req_children=0&offset={offset}',
    f'https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&efdco=1&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&aid=397594&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2602512&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=39e046de839803a2&ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
    f'https://www.booking.com/searchresults.he.html?aid=7961375&lang=he&sid=ff4607c90e3e0d79763672e65389c94b&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&ss=Liverpool%2C+Merseyside%2C+United+Kingdom&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&efdco=1&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=Liverpool&ac_position=0&ac_langcode=en&ac_click_type=b&ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&dest_id=-2601422&dest_type=city&iata=LPL&place_id_lat=53.4109&place_id_lon=-2.97811&search_pageview_id=a5494aa83618021d&search_selected=true&search_pageview_id=a5494aa83618021d&checkin=2024-01-04&checkout=2024-01-07&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0&offset={offset}',
    f'https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2589989&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=acea4ea102160498&ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
    f'https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2595386&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=he&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=ec504f01a010003c&ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
    f'https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&ssne=%D7%99%D7%95%D7%A8%D7%A7&ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2612321&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}'
}
x = 0
hotels = []
links = []
prices = []

for x in hotelsArr:

    maxValue = getOffset(x) *25
    offset = 0
    page = x
    # Loop that run on the first 40 pages (offset+25 each time)
    while offset < maxValue:

        while True:  # Loop that make sure that the page loaded successfully
            temp = getPageBySel(page)
            soup = BeautifulSoup(temp, 'html.parser')
            if len(soup("h3", {"class": "a4225678b2"})) > 0:
                break
            else:
                time.sleep(1.5)

        for element in soup.select(".fcab3ed991.fbd1d3018c.e729ed5ab6"):
            # Find the price
            price = element.get_text(strip=True) if element else 'N/A'

            # Append the price to the prices list

            prices.append(price)
            # Extract the hotel name, link, and price
        for element in soup("h3", {"class": "a4225678b2"}):
            # Find the hotel link
            link = element('a')[0]['href']

            # Find the hotel name
            name = element.select_one('.fcab3ed991.a23c043802').get_text() if element else 'N/A'

            # Append the data to the respective lists
            hotels.append(name)
            links.append(link)

        offset += 25  # Move to the next page
        page = page.replace(f"offset={offset - 25}", f"offset={offset}")  # Update the URL with the new offset
# Move to the next page

# Ensure all lists have the same length
length = len(hotels)
if len(links) < length:
    length = len(links)
if len(prices) < length:
    length = len(prices)
# Take the data into dataframe
df = pd.DataFrame({
    'Hotel': hotels, 'Link': links, 'Prices': prices
})
print(df)

# Df to csv
df.to_csv('hotels_list30.csv', index=True)

was able to scrape csv with around 3699 records enter image description here

Abhay Chaudhary
  • 1,763
  • 1
  • 8
  • 13