import bs4
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
#Function that open the url with selenium, and return the page source
def getPageBySel(url):
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(options=options)
driver.get(url)
page = driver.page_source
driver.quit()
return page
offset=0
hotelsArr = { f'https://www.booking.com/searchresults.he.html?aid=397594&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4ArvKrqIGwAIB0gIkMWJjMjhhNzItNDZhNC00NDZmLTk1YzgtNjhiOWM0NmM0NDA42AIE4AIB&dest_id=-2601889&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&checkin=2024-01-04&checkout=2024-01-07&req_children=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Manchester%2C+Greater+Manchester%2C+United+Kingdom&ssne=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&ssne_untouched=%D7%9E%D7%A0%D7%A6%27%D7%A1%D7%98%D7%A8&efdco=1&label=gog235jc-1DCAEoggI46AdIDlgDaGqIAQGYAQ64ARfIAQzYAQPoAQH4AQKIAgGoAgO4Ar3Xs6IGwAIB0gIkMWU3MTc2OTUtZDZkNi00NzFhLTk2NWYtMDczNjk5MDNhN2U52AIE4AIB&aid=397594&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2602512&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=39e046de839803a2&ac_meta=GhAzOWUwNDZkZTgzOTgwM2EyIAAoATICZW46Ck1hbmNoZXN0ZXJAAEoAUAA%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?aid=7961375&lang=he&sid=ff4607c90e3e0d79763672e65389c94b&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.he.html%3Faid%3D7961375%26sid%3Dff4607c90e3e0d79763672e65389c94b%26sb_price_type%3Dtotal%26%26&ss=Liverpool%2C+Merseyside%2C+United+Kingdom&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&efdco=1&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=Liverpool&ac_position=0&ac_langcode=en&ac_click_type=b&ac_meta=GhBhNTQ5NGFhODM2MTgwMjFkIAAoATICZW46CUxpdmVycG9vbEAASgBQAA%3D%3D&dest_id=-2601422&dest_type=city&iata=LPL&place_id_lat=53.4109&place_id_lon=-2.97811&search_pageview_id=a5494aa83618021d&search_selected=true&search_pageview_id=a5494aa83618021d&checkin=2024-01-04&checkout=2024-01-07&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=Birmingham%2C+West+Midlands%2C+United+Kingdom&ssne=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&ssne_untouched=%D7%91%D7%A8%D7%9E%D7%99%D7%A0%D7%92%D7%94%D7%90%D7%9D&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AsL2s6IGwAIB0gIkM2VlZjk2YjEtMDJhYi00YmExLTg1NmEtOTIxYTNhNzdhMWQ22AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2589989&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=acea4ea102160498&ac_meta=GhBhY2VhNGVhMTAyMTYwNDk4IAAoATICZW46BWJpcm1pQABKAFAA&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%90%D7%93%D7%99%D7%A0%D7%91%D7%95%D7%A8%D7%95%2C+%D7%A1%D7%A7%D7%95%D7%98%D7%9C%D7%A0%D7%93%2C+%D7%91%D7%A8%D7%99%D7%98%D7%A0%D7%99%D7%94&ssne=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&ssne_untouched=%D7%92%D7%9C%D7%90%D7%96%D7%92%D7%95&efdco=1&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4AoP4s6IGwAIB0gIkZmQ0YjhjNTMtZTc2ZS00NDZkLThmMmEtNmUyZDk3YTAwZWJl2AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2595386&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=he&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=ec504f01a010003c&ac_meta=GhBlYzUwNGYwMWEwMTAwMDNjIAAoATICaGU6A2VkaUAASgBQAA%3D%3D&group_adults=2&checkin=2024-01-04&checkout=2024-01-07&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}',
f'https://www.booking.com/searchresults.he.html?ss=%D7%99%D7%95%D7%A8%D7%A7&ssne=%D7%99%D7%95%D7%A8%D7%A7&ssne_untouched=%D7%99%D7%95%D7%A8%D7%A7&label=gen173nr-1BCAEoggI46AdIM1gEaGqIAQGYAQ64ARfIAQzYAQHoAQGIAgGoAgO4Auj6s6IGwAIB0gIkM2IwYTU3ODgtNDJiYS00ZDk0LWI0MDAtNGU3M2U3ZDlkNzM42AIF4AIB&sid=ff4607c90e3e0d79763672e65389c94b&aid=304142&lang=he&sb=1&src_elem=sb&src=index&dest_id=-2612321&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset={offset}'
}
x=0
hotels=[]
links=[]
prices=[]
for x in hotelsArr:
offset=0
page = x
# Loop that run on the first 40 pages (offset+25 each time)
while offset < 980:
while True: #Loop that make sure that the page loaded successfully
temp=getPageBySel(page)
soup = BeautifulSoup(temp, 'html.parser')
if len(soup("h3",{"class":"a4225678b2"}))>0:
break
else:
time.sleep(1.5)
for element in soup.select(".fcab3ed991.fbd1d3018c.e729ed5ab6"):
# Find the price
price = element.get_text(strip=True) if element else 'N/A'
# Append the price to the prices list
prices.append(price)
# Extract the hotel name, link, and price
for element in soup("h3", {"class": "a4225678b2"}):
# Find the hotel link
link = element('a')[0]['href']
# Find the hotel name
name = element.select_one('.fcab3ed991.a23c043802').get_text() if element else 'N/A'
# Append the data to the respective lists
hotels.append(name)
links.append(link)
offset= offset+25 #Move to the next page
# Ensure all lists have the same length
length = len(hotels)
if len(links) < length:
length = len(links)
if len(prices) < length:
length = len(prices)
#Take the data into dataframe
df = pd.DataFrame({
'Hotel':hotels, 'Link':links, 'Prices':prices
})
print(df)
#Df to csv
df.to_csv('hotels_list30.csv', index=True)
I've get an error in this code, all the arrys must be in the same length, how can i fix this problem? tried everythins, the condition in the end and try to put 'NA' in the black cells.
I've tried to solve this but it didn't work, i need to get a df with the hotels, link and price cols, through scrapping. maybe i've got a mistake there. this is why i also put the booking links.