I am trying to web scrape the Marathi Matrimony website using Python and Selenium. The website has a total of 365 pages, but I am only able to successfully scrape the first 130 pages. After that, the website stops responding and displays an "Out of Memory" error.
I have tried refreshing the page, but it doesn't resolve the issue. Upon refresh, the website resets the current page to 1, requiring me to click the "Next" button 130 times again to start extracting information.
Here is an overview of my code:
# Create ChromeOptions and set incognito mode
chrome_options = Options()
chrome_options.add_argument('--incognito')
driver = webdriver.Chrome(options=chrome_options)
driver.get('https://matches.marathimatrimony.com/')
username = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='MIDP']")))
password = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='PASSWORD2']")))
#enter username and password
username.clear()
username.send_keys("EMAIL")
password.clear()
password.send_keys("PASSWORD")
# Wait for the login button to be visible
button = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[type='submit'][value='LOGIN']")))
# Click the login button
button.click()
sleep(2)
# Find the element using the CSS selector
ad_close_button = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span > img[alt='Close button']")))
# Click the element
ad_close_button.click()
sleep(2)
# Find the element using the CSS selector
all_matches = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a[routerlink='/listing']")))
# Click the element
all_matches.click()
# Create a DataFrame to store the data
columns = ["Profile Name", "Profile Href", "Profile ID", "Page Number"]
data2 = []
current_page_number = 1
last_page_scraped = 0
count = 1
while current_page_number <= 375:
if current_page_number <= last_page_scraped:
print("Current page is not equal to the last page scraped")
while current_page_number <= last_page_scraped:
next_page_element = driver.find_element(By.XPATH, "//li[@class='pagination-next']//a")
driver.execute_script("arguments[0].click();", next_page_element)
current_page_element = driver.find_element(By.XPATH, "//li[contains(@class, 'current')]/span[2]")
current_page_text = current_page_element.text
current_page_number = re.search(r'\d+', current_page_text).group()
current_page_number = int(current_page_number)
print(f"Moved to page: {current_page_number}")
sleep(2)
else:
print(f"Scraping page number: {current_page_number}")
# Find the profile elements
profile_elements = driver.find_elements(By.CSS_SELECTOR, "div.listingMatchCard")
# Loop through each profile element
for profile in profile_elements:
try:
profile_name_element = profile.find_element(By.CSS_SELECTOR, "a.clr-black1.text-decoration-none.col-md-12.pl-0")
profile_name = profile_name_element.text
profile_href = profile_name_element.get_attribute("href")
profile_id_element = profile.find_element(By.CSS_SELECTOR, "a.cursor-pointer.outline-none.text-decoration-none.clr-grey2")
profile_id = profile_id_element.get_attribute("href").split("/")[-1]
data2.append([profile_name, profile_href, profile_id, current_page_number])
count += 1
except NoSuchElementException:
print("Error occurred while scraping a profile. Skipping...")
last_page_scraped = current_page_number
print(f"Successfully scraped page number: {last_page_scraped}")
if current_page_number % 5 == 0:
# Save the scraped data every 5 pages
df2 = pd.DataFrame(data2, columns=columns)
filename = f"scraped_profiles/scraped_profiles_page_{current_page_number}.csv"
df2.to_csv(filename, index=False)
try:
next_page_element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "li.pagination-next a")))
driver.execute_script("arguments[0].click();", next_page_element)
except TimeoutException:
break
sleep(2)
current_page_element = driver.find_element(By.XPATH, "//li[contains(@class, 'current')]/span[2]")
current_page_text = current_page_element.text
current_page_number = re.search(r'\d+', current_page_text).group()
current_page_number = int(current_page_number)
print(f"The current page is: {current_page_number}")
# Save the final scraped data
df2 = pd.DataFrame(data2, columns=columns)
filename = f"final_scraped_profiles.csv"
df2.to_csv(filename, index=False)
I have also tried the following approach to navigate to the desired page. So that I can save the files in batches but same thing happens. Even with this approach, the website becomes unresponsive again after 130 clicks of the next page button. I tried to search for the link on the website which can directly navigate me to the page number 130, but I was unable to find it on the website. Hence I had to click the next button 130 time to again start scrapping.
target_page_number = 135
while True:
try:
current_page_element = driver.find_element(By.XPATH, "//li[contains(@class, 'current')]/span[2]")
current_page_text = current_page_element.text
current_page_number = re.search(r'\d+', current_page_text).group()
current_page_number = int(current_page_number)
if current_page_number >= target_page_number:
break
except NoSuchElementException:
break
print(f'current page is:{current_page_number}',end='\r')
next_page_element = driver.find_element(By.XPATH, "//li[@class='pagination-next']//a")
driver.execute_script("arguments[0].click();", next_page_element)
sleep(2)
I would appreciate any suggestions or insights on how to resolve the "Out of Memory" error and successfully scrape all the pages of the website.