2

In this scrapy I want to click on the goto store open the url in new tab capture the url and close in and move on to the original tab. But the script is throwing error.

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time
from selenium.webdriver.common.keys import Keys

class CompItem(scrapy.Item):
    model_name = scrapy.Field()
    model_link = scrapy.Field()
    url  =scrapy.Field()

class criticspider(CrawlSpider):
    name = "extract"
    allowed_domains = ["mysmartprice.com"]
    start_urls = ["http://www.mysmartprice.com/computer/lenovo-g50-70-laptop-msf201821"]


    def __init__(self, *args, **kwargs):
        super(criticspider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()

        self.browser.implicitly_wait(20)

    def parse_start_url(self, response):
        self.browser.get(response.url)
        item = CompItem()
        time.sleep(10)
        items = []





            # Save the window opener (current window, do not mistaken with tab... not the same)
        button = self.browser.find_element_by_xpath("/html/body/div[3]/div/div[3]/div/div[2]/div[4]/div[4]/div[5]/div[1]")
        main_window = self.browser.current_window_handle

        # Open the link in a new tab by sending key strokes on the element
        # Use: Keys.CONTROL + Keys.SHIFT + Keys.RETURN to open tab on top of the stack 
        button.send_keys(Keys.CONTROL + Keys.RETURN)

        # Switch tab to the new tab, which we will assume is the next one on the right
        self.browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB)
        time.sleep(10)
        # Put focus on current window which will, in fact, put focus on the current visible tab
        self.browser.switch_to_window(main_window)
        item['url'] = self.browser.current_url

        # do whatever you have to do on this page, we will just got to sleep for now
        time.sleep(2)

        # Close current tab
        self.browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'w')

        yield item

The code is not throwing any error and I have tried using in multiple browsers. But couldn't found whats wrong?

alecxe
  • 462,703
  • 120
  • 1,088
  • 1,195
John Dene
  • 550
  • 1
  • 7
  • 21

1 Answers1

3

Assuming you want to visit every featured store and get back to the main window, one option would be to perform SHIFT+clicks to open "Go to Store" links in a new window, close the newly opened window and get back to the context of the main window:

import scrapy
from scrapy.contrib.spiders import CrawlSpider
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys


class CompItem(scrapy.Item):
    model_name = scrapy.Field()
    model_link = scrapy.Field()
    url = scrapy.Field()


class criticspider(CrawlSpider):
    name = "extract"
    allowed_domains = ["mysmartprice.com"]
    start_urls = ["http://www.mysmartprice.com/computer/lenovo-g50-70-laptop-msf201821"]

    def __init__(self, *args, **kwargs):
        super(criticspider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()
        self.browser.maximize_window()

        self.browser.implicitly_wait(20)

    def parse_start_url(self, response):
        self.browser.get(response.url)

        # waiting for "Go to store" to become visible
        wait = WebDriverWait(self.browser, 10)
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.store_pricetable")))

        main_window = self.browser.window_handles[0]

        # iterate over featured stores and visit them
        for store in self.browser.find_elements_by_css_selector("div.store_pricetable"):

            item = CompItem()

            # shift+click on the "Go to Store" link
            link = store.find_element_by_css_selector("div.store_gostore > div.storebutton")
            ActionChains(self.browser).key_down(Keys.SHIFT).move_to_element(link).click(link).key_up(Keys.SHIFT).perform()

            # there is a popup preventing us to navigate to the store URL - close it 
            try:
                popup_close = self.browser.find_element_by_css_selector(".popup-closebutton")
                popup_close.click()

                # repeat the click
                ActionChains(self.browser).key_down(Keys.SHIFT).move_to_element(link).click(link).key_up(Keys.SHIFT).perform()
            except NoSuchElementException:
                pass

            # switch to the newly opened window, read the current url and close the window
            self.browser.switch_to.window(self.browser.window_handles[-1])

            # wait until "On your way to the store" would not be in title
            wait.until(lambda browser: "On your way to the Store" not in browser.title)

            item['url'] = self.browser.current_url
            self.browser.close()

            # switch back to the main window
            self.browser.switch_to.window(main_window)

            yield item

This works for me and outputs 2 items:

{'url': u'http://www.ebay.in/itm/LENOVO-G50-70-LAPTOP-59422417-/231660194652?aff_source=mysmartprice'}
{"url": "https://paytm.com/shop/p/lenovo-g50-70-core-i7-4500-4th-gen-8-gb-1-tb-15-6-inch-2-gb-graphics-win8-1-no-bag-black-CMPLXLAPLENOVO-G50-7DUMM20256A81CC05?utm_source=Affiliates&utm_medium=msp&utm_campaign=msp"}
alecxe
  • 462,703
  • 120
  • 1,088
  • 1,195
  • How to get all the url of all the store pages? – John Dene Sep 07 '15 at 14:56
  • I only want the url of the store that is all the urls not the one which I have parsed in start_url ,how to ignore it? – John Dene Sep 07 '15 at 14:58
  • @JohnDene I've added a note that you might want to increase the page load timeout to allow it to load before reading the `current_url`. – alecxe Sep 07 '15 at 14:58
  • Can I just add time.sleep() before url capture? – John Dene Sep 07 '15 at 15:03
  • @JohnDene you can, but that's not quite reliable and is not recommended. The best option would be to use an explicit wait. Hold on - I'll provide it. – alecxe Sep 07 '15 at 15:04
  • And plzz tell me how to get the other url of stores?because I can't find how to change I want featured seller as well non featured seller also – John Dene Sep 07 '15 at 15:13
  • Thanx a lot @alecxe will this work with phantomjs aswell/?? – John Dene Sep 07 '15 at 17:09
  • @JohnDene haven't tested, but I'm not sure about that - phantomjs is usually a special case you sometimes have to handle differently. You can actually run firefox on a [virtual display](http://stackoverflow.com/questions/6183276/how-do-i-run-selenium-in-xvfb) if real display is the reason you ask about phantomjs. – alecxe Sep 07 '15 at 17:10
  • Thanks man ! as always you come to rescue me..Wish to meet you sometime – John Dene Sep 07 '15 at 18:56
  • can you help with this plz - http://stackoverflow.com/questions/32584084/how-to-hold-the-cache-in-selenium-in-a-loop-after-a-page-gets-refreshed – John Dene Sep 15 '15 at 13:14