Brief Explanation:
I have a Scrapy project that takes stock data from Yahoo! Finance. In order for my project to work, I need to ensure that a stock has been around for a desired amount of time. I do this by scraping CAT (Caterpillar Inc. (CAT) -NYSE) first, get the amount of closing prices that there is for that time period, and then ensure that all stocks scraped after that have the same amount of closing prices as CAT, thus ensuring that a stock has been publicly traded for the desired time length.
The Problem:
This all works fine and dandy, however my problem is that before scrapy has finished parsing CAT, it begins scraping other stocks and parsing them. This results in an error, as before I can get the desired amount of closing prices from CAT, scrapy is trying to decide if any other stock has the same amount of closing prices as CAT, which does not exist yet.
The actual question
How can I force scrapy to finish parsing one url before beginning others
I have also tried:
def start_requests(self):
global start_time
yield Request('http://finance.yahoo.com/q?s=CAT', self.parse)
# Waits 4 seconds to allow CAT to finish crawling
if time.time() - start_time > 0.2:
for i in self.other_urls:
yield Request(i, self.parse)
but the stocks in other_urls
never commence, because scrapy never goes back to def start_requests
to check if the time is above 0.2
The Entire Code:
from scrapy.selector import Selector
from scrapy import Request
from scrapy.exceptions import CloseSpider
from sharpeparser.gen_settings import *
from decimal import Decimal
from scrapy.spider import Spider
from sharpeparser.items import SharpeparserItem
import numpy
import time
if data_intervals == "m":
required_amount_of_returns = 24
elif data_intervals == "w":
required_amount_of_returns = 100
else:
required_amount_of_returns =
counter = 1
start_time = time.time()
class DnotSpider(Spider):
# ---- >>> ENSURE YOU INDENT 1 ---- >>>
# =======================================
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/", "ca.finance.yahoo.com"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
other_urls = ['http://eoddata.com/stocklist/TSX.htm', 'http://eoddata.com/stocklist/TSX/B.htm', 'http://eoddata.com/stocklist/TSX/C.htm', 'http://eoddata.com/stocklist/TSX/D.htm', 'http://eoddata.com/stocklist/TSX/E.htm', 'http://eoddata.com/stocklist/TSX/F.htm', 'http://eoddata.com/stocklist/TSX/G.htm', 'http://eoddata.com/stocklist/TSX/H.htm', 'http://eoddata.com/stocklist/TSX/I.htm', 'http://eoddata.com/stocklist/TSX/J.htm', 'http://eoddata.com/stocklist/TSX/K.htm', 'http://eoddata.com/stocklist/TSX/L.htm', 'http://eoddata.com/stocklist/TSX/M.htm', 'http://eoddata.com/stocklist/TSX/N.htm', 'http://eoddata.com/stocklist/TSX/O.htm', 'http://eoddata.com/stocklist/TSX/P.htm', 'http://eoddata.com/stocklist/TSX/Q.htm', 'http://eoddata.com/stocklist/TSX/R.htm', 'http://eoddata.com/stocklist/TSX/S.htm', 'http://eoddata.com/stocklist/TSX/T.htm', 'http://eoddata.com/stocklist/TSX/U.htm', 'http://eoddata.com/stocklist/TSX/V.htm', 'http://eoddata.com/stocklist/TSX/W.htm', 'http://eoddata.com/stocklist/TSX/X.htm', 'http://eoddata.com/stocklist/TSX/Y.htm', 'http://eoddata.com/stocklist/TSX/Z.htm'
'http://eoddata.com/stocklist/NASDAQ/B.htm', 'http://eoddata.com/stocklist/NASDAQ/C.htm', 'http://eoddata.com/stocklist/NASDAQ/D.htm', 'http://eoddata.com/stocklist/NASDAQ/E.htm', 'http://eoddata.com/stocklist/NASDAQ/F.htm', 'http://eoddata.com/stocklist/NASDAQ/G.htm', 'http://eoddata.com/stocklist/NASDAQ/H.htm', 'http://eoddata.com/stocklist/NASDAQ/I.htm', 'http://eoddata.com/stocklist/NASDAQ/J.htm', 'http://eoddata.com/stocklist/NASDAQ/K.htm', 'http://eoddata.com/stocklist/NASDAQ/L.htm', 'http://eoddata.com/stocklist/NASDAQ/M.htm', 'http://eoddata.com/stocklist/NASDAQ/N.htm', 'http://eoddata.com/stocklist/NASDAQ/O.htm', 'http://eoddata.com/stocklist/NASDAQ/P.htm', 'http://eoddata.com/stocklist/NASDAQ/Q.htm', 'http://eoddata.com/stocklist/NASDAQ/R.htm', 'http://eoddata.com/stocklist/NASDAQ/S.htm', 'http://eoddata.com/stocklist/NASDAQ/T.htm', 'http://eoddata.com/stocklist/NASDAQ/U.htm', 'http://eoddata.com/stocklist/NASDAQ/V.htm', 'http://eoddata.com/stocklist/NASDAQ/W.htm', 'http://eoddata.com/stocklist/NASDAQ/X.htm', 'http://eoddata.com/stocklist/NASDAQ/Y.htm', 'http://eoddata.com/stocklist/NASDAQ/Z.htm',
'http://eoddata.com/stocklist/NYSE/B.htm', 'http://eoddata.com/stocklist/NYSE/C.htm', 'http://eoddata.com/stocklist/NYSE/D.htm', 'http://eoddata.com/stocklist/NYSE/E.htm', 'http://eoddata.com/stocklist/NYSE/F.htm', 'http://eoddata.com/stocklist/NYSE/G.htm', 'http://eoddata.com/stocklist/NYSE/H.htm', 'http://eoddata.com/stocklist/NYSE/I.htm', 'http://eoddata.com/stocklist/NYSE/J.htm', 'http://eoddata.com/stocklist/NYSE/K.htm', 'http://eoddata.com/stocklist/NYSE/L.htm', 'http://eoddata.com/stocklist/NYSE/M.htm', 'http://eoddata.com/stocklist/NYSE/N.htm', 'http://eoddata.com/stocklist/NYSE/O.htm', 'http://eoddata.com/stocklist/NYSE/P.htm', 'http://eoddata.com/stocklist/NYSE/Q.htm', 'http://eoddata.com/stocklist/NYSE/R.htm', 'http://eoddata.com/stocklist/NYSE/S.htm', 'http://eoddata.com/stocklist/NYSE/T.htm', 'http://eoddata.com/stocklist/NYSE/U.htm', 'http://eoddata.com/stocklist/NYSE/V.htm', 'http://eoddata.com/stocklist/NYSE/W.htm', 'http://eoddata.com/stocklist/NYSE/X.htm', 'http://eoddata.com/stocklist/NYSE/Y.htm', 'http://eoddata.com/stocklist/NYSE/Z.htm',
'http://eoddata.com/stocklist/HKEX/0.htm', 'http://eoddata.com/stocklist/HKEX/1.htm', 'http://eoddata.com/stocklist/HKEX/2.htm', 'http://eoddata.com/stocklist/HKEX/3.htm', 'http://eoddata.com/stocklist/HKEX/6.htm', 'http://eoddata.com/stocklist/HKEX/8.htm',
'http://eoddata.com/stocklist/LSE/0.htm', 'http://eoddata.com/stocklist/LSE/1.htm', 'http://eoddata.com/stocklist/LSE/2.htm', 'http://eoddata.com/stocklist/LSE/3.htm', 'http://eoddata.com/stocklist/LSE/4.htm', 'http://eoddata.com/stocklist/LSE/5.htm', 'http://eoddata.com/stocklist/LSE/6.htm', 'http://eoddata.com/stocklist/LSE/7.htm', 'http://eoddata.com/stocklist/LSE/8.htm', 'http://eoddata.com/stocklist/LSE/9.htm', 'http://eoddata.com/stocklist/LSE/A.htm', 'http://eoddata.com/stocklist/LSE/B.htm', 'http://eoddata.com/stocklist/LSE/C.htm', 'http://eoddata.com/stocklist/LSE/D.htm', 'http://eoddata.com/stocklist/LSE/E.htm', 'http://eoddata.com/stocklist/LSE/F.htm', 'http://eoddata.com/stocklist/LSE/G.htm', 'http://eoddata.com/stocklist/LSE/H.htm', 'http://eoddata.com/stocklist/LSE/I.htm', 'http://eoddata.com/stocklist/LSE/G.htm', 'http://eoddata.com/stocklist/LSE/K.htm', 'http://eoddata.com/stocklist/LSE/L.htm', 'http://eoddata.com/stocklist/LSE/M.htm', 'http://eoddata.com/stocklist/LSE/N.htm', 'http://eoddata.com/stocklist/LSE/O.htm', 'http://eoddata.com/stocklist/LSE/P.htm', 'http://eoddata.com/stocklist/LSE/Q.htm', 'http://eoddata.com/stocklist/LSE/R.htm', 'http://eoddata.com/stocklist/LSE/S.htm', 'http://eoddata.com/stocklist/LSE/T.htm', 'http://eoddata.com/stocklist/LSE/U.htm', 'http://eoddata.com/stocklist/LSE/V.htm', 'http://eoddata.com/stocklist/LSE/W.htm', 'http://eoddata.com/stocklist/LSE/X.htm', 'http://eoddata.com/stocklist/LSE/Y.htm', 'http://eoddata.com/stocklist/LSE/Z.htm',
'http://eoddata.com/stocklist/AMS/A.htm', 'http://eoddata.com/stocklist/AMS/B.htm', 'http://eoddata.com/stocklist/AMS/C.htm', 'http://eoddata.com/stocklist/AMS/D.htm', 'http://eoddata.com/stocklist/AMS/E.htm', 'http://eoddata.com/stocklist/AMS/F.htm', 'http://eoddata.com/stocklist/AMS/G.htm', 'http://eoddata.com/stocklist/AMS/H.htm', 'http://eoddata.com/stocklist/AMS/I.htm', 'http://eoddata.com/stocklist/AMS/J.htm', 'http://eoddata.com/stocklist/AMS/K.htm', 'http://eoddata.com/stocklist/AMS/L.htm', 'http://eoddata.com/stocklist/AMS/M.htm', 'http://eoddata.com/stocklist/AMS/N.htm', 'http://eoddata.com/stocklist/AMS/O.htm', 'http://eoddata.com/stocklist/AMS/P.htm', 'http://eoddata.com/stocklist/AMS/Q.htm', 'http://eoddata.com/stocklist/AMS/R.htm', 'http://eoddata.com/stocklist/AMS/S.htm', 'http://eoddata.com/stocklist/AMS/T.htm', 'http://eoddata.com/stocklist/AMS/U.htm', 'http://eoddata.com/stocklist/AMS/V.htm', 'http://eoddata.com/stocklist/AMS/W.htm', 'http://eoddata.com/stocklist/AMS/X.htm', 'http://eoddata.com/stocklist/AMS/Y.htm', 'http://eoddata.com/stocklist/AMS/Z.htm',
'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=A', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=B', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=C', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=D', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=E', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=F', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=G', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=H', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=I', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=J', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=K', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=L', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=M', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=N', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=O', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=P', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=Q', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=R', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=S', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=T', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=U', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=V', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=W', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=X', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=Y', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=Z',
'https://ca.finance.yahoo.com/q/cp?s=%5EHSI&alpha=0', 'https://ca.finance.yahoo.com/q/cp?s=%5EHSI&alpha=1', 'https://ca.finance.yahoo.com/q/cp?s=%5EHSI&alpha=2', 'https://ca.finance.yahoo.com/q/cp?s=%5EHSI&alpha=3',
'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=A', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=B', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=C', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=D', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=E', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=F', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=G', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=H', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=I', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=J', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=K', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=L', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=M', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=N', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=O', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=P', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=Q', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=R', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=S', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=T', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=U', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=V', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=W', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=X', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=Y', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=Z',
'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=A', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=B', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=C', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=D', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=E', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=F', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=G', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=H', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=I', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=J', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=K', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=L', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=M', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=N', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=O', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=P', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=Q', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=R', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=S', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=T', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=U', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=V', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=W', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=X', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=Y', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=Z',
'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=A', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=B', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=C', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=D', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=E', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=F', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=G', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=H', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=I', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=J', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=K', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=L', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=M', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=N', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=O', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=P', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=Q', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=R', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=S', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=T', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=U', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=V', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=W', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=X', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=Y', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=Z']
def start_requests(self):
global start_time
yield Request('http://finance.yahoo.com/q?s=CAT', self.parse)
# Waits 4 seconds to allow CAT to finish crawling
if time.time() - start_time > 0.2:
for i in self.other_urls:
yield Request(i, self.parse)
def parse(self, response):
if "eoddata" in response.url:
companyList = response.xpath('//tr[@class="ro"]/td/a/text()').extract()
for company in companyList:
if "TSX" in response.url:
go = 'http://finance.yahoo.com/q/hp?s={0}.TO&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
elif "LSE" in response.url:
go = 'http://finance.yahoo.com/q/hp?s={0}.L&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
elif "HKEX" in response.url:
go = 'http://finance.yahoo.com/q/hp?s={0}.HK&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
elif "AMS" in response.url:
go = 'https://ca.finance.yahoo.com/q/hp?s={0}.AS&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
else:
go = 'https://ca.finance.yahoo.com/q/hp?s={0}&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
elif "http://finance.yahoo.com/q?s=CAT" in response.url:
go = 'http://finance.yahoo.com/q/hp?s=CAT&a={0}&b={1}&c={2}&d={3}&e={4}&f={5}&g={6}'.format(beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
else:
rows = response.xpath('//table[@class="yfnc_tableout1"]//table/tr')[1:]
for row in rows:
company = row.xpath('.//td[1]/b/a/text()').extract()
go = 'http://finance.yahoo.com/q/hp?s={0}&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_day, beginning_month, beginning_year, ending_day, ending_month, ending_year, data_intervals)
yield Request(go, self.stocks1)
def stocks1(self, response):
current_page = response.url
print current_page
# If the link is not the same as the first page, ie. stocks1 is requested through stocks2, get the stock data from stocks2
if initial_ending not in current_page[-iel:]:
returns_pages = response.meta.get('returns_pages')
# Remove the last stock price from the stock list, because it is the same as the first on the new list
if not not returns_pages:
if len(returns_pages) > 2:
returns_pages = returns_pages[:-1]
else:
# Else, if the link does match that of the first page, create a new list becuase one does not exist yet
returns_pages = []
# This grabs the stock data from the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
print "stocks1"
print returns_pages
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to returns_pages
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And this is the first page:
if initial_ending in current_page[-iel:]:
#create necessary url for the 2nd page
next_page = current_page + "&z=66&y=66"
# If this is not the first page
else:
# This increases the end of the link by 66, thereby getting the next 66 results on for pages 2 and after
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# Then go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks2, meta={'returns_pages': returns_pages}, dont_filter=True)
# Else, if there is no 'Next Link'
else:
# Send the retuns to finalize.stock to be saved in the item
yield Request(current_page, callback=self.finalize_stock, meta={'returns_pages': returns_pages}, dont_filter=True)
def stocks2(self, response):
# Prints the link of the current url
current_page = response.url
print current_page
# Gets the returns from the previous page
returns_pages = response.meta.get('returns_pages')
# Removes the last return from the previous page because it will be a duplicate
returns_pages = returns_pages[:-1]
print "stocks2"
print returns_pages
# Gets all of the returns on the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to the previous returns
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after 2"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And somehow, this is the first page (should never be true)
if initial_ending in current_page[-iel:]:
# Add necessary link to go to the second page
next_page = current_page + "&z=66&y=66"
print next_page, "66&y not in curr_page"
# Else, this is not the first page (should always be true)
else:
# add 66 to the last number on the preceeding link in order to access the second or later pages
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks1, meta={'returns_pages': returns_pages}, dont_filter=True)
else:
# If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
yield Request(current_page, callback=self.finalize_stock, meta={'returns_pages': returns_pages}, dont_filter=True)
print "sending to finalize stock"
def finalize_stock(self,response):
current_page = response.url
print "====================="
print "finalize_stock called"
print current_page
print "====================="
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in unformatted_returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
RFR = 0.03
# Make sure list is exact length, otherwise rate_of_return will be inaccurate
# Returns has not been checked by pipeline yet, so small lists will be in the variable
if len(returns) > required_amount_of_returns:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = SharpeparserItem()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[@class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = unformatted_returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[@class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item