0

I am trying to optimize this code, as of right now it runs 340 Requests in 10 mins. I have trying to get 1800 requests in 30 mins. Since I can run a request every second, according to amazon api. Can I use multithreading with this code to increase the number of runs??

However, I was reading in the full data to the main function, should I split it now, how can I figure out how many each thread should take?

def newhmac():
    return hmac.new(AWS_SECRET_ACCESS_KEY, digestmod=sha256)

def getSignedUrl(params):
    hmac = newhmac()
    action = 'GET'
    server = "webservices.amazon.com"
    path = "/onca/xml"

    params['Version'] = '2013-08-01'
    params['AWSAccessKeyId'] = AWS_ACCESS_KEY_ID
    params['Service'] = 'AWSECommerceService'
    params['Timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())

    key_values = [(urllib.quote(k), urllib.quote(v)) for k,v in params.items()]
    key_values.sort()
    paramstring = '&'.join(['%s=%s' % (k, v) for k, v in key_values])
    urlstring = "http://" + server + path + "?" + \
        ('&'.join(['%s=%s' % (k, v) for k, v in key_values]))
    hmac.update(action + "\n" + server + "\n" + path + "\n" + paramstring)
    urlstring = urlstring + "&Signature="+\
        urllib.quote(base64.encodestring(hmac.digest()).strip())
    return urlstring

def readData():
    data = []
    with open("ASIN.csv") as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row[0])
    return data

def writeData(data):
    with open("data.csv", "a") as f:
        writer = csv.writer(f)
        writer.writerows(data)

def main():
    data = readData()
    filtData = []
    i = 0
    count = 0
    while(i < len(data) -10 ):
        if (count %4 == 0):
            time.sleep(1)
        asins = ','.join([data[x] for x in range(i,i+10)])
        params = {'ResponseGroup':'OfferFull,Offers',
                 'AssociateTag':'4chin-20',
                 'Operation':'ItemLookup',
                 'IdType':'ASIN',
                 'ItemId':asins}
        url = getSignedUrl(params)
        resp = requests.get(url)
        responseSoup=BeautifulSoup(resp.text)

        quantity = ['' if product.amount is None else product.amount.text for product in responseSoup.findAll("offersummary")]
        price = ['' if product.lowestnewprice is None else product.lowestnewprice.formattedprice.text for product in responseSoup.findAll("offersummary")]
        prime = ['' if product.iseligibleforprime is None else product.iseligibleforprime.text for product in responseSoup("offer")]


        for zz in zip(asins.split(","), price,quantity,prime):
            print zz
            filtData.append(zz)

        print i, len(filtData)
        i+=10
        count +=1
    writeData(filtData)


threading.Timer(1.0, main).start()
Ben
  • 391
  • 1
  • 5
  • 19
  • Your code is slow because you are running your requests synchronously, one after another. You can set up a script that uses Python 3's asyncio or a threaded handler like this: http://stackoverflow.com/a/2635066/2178164 – jumbopap Jan 18 '16 at 04:23
  • @jumbopap Thanks, lemme have a look and adjust my code and see what happens. – Ben Jan 18 '16 at 04:26

1 Answers1

2

If you are using python 3.2 you can use concurrent.futures library to make it easy to launch tasks in multiple threads. e.g. here I am simulating running 10 url parsing job in parallel, each one of which takes 1 sec, if run synchronously it would have taken 10 seconds but with thread pool of 10 should take about 1 seconds

import time
from concurrent.futures import ThreadPoolExecutor

def parse_url(url):
    time.sleep(1)
    print(url)
    return "done."

st = time.time()
with ThreadPoolExecutor(max_workers=10) as executor:
    for i in range(10):
        future = executor.submit(parse_url, "http://google.com/%s"%i)

print("total time: %s"%(time.time() - st))

Output:

http://google.com/0
http://google.com/1
http://google.com/2
http://google.com/3
http://google.com/4
http://google.com/5
http://google.com/6
http://google.com/7
http://google.com/8
http://google.com/9
total time: 1.0066466331481934
Anurag Uniyal
  • 85,954
  • 40
  • 175
  • 219