I am attempting to just generate traffic on the network by having a large list of sites that I am opening from a text file.
I then would like to get the site and all the href links. Go to those links and then the site, then proceed onto the next site in the text document.
My problem (that I have been noticing) is that it taking a while to execute these statements, Upwards of 5 seconds per curl. Is this because of my excessive use of try except loops? I'm just trying to understand where the problem may be.
2018-03-14 16:30:32.590135
http://www.ipostparcels.com/parcel-delivery/amazon-parcel-delivery
2018-03-14 16:30:37.653522
http://www.ipostparcels.com/parcel-delivery/abot-ipostparcels
2018-03-14 16:30:42.716842
http://www.ipostparcels.com/parcel-delivery/parcel-delivery-rates
2018-03-14 16:30:47.762127
http://www.ipostparcels.com/parcel-delivery/parcel-collection-and-delivery
2018-03-14 16:30:52.809792
http://www.ipostparcels.com/parcel-delivery/post-for-a-post
2018-03-14 16:30:57.876936
http://www.ipostparcels.com/parcel-delivery/discont-codes-and-offers
2018-03-14 16:31:02.947123
http://www.ipostparcels.com/corier/ebay-corier-service
#!/usr/bin/python
from bs4 import BeautifulSoup
import urllib2
import pycurl
from io import BytesIO
import os
import re
import sys
import random
from datetime import datetime
links = []
while True:
with open("topdomains3.txt", "r") as f:
domains = list(f)
joker=random.randint(1, len(domains))
for i in domains[joker:len(domains)]:
i=i.replace("\\n", "")
i=i.replace("None", "")
i=i.rstrip()
print i
try:
c = pycurl.Curl()
c.setopt(c.URL, i)
c.setopt(pycurl.TIMEOUT, 3)
c.setopt(c.FOLLOWLOCATION, True)
c.setopt(c.MAXREDIRS , 5)
try:
i='http://' + i
html_page = urllib2.urlopen(i)
soup = BeautifulSoup(html_page, 'html5lib')
except Exception,e:
print e
continue
for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
links.append(link.get('href').replace("u", ""))
for a in links:
try:
print "----------------------------------------------------------"
print str(datetime.now())
print a
d = pycurl.Curl()
#c.setopt(c.VERBOSE, True)
d.setopt(d.URL, str(a))
#c.setopt(c.WRITEDATA, buffer)
d.setopt(d.TIMEOUT, 3)
d.setopt(d.FOLLOWLOCATION, True)
d.setopt(d.MAXREDIRS , 5)
#d.setopt(pycurl.WRITEFUNCTION, lambda x: None)
d.perform()
d.close()
except pycurl.error:
continue
c.perform()
c.close()
except pycurl.error:
continue
any assistance would be appreciated.