I have this code that iterates through a txt file of URLs and searches for files to download:
URLS = open("urlfile.txt").readlines()
def downloader():
with open('data.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
for url in downloadtools.URLS:
try:
html_data = urlopen(url)
except:
print 'Error opening URL: ' + url
pass
#Creates a BS object out of the open URL.
soup = bs(html_data)
#Parsing the URL for later use
urlinfo = urlparse.urlparse(url)
domain = urlparse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
path = urlinfo.path.rsplit('/', 1)[0]
FILETYPE = ['\.pdf$', '\.ppt$', '\.pptx$', '\.doc$', '\.docx$', '\.xls$', '\.xlsx$', '\.wmv$', '\.mp4$', '\.mp3$']
#Loop iterates through list of file types for open URL.
for types in FILETYPE:
for link in soup.findAll(href = compile(types)):
urlfile = link.get('href')
filename = urlfile.split('/')[-1]
while os.path.exists(filename):
try:
fileprefix = filename.split('_')[0]
filetype = filename.split('.')[-1]
num = int(filename.split('.')[0].split('_')[1])
filename = fileprefix + '_' + str(num + 1) + '.' + filetype
except:
filetype = filename.split('.')[1]
fileprefix = filename.split('.')[0] + '_' + str(1)
filename = fileprefix + '.' + filetype
#Creates a full URL if needed.
if '://' not in urlfile and not urlfile.startswith('//'):
if not urlfile.startswith('/'):
urlfile = urlparse.urljoin(path, urlfile)
urlfile = urlparse.urljoin(domain, urlfile)
#Downloads the urlfile or returns error for manual inspection
try:
urlretrieve(urlfile, filename, Percentage)
writer.writerow(['SUCCESS', url, urlfile, filename])
print " SUCCESS"
except:
print " ERROR"
writer.writerow(['ERROR', url, urlfile, filename])
Everything works fine except the fact that the data is not being written to the CSV. No directories are being changed (that I know of, at least...)
The script iterates through the external list of URLs, finds the files, downloads them properly, and prints "SUCCESS" or "ERROR" without issue. The only thing it's NOT doing is writing the data to the CSV file. It will run through in its entirety without writing any CSV data.
I tried running it in a virtualenv to make sure there wasn't any weird package issues.
Is there something going on with my embedded loops that causing the CSV data to fail to write?