For some reason when I run this code it keeps looping over the same object and is not getting any new items from the database. In other words, the print output is just the same object over and over, when it should be iterating over items in the list. Here is my code:
article = Article.objects.filter(is_locked=False, is_downloaded=False).first()
while article:
article.is_locked = True
article.save()
print '******************************'
date = article.datetime
title = article.title
url = article.url
print('date: %s' % date)
print('url: %s' % url)
print('title: %s' % title)
get_article(url, title, article)
article = Article.objects.filter(is_locked=False, is_downloaded=False).first()
Where mldb.models is:
from django.db import models
class Article(models.Model):
url = models.CharField(max_length=1028)
title = models.CharField(max_length=1028)
category = models.CharField(max_length=128)
locale = models.CharField(max_length=128)
section = models.CharField(max_length=512)
tag = models.CharField(max_length=128)
author = models.CharField(max_length=256)
datetime = models.DateTimeField()
description = models.TextField()
article = models.TextField()
is_locked = models.BooleanField(default=False)
is_downloaded = models.BooleanField(default=False)
def __str__(self): # __unicode__ on Python 2
return self.name
class Meta:
app_label = 'mldb'
I have also tried this but it also does not loop through objects either (the loop just repeats the same object over and over):
articles = Article.objects.filter(is_locked=False, is_downloaded=False)
for article in articles:
...
Here is get_article(). This seems to be what is causing the problem (if I remove the call to this function everything works properly):
def get_article(url, title, article):
failed_attempts = 0
while True:
try:
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content, "html5lib")
description = soup.find(property="og:description")["content"] if soup.find(property="og:description") else ''
locale = soup.find(property="og:locale")["content"] if soup.find(property="og:locale") else ''
section = soup.find(property="og:article:section")["content"] if soup.find(property="og:article:section") else ''
tag = soup.find(property="og:article:tag")["content"] if soup.find(property="og:article:tag") else ''
author = soup.find(property="og:article:author")["content"] if soup.find(property="og:article:author") else ''
date = soup.find(property="og:article:published_time")["content"] if soup.find(property="og:article:published_time") else ''
print 'date'
print date
body = ''
for body_tag in soup.findAll("div", {"class" : re.compile('ArticleBody_body.*')}):
body += body_tag.text
# datetime.strptime (ts, "%Y") # 2012-01-02T04:32:57+0000
dt = dateutil.parser.parse(date, fuzzy=True)
print dt
print url
article.title = title.encode('utf-8')
article.url = url.encode('utf-8')
article.description = description.encode('utf-8')
article.locale = locale.encode('utf-8')
article.section = section.encode('utf-8')
article.tag = tag.encode('utf-8')
article.author = author.encode('utf-8')
article.body = body.encode('utf-8')
article.is_downloaded = True
article.article = body
article.save()
print(description.encode('utf-8'))
except (urllib2.HTTPError, ValueError) as err:
print err
time.sleep(20)
failed_attempts += 1
if failed_attempts < 10:
continue
Any ideas?