Basically, I pull a series of links from my database, and want to scrape them for specific links I'm looking for. I then re-feed those links into my link queue that my multiple QWebViews reference, and they continue to pull those down for processing/storage.
My issue is that as this runs for... say 200 or 500 links, it starts to use up more and more RAM.
I have exhaustively looked into this, using heapy, memory_profiler, and objgraph to figure out what's causing the memory leak... The python heap's objects stay about the the same in terms of amount AND size over time. This made me think the C++ objects weren't getting removed. Sure enough, using memory_profiler, the RAM only goes up when the self.load(self.url) lines of code are called. I've tried to fix this, but to no avail.
Code:
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebView, QWebSettings
from PyQt4.QtGui import QApplication
from lxml.etree import HTMLParser
# My functions
from util import dump_list2queue, parse_doc
class ThreadFlag:
def __init__(self, threads, jid, db):
self.threads = threads
self.job_id = jid
self.db_direct = db
self.xml_parser = HTMLParser()
class WebView(QWebView):
def __init__(self, thread_flag, id_no):
super(QWebView, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
self.settings().globalSettings().setAttribute(QWebSettings.AutoLoadImages, False)
# This is actually a dict with a few additional details about the url we want to pull
self.url = None
# doing one instance of this to avoid memory leaks
self.qurl = QUrl()
# id of the webview instance
self.id = id_no
# Status webview instance, green mean it isn't working and yellow means it is.
self.status = 'GREEN'
# Reference to a single universal object all the webview instances can see.
self.thread_flag = thread_flag
def handleLoadFinished(self):
try:
self.processCurrentPage()
except Exception as e:
print e
self.status = 'GREEN'
if not self.fetchNext():
# We're finished!
self.loadFinished.disconnect()
self.stop()
else:
# We're not finished! Do next url.
self.qurl.setUrl(self.url['url'])
self.load(self.qurl)
def processCurrentPage(self):
self.frame = str(self.page().mainFrame().toHtml().toUtf8())
# This is the case for the initial web pages I want to gather links from.
if 'name' in self.url:
# Parse html string for links I'm looking for.
new_links = parse_doc(self.thread_flag.xml_parser, self.url, self.frame)
if len(new_links) == 0: return 0
fkid = self.url['pkid']
new_links = map(lambda x: (fkid, x['title'],x['url'], self.thread_flag.job_id), new_links)
# Post links to database, db de-dupes and then repull ones that made it.
self.thread_flag.db_direct.post_links(new_links)
added_links = self.thread_flag.db_direct.get_links(self.thread_flag.job_id,fkid)
# Add the pulled links to central queue all the qwebviews pull from
dump_list2queue(added_links, self._urls)
del added_links
else:
# Process one of the links I pulled from the initial set of data that was originally in the queue.
print "Processing target link!"
# Get next url from the universal queue!
def fetchNext(self):
if self._urls and self._urls.empty():
self.status = 'GREEN'
return False
else:
self.status = 'YELLOW'
self.url = self._urls.get()
return True
def start(self, urls):
# This is where the reference to the universal queue gets made.
self._urls = urls
if self.fetchNext():
self.qurl.setUrl(self.url['url'])
self.load(self.qurl)
# uq = central url queue shared between webview instances
# ta = array of webview objects
# tf - thread flag (basically just a custom universal object that all the webviews can access).
# This main "program" is started by another script elsewhere.
def main_program(uq, ta, tf):
app = QApplication([])
webviews = ta
threadflag = tf
tf.app = app
print "Beginning the multiple async web calls..."
# Create n "threads" (really just webviews) that each will make asynchronous calls.
for n in range(0,threadflag.threads):
webviews.append(WebView(threadflag, n+1))
webviews[n].start(uq)
app.exec_()
Here's what my memory tools say (they're all about constant through the whole program)
- RAM: resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
2491(MB)
- Objgraph most common types:
methoddescriptor 9959
function 8342
weakref 6440
tuple 6418
dict 4982
wrapper_descriptor 4380
getset_descriptor 2314
list 1890
method_descriptor 1445
builtin_function_or_method 1298
- Heapy:
Partition of a set of 9879 objects. Total size = 1510000 bytes.
Index Count % Size % Cumulative % Kind (class / dict of class)
0 2646 27 445216 29 445216 29 str 1 563 6 262088 17 707304 47 dict (no owner) 2 2267 23 199496 13 906800 60 __builtin__.weakref 3 2381 24 179128 12 1085928 72 tuple 4 212 2 107744 7 1193672 79 dict of guppy.etc.Glue.Interface 5 50 1 52400 3 1246072 83 dict of guppy.etc.Glue.Share 6 121 1 40200 3 1286272 85 list 7 116 1 32480 2 1318752 87 dict of guppy.etc.Glue.Owner 8 240 2 30720 2 1349472 89 types.CodeType 9 42 0 24816 2 1374288 91 dict of class