multi threaded crawler in python

Question

I am trying to implement a multihtreaded crawler that takes an initial url and searches for links within that link and displays each links and at the same time look for links within each link

This is my code

import urllib.request, re, threading, csv
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit

class a3_6:

    __url_q = Queue(100)
    __html_q = Queue()
    __data_q = Queue()
    __visited_urls = []

    def __init__(self, start_url, max_threads):
        self.__url_q.put(start_url)
        self.max_threads = max_threads

    def gethtml(self,url):
        try:
            req=urllib.request.Request(url)
            html=urllib.request.urlopen(req).read()
            self.__html_q.put(html)
        except urllib.error.URLError as e:
            print(e.reason)
        except:
            print("invalid: " + url)
        self.__visited_urls.append(url)

    def mine_thread(self):
        while True:
            if not self.__html_q.empty():
                soup = BeautifulSoup(self.__html_q.get(),"html.parser")
                for a in soup.find_all('a', href=True):
                    if a not in self.__visited_urls:
                        link='https://en.wikipedia.org'+a.get('href')
                        self.__url_q.put(link)
                        self.__data_q.put(link)
            else:
                break

    def store(self):
        while True:
            if not self.__data_q.empty():
                print (self.__data_q.get())

    def download_thread(self):
        while True:
            if not self.__url_q.empty():
                self.gethtml(self.__url_q.get())
            else:
                break

    def run(self):
        self.download_thread()
        self.mine_thread()
        self.store()

    def op(self):
        for x in range(self.max_threads):
            t = threading.Thread(target=self.run)
            t.daemon = True
            t.start()
        self.store()


if __name__ == '__main__':
    a=a3_6('https://en.wikipedia.org/wiki/Main_Page', 5)
    a.op()

EDIT: I edited the code and now I am getting proper results but again not ending.

When you call `self.store` from the `op` function, it will most likely return after one iteration of the `while` loop because no data will be in the `__data_q` yet. I think you want to take the `else: break` out of that loop. — horns, Nov 13 '15 at 15:22
You probably want to look at a ThreadPool or similar: http://stackoverflow.com/questions/3033952/python-thread-pool-similar-to-the-multiprocessing-pool rather than managing your own. — James Harrison, Nov 13 '15 at 15:24

score 0 · Accepted Answer · answered Nov 13 '15 at 22:04

I arrived at the solution. I took James Harrison's help. i don't know why he deleted his original solution but here it is

import urllib.request, threading
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
from a3_3 import store_to_db

class a3_5:

    __url_q = Queue(100)
    __html_q = Queue()
    __data_q = Queue()
    __visited_urls=[]

    def gethtml(self,url):
        try:
            req=urllib.request.Request(url)
            html=urllib.request.urlopen(req).read()
            self.__html_q.put(html)
            pars=urlparse(url)
        except urllib.error.URLError as e:
            print(e.reason+':'+url)
        except:
            print("invalid: " + url)

    def mine_thread(self):
        while True:
            if not self.__html_q.empty():
                soup = BeautifulSoup(self.__html_q.get(),"html.parser")
                for a in soup.find_all('a', href=True):
                    link=a.get('href')
                    """if not link.startswith('www'):
                        link=self.__prfx+link"""
                    if link not in self.__visited_urls:
                        self.__url_q.put(link)
                        self.__data_q.put(link)
            else:
                break

    def store(self):
        while True:
            if not self.__data_q.empty():
                cont=self.__data_q.get()
                print (cont)
            else:
                break

    def download_thread(self):
        while True:
            if not self.__url_q.empty():
                self.gethtml(self.__url_q.get())
                self.__url_q.task_done()

    def op(self,*urls):
        for x in range(25):
            d = threading.Thread(target=self.download_thread)
            d.setDaemon(True)
            d.start()
        for url in urls:
            self.__url_q.put(url)
        self.__url_q.join()
        self.mine_thread()
        self.store()

if __name__ == '__main__':
    urls=['https://en.wikipedia.org/wiki/Bajirao']#,'https://en.wikipedia.org/wiki/Malharrao_Holkar','https://en.wikipedia.org/wiki/Ranoji_Scindia']
    a=a3_5()
    a.op(*urls)

Essentially I had to arrange another queue where I had to set the workers to activate the threads. Also, the mine_thread and store methods needed to start after the completion of download_thread method, because the values wouldn't get stored.

multi threaded crawler in python

1 Answers1