I have a file with several thousand rows. The format is basically:
text \t url1, url2, url3
Now, I have a process to take each row and search for content in the url's mentioned. As going and fetching content will take some time, I want to use multiprocessing to speed up things.
import gensim, re, string
import multiprocessing
import bz2
from multiprocessing import Pool
import mwparserfromhell, os, time
import codecs
import multiprocessing as mp
from boilerpipe.extract import Extractor
#from multiprocessing import freeze_support
import WikiExtractor as wikiextractor
from gensim.utils import to_unicode,any2unicode
#from lxml.etree import tounicode
from collections import defaultdict
from itertools import izip_longest
def read_file(tot):
file_to_read=os.getcwd()+"/../wikidata/sequence-url-mapper.txt"
f_r = open(file_to_read,'r')
for line in f_r:
content, urls=line.strip().split("||")
#print "Content =>", content
#print "URLs =>", urls
urls = urls.replace('set([u\'','').replace('\'])','').split(",")
#print urls
yield content, urls
f_r.close()
def aggregate_web_content(urls):
extracted_content=''
for url in urls:
try:
extractor = Extractor(extractor='ArticleExtractor', url=url)
retrieved_text = extractor.getText()
lines = retrieved_text.split('\n')
if len(lines) <= 5:
continue
for line in lines:
if len(line.strip()) > 10:
extracted_content=extracted_content+' '+line.replace('\s+',' ')
extracted_content = extracted_content +"||"
except:
continue
return extracted_content
if __name__ == "__main__":
pool = mp.Pool(processes=4)
result_list=[]
f_seq=codecs.open(os.getcwd()+'/../wikidata/sequence-content-mapper.txt','w')
k=0
for content, urllist in read_file(1):
content = content.replace('\s+',' ')
ext_content=aggregate_web_content(urllist).encode('utf-8','ignore')
if len(ext_content.strip()) > 10:
f_seq.write(content+"\t"+ext_content+'\n')
k+=1
#if k==10:
# break
#
if k>0 and k%10 == 0:
print "Done with ", k , " lines.."
if k == 10000:
break
f_seq.close()
I am not able to use the pool to process the rows separately in different processes.