I am getting the below error when I am downloading files using multiprocessing. I am downloading Wikipedia page views and they have it by hour so it might include a lot of downloading.
Any recommendation to why this error is caused and HOW TO SOLVE IT? Thanks
MaybeEncodingError: Error sending result: ''. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'
import fnmatch
import requests
import urllib.request
from bs4 import BeautifulSoup
import multiprocessing as mp
def download_it(download_file):
global path_to_save_document
filename = download_file[download_file.rfind("/")+1:]
save_file_w_submission_path = path_to_save_document + filename
request = urllib.request.Request(download_file)
response = urllib.request.urlopen(request)
data_content = response.read()
with open(save_file_w_submission_path, 'wb') as wf:
wf.write(data_content)
print(save_file_w_submission_path)
pattern = r'*200801*'
url_to_download = r'https://dumps.wikimedia.org/other/pagecounts-raw/'
path_to_save_document = r'D:\Users\Jonathan\Desktop\Wikipedia\\'
def main():
global pattern
global url_to_download
r = requests.get(url_to_download)
data = r.text
soup = BeautifulSoup(data,features="lxml")
list_of_href_year = []
for i in range(2):
if i == 0:
for link in soup.find_all('a'):
lien = link.get('href')
if len(lien) == 4:
list_of_href_year.append(url_to_download + lien + '/')
elif i == 1:
list_of_href_months = []
list_of_href_pageviews = []
for loh in list_of_href_year:
r = requests.get(loh)
data = r.text
soup = BeautifulSoup(data,features="lxml")
for link in soup.find_all('a'):
lien = link.get('href')
if len(lien) == 7:
list_of_href_months.append(loh + lien + '/')
if not list_of_href_months:
continue
for lohp in list_of_href_months:
r = requests.get(lohp)
data = r.text
soup = BeautifulSoup(data,features="lxml")
for link in soup.find_all('a'):
lien = link.get('href')
if "pagecounts" in lien:
list_of_href_pageviews.append(lohp + lien)
matching_list_of_href = fnmatch.filter(list_of_href_pageviews, pattern)
matching_list_of_href.sort()
with mp.Pool(mp.cpu_count()) as p:
print(p.map(download_it, matching_list_of_href))
if __name__ == '__main__':
main()